Test bug similar to 307c1731c1

Slightly improve some debug prints
Close client FDs only when destroying the client, after handling all async reads/writes
2023-06-02 17:36:56 +03:00 · 2023-05-29 01:04:16 +03:00 · 2023-05-25 00:52:43 +03:00 · 2023-05-21 18:37:01 +03:00 · 2023-05-21 15:07:14 +03:00 · 2023-05-21 14:48:26 +03:00
113 changed files with 4673 additions and 606 deletions
--- a/.gitea/workflows/buildenv.Dockerfile
+++ b/.gitea/workflows/buildenv.Dockerfile
@@ -0,0 +1,36 @@
+FROM node:16-bullseye
+
+WORKDIR /root
+
+ADD ./docker/vitastor.gpg /etc/apt/trusted.gpg.d
+
+RUN echo 'deb http://deb.debian.org/debian bullseye-backports main' >> /etc/apt/sources.list; \
+    echo 'deb http://vitastor.io/debian bullseye main' >> /etc/apt/sources.list; \
+    echo >> /etc/apt/preferences; \
+    echo 'Package: *' >> /etc/apt/preferences; \
+    echo 'Pin: release a=bullseye-backports' >> /etc/apt/preferences; \
+    echo 'Pin-Priority: 500' >> /etc/apt/preferences; \
+    echo >> /etc/apt/preferences; \
+    echo 'Package: *' >> /etc/apt/preferences; \
+    echo 'Pin: origin "vitastor.io"' >> /etc/apt/preferences; \
+    echo 'Pin-Priority: 1000' >> /etc/apt/preferences; \
+    grep '^deb ' /etc/apt/sources.list | perl -pe 's/^deb/deb-src/' >> /etc/apt/sources.list; \
+    echo 'APT::Install-Recommends false;' >> /etc/apt/apt.conf; \
+    echo 'APT::Install-Suggests false;' >> /etc/apt/apt.conf
+
+RUN apt-get update
+RUN apt-get -y install etcd qemu-system-x86 qemu-block-extra qemu-utils fio libasan5 \
+    liburing1 liburing-dev libgoogle-perftools-dev devscripts libjerasure-dev cmake libibverbs-dev libisal-dev
+RUN apt-get -y build-dep fio qemu=`dpkg -s qemu-system-x86|grep ^Version:|awk '{print $2}'`
+RUN apt-get -y install jq lp-solve sudo
+RUN apt-get --download-only source fio qemu=`dpkg -s qemu-system-x86|grep ^Version:|awk '{print $2}'`
+
+RUN set -ex; \
+    mkdir qemu-build; \
+    cd qemu-build; \
+    dpkg-source -x /root/qemu*.dsc; \
+    cd qemu*/; \
+    debian/rules configure-qemu || debian/rules b/configure-stamp; \
+    cd b/qemu; \
+    make -j8 config-poison.h || true; \
+    make -j8 qapi/qapi-builtin-types.h
--- a/.gitea/workflows/test.Dockerfile
+++ b/.gitea/workflows/test.Dockerfile
@@ -0,0 +1,16 @@
+FROM git.yourcmc.ru/vitalif/vitastor/buildenv
+
+ADD . /root/vitastor
+
+RUN set -e -x; \
+    mkdir -p /root/fio-build/; \
+    cd /root/fio-build/; \
+    dpkg-source -x /root/fio*.dsc; \
+    cd /root/vitastor; \
+    ln -s /root/fio-build/fio-*/ ./fio; \
+    ln -s /root/qemu-build/qemu-*/ ./qemu; \
+    ls /usr/include/linux/raw.h || cp ./debian/raw.h /usr/include/linux/raw.h; \
+    mkdir build; \
+    cd build; \
+    cmake .. -DWITH_ASAN=yes -DWITH_QEMU=yes; \
+    make -j16
--- a/.gitea/workflows/test.yml
+++ b/.gitea/workflows/test.yml
@@ -0,0 +1,678 @@
+name: Test
+
+on:
+  push:
+    branches:
+    - '*'
+    paths:
+    - '.gitea/**'
+    - 'src/**'
+    - 'mon/**'
+    - 'json11'
+    - 'cpp-btree'
+    - 'tests/**'
+
+env:
+  BUILDENV_IMAGE: git.yourcmc.ru/vitalif/vitastor/buildenv
+  TEST_IMAGE: git.yourcmc.ru/vitalif/vitastor/test
+  OSD_ARGS: '--etcd_quick_timeout 2000'
+
+concurrency:
+  group: ci-${{ github.ref }}
+  cancel-in-progress: true
+
+jobs:
+
+  buildenv:
+    runs-on: ubuntu-latest
+    container: git.yourcmc.ru/vitalif/gitea-ci-dind
+    steps:
+    - uses: actions/checkout@v3
+
+    - name: Build and push
+      run: |
+        set -ex
+        if ! docker manifest inspect $BUILDENV_IMAGE >/dev/null; then
+          docker build -t $BUILDENV_IMAGE -f .gitea/workflows/buildenv.Dockerfile .
+          docker login git.yourcmc.ru -u vitalif -p "${{secrets.TOKEN}}"
+          docker push $BUILDENV_IMAGE
+        fi
+
+  build:
+    runs-on: ubuntu-latest
+    needs: buildenv
+    container: git.yourcmc.ru/vitalif/gitea-ci-dind
+    steps:
+    - uses: actions/checkout@v3
+      with:
+        submodules: true
+
+    - name: Build and push
+      run: |
+        set -ex
+        if ! docker manifest inspect $TEST_IMAGE:$GITHUB_SHA >/dev/null; then
+          docker build -t $TEST_IMAGE:$GITHUB_SHA -f .gitea/workflows/test.Dockerfile .
+          docker login git.yourcmc.ru -u vitalif -p "${{secrets.TOKEN}}"
+          docker push $TEST_IMAGE:$GITHUB_SHA
+        fi
+
+  make_test:
+    runs-on: ubuntu-latest
+    needs: build
+    container: ${{env.TEST_IMAGE}}:${{github.sha}}
+    steps:
+    # leak sanitizer sometimes crashes
+    - run: cd /root/vitastor/build && ASAN_OPTIONS=detect_leaks=0 make -j16 test
+
+  test_add_osd:
+    runs-on: ubuntu-latest
+    needs: build
+    container: ${{env.TEST_IMAGE}}:${{github.sha}}
+    steps:
+    - name: Run test
+      id: test
+      timeout-minutes: 3
+      run: /root/vitastor/tests/test_add_osd.sh
+    - name: Print logs
+      if: always() && steps.test.outcome == 'failure'
+      run: |
+        for i in /root/vitastor/testdata/*.log /root/vitastor/testdata/*.txt; do
+          echo "-------- $i --------"
+          cat $i
+          echo ""
+        done
+
+  test_cas:
+    runs-on: ubuntu-latest
+    needs: build
+    container: ${{env.TEST_IMAGE}}:${{github.sha}}
+    steps:
+    - name: Run test
+      id: test
+      timeout-minutes: 3
+      run: /root/vitastor/tests/test_cas.sh
+    - name: Print logs
+      if: always() && steps.test.outcome == 'failure'
+      run: |
+        for i in /root/vitastor/testdata/*.log /root/vitastor/testdata/*.txt; do
+          echo "-------- $i --------"
+          cat $i
+          echo ""
+        done
+
+  test_change_pg_count:
+    runs-on: ubuntu-latest
+    needs: build
+    container: ${{env.TEST_IMAGE}}:${{github.sha}}
+    steps:
+    - name: Run test
+      id: test
+      timeout-minutes: 3
+      run: /root/vitastor/tests/test_change_pg_count.sh
+    - name: Print logs
+      if: always() && steps.test.outcome == 'failure'
+      run: |
+        for i in /root/vitastor/testdata/*.log /root/vitastor/testdata/*.txt; do
+          echo "-------- $i --------"
+          cat $i
+          echo ""
+        done
+
+  test_change_pg_count_ec:
+    runs-on: ubuntu-latest
+    needs: build
+    container: ${{env.TEST_IMAGE}}:${{github.sha}}
+    steps:
+    - name: Run test
+      id: test
+      timeout-minutes: 3
+      run: SCHEME=ec /root/vitastor/tests/test_change_pg_count.sh
+    - name: Print logs
+      if: always() && steps.test.outcome == 'failure'
+      run: |
+        for i in /root/vitastor/testdata/*.log /root/vitastor/testdata/*.txt; do
+          echo "-------- $i --------"
+          cat $i
+          echo ""
+        done
+
+  test_change_pg_size:
+    runs-on: ubuntu-latest
+    needs: build
+    container: ${{env.TEST_IMAGE}}:${{github.sha}}
+    steps:
+    - name: Run test
+      id: test
+      timeout-minutes: 3
+      run: /root/vitastor/tests/test_change_pg_size.sh
+    - name: Print logs
+      if: always() && steps.test.outcome == 'failure'
+      run: |
+        for i in /root/vitastor/testdata/*.log /root/vitastor/testdata/*.txt; do
+          echo "-------- $i --------"
+          cat $i
+          echo ""
+        done
+
+  test_create_nomaxid:
+    runs-on: ubuntu-latest
+    needs: build
+    container: ${{env.TEST_IMAGE}}:${{github.sha}}
+    steps:
+    - name: Run test
+      id: test
+      timeout-minutes: 3
+      run: /root/vitastor/tests/test_create_nomaxid.sh
+    - name: Print logs
+      if: always() && steps.test.outcome == 'failure'
+      run: |
+        for i in /root/vitastor/testdata/*.log /root/vitastor/testdata/*.txt; do
+          echo "-------- $i --------"
+          cat $i
+          echo ""
+        done
+
+  test_etcd_fail:
+    runs-on: ubuntu-latest
+    needs: build
+    container: ${{env.TEST_IMAGE}}:${{github.sha}}
+    steps:
+    - name: Run test
+      id: test
+      timeout-minutes: 10
+      run: /root/vitastor/tests/test_etcd_fail.sh
+    - name: Print logs
+      if: always() && steps.test.outcome == 'failure'
+      run: |
+        for i in /root/vitastor/testdata/*.log /root/vitastor/testdata/*.txt; do
+          echo "-------- $i --------"
+          cat $i
+          echo ""
+        done
+
+  test_failure_domain:
+    runs-on: ubuntu-latest
+    needs: build
+    container: ${{env.TEST_IMAGE}}:${{github.sha}}
+    steps:
+    - name: Run test
+      id: test
+      timeout-minutes: 3
+      run: /root/vitastor/tests/test_failure_domain.sh
+    - name: Print logs
+      if: always() && steps.test.outcome == 'failure'
+      run: |
+        for i in /root/vitastor/testdata/*.log /root/vitastor/testdata/*.txt; do
+          echo "-------- $i --------"
+          cat $i
+          echo ""
+        done
+
+  test_interrupted_rebalance:
+    runs-on: ubuntu-latest
+    needs: build
+    container: ${{env.TEST_IMAGE}}:${{github.sha}}
+    steps:
+    - name: Run test
+      id: test
+      timeout-minutes: 10
+      run: /root/vitastor/tests/test_interrupted_rebalance.sh
+    - name: Print logs
+      if: always() && steps.test.outcome == 'failure'
+      run: |
+        for i in /root/vitastor/testdata/*.log /root/vitastor/testdata/*.txt; do
+          echo "-------- $i --------"
+          cat $i
+          echo ""
+        done
+
+  test_interrupted_rebalance_imm:
+    runs-on: ubuntu-latest
+    needs: build
+    container: ${{env.TEST_IMAGE}}:${{github.sha}}
+    steps:
+    - name: Run test
+      id: test
+      timeout-minutes: 10
+      run: IMMEDIATE_COMMIT=1 /root/vitastor/tests/test_interrupted_rebalance.sh
+    - name: Print logs
+      if: always() && steps.test.outcome == 'failure'
+      run: |
+        for i in /root/vitastor/testdata/*.log /root/vitastor/testdata/*.txt; do
+          echo "-------- $i --------"
+          cat $i
+          echo ""
+        done
+
+  test_interrupted_rebalance_ec:
+    runs-on: ubuntu-latest
+    needs: build
+    container: ${{env.TEST_IMAGE}}:${{github.sha}}
+    steps:
+    - name: Run test
+      id: test
+      timeout-minutes: 10
+      run: SCHEME=ec /root/vitastor/tests/test_interrupted_rebalance.sh
+    - name: Print logs
+      if: always() && steps.test.outcome == 'failure'
+      run: |
+        for i in /root/vitastor/testdata/*.log /root/vitastor/testdata/*.txt; do
+          echo "-------- $i --------"
+          cat $i
+          echo ""
+        done
+
+  test_interrupted_rebalance_ec_imm:
+    runs-on: ubuntu-latest
+    needs: build
+    container: ${{env.TEST_IMAGE}}:${{github.sha}}
+    steps:
+    - name: Run test
+      id: test
+      timeout-minutes: 10
+      run: SCHEME=ec IMMEDIATE_COMMIT=1 /root/vitastor/tests/test_interrupted_rebalance.sh
+    - name: Print logs
+      if: always() && steps.test.outcome == 'failure'
+      run: |
+        for i in /root/vitastor/testdata/*.log /root/vitastor/testdata/*.txt; do
+          echo "-------- $i --------"
+          cat $i
+          echo ""
+        done
+
+  test_minsize_1:
+    runs-on: ubuntu-latest
+    needs: build
+    container: ${{env.TEST_IMAGE}}:${{github.sha}}
+    steps:
+    - name: Run test
+      id: test
+      timeout-minutes: 3
+      run: /root/vitastor/tests/test_minsize_1.sh
+    - name: Print logs
+      if: always() && steps.test.outcome == 'failure'
+      run: |
+        for i in /root/vitastor/testdata/*.log /root/vitastor/testdata/*.txt; do
+          echo "-------- $i --------"
+          cat $i
+          echo ""
+        done
+
+  test_move_reappear:
+    runs-on: ubuntu-latest
+    needs: build
+    container: ${{env.TEST_IMAGE}}:${{github.sha}}
+    steps:
+    - name: Run test
+      id: test
+      timeout-minutes: 3
+      run: /root/vitastor/tests/test_move_reappear.sh
+    - name: Print logs
+      if: always() && steps.test.outcome == 'failure'
+      run: |
+        for i in /root/vitastor/testdata/*.log /root/vitastor/testdata/*.txt; do
+          echo "-------- $i --------"
+          cat $i
+          echo ""
+        done
+
+  test_rebalance_verify:
+    runs-on: ubuntu-latest
+    needs: build
+    container: ${{env.TEST_IMAGE}}:${{github.sha}}
+    steps:
+    - name: Run test
+      id: test
+      timeout-minutes: 3
+      run: /root/vitastor/tests/test_rebalance_verify.sh
+    - name: Print logs
+      if: always() && steps.test.outcome == 'failure'
+      run: |
+        for i in /root/vitastor/testdata/*.log /root/vitastor/testdata/*.txt; do
+          echo "-------- $i --------"
+          cat $i
+          echo ""
+        done
+
+  test_rebalance_verify_imm:
+    runs-on: ubuntu-latest
+    needs: build
+    container: ${{env.TEST_IMAGE}}:${{github.sha}}
+    steps:
+    - name: Run test
+      id: test
+      timeout-minutes: 3
+      run: IMMEDIATE_COMMIT=1 /root/vitastor/tests/test_rebalance_verify.sh
+    - name: Print logs
+      if: always() && steps.test.outcome == 'failure'
+      run: |
+        for i in /root/vitastor/testdata/*.log /root/vitastor/testdata/*.txt; do
+          echo "-------- $i --------"
+          cat $i
+          echo ""
+        done
+
+  test_rebalance_verify_ec:
+    runs-on: ubuntu-latest
+    needs: build
+    container: ${{env.TEST_IMAGE}}:${{github.sha}}
+    steps:
+    - name: Run test
+      id: test
+      timeout-minutes: 3
+      run: SCHEME=ec /root/vitastor/tests/test_rebalance_verify.sh
+    - name: Print logs
+      if: always() && steps.test.outcome == 'failure'
+      run: |
+        for i in /root/vitastor/testdata/*.log /root/vitastor/testdata/*.txt; do
+          echo "-------- $i --------"
+          cat $i
+          echo ""
+        done
+
+  test_rebalance_verify_ec_imm:
+    runs-on: ubuntu-latest
+    needs: build
+    container: ${{env.TEST_IMAGE}}:${{github.sha}}
+    steps:
+    - name: Run test
+      id: test
+      timeout-minutes: 3
+      run: SCHEME=ec IMMEDIATE_COMMIT=1 /root/vitastor/tests/test_rebalance_verify.sh
+    - name: Print logs
+      if: always() && steps.test.outcome == 'failure'
+      run: |
+        for i in /root/vitastor/testdata/*.log /root/vitastor/testdata/*.txt; do
+          echo "-------- $i --------"
+          cat $i
+          echo ""
+        done
+
+  test_rm:
+    runs-on: ubuntu-latest
+    needs: build
+    container: ${{env.TEST_IMAGE}}:${{github.sha}}
+    steps:
+    - name: Run test
+      id: test
+      timeout-minutes: 3
+      run: /root/vitastor/tests/test_rm.sh
+    - name: Print logs
+      if: always() && steps.test.outcome == 'failure'
+      run: |
+        for i in /root/vitastor/testdata/*.log /root/vitastor/testdata/*.txt; do
+          echo "-------- $i --------"
+          cat $i
+          echo ""
+        done
+
+  test_snapshot:
+    runs-on: ubuntu-latest
+    needs: build
+    container: ${{env.TEST_IMAGE}}:${{github.sha}}
+    steps:
+    - name: Run test
+      id: test
+      timeout-minutes: 3
+      run: /root/vitastor/tests/test_snapshot.sh
+    - name: Print logs
+      if: always() && steps.test.outcome == 'failure'
+      run: |
+        for i in /root/vitastor/testdata/*.log /root/vitastor/testdata/*.txt; do
+          echo "-------- $i --------"
+          cat $i
+          echo ""
+        done
+
+  test_snapshot_ec:
+    runs-on: ubuntu-latest
+    needs: build
+    container: ${{env.TEST_IMAGE}}:${{github.sha}}
+    steps:
+    - name: Run test
+      id: test
+      timeout-minutes: 3
+      run: SCHEME=ec /root/vitastor/tests/test_snapshot.sh
+    - name: Print logs
+      if: always() && steps.test.outcome == 'failure'
+      run: |
+        for i in /root/vitastor/testdata/*.log /root/vitastor/testdata/*.txt; do
+          echo "-------- $i --------"
+          cat $i
+          echo ""
+        done
+
+  test_splitbrain:
+    runs-on: ubuntu-latest
+    needs: build
+    container: ${{env.TEST_IMAGE}}:${{github.sha}}
+    steps:
+    - name: Run test
+      id: test
+      timeout-minutes: 3
+      run: /root/vitastor/tests/test_splitbrain.sh
+    - name: Print logs
+      if: always() && steps.test.outcome == 'failure'
+      run: |
+        for i in /root/vitastor/testdata/*.log /root/vitastor/testdata/*.txt; do
+          echo "-------- $i --------"
+          cat $i
+          echo ""
+        done
+
+  test_write:
+    runs-on: ubuntu-latest
+    needs: build
+    container: ${{env.TEST_IMAGE}}:${{github.sha}}
+    steps:
+    - name: Run test
+      id: test
+      timeout-minutes: 3
+      run: /root/vitastor/tests/test_write.sh
+    - name: Print logs
+      if: always() && steps.test.outcome == 'failure'
+      run: |
+        for i in /root/vitastor/testdata/*.log /root/vitastor/testdata/*.txt; do
+          echo "-------- $i --------"
+          cat $i
+          echo ""
+        done
+
+  test_write_xor:
+    runs-on: ubuntu-latest
+    needs: build
+    container: ${{env.TEST_IMAGE}}:${{github.sha}}
+    steps:
+    - name: Run test
+      id: test
+      timeout-minutes: 3
+      run: SCHEME=xor /root/vitastor/tests/test_write.sh
+    - name: Print logs
+      if: always() && steps.test.outcome == 'failure'
+      run: |
+        for i in /root/vitastor/testdata/*.log /root/vitastor/testdata/*.txt; do
+          echo "-------- $i --------"
+          cat $i
+          echo ""
+        done
+
+  test_write_no_same:
+    runs-on: ubuntu-latest
+    needs: build
+    container: ${{env.TEST_IMAGE}}:${{github.sha}}
+    steps:
+    - name: Run test
+      id: test
+      timeout-minutes: 3
+      run: /root/vitastor/tests/test_write_no_same.sh
+    - name: Print logs
+      if: always() && steps.test.outcome == 'failure'
+      run: |
+        for i in /root/vitastor/testdata/*.log /root/vitastor/testdata/*.txt; do
+          echo "-------- $i --------"
+          cat $i
+          echo ""
+        done
+
+  test_heal_pg_size_2:
+    runs-on: ubuntu-latest
+    needs: build
+    container: ${{env.TEST_IMAGE}}:${{github.sha}}
+    steps:
+    - name: Run test
+      id: test
+      timeout-minutes: 10
+      run: PG_SIZE=2 /root/vitastor/tests/test_heal.sh
+    - name: Print logs
+      if: always() && steps.test.outcome == 'failure'
+      run: |
+        for i in /root/vitastor/testdata/*.log /root/vitastor/testdata/*.txt; do
+          echo "-------- $i --------"
+          cat $i
+          echo ""
+        done
+
+  test_heal_ec:
+    runs-on: ubuntu-latest
+    needs: build
+    container: ${{env.TEST_IMAGE}}:${{github.sha}}
+    steps:
+    - name: Run test
+      id: test
+      timeout-minutes: 10
+      run: SCHEME=ec /root/vitastor/tests/test_heal.sh
+    - name: Print logs
+      if: always() && steps.test.outcome == 'failure'
+      run: |
+        for i in /root/vitastor/testdata/*.log /root/vitastor/testdata/*.txt; do
+          echo "-------- $i --------"
+          cat $i
+          echo ""
+        done
+
+  test_heal_imm_ec:
+    runs-on: ubuntu-latest
+    needs: build
+    container: ${{env.TEST_IMAGE}}:${{github.sha}}
+    steps:
+    - name: Run test
+      id: test
+      timeout-minutes: 10
+      run: IMMEDIATE_COMMIT=1 SCHEME=ec /root/vitastor/tests/test_heal.sh
+    - name: Print logs
+      if: always() && steps.test.outcome == 'failure'
+      run: |
+        for i in /root/vitastor/testdata/*.log /root/vitastor/testdata/*.txt; do
+          echo "-------- $i --------"
+          cat $i
+          echo ""
+        done
+
+  test_scrub:
+    runs-on: ubuntu-latest
+    needs: build
+    container: ${{env.TEST_IMAGE}}:${{github.sha}}
+    steps:
+    - name: Run test
+      id: test
+      timeout-minutes: 3
+      run: /root/vitastor/tests/test_scrub.sh
+    - name: Print logs
+      if: always() && steps.test.outcome == 'failure'
+      run: |
+        for i in /root/vitastor/testdata/*.log /root/vitastor/testdata/*.txt; do
+          echo "-------- $i --------"
+          cat $i
+          echo ""
+        done
+
+  test_scrub_zero_osd_2:
+    runs-on: ubuntu-latest
+    needs: build
+    container: ${{env.TEST_IMAGE}}:${{github.sha}}
+    steps:
+    - name: Run test
+      id: test
+      timeout-minutes: 3
+      run: ZERO_OSD=2 /root/vitastor/tests/test_scrub.sh
+    - name: Print logs
+      if: always() && steps.test.outcome == 'failure'
+      run: |
+        for i in /root/vitastor/testdata/*.log /root/vitastor/testdata/*.txt; do
+          echo "-------- $i --------"
+          cat $i
+          echo ""
+        done
+
+  test_scrub_xor:
+    runs-on: ubuntu-latest
+    needs: build
+    container: ${{env.TEST_IMAGE}}:${{github.sha}}
+    steps:
+    - name: Run test
+      id: test
+      timeout-minutes: 3
+      run: SCHEME=xor /root/vitastor/tests/test_scrub.sh
+    - name: Print logs
+      if: always() && steps.test.outcome == 'failure'
+      run: |
+        for i in /root/vitastor/testdata/*.log /root/vitastor/testdata/*.txt; do
+          echo "-------- $i --------"
+          cat $i
+          echo ""
+        done
+
+  test_scrub_pg_size_3:
+    runs-on: ubuntu-latest
+    needs: build
+    container: ${{env.TEST_IMAGE}}:${{github.sha}}
+    steps:
+    - name: Run test
+      id: test
+      timeout-minutes: 3
+      run: PG_SIZE=3 /root/vitastor/tests/test_scrub.sh
+    - name: Print logs
+      if: always() && steps.test.outcome == 'failure'
+      run: |
+        for i in /root/vitastor/testdata/*.log /root/vitastor/testdata/*.txt; do
+          echo "-------- $i --------"
+          cat $i
+          echo ""
+        done
+
+  test_scrub_pg_size_6_pg_minsize_4_osd_count_6_ec:
+    runs-on: ubuntu-latest
+    needs: build
+    container: ${{env.TEST_IMAGE}}:${{github.sha}}
+    steps:
+    - name: Run test
+      id: test
+      timeout-minutes: 3
+      run: PG_SIZE=6 PG_MINSIZE=4 OSD_COUNT=6 SCHEME=ec /root/vitastor/tests/test_scrub.sh
+    - name: Print logs
+      if: always() && steps.test.outcome == 'failure'
+      run: |
+        for i in /root/vitastor/testdata/*.log /root/vitastor/testdata/*.txt; do
+          echo "-------- $i --------"
+          cat $i
+          echo ""
+        done
+
+  test_scrub_ec:
+    runs-on: ubuntu-latest
+    needs: build
+    container: ${{env.TEST_IMAGE}}:${{github.sha}}
+    steps:
+    - name: Run test
+      id: test
+      timeout-minutes: 3
+      run: SCHEME=ec /root/vitastor/tests/test_scrub.sh
+    - name: Print logs
+      if: always() && steps.test.outcome == 'failure'
+      run: |
+        for i in /root/vitastor/testdata/*.log /root/vitastor/testdata/*.txt; do
+          echo "-------- $i --------"
+          cat $i
+          echo ""
+        done
+
--- a/.gitea/workflows/tests-to-yaml.pl
+++ b/.gitea/workflows/tests-to-yaml.pl
@@ -0,0 +1,68 @@
+#!/usr/bin/perl
+
+use strict;
+
+for my $line (<>)
+{
+    if ($line =~ /\.\/(test_[^\.]+)/s)
+    {
+        chomp $line;
+        my $test_name = $1;
+        my $timeout = 3;
+        if ($test_name eq 'test_etcd_fail' || $test_name eq 'test_heal' || $test_name eq 'test_interrupted_rebalance')
+        {
+            $timeout = 10;
+        }
+        while ($line =~ /([^\s=]+)=(\S+)/gs)
+        {
+            if ($1 eq 'SCHEME' && $2 eq 'ec')
+            {
+                $test_name .= '_ec';
+            }
+            elsif ($1 eq 'SCHEME' && $2 eq 'xor')
+            {
+                $test_name .= '_xor';
+            }
+            elsif ($1 eq 'IMMEDIATE_COMMIT')
+            {
+                $test_name .= '_imm';
+            }
+            else
+            {
+                $test_name .= '_'.lc($1).'_'.$2;
+            }
+        }
+        $line =~ s!\./test_!/root/vitastor/tests/test_!;
+        # Gitea CI doesn't support artifacts yet, lol
+        #- name: Upload results
+        #  uses: actions/upload-artifact\@v3
+        #  if: always()
+        #  with:
+        #    name: ${test_name}_result
+        #    path: |
+        #      /root/vitastor/testdata
+        #      !/root/vitastor/testdata/*.bin
+        #    retention-days: 5
+        print <<"EOF"
+  $test_name:
+    runs-on: ubuntu-latest
+    needs: build
+    container: \${{env.TEST_IMAGE}}:\${{github.sha}}
+    steps:
+    - name: Run test
+      id: test
+      timeout-minutes: $timeout
+      run: $line
+    - name: Print logs
+      if: always() && steps.test.outcome == 'failure'
+      run: |
+        for i in /root/vitastor/testdata/*.log /root/vitastor/testdata/*.txt; do
+          echo "-------- \$i --------"
+          cat \$i
+          echo ""
+        done
+
+EOF
+;
+    }
+}
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -2,6 +2,6 @@ cmake_minimum_required(VERSION 2.8.12)

 project(vitastor)

-set(VERSION "0.8.8")
+set(VERSION "0.9.0")

 add_subdirectory(src)
--- a/csi/Makefile
+++ b/csi/Makefile
@@ -1,4 +1,4 @@
-VERSION ?= v0.8.8
+VERSION ?= v0.9.0

 all: build push

--- a/csi/deploy/004-csi-nodeplugin.yaml
+++ b/csi/deploy/004-csi-nodeplugin.yaml
@@ -49,7 +49,7 @@ spec:
            capabilities:
              add: ["SYS_ADMIN"]
            allowPrivilegeEscalation: true
-          image: vitalif/vitastor-csi:v0.8.8
+          image: vitalif/vitastor-csi:v0.9.0
          args:
            - "--node=$(NODE_ID)"
            - "--endpoint=$(CSI_ENDPOINT)"
--- a/csi/deploy/007-csi-provisioner.yaml
+++ b/csi/deploy/007-csi-provisioner.yaml
@@ -116,7 +116,7 @@ spec:
            privileged: true
            capabilities:
              add: ["SYS_ADMIN"]
-          image: vitalif/vitastor-csi:v0.8.8
+          image: vitalif/vitastor-csi:v0.9.0
          args:
            - "--node=$(NODE_ID)"
            - "--endpoint=$(CSI_ENDPOINT)"
--- a/csi/go.mod
+++ b/csi/go.mod
@@ -4,25 +4,10 @@ go 1.15

 require (
 	github.com/container-storage-interface/spec v1.4.0
-	github.com/coreos/bbolt v0.0.0-00010101000000-000000000000 // indirect
-	github.com/coreos/etcd v3.3.25+incompatible // indirect
-	github.com/coreos/go-semver v0.3.0 // indirect
-	github.com/coreos/go-systemd v0.0.0-20191104093116-d3cd4ed1dbcf // indirect
-	github.com/coreos/pkg v0.0.0-20180928190104-399ea9e2e55f // indirect
-	github.com/dustin/go-humanize v1.0.0 // indirect
 	github.com/golang/glog v0.0.0-20160126235308-23def4e6c14b
-	github.com/gorilla/websocket v1.4.2 // indirect
-	github.com/grpc-ecosystem/go-grpc-middleware v1.3.0 // indirect
-	github.com/grpc-ecosystem/go-grpc-prometheus v1.2.0 // indirect
-	github.com/grpc-ecosystem/grpc-gateway v1.16.0 // indirect
-	github.com/jonboulle/clockwork v0.2.2 // indirect
 	github.com/kubernetes-csi/csi-lib-utils v0.9.1
-	github.com/soheilhy/cmux v0.1.5 // indirect
-	github.com/tmc/grpc-websocket-proxy v0.0.0-20201229170055-e5319fda7802 // indirect
-	github.com/xiang90/probing v0.0.0-20190116061207-43a291ad63a2 // indirect
-	go.etcd.io/bbolt v0.0.0-00010101000000-000000000000 // indirect
-	go.etcd.io/etcd v3.3.25+incompatible
 	golang.org/x/net v0.0.0-20201202161906-c7110b5ffcbb
+	golang.org/x/xerrors v0.0.0-20200804184101-5ec99f83aff1 // indirect
 	google.golang.org/grpc v1.33.1
 	k8s.io/klog v1.0.0
 	k8s.io/utils v0.0.0-20210305010621-2afb4311ab10
--- a/csi/go.sum
+++ b/csi/go.sum
@@ -31,14 +31,11 @@ github.com/alecthomas/template v0.0.0-20160405071501-a0175ee3bccc/go.mod h1:LOuy
 github.com/alecthomas/template v0.0.0-20190718012654-fb15b899a751/go.mod h1:LOuyumcjzFXgccqObfd/Ljyb9UuFJ6TxHnclSeseNhc=
 github.com/alecthomas/units v0.0.0-20151022065526-2efee857e7cf/go.mod h1:ybxpYRFXyAe+OPACYpWeL0wqObRcbAqCMya13uyzqw0=
 github.com/alecthomas/units v0.0.0-20190717042225-c3de453c63f4/go.mod h1:ybxpYRFXyAe+OPACYpWeL0wqObRcbAqCMya13uyzqw0=
-github.com/antihax/optional v1.0.0/go.mod h1:uupD/76wgC+ih3iEmQUL+0Ugr19nfwCT1kdvxnR2qWY=
 github.com/beorn7/perks v0.0.0-20180321164747-3a771d992973/go.mod h1:Dwedo/Wpr24TaqPxmxbtue+5NUziq4I4S80YR8gNf3Q=
 github.com/beorn7/perks v1.0.0/go.mod h1:KWe93zE9D1o94FZ5RNwFwVgaQK1VOXiVxmqh+CedLV8=
-github.com/beorn7/perks v1.0.1 h1:VlbKKnNfV8bJzeqoa4cOKqO6bYr3WgKZxO8Z16+hsOM=
 github.com/beorn7/perks v1.0.1/go.mod h1:G2ZrVWU2WbWT9wwq4/hrbKbnv/1ERSJQ0ibhJ6rlkpw=
 github.com/blang/semver v3.5.0+incompatible/go.mod h1:kRBLl5iJ+tD4TcOOxsy/0fnwebNt5EWlYSAyrTnjyyk=
 github.com/census-instrumentation/opencensus-proto v0.2.1/go.mod h1:f6KPmirojxKA12rnyqOA5BBL4O983OfeGPqjHWSTneU=
-github.com/cespare/xxhash/v2 v2.1.1 h1:6MnRN8NT7+YBpUIWxHtefFZOKTAPgGjpQSxqLNn0+qY=
 github.com/cespare/xxhash/v2 v2.1.1/go.mod h1:VGX0DQ3Q6kWi7AoAeZDth3/j3BFtOZR5XLFGgcrjCOs=
 github.com/chzyer/logex v1.1.10/go.mod h1:+Ywpsq7O8HXn0nuIou7OrIPyXbp3wmkHB+jjWRnGsAI=
 github.com/chzyer/readline v0.0.0-20180603132655-2972be24d48e/go.mod h1:nSuG5e5PlCu98SY8svDHJxuZscDgtXS6KTTbou5AhLI=
@@ -46,25 +43,12 @@ github.com/chzyer/test v0.0.0-20180213035817-a1ea475d72b1/go.mod h1:Q3SI9o4m/ZMn
 github.com/container-storage-interface/spec v1.2.0/go.mod h1:6URME8mwIBbpVyZV93Ce5St17xBiQJQY67NDsuohiy4=
 github.com/container-storage-interface/spec v1.4.0 h1:ozAshSKxpJnYUfmkpZCTYyF/4MYeYlhdXbAvPvfGmkg=
 github.com/container-storage-interface/spec v1.4.0/go.mod h1:6URME8mwIBbpVyZV93Ce5St17xBiQJQY67NDsuohiy4=
-github.com/coreos/bbolt v1.3.5 h1:XFv7xaq7701j8ZSEzR28VohFYSlyakMyqNMU5FQH6Ac=
-github.com/coreos/bbolt v1.3.5/go.mod h1:G5EMThwa9y8QZGBClrRx5EY+Yw9kAhnjy3bSjsnlVTQ=
-github.com/coreos/etcd v3.3.25+incompatible h1:0GQEw6h3YnuOVdtwygkIfJ+Omx0tZ8/QkVyXI4LkbeY=
-github.com/coreos/etcd v3.3.25+incompatible/go.mod h1:uF7uidLiAD3TWHmW31ZFd/JWoc32PjwdhPthX9715RE=
-github.com/coreos/go-semver v0.3.0 h1:wkHLiw0WNATZnSG7epLsujiMCgPAc9xhjJ4tgnAxmfM=
-github.com/coreos/go-semver v0.3.0/go.mod h1:nnelYz7RCh+5ahJtPPxZlU+153eP4D4r3EedlOD2RNk=
-github.com/coreos/go-systemd v0.0.0-20191104093116-d3cd4ed1dbcf h1:iW4rZ826su+pqaw19uhpSCzhj44qo35pNgKFGqzDKkU=
-github.com/coreos/go-systemd v0.0.0-20191104093116-d3cd4ed1dbcf/go.mod h1:F5haX7vjVVG0kc13fIWeqUViNPyEJxv/OmvnBo0Yme4=
-github.com/coreos/pkg v0.0.0-20180928190104-399ea9e2e55f h1:lBNOc5arjvs8E5mO2tbpBpLoyyu8B6e44T7hJy6potg=
-github.com/coreos/pkg v0.0.0-20180928190104-399ea9e2e55f/go.mod h1:E3G3o1h8I7cfcXa63jLwjI0eiQQMgzzUDFVpN/nH/eA=
 github.com/davecgh/go-spew v1.1.0/go.mod h1:J7Y8YcW2NihsgmVo/mv3lAwl/skON4iLHjSsI+c5H38=
 github.com/davecgh/go-spew v1.1.1 h1:vj9j/u1bqnvCEfJOwUhtlOARqs3+rkHYY13jYWTU97c=
 github.com/davecgh/go-spew v1.1.1/go.mod h1:J7Y8YcW2NihsgmVo/mv3lAwl/skON4iLHjSsI+c5H38=
-github.com/dgrijalva/jwt-go v3.2.0+incompatible h1:7qlOGliEKZXTDg6OTjfoBKDXWrumCAMpl/TFQ4/5kLM=
 github.com/dgrijalva/jwt-go v3.2.0+incompatible/go.mod h1:E3ru+11k8xSBh+hMPgOLZmtrrCbhqsmaPHjLKYnJCaQ=
 github.com/docker/spdystream v0.0.0-20160310174837-449fdfce4d96/go.mod h1:Qh8CwZgvJUkLughtfhJv5dyTYa91l1fOUCrgjqmcifM=
 github.com/docopt/docopt-go v0.0.0-20180111231733-ee0de3bc6815/go.mod h1:WwZ+bS3ebgob9U8Nd0kOddGdZWjyMGR8Wziv+TBNwSE=
-github.com/dustin/go-humanize v1.0.0 h1:VSnTsYCnlFHaM2/igO1h6X3HA71jcobQuxemgkq4zYo=
-github.com/dustin/go-humanize v1.0.0/go.mod h1:HtrtbFcZ19U5GC7JDqmcUSB87Iq5E25KnS6fMYU6eOk=
 github.com/elazarl/goproxy v0.0.0-20180725130230-947c36da3153/go.mod h1:/Zj4wYkgs4iZTTu3o/KG3Itv/qCCa8VVMlb3i9OVuzc=
 github.com/emicklei/go-restful v0.0.0-20170410110728-ff4f55a20633/go.mod h1:otzb+WCGbkyDHkqmQmT5YD2WR4BBwUdeQoFo8l/7tVs=
 github.com/envoyproxy/go-control-plane v0.9.0/go.mod h1:YTl/9mNaCwkRvm6d1a2C3ymFceY/DCBVvsKhRF0iEA4=
@@ -73,7 +57,6 @@ github.com/evanphx/json-patch v4.9.0+incompatible/go.mod h1:50XU6AFN0ol/bzJsmQLi
 github.com/fsnotify/fsnotify v1.4.7/go.mod h1:jwhsz4b93w/PPRr/qN1Yymfu8t87LnFCMoQvtojpjFo=
 github.com/fsnotify/fsnotify v1.4.9/go.mod h1:znqG4EE+3YCdAaPaxE2ZRY/06pZUdp0tY4IgpuI1SZQ=
 github.com/ghodss/yaml v0.0.0-20150909031657-73d445a93680/go.mod h1:4dBDuWmgqj2HViK6kFavaiC9ZROes6MMH2rRYeMEF04=
-github.com/ghodss/yaml v1.0.0/go.mod h1:4dBDuWmgqj2HViK6kFavaiC9ZROes6MMH2rRYeMEF04=
 github.com/go-gl/glfw/v3.3/glfw v0.0.0-20191125211704-12ad95a8df72/go.mod h1:tQ2UAYgL5IevRw8kRxooKSPJfGvJ9fJQFa0TUsXzTg8=
 github.com/go-kit/kit v0.8.0/go.mod h1:xBxKIO96dXMWWy0MnWVtmwkA9/13aqxPnvrjFYMA2as=
 github.com/go-kit/kit v0.9.0/go.mod h1:xBxKIO96dXMWWy0MnWVtmwkA9/13aqxPnvrjFYMA2as=
@@ -88,14 +71,10 @@ github.com/go-openapi/spec v0.0.0-20160808142527-6aced65f8501/go.mod h1:J8+jY1nA
 github.com/go-openapi/swag v0.0.0-20160704191624-1d0bd113de87/go.mod h1:DXUve3Dpr1UfpPtxFw+EFuQ41HhCWZfha5jSVRG7C7I=
 github.com/go-stack/stack v1.8.0/go.mod h1:v0f6uXyyMGvRgIKkXu+yp6POWl0qKG85gN/melR3HDY=
 github.com/gogo/protobuf v1.1.1/go.mod h1:r8qH/GZQm5c6nD/R0oafs1akxWv10x8SbQlK7atdtwQ=
-github.com/gogo/protobuf v1.3.1 h1:DqDEcV5aeaTmdFBePNpYsp3FlcVH/2ISVVM9Qf8PSls=
 github.com/gogo/protobuf v1.3.1/go.mod h1:SlYgWuQ5SjCEi6WLHjHCa1yvBfUnHcTbrrZtXPKa29o=
-github.com/gogo/protobuf v1.3.2 h1:Ov1cvc58UF3b5XjBnZv7+opcTcQFZebYjWzi34vdm4Q=
-github.com/gogo/protobuf v1.3.2/go.mod h1:P1XiOD3dCwIKUDQYPy72D8LYyHL2YPYrpS2s69NZV8Q=
 github.com/golang/glog v0.0.0-20160126235308-23def4e6c14b h1:VKtxabqXZkF25pY9ekfRL6a582T4P37/31XEstQ5p58=
 github.com/golang/glog v0.0.0-20160126235308-23def4e6c14b/go.mod h1:SBH7ygxi8pfUlaOkMMuAQtPIUF8ecWP5IEl/CR7VP2Q=
 github.com/golang/groupcache v0.0.0-20190702054246-869f871628b6/go.mod h1:cIg4eruTrX1D+g88fzRXU5OdNfaM+9IcxsU14FzY7Hc=
-github.com/golang/groupcache v0.0.0-20191227052852-215e87163ea7 h1:5ZkaAPbicIKTF2I64qf5Fh8Aa83Q/dnOafMYV0OMwjA=
 github.com/golang/groupcache v0.0.0-20191227052852-215e87163ea7/go.mod h1:cIg4eruTrX1D+g88fzRXU5OdNfaM+9IcxsU14FzY7Hc=
 github.com/golang/mock v1.1.1/go.mod h1:oTYuIxOrZwtPieC+H1uAHpcLFnEyAGVDL/k47Jfbm0A=
 github.com/golang/mock v1.2.0/go.mod h1:oTYuIxOrZwtPieC+H1uAHpcLFnEyAGVDL/k47Jfbm0A=
@@ -113,7 +92,6 @@ github.com/golang/protobuf v1.4.1/go.mod h1:U8fpvMrcmy5pZrNK1lt4xCsGvpyWQ/VVv6QD
 github.com/golang/protobuf v1.4.2 h1:+Z5KGCizgyZCbGh1KZqA0fcLLkwbsjIzS4aV2v7wJX0=
 github.com/golang/protobuf v1.4.2/go.mod h1:oDoupMAO8OvCJWAcko0GGGIgR6R6ocIYbsSw735rRwI=
 github.com/google/btree v0.0.0-20180813153112-4030bb1f1f0c/go.mod h1:lNA+9X1NB3Zf8V7Ke586lFgjr2dZNuvo3lPJSGZ5JPQ=
-github.com/google/btree v1.0.0 h1:0udJVsspx3VBr5FwtLhQQtuAsVc79tTq0ocGIPAU6qo=
 github.com/google/btree v1.0.0/go.mod h1:lNA+9X1NB3Zf8V7Ke586lFgjr2dZNuvo3lPJSGZ5JPQ=
 github.com/google/go-cmp v0.2.0/go.mod h1:oXzfMopK8JAjlY9xF4vHSVASa0yLyX7SntLO5aqRK0M=
 github.com/google/go-cmp v0.3.0/go.mod h1:8QqcDgzrUqlUb/G2PQTWiueGozuR1884gddMywk6iLU=
@@ -127,38 +105,24 @@ github.com/google/pprof v0.0.0-20181206194817-3ea8567a2e57/go.mod h1:zfwlbNMJ+OI
 github.com/google/pprof v0.0.0-20190515194954-54271f7e092f/go.mod h1:zfwlbNMJ+OItoe0UupaVj+oy1omPYYDuagoSzA8v9mc=
 github.com/google/pprof v0.0.0-20191218002539-d4f498aebedc/go.mod h1:ZgVRPoUq/hfqzAqh7sHMqb3I9Rq5C59dIz2SbBwJ4eM=
 github.com/google/renameio v0.1.0/go.mod h1:KWCgfxg9yswjAJkECMjeO8J8rahYeXnNhOm40UhjYkI=
-github.com/google/uuid v1.1.1 h1:Gkbcsh/GbpXz7lPftLA3P6TYMwjCLYm83jiFQZF/3gY=
 github.com/google/uuid v1.1.1/go.mod h1:TIyPZe4MgqvfeYDBFedMoGGpEw/LqOeaOT+nhxU+yHo=
 github.com/googleapis/gax-go/v2 v2.0.4/go.mod h1:0Wqv26UfaUD9n4G6kQubkQ+KchISgw+vpHVxEJEs9eg=
 github.com/googleapis/gax-go/v2 v2.0.5/go.mod h1:DWXyrwAJ9X0FpwwEdw+IPEYBICEFu5mhpdKc/us6bOk=
 github.com/googleapis/gnostic v0.4.1/go.mod h1:LRhVm6pbyptWbWbuZ38d1eyptfvIytN3ir6b65WBswg=
-github.com/gorilla/websocket v1.4.2 h1:+/TMaTYc4QFitKJxsQ7Yye35DkWvkdLcvGKqM+x0Ufc=
-github.com/gorilla/websocket v1.4.2/go.mod h1:YR8l580nyteQvAITg2hZ9XVh4b55+EU/adAjf1fMHhE=
 github.com/gregjones/httpcache v0.0.0-20180305231024-9cad4c3443a7/go.mod h1:FecbI9+v66THATjSRHfNgh1IVFe/9kFxbXtjV0ctIMA=
-github.com/grpc-ecosystem/go-grpc-middleware v1.3.0 h1:+9834+KizmvFV7pXQGSXQTsaWhq2GjuNUt0aUU0YBYw=
-github.com/grpc-ecosystem/go-grpc-middleware v1.3.0/go.mod h1:z0ButlSOZa5vEBq9m2m2hlwIgKw+rp3sdCBRoJY+30Y=
-github.com/grpc-ecosystem/go-grpc-prometheus v1.2.0 h1:Ovs26xHkKqVztRpIrF/92BcuyuQ/YW4NSIpoGtfXNho=
-github.com/grpc-ecosystem/go-grpc-prometheus v1.2.0/go.mod h1:8NvIoxWQoOIhqOTXgfV/d3M/q6VIi02HzZEHgUlZvzk=
-github.com/grpc-ecosystem/grpc-gateway v1.16.0 h1:gmcG1KaJ57LophUzW0Hy8NmPhnMZb4M0+kPpLofRdBo=
-github.com/grpc-ecosystem/grpc-gateway v1.16.0/go.mod h1:BDjrQk3hbvj6Nolgz8mAMFbcEtjT1g+wF4CSlocrBnw=
 github.com/hashicorp/golang-lru v0.5.0/go.mod h1:/m3WP610KZHVQ1SGc6re/UDhFvYD7pJ4Ao+sR/qLZy8=
 github.com/hashicorp/golang-lru v0.5.1/go.mod h1:/m3WP610KZHVQ1SGc6re/UDhFvYD7pJ4Ao+sR/qLZy8=
 github.com/hpcloud/tail v1.0.0/go.mod h1:ab1qPbhIpdTxEkNHXyeSf5vhxWSCs/tWer42PpOxQnU=
 github.com/ianlancetaylor/demangle v0.0.0-20181102032728-5e5cf60278f6/go.mod h1:aSSvb/t6k1mPoxDqO4vJh6VOCGPwU4O0C2/Eqndh1Sc=
 github.com/imdario/mergo v0.3.5/go.mod h1:2EnlNZ0deacrJVfApfmtdGgDfMuh/nq6Ok1EcJh5FfA=
-github.com/jonboulle/clockwork v0.2.2 h1:UOGuzwb1PwsrDAObMuhUnj0p5ULPj8V/xJ7Kx9qUBdQ=
-github.com/jonboulle/clockwork v0.2.2/go.mod h1:Pkfl5aHPm1nk2H9h0bjmnJD/BcgbGXUBGnn1kMkgxc8=
 github.com/json-iterator/go v1.1.6/go.mod h1:+SdeFBvtyEkXs7REEP0seUULqWtbJapLOCVDaaPEHmU=
-github.com/json-iterator/go v1.1.10 h1:Kz6Cvnvv2wGdaG/V8yMvfkmNiXq9Ya2KUv4rouJJr68=
 github.com/json-iterator/go v1.1.10/go.mod h1:KdQUCv79m/52Kvf8AW2vK1V8akMuk1QjK/uOdHXbAo4=
 github.com/jstemmer/go-junit-report v0.0.0-20190106144839-af01ea7f8024/go.mod h1:6v2b51hI/fHJwM22ozAgKL4VKDeJcHhJFhtBdhmNjmU=
 github.com/jstemmer/go-junit-report v0.9.1/go.mod h1:Brl9GWCQeLvo8nXZwPNNblvFj/XSXhF0NWZEnDohbsk=
 github.com/julienschmidt/httprouter v1.2.0/go.mod h1:SYymIcj16QtmaHHD7aYtjjsJG7VTCxuUUipMqKk8s4w=
 github.com/kisielk/errcheck v1.2.0/go.mod h1:/BMXB+zMLi60iA8Vv6Ksmxu/1UDYcXs4uQLJ+jE2L00=
-github.com/kisielk/errcheck v1.5.0/go.mod h1:pFxgyoBC7bSaBwPgfKdkLd5X25qrDl4LWUI2bnpBCr8=
 github.com/kisielk/gotool v1.0.0/go.mod h1:XhKaO+MFFWcvkIS/tQcRk01m1F5IRFswLeQ+oQHNcck=
 github.com/konsorten/go-windows-terminal-sequences v1.0.1/go.mod h1:T0+1ngSBFLxvqU3pZ+m/2kptfBszLMUkC4ZK/EgS/cQ=
-github.com/konsorten/go-windows-terminal-sequences v1.0.3 h1:CE8S1cTafDpPvMhIxNJKvHsGVBgn1xWYf1NbHQhywc8=
 github.com/konsorten/go-windows-terminal-sequences v1.0.3/go.mod h1:T0+1ngSBFLxvqU3pZ+m/2kptfBszLMUkC4ZK/EgS/cQ=
 github.com/kr/logfmt v0.0.0-20140226030751-b84e30acd515/go.mod h1:+0opPa2QZZtGFBFZlji/RkVcI2GknAs/DXo4wKdlNEc=
 github.com/kr/pretty v0.1.0/go.mod h1:dAy3ld7l9f0ibDNOQOHHMYYIIbhfbHSm3C4ZsoJORNo=
@@ -171,14 +135,11 @@ github.com/kubernetes-csi/csi-lib-utils v0.9.1 h1:sGq6ifVujfMSkfTsMZip44Ttv8SDXv
 github.com/kubernetes-csi/csi-lib-utils v0.9.1/go.mod h1:8E2jVUX9j3QgspwHXa6LwyN7IHQDjW9jX3kwoWnSC+M=
 github.com/mailru/easyjson v0.0.0-20160728113105-d5b7844b561a/go.mod h1:C1wdFJiN94OJF2b5HbByQZoLdCWB1Yqtg26g4irojpc=
 github.com/matttproud/golang_protobuf_extensions v1.0.1/go.mod h1:D8He9yQNgCq6Z5Ld7szi9bcBfOoFv/3dc6xSMkL2PC0=
-github.com/matttproud/golang_protobuf_extensions v1.0.2-0.20181231171920-c182affec369 h1:I0XW9+e1XWDxdcEniV4rQAIOPUGDq67JSCiRCgGCZLI=
 github.com/matttproud/golang_protobuf_extensions v1.0.2-0.20181231171920-c182affec369/go.mod h1:BSXmuO+STAnVfrANrmjBb36TMTDstsz7MSK+HVaYKv4=
 github.com/moby/term v0.0.0-20200312100748-672ec06f55cd/go.mod h1:DdlQx2hp0Ss5/fLikoLlEeIYiATotOjgB//nb973jeo=
 github.com/modern-go/concurrent v0.0.0-20180228061459-e0a39a4cb421/go.mod h1:6dJC0mAP4ikYIbvyc7fijjWJddQyLn8Ig3JB5CqoB9Q=
-github.com/modern-go/concurrent v0.0.0-20180306012644-bacd9c7ef1dd h1:TRLaZ9cD/w8PVh93nsPXa1VrQ6jlwL5oN8l14QlcNfg=
 github.com/modern-go/concurrent v0.0.0-20180306012644-bacd9c7ef1dd/go.mod h1:6dJC0mAP4ikYIbvyc7fijjWJddQyLn8Ig3JB5CqoB9Q=
 github.com/modern-go/reflect2 v0.0.0-20180701023420-4b7aa43c6742/go.mod h1:bx2lNnkwVCuqBIxFjflWJWanXIb3RllmbCylyMrvgv0=
-github.com/modern-go/reflect2 v1.0.1 h1:9f412s+6RmYXLWZSEzVVgPGK7C2PphHj5RJrvfx9AWI=
 github.com/modern-go/reflect2 v1.0.1/go.mod h1:bx2lNnkwVCuqBIxFjflWJWanXIb3RllmbCylyMrvgv0=
 github.com/munnerz/goautoneg v0.0.0-20120707110453-a547fc61f48d/go.mod h1:+n7T8mK8HuQTcFwEeznm/DIxMOiR9yIdICNftLE1DvQ=
 github.com/mwitkow/go-conntrack v0.0.0-20161129095857-cc309e4a2223/go.mod h1:qRWi+5nqEBWmkhHvq77mSJWrCKwh8bxhgT7d/eI7P4U=
@@ -188,38 +149,28 @@ github.com/onsi/ginkgo v1.6.0/go.mod h1:lLunBs/Ym6LB5Z9jYTR76FiuTmxDTDusOGeTQH+W
 github.com/onsi/ginkgo v1.11.0/go.mod h1:lLunBs/Ym6LB5Z9jYTR76FiuTmxDTDusOGeTQH+WWjE=
 github.com/onsi/gomega v0.0.0-20170829124025-dcabb60a477c/go.mod h1:C1qb7wdrVGGVU+Z6iS04AVkA3Q65CEZX59MT0QO5uiA=
 github.com/onsi/gomega v1.7.0/go.mod h1:ex+gbHU/CVuBBDIJjb2X0qEXbFg53c61hWP/1CpauHY=
-github.com/opentracing/opentracing-go v1.1.0/go.mod h1:UkNAQd3GIcIGf0SeVgPpRdFStlNbqXla1AfSYxPUl2o=
 github.com/peterbourgon/diskv v2.0.1+incompatible/go.mod h1:uqqh8zWWbv1HBMNONnaR/tNboyR3/BZd58JJSHlUSCU=
 github.com/pkg/errors v0.8.0/go.mod h1:bwawxfHBFNV+L2hUp1rHADufV3IMtnDRdf1r5NINEl0=
 github.com/pkg/errors v0.8.1/go.mod h1:bwawxfHBFNV+L2hUp1rHADufV3IMtnDRdf1r5NINEl0=
-github.com/pkg/errors v0.9.1 h1:FEBLx1zS214owpjy7qsBeixbURkuhQAwrK5UwLGTwt4=
 github.com/pkg/errors v0.9.1/go.mod h1:bwawxfHBFNV+L2hUp1rHADufV3IMtnDRdf1r5NINEl0=
 github.com/pmezard/go-difflib v1.0.0 h1:4DBwDE0NGyQoBHbLQYPwSUPoCMWR5BEzIk/f1lZbAQM=
 github.com/pmezard/go-difflib v1.0.0/go.mod h1:iKH77koFhYxTK1pcRnkKkqfTogsbg7gZNVY4sRDYZ/4=
 github.com/prometheus/client_golang v0.9.1/go.mod h1:7SWBe2y4D6OKWSNQJUaRYU/AaXPKyh/dDVn+NZz0KFw=
 github.com/prometheus/client_golang v1.0.0/go.mod h1:db9x61etRT2tGnBNRi70OPL5FsnadC4Ky3P0J6CfImo=
-github.com/prometheus/client_golang v1.7.1 h1:NTGy1Ja9pByO+xAeH/qiWnLrKtr3hJPNjaVUwnjpdpA=
 github.com/prometheus/client_golang v1.7.1/go.mod h1:PY5Wy2awLA44sXw4AOSfFBetzPP4j5+D6mVACh+pe2M=
 github.com/prometheus/client_model v0.0.0-20180712105110-5c3871d89910/go.mod h1:MbSGuTsp3dbXC40dX6PRTWyKYBIrTGTE9sqQNg2J8bo=
 github.com/prometheus/client_model v0.0.0-20190129233127-fd36f4220a90/go.mod h1:xMI15A0UPsDsEKsMN9yxemIoYk6Tm2C1GtYGdfGttqA=
 github.com/prometheus/client_model v0.0.0-20190812154241-14fe0d1b01d4/go.mod h1:xMI15A0UPsDsEKsMN9yxemIoYk6Tm2C1GtYGdfGttqA=
-github.com/prometheus/client_model v0.2.0 h1:uq5h0d+GuxiXLJLNABMgp2qUWDPiLvgCzz2dUR+/W/M=
 github.com/prometheus/client_model v0.2.0/go.mod h1:xMI15A0UPsDsEKsMN9yxemIoYk6Tm2C1GtYGdfGttqA=
 github.com/prometheus/common v0.4.1/go.mod h1:TNfzLD0ON7rHzMJeJkieUDPYmFC7Snx/y86RQel1bk4=
-github.com/prometheus/common v0.10.0 h1:RyRA7RzGXQZiW+tGMr7sxa85G1z0yOpM1qq5c8lNawc=
 github.com/prometheus/common v0.10.0/go.mod h1:Tlit/dnDKsSWFlCLTWaA1cyBgKHSMdTB80sz/V91rCo=
 github.com/prometheus/procfs v0.0.0-20181005140218-185b4288413d/go.mod h1:c3At6R/oaqEKCNdg8wHV1ftS6bRYblBhIjjI8uT2IGk=
 github.com/prometheus/procfs v0.0.2/go.mod h1:TjEm7ze935MbeOT/UhFTIMYKhuLP4wbCsTZCD3I8kEA=
-github.com/prometheus/procfs v0.1.3 h1:F0+tqvhOksq22sc6iCHF5WGlWjdwj92p0udFh1VFBS8=
 github.com/prometheus/procfs v0.1.3/go.mod h1:lV6e/gmhEcM9IjHGsFOCxxuZ+z1YqCvr4OA4YeYWdaU=
-github.com/rogpeppe/fastuuid v1.2.0/go.mod h1:jVj6XXZzXRy/MSR5jhDC/2q6DgLz+nrA6LYCDYWNEvQ=
 github.com/rogpeppe/go-internal v1.3.0/go.mod h1:M8bDsm7K2OlrFYOpmOWEs/qY81heoFRclV5y23lUDJ4=
 github.com/sirupsen/logrus v1.2.0/go.mod h1:LxeOpSwHxABJmUn/MG1IvRgCAasNZTLOkJPxbbu5VWo=
 github.com/sirupsen/logrus v1.4.2/go.mod h1:tLMulIdttU9McNUspp0xgXVQah82FyeX6MwdIuYE2rE=
-github.com/sirupsen/logrus v1.6.0 h1:UBcNElsrwanuuMsnGSlYmtmgbb23qDR5dG+6X6Oo89I=
 github.com/sirupsen/logrus v1.6.0/go.mod h1:7uNnSEd1DgxDLC74fIahvMZmmYsHGZGEOFrfsX/uA88=
-github.com/soheilhy/cmux v0.1.5 h1:jjzc5WVemNEDTLwv9tlmemhC73tI08BNOIGwBOo10Js=
-github.com/soheilhy/cmux v0.1.5/go.mod h1:T7TcVDs9LWfQgPlPsdngu6I6QIoyIFZDDC6sNE1GqG0=
 github.com/spf13/afero v1.2.2/go.mod h1:9ZxEEn6pIJ8Rxe320qSDBk6AsU0r9pR7Q4OcevTdifk=
 github.com/spf13/pflag v0.0.0-20170130214245-9ff6c6923cff/go.mod h1:DYY7MBk1bdzusC3SYhjObp+wFpr4gzcvqqNjLnInEg4=
 github.com/spf13/pflag v1.0.3/go.mod h1:DYY7MBk1bdzusC3SYhjObp+wFpr4gzcvqqNjLnInEg4=
@@ -231,24 +182,11 @@ github.com/stretchr/testify v1.3.0/go.mod h1:M5WIy9Dh21IEIfnGCwXGc5bZfKNJtfHm1UV
 github.com/stretchr/testify v1.4.0/go.mod h1:j7eGeouHqKxXV5pUuKE4zz7dFj8WfuZ+81PSLYec5m4=
 github.com/stretchr/testify v1.5.1 h1:nOGnQDM7FYENwehXlg/kFVnos3rEvtKTjRvOWSzb6H4=
 github.com/stretchr/testify v1.5.1/go.mod h1:5W2xD1RspED5o8YsWQXVCued0rvSQ+mT+I5cxcmMvtA=
-github.com/tmc/grpc-websocket-proxy v0.0.0-20201229170055-e5319fda7802 h1:uruHq4dN7GR16kFc5fp3d1RIYzJW5onx8Ybykw2YQFA=
-github.com/tmc/grpc-websocket-proxy v0.0.0-20201229170055-e5319fda7802/go.mod h1:ncp9v5uamzpCO7NfCPTXjqaC+bZgJeR0sMTm6dMHP7U=
-github.com/xiang90/probing v0.0.0-20190116061207-43a291ad63a2 h1:eY9dn8+vbi4tKz5Qo6v2eYzo7kUS51QINcR5jNpbZS8=
-github.com/xiang90/probing v0.0.0-20190116061207-43a291ad63a2/go.mod h1:UETIi67q53MR2AWcXfiuqkDkRtnGDLqkBTpCHuJHxtU=
-github.com/yuin/goldmark v1.1.27/go.mod h1:3hX8gzYuyVAZsxl0MRgGTJEmQBFcNTphYh9decYSb74=
-github.com/yuin/goldmark v1.2.1/go.mod h1:3hX8gzYuyVAZsxl0MRgGTJEmQBFcNTphYh9decYSb74=
-go.etcd.io/bbolt v1.3.5 h1:XAzx9gjCb0Rxj7EoqcClPD1d5ZBxZJk0jbuoPHenBt0=
-go.etcd.io/bbolt v1.3.5/go.mod h1:G5EMThwa9y8QZGBClrRx5EY+Yw9kAhnjy3bSjsnlVTQ=
-go.etcd.io/etcd v3.3.25+incompatible h1:V1RzkZJj9LqsJRy+TUBgpWSbZXITLB819lstuTFoZOY=
-go.etcd.io/etcd v3.3.25+incompatible/go.mod h1:yaeTdrJi5lOmYerz05bd8+V7KubZs8YSFZfzsF9A6aI=
 go.opencensus.io v0.21.0/go.mod h1:mSImk1erAIZhrmZN+AvHh14ztQfjbGwt4TtuofqLduU=
 go.opencensus.io v0.22.0/go.mod h1:+kGneAE2xo2IficOXnaByMWTGM9T73dGwxeWcUqIpI8=
 go.opencensus.io v0.22.2/go.mod h1:yxeiOL68Rb0Xd1ddK5vPZ/oVn4vY4Ynel7k9FzqtOIw=
-go.uber.org/atomic v1.4.0 h1:cxzIVoETapQEqDhQu3QfnvXAV4AlzcvUCxkVUFw3+EU=
 go.uber.org/atomic v1.4.0/go.mod h1:gD2HeocX3+yG+ygLZcrzQJaqmWj9AIm7n08wl/qW/PE=
-go.uber.org/multierr v1.1.0 h1:HoEmRHQPVSqub6w2z2d2EOVs2fjyFRGyofhKuyDq0QI=
 go.uber.org/multierr v1.1.0/go.mod h1:wR5kodmAFQ0UK8QlbwjlSNy0Z68gJhDJUG5sjR94q/0=
-go.uber.org/zap v1.10.0 h1:ORx85nbTijNz8ljznvCMR1ZBIPKFn3jQrag10X2AsuM=
 go.uber.org/zap v1.10.0/go.mod h1:vwi/ZaCAaUcBkycHslxD9B2zi4UTXhF60s6SWpuDF0Q=
 golang.org/x/crypto v0.0.0-20180904163835-0709b304e793/go.mod h1:6SG95UA2DQfeDnfUPMdvaQW0Q7yPrPDi9nlGo2tz2b4=
 golang.org/x/crypto v0.0.0-20190308221718-c2843e01d9a2/go.mod h1:djNgcEr1/C05ACkg1iLfiJU5Ep61QUkGW8qpdssI0+w=
@@ -256,7 +194,6 @@ golang.org/x/crypto v0.0.0-20190510104115-cbcb75029529/go.mod h1:yigFU9vqHzYiE8U
 golang.org/x/crypto v0.0.0-20190605123033-f99c8df09eb5/go.mod h1:yigFU9vqHzYiE8UmvKecakEJjdnWj3jj499lnFckfCI=
 golang.org/x/crypto v0.0.0-20191011191535-87dc89f01550/go.mod h1:yigFU9vqHzYiE8UmvKecakEJjdnWj3jj499lnFckfCI=
 golang.org/x/crypto v0.0.0-20191206172530-e9b2fee46413/go.mod h1:LzIPMQfyMNhhGPhUkYOs5KpL4U8rLKemX1yGLhDgUto=
-golang.org/x/crypto v0.0.0-20200622213623-75b288015ac9 h1:psW17arqaxU48Z5kZ0CQnkZWQJsqcURM6tKiBApRjXI=
 golang.org/x/crypto v0.0.0-20200622213623-75b288015ac9/go.mod h1:LzIPMQfyMNhhGPhUkYOs5KpL4U8rLKemX1yGLhDgUto=
 golang.org/x/exp v0.0.0-20190121172915-509febef88a4/go.mod h1:CJ0aWSM057203Lf6IL+f9T1iT9GByDxfZKAQTCR3kQA=
 golang.org/x/exp v0.0.0-20190306152737-a1d7652674e8/go.mod h1:CJ0aWSM057203Lf6IL+f9T1iT9GByDxfZKAQTCR3kQA=
@@ -276,8 +213,6 @@ golang.org/x/mobile v0.0.0-20190719004257-d2bd2a29d028/go.mod h1:E/iHnbuqvinMTCc
 golang.org/x/mod v0.0.0-20190513183733-4bf6d317e70e/go.mod h1:mXi4GBBbnImb6dmsKGUJ2LatrhH/nqhxcFungHvyanc=
 golang.org/x/mod v0.1.0/go.mod h1:0QHyrYULN0/3qlju5TqG8bIK38QM8yzMo5ekMj3DlcY=
 golang.org/x/mod v0.1.1-0.20191105210325-c90efee705ee/go.mod h1:QqPTAvyqsEbceGzBzNggFXnrqF1CaUcvgkdR5Ot7KZg=
-golang.org/x/mod v0.2.0/go.mod h1:s0Qsj1ACt9ePp/hMypM3fl4fZqREWJwdYDEqhRiZZUA=
-golang.org/x/mod v0.3.0/go.mod h1:s0Qsj1ACt9ePp/hMypM3fl4fZqREWJwdYDEqhRiZZUA=
 golang.org/x/net v0.0.0-20180724234803-3673e40ba225/go.mod h1:mL1N/T3taQHkDXs73rZJwtUhF3w3ftmwwsq0BUmARs4=
 golang.org/x/net v0.0.0-20180906233101-161cd47e91fd/go.mod h1:mL1N/T3taQHkDXs73rZJwtUhF3w3ftmwwsq0BUmARs4=
 golang.org/x/net v0.0.0-20181114220301-adae6a3d119a/go.mod h1:mL1N/T3taQHkDXs73rZJwtUhF3w3ftmwwsq0BUmARs4=
@@ -291,26 +226,20 @@ golang.org/x/net v0.0.0-20190603091049-60506f45cf65/go.mod h1:HSz+uSET+XFnRR8LxR
 golang.org/x/net v0.0.0-20190613194153-d28f0bde5980/go.mod h1:z5CRVTTTmAJ677TzLLGU+0bjPO0LkuOLi4/5GtJWs/s=
 golang.org/x/net v0.0.0-20190620200207-3b0461eec859/go.mod h1:z5CRVTTTmAJ677TzLLGU+0bjPO0LkuOLi4/5GtJWs/s=
 golang.org/x/net v0.0.0-20191209160850-c0dbc17a3553/go.mod h1:z5CRVTTTmAJ677TzLLGU+0bjPO0LkuOLi4/5GtJWs/s=
-golang.org/x/net v0.0.0-20200226121028-0de0cce0169b/go.mod h1:z5CRVTTTmAJ677TzLLGU+0bjPO0LkuOLi4/5GtJWs/s=
 golang.org/x/net v0.0.0-20200324143707-d3edc9973b7e/go.mod h1:qpuaurCH72eLCgpAm/N6yyVIVM9cpaDIP3A8BGJEC5A=
-golang.org/x/net v0.0.0-20200707034311-ab3426394381 h1:VXak5I6aEWmAXeQjA+QSZzlgNrpq9mjcfDemuexIKsU=
 golang.org/x/net v0.0.0-20200707034311-ab3426394381/go.mod h1:/O7V0waA8r7cgGh81Ro3o1hOxt32SMVPicZroKQ2sZA=
-golang.org/x/net v0.0.0-20200822124328-c89045814202/go.mod h1:/O7V0waA8r7cgGh81Ro3o1hOxt32SMVPicZroKQ2sZA=
-golang.org/x/net v0.0.0-20201021035429-f5854403a974/go.mod h1:sp8m0HH+o8qH0wwXwYZr8TS3Oi6o0r6Gce1SSxlDquU=
 golang.org/x/net v0.0.0-20201202161906-c7110b5ffcbb h1:eBmm0M9fYhWpKZLjQUUKka/LtIxf46G4fxeEz5KJr9U=
 golang.org/x/net v0.0.0-20201202161906-c7110b5ffcbb/go.mod h1:sp8m0HH+o8qH0wwXwYZr8TS3Oi6o0r6Gce1SSxlDquU=
 golang.org/x/oauth2 v0.0.0-20180821212333-d2e6202438be/go.mod h1:N/0e6XlmueqKjAGxoOufVs8QHGRruUQn6yWY3a++T0U=
 golang.org/x/oauth2 v0.0.0-20190226205417-e64efc72b421/go.mod h1:gOpvHmFTYa4IltrdGE7lF6nIHvwfUNPOp7c8zoXwtLw=
 golang.org/x/oauth2 v0.0.0-20190604053449-0f29369cfe45/go.mod h1:gOpvHmFTYa4IltrdGE7lF6nIHvwfUNPOp7c8zoXwtLw=
 golang.org/x/oauth2 v0.0.0-20191202225959-858c2ad4c8b6/go.mod h1:gOpvHmFTYa4IltrdGE7lF6nIHvwfUNPOp7c8zoXwtLw=
-golang.org/x/oauth2 v0.0.0-20200107190931-bf48bf16ab8d/go.mod h1:gOpvHmFTYa4IltrdGE7lF6nIHvwfUNPOp7c8zoXwtLw=
 golang.org/x/sync v0.0.0-20180314180146-1d60e4601c6f/go.mod h1:RxMgew5VJxzue5/jJTE5uejpjVlOe/izrB70Jof72aM=
 golang.org/x/sync v0.0.0-20181108010431-42b317875d0f/go.mod h1:RxMgew5VJxzue5/jJTE5uejpjVlOe/izrB70Jof72aM=
 golang.org/x/sync v0.0.0-20181221193216-37e7f081c4d4/go.mod h1:RxMgew5VJxzue5/jJTE5uejpjVlOe/izrB70Jof72aM=
 golang.org/x/sync v0.0.0-20190227155943-e225da77a7e6/go.mod h1:RxMgew5VJxzue5/jJTE5uejpjVlOe/izrB70Jof72aM=
 golang.org/x/sync v0.0.0-20190423024810-112230192c58/go.mod h1:RxMgew5VJxzue5/jJTE5uejpjVlOe/izrB70Jof72aM=
 golang.org/x/sync v0.0.0-20190911185100-cd5d95a43a6e/go.mod h1:RxMgew5VJxzue5/jJTE5uejpjVlOe/izrB70Jof72aM=
-golang.org/x/sync v0.0.0-20201020160332-67f06af15bc9/go.mod h1:RxMgew5VJxzue5/jJTE5uejpjVlOe/izrB70Jof72aM=
 golang.org/x/sys v0.0.0-20180905080454-ebe1bf3edb33/go.mod h1:STP8DvDyc/dI5b8T5hshtkjS+E42TnysNCUPdjciGhY=
 golang.org/x/sys v0.0.0-20180909124046-d0be0721c37e/go.mod h1:STP8DvDyc/dI5b8T5hshtkjS+E42TnysNCUPdjciGhY=
 golang.org/x/sys v0.0.0-20181116152217-5ac8a444bdc5/go.mod h1:STP8DvDyc/dI5b8T5hshtkjS+E42TnysNCUPdjciGhY=
@@ -326,11 +255,9 @@ golang.org/x/sys v0.0.0-20191005200804-aed5e4c7ecf9/go.mod h1:h1NjWce9XRLGQEsW7w
 golang.org/x/sys v0.0.0-20191204072324-ce4227a45e2e/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs=
 golang.org/x/sys v0.0.0-20191228213918-04cbcbbfeed8/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs=
 golang.org/x/sys v0.0.0-20200106162015-b016eb3dc98e/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs=
-golang.org/x/sys v0.0.0-20200202164722-d101bd2416d5/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs=
 golang.org/x/sys v0.0.0-20200302150141-5c8b2ff67527/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs=
 golang.org/x/sys v0.0.0-20200323222414-85ca7c5b95cd/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs=
 golang.org/x/sys v0.0.0-20200615200032-f1bc736245b1/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs=
-golang.org/x/sys v0.0.0-20200622214017-ed371f2e16b4 h1:5/PjkGUjvEU5Gl6BxmvKRPpqo2uNMv4rcHBMwzk/st8=
 golang.org/x/sys v0.0.0-20200622214017-ed371f2e16b4/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs=
 golang.org/x/sys v0.0.0-20200930185726-fdedc70b468f h1:+Nyd8tzPX9R7BWHguqsrbFdRx3WQ/1ib8I44HXV5yTA=
 golang.org/x/sys v0.0.0-20200930185726-fdedc70b468f/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs=
@@ -341,7 +268,6 @@ golang.org/x/text v0.3.3 h1:cokOdA+Jmi5PJGXLlLllQSgYigAEfHXJAERHVMaCc2k=
 golang.org/x/text v0.3.3/go.mod h1:5Zoc/QRtKVWzQhOtBMvqHzDpF6irO9z98xDceosuGiQ=
 golang.org/x/time v0.0.0-20181108054448-85acf8d2951c/go.mod h1:tRJNPiyCQ0inRvYxbN9jk5I+vvW/OXSQhTDSoE431IQ=
 golang.org/x/time v0.0.0-20190308202827-9d24e82272b4/go.mod h1:tRJNPiyCQ0inRvYxbN9jk5I+vvW/OXSQhTDSoE431IQ=
-golang.org/x/time v0.0.0-20191024005414-555d28b269f0 h1:/5xXl8Y5W96D+TtHSlonuFqGHIWVuyCkGJLwGh9JJFs=
 golang.org/x/time v0.0.0-20191024005414-555d28b269f0/go.mod h1:tRJNPiyCQ0inRvYxbN9jk5I+vvW/OXSQhTDSoE431IQ=
 golang.org/x/tools v0.0.0-20180917221912-90fa682c2a6e/go.mod h1:n7NCudcB/nEzxVGmLbDWY5pfWTLqBcC2KZ6jyYvM4mQ=
 golang.org/x/tools v0.0.0-20181011042414-1f849cf54d09/go.mod h1:n7NCudcB/nEzxVGmLbDWY5pfWTLqBcC2KZ6jyYvM4mQ=
@@ -360,14 +286,10 @@ golang.org/x/tools v0.0.0-20190628153133-6cdbf07be9d0/go.mod h1:/rFqwRUd4F7ZHNgw
 golang.org/x/tools v0.0.0-20190816200558-6889da9d5479/go.mod h1:b+2E5dAYhXwXZwtnZ6UAqBI28+e2cm9otk0dWdXHAEo=
 golang.org/x/tools v0.0.0-20190911174233-4f2ddba30aff/go.mod h1:b+2E5dAYhXwXZwtnZ6UAqBI28+e2cm9otk0dWdXHAEo=
 golang.org/x/tools v0.0.0-20191012152004-8de300cfc20a/go.mod h1:b+2E5dAYhXwXZwtnZ6UAqBI28+e2cm9otk0dWdXHAEo=
-golang.org/x/tools v0.0.0-20191119224855-298f0cb1881e/go.mod h1:b+2E5dAYhXwXZwtnZ6UAqBI28+e2cm9otk0dWdXHAEo=
 golang.org/x/tools v0.0.0-20191125144606-a911d9008d1f/go.mod h1:b+2E5dAYhXwXZwtnZ6UAqBI28+e2cm9otk0dWdXHAEo=
 golang.org/x/tools v0.0.0-20191227053925-7b8e75db28f4/go.mod h1:TB2adYChydJhpapKDTa4BR/hXlZSLoq2Wpct/0txZ28=
-golang.org/x/tools v0.0.0-20200619180055-7c47624df98f/go.mod h1:EkVYQZoAsY45+roYkvgYkIh4xh/qjgUK9TdY2XT94GE=
-golang.org/x/tools v0.0.0-20210106214847-113979e3529a/go.mod h1:emZCQorbCU4vsT4fOWvOPXz4eW1wZW4PmDk9uLelYpA=
 golang.org/x/xerrors v0.0.0-20190717185122-a985d3407aa7/go.mod h1:I/5z698sn9Ka8TeJc9MKroUUfqBBauWjQqLJ2OPfmY0=
 golang.org/x/xerrors v0.0.0-20191011141410-1b5146add898/go.mod h1:I/5z698sn9Ka8TeJc9MKroUUfqBBauWjQqLJ2OPfmY0=
-golang.org/x/xerrors v0.0.0-20191204190536-9bdfabe68543 h1:E7g+9GITq07hpfrRu66IVDexMakfv52eLZ2CXBWiKr4=
 golang.org/x/xerrors v0.0.0-20191204190536-9bdfabe68543/go.mod h1:I/5z698sn9Ka8TeJc9MKroUUfqBBauWjQqLJ2OPfmY0=
 golang.org/x/xerrors v0.0.0-20200804184101-5ec99f83aff1 h1:go1bK/D/BFZV2I8cIQd1NKEZ+0owSTG1fDTci4IqFcE=
 golang.org/x/xerrors v0.0.0-20200804184101-5ec99f83aff1/go.mod h1:I/5z698sn9Ka8TeJc9MKroUUfqBBauWjQqLJ2OPfmY0=
@@ -388,8 +310,6 @@ google.golang.org/genproto v0.0.0-20190801165951-fa694d86fc64/go.mod h1:DMBHOl98
 google.golang.org/genproto v0.0.0-20190819201941-24fa4b261c55/go.mod h1:DMBHOl98Agz4BDEuKkezgsaosCRResVns1a3J2ZsMNc=
 google.golang.org/genproto v0.0.0-20190911173649-1774047e7e51/go.mod h1:IbNlFCBrqXvoKpeg0TB2l7cyZUmoaFKYIwrEpbDKLA8=
 google.golang.org/genproto v0.0.0-20191230161307-f3c370f40bfb/go.mod h1:n3cpQtvxv34hfy77yVDNjmbRyujviMdxYliBSkLhpCc=
-google.golang.org/genproto v0.0.0-20200423170343-7949de9c1215/go.mod h1:55QSHmfGQM9UVYDPBsyGGes0y52j32PQ3BqQfXhyH3c=
-google.golang.org/genproto v0.0.0-20200513103714-09dca8ec2884/go.mod h1:55QSHmfGQM9UVYDPBsyGGes0y52j32PQ3BqQfXhyH3c=
 google.golang.org/genproto v0.0.0-20200526211855-cb27e3aa2013 h1:+kGHl1aib/qcwaRi1CbqBZ1rk19r85MNUf8HaBghugY=
 google.golang.org/genproto v0.0.0-20200526211855-cb27e3aa2013/go.mod h1:NbSheEEYHJ7i3ixzK3sjbqSGDJWnxyFXZblF3eUsNvo=
 google.golang.org/grpc v1.25.1 h1:wdKvqQk7IttEw92GoRyKG2IDrUIpgpj6H6m81yfeMW0=
@@ -415,7 +335,6 @@ gopkg.in/inf.v0 v0.9.1/go.mod h1:cWUDdTG/fYaXco+Dcufb5Vnc6Gp2YChqWtbxRZE0mXw=
 gopkg.in/tomb.v1 v1.0.0-20141024135613-dd632973f1e7/go.mod h1:dt/ZhP58zS4L8KSrWDmTeBkI65Dw0HsyUHuEVlX15mw=
 gopkg.in/yaml.v2 v2.2.1/go.mod h1:hI93XBmqTisBFMUTm0b8Fm+jr3Dg1NNxqwp+5A1VGuI=
 gopkg.in/yaml.v2 v2.2.2/go.mod h1:hI93XBmqTisBFMUTm0b8Fm+jr3Dg1NNxqwp+5A1VGuI=
-gopkg.in/yaml.v2 v2.2.3/go.mod h1:hI93XBmqTisBFMUTm0b8Fm+jr3Dg1NNxqwp+5A1VGuI=
 gopkg.in/yaml.v2 v2.2.4/go.mod h1:hI93XBmqTisBFMUTm0b8Fm+jr3Dg1NNxqwp+5A1VGuI=
 gopkg.in/yaml.v2 v2.2.5/go.mod h1:hI93XBmqTisBFMUTm0b8Fm+jr3Dg1NNxqwp+5A1VGuI=
 gopkg.in/yaml.v2 v2.2.8 h1:obN1ZagJSUGI0Ek/LBmuj4SNLPfIny3KsKFopxRdj10=
@@ -444,5 +363,4 @@ k8s.io/utils v0.0.0-20210305010621-2afb4311ab10/go.mod h1:jPW/WVKK9YHAvNhRxK0md/
 rsc.io/binaryregexp v0.2.0/go.mod h1:qTv7/COck+e2FymRvadv62gMdZztPaShugOCi3I+8D8=
 sigs.k8s.io/structured-merge-diff/v4 v4.0.1/go.mod h1:bJZC9H9iH24zzfZ/41RGcq60oK1F7G282QMXDPYydCw=
 sigs.k8s.io/yaml v1.1.0/go.mod h1:UJmg0vDUVViEyp3mgSv9WPwZCDxu4rQW1olrI1uml+o=
-sigs.k8s.io/yaml v1.2.0 h1:kr/MCeFWJWTwyaHoR9c8EjH9OumOmoF9YGiZd7lFm/Q=
 sigs.k8s.io/yaml v1.2.0/go.mod h1:yfXDCHCao9+ENCvLSE62v9VSji2MKu5jeNfTrofGhJc=
--- a/csi/src/config.go
+++ b/csi/src/config.go
@@ -5,7 +5,7 @@ package vitastor

 const (
    vitastorCSIDriverName    = "csi.vitastor.io"
-    vitastorCSIDriverVersion = "0.8.8"
+    vitastorCSIDriverVersion = "0.9.0"
 )

 // Config struct fills the parameters of request or user input
--- a/debian/changelog
+++ b/debian/changelog
@@ -1,10 +1,10 @@
-vitastor (0.8.8-1) unstable; urgency=medium
+vitastor (0.9.0-1) unstable; urgency=medium

  * Bugfixes

 -- Vitaliy Filippov <vitalif@yourcmc.ru>  Fri, 03 Jun 2022 02:09:44 +0300

-vitastor (0.8.8-1) unstable; urgency=medium
+vitastor (0.9.0-1) unstable; urgency=medium

  * Implement NFS proxy
  * Add documentation
--- a/debian/vitastor.Dockerfile
+++ b/debian/vitastor.Dockerfile
@@ -34,8 +34,8 @@ RUN set -e -x; \
    mkdir -p /root/packages/vitastor-$REL; \
    rm -rf /root/packages/vitastor-$REL/*; \
    cd /root/packages/vitastor-$REL; \
-    cp -r /root/vitastor vitastor-0.8.8; \
-    cd vitastor-0.8.8; \
+    cp -r /root/vitastor vitastor-0.9.0; \
+    cd vitastor-0.9.0; \
    ln -s /root/fio-build/fio-*/ ./fio; \
    FIO=$(head -n1 fio/debian/changelog | perl -pe 's/^.*\((.*?)\).*$/$1/'); \
    ls /usr/include/linux/raw.h || cp ./debian/raw.h /usr/include/linux/raw.h; \
@@ -48,8 +48,8 @@ RUN set -e -x; \
    rm -rf a b; \
    echo "dep:fio=$FIO" > debian/fio_version; \
    cd /root/packages/vitastor-$REL; \
-    tar --sort=name --mtime='2020-01-01' --owner=0 --group=0 --exclude=debian -cJf vitastor_0.8.8.orig.tar.xz vitastor-0.8.8; \
-    cd vitastor-0.8.8; \
+    tar --sort=name --mtime='2020-01-01' --owner=0 --group=0 --exclude=debian -cJf vitastor_0.9.0.orig.tar.xz vitastor-0.9.0; \
+    cd vitastor-0.9.0; \
    V=$(head -n1 debian/changelog | perl -pe 's/^.*\((.*?)\).*$/$1/'); \
    DEBFULLNAME="Vitaliy Filippov <vitalif@yourcmc.ru>" dch -D $REL -v "$V""$REL" "Rebuild for $REL"; \
    DEB_BUILD_OPTIONS=nocheck dpkg-buildpackage --jobs=auto -sa; \
--- a/docker/vitastor.gpg
+++ b/docker/vitastor.gpg
--- a/docs/config/common.en.md
+++ b/docs/config/common.en.md
@@ -25,11 +25,16 @@ running if required parameters are specified.
 ## etcd_address

 - Type: string or array of strings
+- Can be changed online: yes

 etcd connection endpoint(s). Multiple endpoints may be delimited by "," or
 specified in a JSON array `["10.0.115.10:2379/v3","10.0.115.11:2379/v3"]`.
 Note that https is not supported for etcd connections yet.

+etcd connection endpoints can be changed online by updating global
+configuration in etcd itself - this allows to switch the cluster to new
+etcd addresses without downtime.
+
 ## etcd_prefix

 - Type: string
@@ -42,5 +47,6 @@ example, use a single etcd cluster for multiple Vitastor clusters.

 - Type: integer
 - Default: 0
+- Can be changed online: yes

 Log level. Raise if you want more verbose output.
--- a/docs/config/common.ru.md
+++ b/docs/config/common.ru.md
@@ -24,10 +24,14 @@
 ## etcd_address

 - Тип: строка или массив строк
+- Можно менять на лету: да

 Адрес(а) подключения к etcd. Несколько адресов могут разделяться запятой
 или указываться в виде JSON-массива `["10.0.115.10:2379/v3","10.0.115.11:2379/v3"]`.

+Адреса подключения к etcd можно поменять на лету, обновив конфигурацию в
+самом etcd - это позволяет переключить кластер на новые etcd без остановки.
+
 ## etcd_prefix

 - Тип: строка
@@ -41,5 +45,6 @@

 - Тип: целое число
 - Значение по умолчанию: 0
+- Можно менять на лету: да

 Уровень логгирования. Повысьте, если хотите более подробный вывод.
--- a/docs/config/network.en.md
+++ b/docs/config/network.en.md
@@ -153,6 +153,7 @@ operations.
 - Type: seconds
 - Default: 5
 - Minimum: 1
+- Can be changed online: yes

 Interval before attempting to reconnect to an unavailable OSD.

@@ -161,6 +162,7 @@ Interval before attempting to reconnect to an unavailable OSD.
 - Type: seconds
 - Default: 5
 - Minimum: 1
+- Can be changed online: yes

 Timeout for OSD connection attempts.

@@ -169,6 +171,7 @@ Timeout for OSD connection attempts.
 - Type: seconds
 - Default: 5
 - Minimum: 1
+- Can be changed online: yes

 OSD connection inactivity time after which clients and other OSDs send
 keepalive requests to check state of the connection.
@@ -178,6 +181,7 @@ keepalive requests to check state of the connection.
 - Type: seconds
 - Default: 5
 - Minimum: 1
+- Can be changed online: yes

 Maximum time to wait for OSD keepalive responses. If an OSD doesn't respond
 within this time, the connection to it is dropped and a reconnection attempt
@@ -188,6 +192,7 @@ is scheduled.
 - Type: milliseconds
 - Default: 500
 - Minimum: 50
+- Can be changed online: yes

 OSDs respond to clients with a special error code when they receive I/O
 requests for a PG that's not synchronized and started. This parameter sets
@@ -197,6 +202,7 @@ the time for the clients to wait before re-attempting such I/O requests.

 - Type: integer
 - Default: 5
+- Can be changed online: yes

 Maximum number of attempts for etcd requests which can't be retried
 indefinitely.
@@ -205,6 +211,7 @@ indefinitely.

 - Type: milliseconds
 - Default: 1000
+- Can be changed online: yes

 Timeout for etcd requests which should complete quickly, like lease refresh.

@@ -212,6 +219,7 @@ Timeout for etcd requests which should complete quickly, like lease refresh.

 - Type: milliseconds
 - Default: 5000
+- Can be changed online: yes

 Timeout for etcd requests which are allowed to wait for some time.

@@ -219,6 +227,7 @@ Timeout for etcd requests which are allowed to wait for some time.

 - Type: seconds
 - Default: max(30, etcd_report_interval*2)
+- Can be changed online: yes

 Timeout for etcd connection HTTP Keep-Alive. Should be higher than
 etcd_report_interval to guarantee that keepalive actually works.
@@ -227,6 +236,7 @@ etcd_report_interval to guarantee that keepalive actually works.

 - Type: seconds
 - Default: 30
+- Can be changed online: yes

 etcd websocket ping interval required to keep the connection alive and
 detect disconnections quickly.
@@ -235,6 +245,7 @@ detect disconnections quickly.

 - Type: integer
 - Default: 33554432
+- Can be changed online: yes

 Without immediate_commit=all this parameter sets the limit of "dirty"
 (not committed by fsync) data allowed by the client before forcing an
--- a/docs/config/network.ru.md
+++ b/docs/config/network.ru.md
@@ -161,6 +161,7 @@ OSD в любом случае согласовывают реальное зн
 - Тип: секунды
 - Значение по умолчанию: 5
 - Минимальное значение: 1
+- Можно менять на лету: да

 Время ожидания перед повторной попыткой соединиться с недоступным OSD.

@@ -169,6 +170,7 @@ OSD в любом случае согласовывают реальное зн
 - Тип: секунды
 - Значение по умолчанию: 5
 - Минимальное значение: 1
+- Можно менять на лету: да

 Максимальное время ожидания попытки соединения с OSD.

@@ -177,6 +179,7 @@ OSD в любом случае согласовывают реальное зн
 - Тип: секунды
 - Значение по умолчанию: 5
 - Минимальное значение: 1
+- Можно менять на лету: да

 Время неактивности соединения с OSD, после которого клиенты или другие OSD
 посылают запрос проверки состояния соединения.
@@ -186,6 +189,7 @@ OSD в любом случае согласовывают реальное зн
 - Тип: секунды
 - Значение по умолчанию: 5
 - Минимальное значение: 1
+- Можно менять на лету: да

 Максимальное время ожидания ответа на запрос проверки состояния соединения.
 Если OSD не отвечает за это время, соединение отключается и производится
@@ -196,6 +200,7 @@ OSD в любом случае согласовывают реальное зн
 - Тип: миллисекунды
 - Значение по умолчанию: 500
 - Минимальное значение: 50
+- Можно менять на лету: да

 Когда OSD получают от клиентов запросы ввода-вывода, относящиеся к не
 поднятым на данный момент на них PG, либо к PG в процессе синхронизации,
@@ -207,6 +212,7 @@ OSD в любом случае согласовывают реальное зн

 - Тип: целое число
 - Значение по умолчанию: 5
+- Можно менять на лету: да

 Максимальное число попыток выполнения запросов к etcd для тех запросов,
 которые нельзя повторять бесконечно.
@@ -215,6 +221,7 @@ OSD в любом случае согласовывают реальное зн

 - Тип: миллисекунды
 - Значение по умолчанию: 1000
+- Можно менять на лету: да

 Максимальное время выполнения запросов к etcd, которые должны завершаться
 быстро, таких, как обновление резервации (lease).
@@ -223,6 +230,7 @@ OSD в любом случае согласовывают реальное зн

 - Тип: миллисекунды
 - Значение по умолчанию: 5000
+- Можно менять на лету: да

 Максимальное время выполнения запросов к etcd, для которых не обязательно
 гарантировать быстрое выполнение.
@@ -231,6 +239,7 @@ OSD в любом случае согласовывают реальное зн

 - Тип: секунды
 - Значение по умолчанию: max(30, etcd_report_interval*2)
+- Можно менять на лету: да

 Таймаут для HTTP Keep-Alive в соединениях к etcd. Должен быть больше, чем
 etcd_report_interval, чтобы keepalive гарантированно работал.
@@ -239,6 +248,7 @@ etcd_report_interval, чтобы keepalive гарантированно рабо

 - Тип: секунды
 - Значение по умолчанию: 30
+- Можно менять на лету: да

 Интервал проверки живости вебсокет-подключений к etcd.

@@ -246,6 +256,7 @@ etcd_report_interval, чтобы keepalive гарантированно рабо

 - Тип: целое число
 - Значение по умолчанию: 33554432
+- Можно менять на лету: да

 При работе без immediate_commit=all - это лимит объёма "грязных" (не
 зафиксированных fsync-ом) данных, при достижении которого клиент будет
--- a/docs/config/osd.en.md
+++ b/docs/config/osd.en.md
@@ -7,7 +7,8 @@
 # Runtime OSD Parameters

 These parameters only apply to OSDs, are not fixed at the moment of OSD drive
-initialization and can be changed with an OSD restart.
+initialization and can be changed - either with an OSD restart or, for some of
+them, even without restarting by updating configuration in etcd.

 - [etcd_report_interval](#etcd_report_interval)
 - [run_primary](#run_primary)
@@ -38,6 +39,14 @@ initialization and can be changed with an OSD restart.
 - [throttle_target_parallelism](#throttle_target_parallelism)
 - [throttle_threshold_us](#throttle_threshold_us)
 - [osd_memlock](#osd_memlock)
+- [auto_scrub](#auto_scrub)
+- [no_scrub](#no_scrub)
+- [scrub_interval](#scrub_interval)
+- [scrub_queue_depth](#scrub_queue_depth)
+- [scrub_sleep](#scrub_sleep)
+- [scrub_list_limit](#scrub_list_limit)
+- [scrub_find_best](#scrub_find_best)
+- [scrub_ec_max_bruteforce](#scrub_ec_max_bruteforce)

 ## etcd_report_interval

@@ -91,6 +100,7 @@ OSD by hand.

 - Type: seconds
 - Default: 5
+- Can be changed online: yes

 Time interval at which automatic fsyncs/flushes are issued by each OSD when
 the immediate_commit mode if disabled. fsyncs are required because without
@@ -103,6 +113,7 @@ issue fsyncs at all.

 - Type: integer
 - Default: 128
+- Can be changed online: yes

 Same as autosync_interval, but sets the maximum number of uncommitted write
 operations before issuing an fsync operation internally.
@@ -111,6 +122,7 @@ operations before issuing an fsync operation internally.

 - Type: integer
 - Default: 4
+- Can be changed online: yes

 Maximum recovery operations per one primary OSD at any given moment of time.
 Currently it's the only parameter available to tune the speed or recovery
@@ -120,6 +132,7 @@ and rebalancing, but it's planned to implement more.

 - Type: integer
 - Default: 128
+- Can be changed online: yes

 Number of recovery operations before switching to recovery of the next PG.
 The idea is to mix all PGs during recovery for more even space and load
@@ -130,6 +143,7 @@ Degraded PGs are anyway scanned first.

 - Type: integer
 - Default: 16
+- Can be changed online: yes

 Maximum number of recovery operations before issuing an additional fsync.

@@ -145,6 +159,7 @@ the underlying device. This may be useful for recovery purposes.

 - Type: boolean
 - Default: false
+- Can be changed online: yes

 Disable automatic background recovery of objects. Note that it doesn't
 affect implicit recovery of objects happening during writes - a write is
@@ -154,6 +169,7 @@ always made to a full set of at least pg_minsize OSDs.

 - Type: boolean
 - Default: false
+- Can be changed online: yes

 Disable background movement of data between different OSDs. Disabling it
 means that PGs in the `has_misplaced` state will be left in it indefinitely.
@@ -162,6 +178,7 @@ means that PGs in the `has_misplaced` state will be left in it indefinitely.

 - Type: seconds
 - Default: 3
+- Can be changed online: yes

 Time interval at which OSDs print simple human-readable operation
 statistics on stdout.
@@ -170,6 +187,7 @@ statistics on stdout.

 - Type: seconds
 - Default: 10
+- Can be changed online: yes

 Time interval at which OSDs dump slow or stuck operations on stdout, if
 they're any. Also it's the time after which an operation is considered
@@ -179,6 +197,7 @@ they're any. Also it's the time after which an operation is considered

 - Type: seconds
 - Default: 60
+- Can be changed online: yes

 Number of seconds after which a deleted inode is removed from OSD statistics.

@@ -186,6 +205,7 @@ Number of seconds after which a deleted inode is removed from OSD statistics.

 - Type: integer
 - Default: 128
+- Can be changed online: yes

 Parallel client write operation limit per one OSD. Operations that exceed
 this limit are pushed to a temporary queue instead of being executed
@@ -195,6 +215,7 @@ immediately.

 - Type: integer
 - Default: 1
+- Can be changed online: yes

 Flusher is a micro-thread that moves data from the journal to the data
 area of the device. Their number is auto-tuned between minimum and maximum.
@@ -204,6 +225,7 @@ Minimum number is set by this parameter.

 - Type: integer
 - Default: 256
+- Can be changed online: yes

 Maximum number of journal flushers (see above min_flusher_count).

@@ -260,6 +282,7 @@ Most (99%) other SSDs don't need this option.

 - Type: boolean
 - Default: false
+- Can be changed online: yes

 Enable soft throttling of small journaled writes. Useful for hybrid OSDs
 with fast journal/metadata devices and slow data devices. The idea is that
@@ -277,6 +300,7 @@ fills up.

 - Type: integer
 - Default: 100
+- Can be changed online: yes

 Target maximum number of throttled operations per second under the condition
 of full journal. Set it to approximate random write iops of your data devices
@@ -286,6 +310,7 @@ of full journal. Set it to approximate random write iops of your data devices

 - Type: integer
 - Default: 100
+- Can be changed online: yes

 Target maximum bandwidth in MB/s of throttled operations per second under
 the condition of full journal. Set it to approximate linear write
@@ -295,6 +320,7 @@ performance of your data devices (HDDs).

 - Type: integer
 - Default: 1
+- Can be changed online: yes

 Target maximum parallelism of throttled operations under the condition of
 full journal. Set it to approximate internal parallelism of your data
@@ -304,6 +330,7 @@ devices (1 for HDDs, 4-8 for SSDs).

 - Type: microseconds
 - Default: 50
+- Can be changed online: yes

 Minimal computed delay to be applied to throttled operations. Usually
 doesn't need to be changed.
@@ -313,4 +340,103 @@ doesn't need to be changed.
 - Type: boolean
 - Default: false

-Lock all OSD memory to prevent it from being unloaded into swap with mlockall(). Requires sufficient ulimit -l (max locked memory).
+Lock all OSD memory to prevent it from being unloaded into swap with
+mlockall(). Requires sufficient ulimit -l (max locked memory).
+
+## auto_scrub
+
+- Type: boolean
+- Default: false
+- Can be changed online: yes
+
+Data scrubbing is the process of background verification of copies to find
+and repair corrupted blocks. It's not run automatically by default since
+it's a new feature. Set this parameter to true to enable automatic scrubs.
+
+This parameter makes OSDs automatically schedule data scrubbing of clean PGs
+every `scrub_interval` (see below). You can also start/schedule scrubbing
+manually by setting `next_scrub` JSON key to the desired UNIX time of the
+next scrub in `/pg/history/...` values in etcd.
+
+## no_scrub
+
+- Type: boolean
+- Default: false
+- Can be changed online: yes
+
+Temporarily disable scrubbing and stop running scrubs.
+
+## scrub_interval
+
+- Type: string
+- Default: 30d
+- Can be changed online: yes
+
+Default automatic scrubbing interval for all pools. Numbers without suffix
+are treated as seconds, possible unit suffixes include 's' (seconds),
+'m' (minutes), 'h' (hours), 'd' (days), 'M' (months) and 'y' (years).
+
+## scrub_queue_depth
+
+- Type: integer
+- Default: 1
+- Can be changed online: yes
+
+Number of parallel scrubbing operations per one OSD.
+
+## scrub_sleep
+
+- Type: milliseconds
+- Default: 0
+- Can be changed online: yes
+
+Additional interval between two consecutive scrubbing operations on one OSD.
+Can be used to slow down scrubbing if it affects user load too much.
+
+## scrub_list_limit
+
+- Type: integer
+- Default: 1000
+- Can be changed online: yes
+
+Number of objects to list in one listing operation during scrub.
+
+## scrub_find_best
+
+- Type: boolean
+- Default: true
+- Can be changed online: yes
+
+Find and automatically restore best versions of objects with unmatched
+copies. In replicated setups, the best version is the version with most
+matching replicas. In EC setups, the best version is the subset of data
+and parity chunks without mismatches.
+
+The hypothetical situation where you might want to disable it is when
+you have 3 replicas and you are paranoid that 2 HDDs out of 3 may silently
+corrupt an object in the same way (for example, zero it out) and only
+1 HDD will remain good. In this case disabling scrub_find_best may help
+you to recover the data! See also scrub_ec_max_bruteforce below.
+
+## scrub_ec_max_bruteforce
+
+- Type: integer
+- Default: 100
+- Can be changed online: yes
+
+Vitastor can locate corrupted chunks in EC setups with more than 1 parity
+chunk by brute-forcing all possible error locations. This configuration
+value limits the maximum number of checked combinations. You can try to
+increase it if you have EC N+K setup with N and K large enough for
+combination count `C(N+K-1, K-1) = (N+K-1)! / (K-1)! / N!` to be greater
+than the default 100.
+
+If there are too many possible combinations or if multiple combinations give
+correct results then objects are marked inconsistent and aren't recovered
+automatically.
+
+In replicated setups bruteforcing isn't needed, Vitastor just assumes that
+the variant with most available equal copies is correct. For example, if
+you have 3 replicas and 1 of them differs, this one is considered to be
+corrupted. But if there is no "best" version with more copies than all
+others have then the object is also marked as inconsistent.
--- a/docs/config/osd.ru.md
+++ b/docs/config/osd.ru.md
@@ -8,7 +8,8 @@

 Данные параметры используются только OSD, но, в отличие от дисковых параметров,
 не фиксируются в момент инициализации дисков OSD и могут быть изменены в любой
-момент с перезапуском OSD.
+момент с помощью перезапуска OSD, а некоторые и без перезапуска, с помощью
+изменения конфигурации в etcd.

 - [etcd_report_interval](#etcd_report_interval)
 - [run_primary](#run_primary)
@@ -39,6 +40,14 @@
 - [throttle_target_parallelism](#throttle_target_parallelism)
 - [throttle_threshold_us](#throttle_threshold_us)
 - [osd_memlock](#osd_memlock)
+- [auto_scrub](#auto_scrub)
+- [no_scrub](#no_scrub)
+- [scrub_interval](#scrub_interval)
+- [scrub_queue_depth](#scrub_queue_depth)
+- [scrub_sleep](#scrub_sleep)
+- [scrub_list_limit](#scrub_list_limit)
+- [scrub_find_best](#scrub_find_best)
+- [scrub_ec_max_bruteforce](#scrub_ec_max_bruteforce)

 ## etcd_report_interval

@@ -93,6 +102,7 @@ RUNNING), подходящий под заданную маску. Также н

 - Тип: секунды
 - Значение по умолчанию: 5
+- Можно менять на лету: да

 Временной интервал отправки автоматических fsync-ов (операций очистки кэша)
 каждым OSD для случая, когда режим immediate_commit отключён. fsync-и нужны
@@ -105,6 +115,7 @@ OSD, чтобы успевать очищать журнал - без них OSD

 - Тип: целое число
 - Значение по умолчанию: 128
+- Можно менять на лету: да

 Аналогично autosync_interval, но задаёт не временной интервал, а
 максимальное количество незафиксированных операций записи перед
@@ -114,6 +125,7 @@ OSD, чтобы успевать очищать журнал - без них OSD

 - Тип: целое число
 - Значение по умолчанию: 4
+- Можно менять на лету: да

 Максимальное число операций восстановления на одном первичном OSD в любой
 момент времени. На данный момент единственный параметр, который можно менять
@@ -124,6 +136,7 @@ OSD, чтобы успевать очищать журнал - без них OSD

 - Тип: целое число
 - Значение по умолчанию: 128
+- Можно менять на лету: да

 Число операций восстановления перед переключением на восстановление другой PG.
 Идея заключается в том, чтобы восстанавливать все PG одновременно для более
@@ -135,6 +148,7 @@ OSD, чтобы успевать очищать журнал - без них OSD

 - Тип: целое число
 - Значение по умолчанию: 16
+- Можно менять на лету: да

 Максимальное число операций восстановления перед дополнительным fsync.

@@ -150,6 +164,7 @@ OSD, чтобы успевать очищать журнал - без них OSD

 - Тип: булево (да/нет)
 - Значение по умолчанию: false
+- Можно менять на лету: да

 Отключить автоматическое фоновое восстановление объектов. Обратите внимание,
 что эта опция не отключает восстановление объектов, происходящее при
@@ -160,6 +175,7 @@ OSD.

 - Тип: булево (да/нет)
 - Значение по умолчанию: false
+- Можно менять на лету: да

 Отключить фоновое перемещение объектов между разными OSD. Отключение
 означает, что PG, находящиеся в состоянии `has_misplaced`, будут оставлены
@@ -169,6 +185,7 @@ OSD.

 - Тип: секунды
 - Значение по умолчанию: 3
+- Можно менять на лету: да

 Временной интервал, с которым OSD печатают простую человекочитаемую
 статистику выполнения операций в стандартный вывод.
@@ -177,6 +194,7 @@ OSD.

 - Тип: секунды
 - Значение по умолчанию: 10
+- Можно менять на лету: да

 Временной интервал, с которым OSD выводят в стандартный вывод список
 медленных или зависших операций, если таковые имеются. Также время, при
@@ -186,6 +204,7 @@ OSD.

 - Тип: секунды
 - Значение по умолчанию: 60
+- Можно менять на лету: да

 Число секунд, через которое удалённые инод удаляется и из статистики OSD.

@@ -193,6 +212,7 @@ OSD.

 - Тип: целое число
 - Значение по умолчанию: 128
+- Можно менять на лету: да

 Максимальное число одновременных клиентских операций записи на один OSD.
 Операции, превышающие этот лимит, не исполняются сразу, а сохраняются во
@@ -202,6 +222,7 @@ OSD.

 - Тип: целое число
 - Значение по умолчанию: 1
+- Можно менять на лету: да

 Flusher - это микро-поток (корутина), которая копирует данные из журнала в
 основную область устройства данных. Их число настраивается динамически между
@@ -211,6 +232,7 @@ Flusher - это микро-поток (корутина), которая коп

 - Тип: целое число
 - Значение по умолчанию: 256
+- Можно менять на лету: да

 Максимальное число микро-потоков очистки журнала (см. выше min_flusher_count).

@@ -270,6 +292,7 @@ Flusher - это микро-поток (корутина), которая коп

 - Тип: булево (да/нет)
 - Значение по умолчанию: false
+- Можно менять на лету: да

 Разрешить мягкое ограничение скорости журналируемой записи. Полезно для
 гибридных OSD с быстрыми устройствами метаданных и медленными устройствами
@@ -288,6 +311,7 @@ Flusher - это микро-поток (корутина), которая коп

 - Тип: целое число
 - Значение по умолчанию: 100
+- Можно менять на лету: да

 Расчётное максимальное число ограничиваемых операций в секунду при условии
 отсутствия свободного места в журнале. Устанавливайте приблизительно равным
@@ -298,6 +322,7 @@ Flusher - это микро-поток (корутина), которая коп

 - Тип: целое число
 - Значение по умолчанию: 100
+- Можно менять на лету: да

 Расчётный максимальный размер в МБ/с ограничиваемых операций в секунду при
 условии отсутствия свободного места в журнале. Устанавливайте приблизительно
@@ -308,6 +333,7 @@ Flusher - это микро-поток (корутина), которая коп

 - Тип: целое число
 - Значение по умолчанию: 1
+- Можно менять на лету: да

 Расчётный максимальный параллелизм ограничиваемых операций в секунду при
 условии отсутствия свободного места в журнале. Устанавливайте приблизительно
@@ -318,6 +344,7 @@ Flusher - это микро-поток (корутина), которая коп

 - Тип: микросекунды
 - Значение по умолчанию: 50
+- Можно менять на лету: да

 Минимальная применимая к ограничиваемым операциям задержка. Обычно не
 требует изменений.
@@ -327,4 +354,113 @@ Flusher - это микро-поток (корутина), которая коп
 - Тип: булево (да/нет)
 - Значение по умолчанию: false

-Блокировать всю память OSD с помощью mlockall, чтобы запретить её выгрузку в пространство подкачки. Требует достаточного значения ulimit -l (лимита заблокированной памяти).
+Блокировать всю память OSD с помощью mlockall, чтобы запретить её выгрузку
+в пространство подкачки. Требует достаточного значения ulimit -l (лимита
+заблокированной памяти).
+
+## auto_scrub
+
+- Тип: булево (да/нет)
+- Значение по умолчанию: false
+- Можно менять на лету: да
+
+Скраб - процесс фоновой проверки копий данных, предназначенный, чтобы
+находить и исправлять повреждённые блоки. По умолчанию эти проверки ещё не
+запускаются автоматически, так как являются новой функцией. Чтобы включить
+автоматическое планирование скрабов, установите данный параметр в true.
+
+Включённый параметр заставляет OSD автоматически планировать фоновую
+проверку чистых PG раз в `scrub_interval` (см. ниже). Вы также можете
+запустить или запланировать проверку вручную, установив значение ключа JSON
+`next_scrub` внутри ключей etcd `/pg/history/...` в UNIX-время следующей
+желаемой проверки.
+
+## no_scrub
+
+- Тип: булево (да/нет)
+- Значение по умолчанию: false
+- Можно менять на лету: да
+
+Временно отключить и остановить запущенные скрабы.
+
+## scrub_interval
+
+- Тип: строка
+- Значение по умолчанию: 30d
+- Можно менять на лету: да
+
+Интервал автоматической фоновой проверки по умолчанию для всех пулов.
+Значения без указанной единицы измерения считаются в секундах, допустимые
+символы единиц измерения в конце: 's' (секунды),
+'m' (минуты), 'h' (часы), 'd' (дни), 'M' (месяца) или 'y' (годы).
+
+## scrub_queue_depth
+
+- Тип: целое число
+- Значение по умолчанию: 1
+- Можно менять на лету: да
+
+Число параллельных операций фоновой проверки на один OSD.
+
+## scrub_sleep
+
+- Тип: миллисекунды
+- Значение по умолчанию: 0
+- Можно менять на лету: да
+
+Дополнительный интервал ожидания после фоновой проверки каждого объекта на
+одном OSD. Может использоваться для замедления скраба, если он слишком
+сильно влияет на пользовательскую нагрузку.
+
+## scrub_list_limit
+
+- Тип: целое число
+- Значение по умолчанию: 1000
+- Можно менять на лету: да
+
+Размер загружаемых за одну операцию списков объектов в процессе фоновой
+проверки.
+
+## scrub_find_best
+
+- Тип: булево (да/нет)
+- Значение по умолчанию: true
+- Можно менять на лету: да
+
+Находить и автоматически восстанавливать "лучшие версии" объектов с
+несовпадающими копиями/частями. При использовании репликации "лучшая"
+версия - версия, доступная в большем числе экземпляров, чем другие. При
+использовании кодов коррекции ошибок "лучшая" версия - это подмножество
+частей данных и чётности, полностью соответствующих друг другу.
+
+Гипотетическая ситуация, в которой вы можете захотеть отключить этот
+поиск - это если у вас 3 реплики и вы боитесь, что 2 диска из 3 могут
+незаметно и одинаково повредить данные одного и того же объекта, например,
+занулив его, и только 1 диск останется неповреждённым. В этой ситуации
+отключение этого параметра поможет вам восстановить данные! Смотрите также
+описание следующего параметра - scrub_ec_max_bruteforce.
+
+## scrub_ec_max_bruteforce
+
+- Тип: целое число
+- Значение по умолчанию: 100
+- Можно менять на лету: да
+
+Vitastor старается определить повреждённые части объектов при использовании
+EC (кодов коррекции ошибок) с более, чем 1 диском чётности, путём перебора
+всех возможных комбинаций ошибочных частей. Данное значение конфигурации
+ограничивает число перебираемых комбинаций. Вы можете попробовать поднять
+его, если используете схему кодирования EC N+K с N и K, достаточно большими
+для того, чтобы число сочетаний `C(N+K-1, K-1) = (N+K-1)! / (K-1)! / N!`
+было больше, чем стандартное значение 100.
+
+Если возможных комбинаций слишком много или если корректная комбинаций не
+определяется однозначно, объекты помечаются неконсистентными (inconsistent)
+и не восстанавливаются автоматически.
+
+При использовании репликации перебор не нужен, Vitastor просто предполагает,
+что вариант объекта с наибольшим количеством одинаковых копий корректен.
+Например, если вы используете 3 реплики и 1 из них отличается, эта 1 копия
+считается некорректной. Однако, если "лучшую" версию с числом доступных
+копий большим, чем у всех других версий, найти невозможно, то объект тоже
+маркируется неконсистентным.
--- a/docs/config/pool.en.md
+++ b/docs/config/pool.en.md
@@ -40,6 +40,7 @@ Parameters:
 - [root_node](#root_node)
 - [osd_tags](#osd_tags)
 - [primary_affinity_tags](#primary_affinity_tags)
+- [scrub_interval](#scrub_interval)

 Examples:

@@ -272,6 +273,13 @@ Specifies OSD tags to prefer putting primary OSDs in this pool to.
 Note that for EC/XOR pools Vitastor always prefers to put primary OSD on one
 of the OSDs containing a data chunk for a PG.

+## scrub_interval
+
+- Type: time interval (number + unit s/m/h/d/M/y)
+
+Automatic scrubbing interval for this pool. Overrides
+[global scrub_interval setting](osd.en.md#scrub_interval).
+
 # Examples

 ## Replicated pool
--- a/docs/config/pool.ru.md
+++ b/docs/config/pool.ru.md
@@ -39,6 +39,7 @@
 - [root_node](#root_node)
 - [osd_tags](#osd_tags)
 - [primary_affinity_tags](#primary_affinity_tags)
+- [scrub_interval](#scrub_interval)

 Примеры:

@@ -276,6 +277,13 @@ PG в Vitastor эферемерны, то есть вы можете менят
 для PG этого пула. Имейте в виду, что для EC-пулов Vitastor также всегда
 предпочитает помещать первичный OSD на один из OSD с данными, а не с чётностью.

+## scrub_interval
+
+- Тип: временной интервал (число + единица измерения s/m/h/d/M/y)
+
+Интервал скраба, то есть, автоматической фоновой проверки данных для данного пула.
+Переопределяет [глобальную настройку scrub_interval](osd.ru.md#scrub_interval).
+
 # Примеры

 ## Реплицированный пул
--- a/docs/config/src/common.yml
+++ b/docs/config/src/common.yml
@@ -11,13 +11,21 @@
 - name: etcd_address
  type: string or array of strings
  type_ru: строка или массив строк
+  online: true
  info: |
    etcd connection endpoint(s). Multiple endpoints may be delimited by "," or
    specified in a JSON array `["10.0.115.10:2379/v3","10.0.115.11:2379/v3"]`.
    Note that https is not supported for etcd connections yet.
+
+    etcd connection endpoints can be changed online by updating global
+    configuration in etcd itself - this allows to switch the cluster to new
+    etcd addresses without downtime.
  info_ru: |
    Адрес(а) подключения к etcd. Несколько адресов могут разделяться запятой
    или указываться в виде JSON-массива `["10.0.115.10:2379/v3","10.0.115.11:2379/v3"]`.
+
+    Адреса подключения к etcd можно поменять на лету, обновив конфигурацию в
+    самом etcd - это позволяет переключить кластер на новые etcd без остановки.
 - name: etcd_prefix
  type: string
  default: "/vitastor"
@@ -31,5 +39,6 @@
 - name: log_level
  type: int
  default: 0
+  online: true
  info: Log level. Raise if you want more verbose output.
  info_ru: Уровень логгирования. Повысьте, если хотите более подробный вывод.
--- a/docs/config/src/make.js
+++ b/docs/config/src/make.js
@@ -14,6 +14,7 @@ const L = {
        toc_config: '[Configuration](../config.en.md)',
        toc_usage: 'Usage',
        toc_performance: 'Performance',
+        online: 'Can be changed online: yes',
    },
    ru: {
        Documentation: 'Документация',
@@ -28,6 +29,7 @@ const L = {
        toc_config: '[Конфигурация](../config.ru.md)',
        toc_usage: 'Использование',
        toc_performance: 'Производительность',
+        online: 'Можно менять на лету: да',
    },
 };
 const types = {
@@ -70,6 +72,8 @@ for (const file of params_files)
                out += `- ${L[lang]['Default'] || 'Default'}: ${c.default}\n`;
            if (c.min !== undefined)
                out += `- ${L[lang]['Minimum'] || 'Minimum'}: ${c.min}\n`;
+            if (c.online)
+                out += `- ${L[lang]['online'] || 'Can be changed online: yes'}\n`;
            out += `\n`+(c["info_"+lang] || c["info"]).replace(/\s+$/, '');
        }
        const head = fs.readFileSync(__dirname+'/'+file+'.'+lang+'.md', { encoding: 'utf-8' });
--- a/docs/config/src/network.yml
+++ b/docs/config/src/network.yml
@@ -164,18 +164,21 @@
  type: sec
  min: 1
  default: 5
+  online: true
  info: Interval before attempting to reconnect to an unavailable OSD.
  info_ru: Время ожидания перед повторной попыткой соединиться с недоступным OSD.
 - name: peer_connect_timeout
  type: sec
  min: 1
  default: 5
+  online: true
  info: Timeout for OSD connection attempts.
  info_ru: Максимальное время ожидания попытки соединения с OSD.
 - name: osd_idle_timeout
  type: sec
  min: 1
  default: 5
+  online: true
  info: |
    OSD connection inactivity time after which clients and other OSDs send
    keepalive requests to check state of the connection.
@@ -186,6 +189,7 @@
  type: sec
  min: 1
  default: 5
+  online: true
  info: |
    Maximum time to wait for OSD keepalive responses. If an OSD doesn't respond
    within this time, the connection to it is dropped and a reconnection attempt
@@ -198,6 +202,7 @@
  type: ms
  min: 50
  default: 500
+  online: true
  info: |
    OSDs respond to clients with a special error code when they receive I/O
    requests for a PG that's not synchronized and started. This parameter sets
@@ -211,6 +216,7 @@
 - name: max_etcd_attempts
  type: int
  default: 5
+  online: true
  info: |
    Maximum number of attempts for etcd requests which can't be retried
    indefinitely.
@@ -220,6 +226,7 @@
 - name: etcd_quick_timeout
  type: ms
  default: 1000
+  online: true
  info: |
    Timeout for etcd requests which should complete quickly, like lease refresh.
  info_ru: |
@@ -228,6 +235,7 @@
 - name: etcd_slow_timeout
  type: ms
  default: 5000
+  online: true
  info: Timeout for etcd requests which are allowed to wait for some time.
  info_ru: |
    Максимальное время выполнения запросов к etcd, для которых не обязательно
@@ -235,6 +243,7 @@
 - name: etcd_keepalive_timeout
  type: sec
  default: max(30, etcd_report_interval*2)
+  online: true
  info: |
    Timeout for etcd connection HTTP Keep-Alive. Should be higher than
    etcd_report_interval to guarantee that keepalive actually works.
@@ -244,6 +253,7 @@
 - name: etcd_ws_keepalive_timeout
  type: sec
  default: 30
+  online: true
  info: |
    etcd websocket ping interval required to keep the connection alive and
    detect disconnections quickly.
@@ -252,6 +262,7 @@
 - name: client_dirty_limit
  type: int
  default: 33554432
+  online: true
  info: |
    Without immediate_commit=all this parameter sets the limit of "dirty"
    (not committed by fsync) data allowed by the client before forcing an
--- a/docs/config/src/osd.en.md
+++ b/docs/config/src/osd.en.md
@@ -1,4 +1,5 @@
 # Runtime OSD Parameters

 These parameters only apply to OSDs, are not fixed at the moment of OSD drive
-initialization and can be changed with an OSD restart.
+initialization and can be changed - either with an OSD restart or, for some of
+them, even without restarting by updating configuration in etcd.
--- a/docs/config/src/osd.ru.md
+++ b/docs/config/src/osd.ru.md
@@ -2,4 +2,5 @@

 Данные параметры используются только OSD, но, в отличие от дисковых параметров,
 не фиксируются в момент инициализации дисков OSD и могут быть изменены в любой
-момент с перезапуском OSD.
+момент с помощью перезапуска OSD, а некоторые и без перезапуска, с помощью
+изменения конфигурации в etcd.
--- a/docs/config/src/osd.yml
+++ b/docs/config/src/osd.yml
@@ -66,6 +66,7 @@
 - name: autosync_interval
  type: sec
  default: 5
+  online: true
  info: |
    Time interval at which automatic fsyncs/flushes are issued by each OSD when
    the immediate_commit mode if disabled. fsyncs are required because without
@@ -83,6 +84,7 @@
 - name: autosync_writes
  type: int
  default: 128
+  online: true
  info: |
    Same as autosync_interval, but sets the maximum number of uncommitted write
    operations before issuing an fsync operation internally.
@@ -93,6 +95,7 @@
 - name: recovery_queue_depth
  type: int
  default: 4
+  online: true
  info: |
    Maximum recovery operations per one primary OSD at any given moment of time.
    Currently it's the only parameter available to tune the speed or recovery
@@ -105,6 +108,7 @@
 - name: recovery_pg_switch
  type: int
  default: 128
+  online: true
  info: |
    Number of recovery operations before switching to recovery of the next PG.
    The idea is to mix all PGs during recovery for more even space and load
@@ -119,6 +123,7 @@
 - name: recovery_sync_batch
  type: int
  default: 16
+  online: true
  info: Maximum number of recovery operations before issuing an additional fsync.
  info_ru: Максимальное число операций восстановления перед дополнительным fsync.
 - name: readonly
@@ -133,6 +138,7 @@
 - name: no_recovery
  type: bool
  default: false
+  online: true
  info: |
    Disable automatic background recovery of objects. Note that it doesn't
    affect implicit recovery of objects happening during writes - a write is
@@ -145,6 +151,7 @@
 - name: no_rebalance
  type: bool
  default: false
+  online: true
  info: |
    Disable background movement of data between different OSDs. Disabling it
    means that PGs in the `has_misplaced` state will be left in it indefinitely.
@@ -155,6 +162,7 @@
 - name: print_stats_interval
  type: sec
  default: 3
+  online: true
  info: |
    Time interval at which OSDs print simple human-readable operation
    statistics on stdout.
@@ -164,6 +172,7 @@
 - name: slow_log_interval
  type: sec
  default: 10
+  online: true
  info: |
    Time interval at which OSDs dump slow or stuck operations on stdout, if
    they're any. Also it's the time after which an operation is considered
@@ -175,6 +184,7 @@
 - name: inode_vanish_time
  type: sec
  default: 60
+  online: true
  info: |
    Number of seconds after which a deleted inode is removed from OSD statistics.
  info_ru: |
@@ -182,6 +192,7 @@
 - name: max_write_iodepth
  type: int
  default: 128
+  online: true
  info: |
    Parallel client write operation limit per one OSD. Operations that exceed
    this limit are pushed to a temporary queue instead of being executed
@@ -193,6 +204,7 @@
 - name: min_flusher_count
  type: int
  default: 1
+  online: true
  info: |
    Flusher is a micro-thread that moves data from the journal to the data
    area of the device. Their number is auto-tuned between minimum and maximum.
@@ -204,6 +216,7 @@
 - name: max_flusher_count
  type: int
  default: 256
+  online: true
  info: |
    Maximum number of journal flushers (see above min_flusher_count).
  info_ru: |
@@ -284,6 +297,7 @@
 - name: throttle_small_writes
  type: bool
  default: false
+  online: true
  info: |
    Enable soft throttling of small journaled writes. Useful for hybrid OSDs
    with fast journal/metadata devices and slow data devices. The idea is that
@@ -312,6 +326,7 @@
 - name: throttle_target_iops
  type: int
  default: 100
+  online: true
  info: |
    Target maximum number of throttled operations per second under the condition
    of full journal. Set it to approximate random write iops of your data devices
@@ -324,6 +339,7 @@
 - name: throttle_target_mbs
  type: int
  default: 100
+  online: true
  info: |
    Target maximum bandwidth in MB/s of throttled operations per second under
    the condition of full journal. Set it to approximate linear write
@@ -336,6 +352,7 @@
 - name: throttle_target_parallelism
  type: int
  default: 1
+  online: true
  info: |
    Target maximum parallelism of throttled operations under the condition of
    full journal. Set it to approximate internal parallelism of your data
@@ -348,6 +365,7 @@
 - name: throttle_threshold_us
  type: us
  default: 50
+  online: true
  info: |
    Minimal computed delay to be applied to throttled operations. Usually
    doesn't need to be changed.
@@ -357,10 +375,151 @@
 - name: osd_memlock
  type: bool
  default: false
-  info: >
+  info: |
    Lock all OSD memory to prevent it from being unloaded into swap with
    mlockall(). Requires sufficient ulimit -l (max locked memory).
-  info_ru: >
+  info_ru: |
    Блокировать всю память OSD с помощью mlockall, чтобы запретить её выгрузку
    в пространство подкачки. Требует достаточного значения ulimit -l (лимита
    заблокированной памяти).
+- name: auto_scrub
+  type: bool
+  default: false
+  online: true
+  info: |
+    Data scrubbing is the process of background verification of copies to find
+    and repair corrupted blocks. It's not run automatically by default since
+    it's a new feature. Set this parameter to true to enable automatic scrubs.
+
+    This parameter makes OSDs automatically schedule data scrubbing of clean PGs
+    every `scrub_interval` (see below). You can also start/schedule scrubbing
+    manually by setting `next_scrub` JSON key to the desired UNIX time of the
+    next scrub in `/pg/history/...` values in etcd.
+  info_ru: |
+    Скраб - процесс фоновой проверки копий данных, предназначенный, чтобы
+    находить и исправлять повреждённые блоки. По умолчанию эти проверки ещё не
+    запускаются автоматически, так как являются новой функцией. Чтобы включить
+    автоматическое планирование скрабов, установите данный параметр в true.
+
+    Включённый параметр заставляет OSD автоматически планировать фоновую
+    проверку чистых PG раз в `scrub_interval` (см. ниже). Вы также можете
+    запустить или запланировать проверку вручную, установив значение ключа JSON
+    `next_scrub` внутри ключей etcd `/pg/history/...` в UNIX-время следующей
+    желаемой проверки.
+- name: no_scrub
+  type: bool
+  default: false
+  online: true
+  info: |
+    Temporarily disable scrubbing and stop running scrubs.
+  info_ru: |
+    Временно отключить и остановить запущенные скрабы.
+- name: scrub_interval
+  type: string
+  default: 30d
+  online: true
+  info: |
+    Default automatic scrubbing interval for all pools. Numbers without suffix
+    are treated as seconds, possible unit suffixes include 's' (seconds),
+    'm' (minutes), 'h' (hours), 'd' (days), 'M' (months) and 'y' (years).
+  info_ru: |
+    Интервал автоматической фоновой проверки по умолчанию для всех пулов.
+    Значения без указанной единицы измерения считаются в секундах, допустимые
+    символы единиц измерения в конце: 's' (секунды),
+    'm' (минуты), 'h' (часы), 'd' (дни), 'M' (месяца) или 'y' (годы).
+- name: scrub_queue_depth
+  type: int
+  default: 1
+  online: true
+  info: |
+    Number of parallel scrubbing operations per one OSD.
+  info_ru: |
+    Число параллельных операций фоновой проверки на один OSD.
+- name: scrub_sleep
+  type: ms
+  default: 0
+  online: true
+  info: |
+    Additional interval between two consecutive scrubbing operations on one OSD.
+    Can be used to slow down scrubbing if it affects user load too much.
+  info_ru: |
+    Дополнительный интервал ожидания после фоновой проверки каждого объекта на
+    одном OSD. Может использоваться для замедления скраба, если он слишком
+    сильно влияет на пользовательскую нагрузку.
+- name: scrub_list_limit
+  type: int
+  default: 1000
+  online: true
+  info: |
+    Number of objects to list in one listing operation during scrub.
+  info_ru: |
+    Размер загружаемых за одну операцию списков объектов в процессе фоновой
+    проверки.
+- name: scrub_find_best
+  type: bool
+  default: true
+  online: true
+  info: |
+    Find and automatically restore best versions of objects with unmatched
+    copies. In replicated setups, the best version is the version with most
+    matching replicas. In EC setups, the best version is the subset of data
+    and parity chunks without mismatches.
+
+    The hypothetical situation where you might want to disable it is when
+    you have 3 replicas and you are paranoid that 2 HDDs out of 3 may silently
+    corrupt an object in the same way (for example, zero it out) and only
+    1 HDD will remain good. In this case disabling scrub_find_best may help
+    you to recover the data! See also scrub_ec_max_bruteforce below.
+  info_ru: |
+    Находить и автоматически восстанавливать "лучшие версии" объектов с
+    несовпадающими копиями/частями. При использовании репликации "лучшая"
+    версия - версия, доступная в большем числе экземпляров, чем другие. При
+    использовании кодов коррекции ошибок "лучшая" версия - это подмножество
+    частей данных и чётности, полностью соответствующих друг другу.
+
+    Гипотетическая ситуация, в которой вы можете захотеть отключить этот
+    поиск - это если у вас 3 реплики и вы боитесь, что 2 диска из 3 могут
+    незаметно и одинаково повредить данные одного и того же объекта, например,
+    занулив его, и только 1 диск останется неповреждённым. В этой ситуации
+    отключение этого параметра поможет вам восстановить данные! Смотрите также
+    описание следующего параметра - scrub_ec_max_bruteforce.
+- name: scrub_ec_max_bruteforce
+  type: int
+  default: 100
+  online: true
+  info: |
+    Vitastor can locate corrupted chunks in EC setups with more than 1 parity
+    chunk by brute-forcing all possible error locations. This configuration
+    value limits the maximum number of checked combinations. You can try to
+    increase it if you have EC N+K setup with N and K large enough for
+    combination count `C(N+K-1, K-1) = (N+K-1)! / (K-1)! / N!` to be greater
+    than the default 100.
+
+    If there are too many possible combinations or if multiple combinations give
+    correct results then objects are marked inconsistent and aren't recovered
+    automatically.
+
+    In replicated setups bruteforcing isn't needed, Vitastor just assumes that
+    the variant with most available equal copies is correct. For example, if
+    you have 3 replicas and 1 of them differs, this one is considered to be
+    corrupted. But if there is no "best" version with more copies than all
+    others have then the object is also marked as inconsistent.
+  info_ru: |
+    Vitastor старается определить повреждённые части объектов при использовании
+    EC (кодов коррекции ошибок) с более, чем 1 диском чётности, путём перебора
+    всех возможных комбинаций ошибочных частей. Данное значение конфигурации
+    ограничивает число перебираемых комбинаций. Вы можете попробовать поднять
+    его, если используете схему кодирования EC N+K с N и K, достаточно большими
+    для того, чтобы число сочетаний `C(N+K-1, K-1) = (N+K-1)! / (K-1)! / N!`
+    было больше, чем стандартное значение 100.
+
+    Если возможных комбинаций слишком много или если корректная комбинаций не
+    определяется однозначно, объекты помечаются неконсистентными (inconsistent)
+    и не восстанавливаются автоматически.
+
+    При использовании репликации перебор не нужен, Vitastor просто предполагает,
+    что вариант объекта с наибольшим количеством одинаковых копий корректен.
+    Например, если вы используете 3 реплики и 1 из них отличается, эта 1 копия
+    считается некорректной. Однако, если "лучшую" версию с числом доступных
+    копий большим, чем у всех других версий, найти невозможно, то объект тоже
+    маркируется неконсистентным.
--- a/docs/installation/packages.en.md
+++ b/docs/installation/packages.en.md
@@ -31,8 +31,8 @@
 - Enable elrepo-kernel:
  - CentOS 7: `yum install https://www.elrepo.org/elrepo-release-7.el7.elrepo.noarch.rpm`
  - CentOS 8: `dnf install https://www.elrepo.org/elrepo-release-8.el8.elrepo.noarch.rpm`
-  - RHEL 9 clones: optional, not required: `dnf install https://www.elrepo.org/elrepo-release-9.el9.elrepo.noarch.rpm`
- Install packages: `yum/dnf install vitastor lpsolve etcd qemu-kvm` and optionally `kernel-ml` if you use elrepo-kernel
+  - RHEL 9 clones: `dnf install https://www.elrepo.org/elrepo-release-9.el9.elrepo.noarch.rpm`
+- Install packages: `yum/dnf install vitastor lpsolve etcd kernel-ml qemu-kvm`

 ## Installation requirements

@@ -45,3 +45,10 @@
 - etcd 3.4.15 or newer. Earlier versions won't work because of various bugs,
  for example [#12402](https://github.com/etcd-io/etcd/pull/12402).
 - node.js 10 or newer
+
+## Version archive
+
+All previous Vitastor and other components (QEMU, etcd...) package builds
+can be found here:
+
+https://vitastor.io/archive/
--- a/docs/installation/packages.ru.md
+++ b/docs/installation/packages.ru.md
@@ -22,13 +22,16 @@
 - Добавьте в систему репозиторий Vitastor:
  - CentOS 7: `yum install https://vitastor.io/rpms/centos/7/vitastor-release.rpm`
  - CentOS 8: `dnf install https://vitastor.io/rpms/centos/8/vitastor-release.rpm`
+  - AlmaLinux 9 и другие клоны RHEL 9 (Rocky, Oracle...): `dnf install https://vitastor.io/rpms/centos/9/vitastor-release.rpm`
 - Включите EPEL: `yum/dnf install epel-release`
 - Включите дополнительные репозитории CentOS:
  - CentOS 7: `yum install centos-release-scl`
  - CentOS 8: `dnf install centos-release-advanced-virtualization`
+  - Клоны RHEL 9: не нужно
 - Включите elrepo-kernel:
  - CentOS 7: `yum install https://www.elrepo.org/elrepo-release-7.el7.elrepo.noarch.rpm`
  - CentOS 8: `dnf install https://www.elrepo.org/elrepo-release-8.el8.elrepo.noarch.rpm`
+  - Клоны RHEL 9: `dnf install https://www.elrepo.org/elrepo-release-9.el9.elrepo.noarch.rpm`
 - Установите пакеты: `yum/dnf install vitastor lpsolve etcd kernel-ml qemu-kvm`

 ## Установочные требования
@@ -41,3 +44,10 @@
 - etcd 3.4.15 или новее. Более старые версии не будут работать из-за разных багов,
  например, [#12402](https://github.com/etcd-io/etcd/pull/12402).
 - node.js 10 или новее
+
+## Архив предыдущих версий
+
+Все предыдущие сборки пакетов Vitastor и других компонентов, таких, как QEMU
+и etcd, можно скачать по следующей ссылке:
+
+https://vitastor.io/archive/
--- a/docs/intro/features.en.md
+++ b/docs/intro/features.en.md
@@ -29,6 +29,7 @@
 - Snapshots and copy-on-write image clones
 - [Write throttling to smooth random write workloads in SSD+HDD configurations](../config/osd.en.md#throttle_small_writes)
 - [RDMA/RoCEv2 support via libibverbs](../config/network.en.md#rdma_device)
+- [Scrubbing without checksums](../config/osd.en.md#auto_scrub) (verification of copies)

 ## Plugins and tools

@@ -54,7 +55,6 @@ The following features are planned for the future:
 - iSCSI proxy
 - Multi-threaded client
 - Faster failover
- Scrubbing without checksums (verification of replicas)
 - Checksums
 - Tiered storage (SSD caching)
 - NVDIMM support
--- a/docs/intro/features.ru.md
+++ b/docs/intro/features.ru.md
@@ -31,6 +31,7 @@
 - Снапшоты и copy-on-write клоны
 - [Сглаживание производительности случайной записи в SSD+HDD конфигурациях](../config/osd.ru.md#throttle_small_writes)
 - [Поддержка RDMA/RoCEv2 через libibverbs](../config/network.ru.md#rdma_device)
+- [Фоновая проверка целостности без контрольных сумм](../config/osd.ru.md#auto_scrub) (сверка копий)

 ## Драйверы и инструменты

@@ -54,7 +55,6 @@
 - iSCSI-прокси
 - Многопоточный клиент
 - Более быстрое переключение при отказах
- Фоновая проверка целостности без контрольных сумм (сверка реплик)
 - Контрольные суммы
 - Поддержка SSD-кэширования (tiered storage)
 - Поддержка NVDIMM
--- a/docs/usage/cli.en.md
+++ b/docs/usage/cli.en.md
@@ -20,6 +20,8 @@ It supports the following commands:
 - [flatten](#flatten)
 - [rm-data](#rm-data)
 - [merge-data](#merge-data)
+- [describe](#describe)
+- [fix](#fix)
 - [alloc-osd](#alloc-osd)
 - [rm-osd](#rm-osd)

@@ -174,6 +176,51 @@ Merge layer data without changing metadata. Merge `<from>`..`<to>` to `<target>`
 `<to>` must be a child of `<from>` and `<target>` may be one of the layers between
 `<from>` and `<to>`, including `<from>` and `<to>`.

+## describe
+
+`vitastor-cli describe [--osds <osds>] [--object-state <states>] [--pool <pool>]
+    [--inode <ino>] [--min-inode <ino>] [--max-inode <ino>]
+    [--min-offset <offset>] [--max-offset <offset>]`
+
+Describe unclean object locations in the cluster.
+
+```
+--osds <osds>
+    Only list objects from primary OSD(s) <osds>.
+--object-state <states>
+    Only list objects in given state(s). State(s) may include:
+    degraded, misplaced, incomplete, corrupted, inconsistent.
+--pool <pool name or number>
+    Only list objects in the given pool.
+--inode, --min-inode, --max-inode
+    Restrict listing to specific inode numbers.
+--min-offset, --max-offset
+    Restrict listing to specific offsets inside inodes.
+```
+
+## fix
+
+`vitastor-cli fix [--objects <objects>] [--bad-osds <osds>] [--part <part>] [--check no]`
+
+Fix inconsistent objects in the cluster by deleting some copies.
+
+```
+--objects <objects>
+    Objects to fix, either in plain text or JSON format. If not specified,
+    object list will be read from STDIN in one of the same formats.
+    Plain text format: 0x<inode>:0x<stripe> <any delimiter> 0x<inode>:0x<stripe> ...
+    JSON format: [{"inode":"0x...","stripe":"0x..."},...]
+--bad-osds <osds>
+    Remove inconsistent copies/parts of objects from these OSDs, effectively
+    marking them bad and allowing Vitastor to recover objects from other copies.
+--part <number>
+    Only remove EC part <number> (from 0 to pg_size-1), required for extreme
+    edge cases where one OSD has multiple parts of a EC object.
+--check no
+    Do not recheck that requested objects are actually inconsistent,
+    delete requested copies/parts anyway.
+```
+
 ## alloc-osd

 `vitastor-cli alloc-osd`
--- a/docs/usage/cli.ru.md
+++ b/docs/usage/cli.ru.md
@@ -184,6 +184,59 @@ vitastor-cli snap-create [-p|--pool <id|name>] <image>@<snapshot>
 в целевой образ `<target>`. `<to>` должен быть дочерним образом `<from>`, а `<target>`
 должен быть одним из слоёв между `<from>` и `<to>`, включая сами `<from>` и `<to>`.

+## describe
+
+`vitastor-cli describe [--osds <osds>] [--object-state <состояния>] [--pool <пул>]
+    [--inode <номер>] [--min-inode <номер>] [--max-inode <номер>]
+    [--min-offset <смещение>] [--max-offset <смещение>]`
+
+Описать состояние "грязных" объектов в кластере, то есть таких объектов, копии
+или части которых хранятся на наборе OSD, не равном целевому.
+
+```
+--osds <osds>
+    Перечислять только объекты с первичных OSD из списка <osds>.
+--object-state <состояния>
+    Перечислять только объекты в указанных состояниях. Возможные состояния
+    объектов:
+    - degraded - деградированная избыточность
+    - misplaced - перемещённый
+    - incomplete - нечитаемый из-за потери большего числа частей, чем допустимо
+    - corrupted - с одной или более повреждённой частью
+    - inconsistent - неконсистентный, с неоднозначным расхождением копий/частей
+--pool <имя или ID пула>
+    Перечислять только объекты из заданного пула.
+--inode, --min-inode, --max-inode
+    Перечислять только объекты из указанных номеров инодов (образов).
+--min-offset, --max-offset
+    Перечислять только объекты с заданных смещений внутри образов.
+```
+
+## fix
+
+`vitastor-cli fix [--objects <объекты>] [--bad-osds <osds>] [--part <номер>] [--check no]`
+
+Исправить неконсистентные (неоднозначные) объекты путём удаления части копий.
+
+```
+--objects <объекты>
+    Объекты для исправления - в простом текстовом или JSON формате. Если опция
+    не указана, список объектов читается со стандартного ввода в тех же форматах.
+    Простой формат: 0x<инод>:0x<смещение> <любой разделитель> 0x<инод>:0x<смещение> ...
+    Формат JSON: [{"inode":"0x<инод>","stripe":"0x<смещение>"},...]
+--bad-osds <osds>
+    Удалить неконсистентные копии/части объектов с данных OSD, таким образом
+    признавая потерю этих копий и позволяя Vitastor-у восстановить объекты из
+    других копий.
+--part <номер>
+    Удалить только части EC с заданным номером (от 0 до pg_size-1). Нужно только
+    в редких граничных случаях, когда один и тот же OSD содержит несколько частей
+    одного EC-объекта.
+--check no
+    Не перепроверять, что заданные объекты действительно в неконсистентном
+    состоянии и просто удалять заданные части.
+```
+
 ## alloc-osd

 `vitastor-cli alloc-osd`
--- a/docs/usage/nbd.en.md
+++ b/docs/usage/nbd.en.md
@@ -25,6 +25,23 @@ It will output a block device name like /dev/nbd0 which you can then use as a no

 You can also use `--pool <POOL> --inode <INODE> --size <SIZE>` instead of `--image <IMAGE>` if you want.

+Additional options for map command:
+
+* `--nbd_timeout 30` \
+  Timeout for I/O operations in seconds after exceeding which the kernel stops
+  the device. You can set it to 0 to disable the timeout, but beware that you
+  won't be able to stop the device at all if vitastor-nbd process dies.
+* `--nbd_max_devices 64 --nbd_max_part 3` \
+  Options for the `nbd` kernel module when modprobing it (`nbds_max` and `max_part`).
+  note that maximum allowed (nbds_max)*(1+max_part) is 256.
+* `--logfile /path/to/log/file.txt` \
+  Write log messages to the specified file instead of dropping them (in background mode)
+  or printing them to the standard output (in foreground mode).
+* `--dev_num N` \
+  Use the specified device /dev/nbdN instead of automatic selection.
+* `--foreground 1` \
+  Stay in foreground, do not daemonize.
+
 ## Unmap image

 To unmap the device run:
@@ -32,3 +49,27 @@ To unmap the device run:
 ```
 vitastor-nbd unmap /dev/nbd0
 ```
+
+## List mapped images
+
+```
+vitastor-nbd ls [--json]
+```
+
+Example output (normal format):
+
+```
+/dev/nbd0
+image: bench
+pid: 584536
+
+/dev/nbd1
+image: bench1
+pid: 584546
+```
+
+Example output (JSON format):
+
+```
+{"/dev/nbd0": {"image": "bench", "pid": 584536}, "/dev/nbd1": {"image": "bench1", "pid": 584546}}
+```
--- a/docs/usage/nbd.ru.md
+++ b/docs/usage/nbd.ru.md
@@ -30,6 +30,27 @@ vitastor-nbd map --etcd_address 10.115.0.10:2379/v3 --image testimg
 Для обращения по номеру инода, аналогично другим командам, можно использовать опции
 `--pool <POOL> --inode <INODE> --size <SIZE>` вместо `--image testimg`.

+Дополнительные опции для команды подключения NBD-устройства:
+
+* `--nbd_timeout 30` \
+  Максимальное время выполнения любой операции чтения/записи в секундах, при
+  превышении которого ядро остановит NBD-устройство. Вы можете установить опцию
+  в 0, чтобы отключить ограничение времени, но имейте в виду, что в этом случае
+  вы вообще не сможете отключить NBD-устройство при нештатном завершении процесса
+  vitastor-nbd.
+* `--nbd_max_devices 64 --nbd_max_part 3` \
+  Опции, передаваемые модулю ядра nbd, если его загружает vitastor-nbd
+  (`nbds_max` и `max_part`). Имейте в виду, что (nbds_max)*(1+max_part)
+  обычно не должно превышать 256.
+* `--logfile /path/to/log/file.txt` \
+  Писать сообщения о процессе работы в заданный файл, вместо пропуска их
+  при фоновом режиме запуска или печати на стандартный вывод при запуске
+  в консоли с `--foreground 1`.
+* `--dev_num N` \
+  Использовать заданное устройство `/dev/nbdN` вместо автоматического подбора.
+* `--foreground 1` \
+  Не уводить процесс в фоновый режим.
+
 ## Отключить устройство

 Для отключения устройства выполните:
@@ -37,3 +58,27 @@ vitastor-nbd map --etcd_address 10.115.0.10:2379/v3 --image testimg
 ```
 vitastor-nbd unmap /dev/nbd0
 ```
+
+## Вывести подключённые устройства
+
+```
+vitastor-nbd ls [--json]
+```
+
+Пример вывода в обычном формате:
+
+```
+/dev/nbd0
+image: bench
+pid: 584536
+
+/dev/nbd1
+image: bench1
+pid: 584546
+```
+
+Пример вывода в JSON-формате:
+
+```
+{"/dev/nbd0": {"image": "bench", "pid": 584536}, "/dev/nbd1": {"image": "bench1", "pid": 584546}}
+```
--- a/mon/mon-main.js
+++ b/mon/mon-main.js
@@ -13,7 +13,7 @@ for (let i = 2; i < process.argv.length; i++)
    {
        console.error('USAGE: '+process.argv[0]+' '+process.argv[1]+' [--verbose 1]'+
            ' [--etcd_address "http://127.0.0.1:2379,..."] [--config_path /etc/vitastor/vitastor.conf]'+
-            ' [--etcd_prefix "/vitastor"] [--etcd_start_timeout 5] [--restart_interval 5]');
+            ' [--etcd_prefix "/vitastor"] [--etcd_start_timeout 5]');
        process.exit();
    }
    else if (process.argv[i].substr(0, 2) == '--')
--- a/mon/mon.js
+++ b/mon/mon.js
@@ -104,12 +104,21 @@ const etcd_tree = {
            autosync_writes: 128,
            client_queue_depth: 128, // unused
            recovery_queue_depth: 4,
+            recovery_pg_switch: 128,
            recovery_sync_batch: 16,
            no_recovery: false,
            no_rebalance: false,
            print_stats_interval: 3,
            slow_log_interval: 10,
            inode_vanish_time: 60,
+            auto_scrub: false,
+            no_scrub: false,
+            scrub_interval: '30d', // 1s/1m/1h/1d
+            scrub_queue_depth: 1,
+            scrub_sleep: 0, // milliseconds
+            scrub_list_limit: 1000, // objects to list on one scrub iteration
+            scrub_find_best: true,
+            scrub_ec_max_bruteforce: 100, // maximum EC error locator brute-force iterators
            // blockstore - fixed in superblock
            block_size,
            disk_alignment,
@@ -172,6 +181,8 @@ const etcd_tree = {
                osd_tags?: 'nvme' | [ 'nvme', ... ],
                // prefer to put primary on OSD with these tags
                primary_affinity_tags?: 'nvme' | [ 'nvme', ... ],
+                // scrub interval
+                scrub_interval?: '30d',
            },
            ...
        }, */
@@ -267,7 +278,7 @@ const etcd_tree = {
                    primary: osd_num_t,
                    state: ("starting"|"peering"|"incomplete"|"active"|"repeering"|"stopping"|"offline"|
                        "degraded"|"has_incomplete"|"has_degraded"|"has_misplaced"|"has_unclean"|
-                        "has_invalid"|"left_on_dead")[],
+                        "has_invalid"|"has_inconsistent"|"has_corrupted"|"left_on_dead"|"scrubbing")[],
                }
            }, */
        },
@@ -289,6 +300,7 @@ const etcd_tree = {
                    osd_sets: osd_num_t[][],
                    all_peers: osd_num_t[],
                    epoch: uint64_t,
+                    next_scrub: uint64_t,
                },
            }, */
        },
@@ -561,7 +573,7 @@ class Mon
        }
        if (!this.ws)
        {
-            await this.die('Failed to open etcd watch websocket');
+            this.die('Failed to open etcd watch websocket');
        }
        const cur_addr = this.selected_etcd_url;
        this.ws_alive = true;
@@ -728,7 +740,7 @@ class Mon
            const res = await this.etcd_call('/lease/keepalive', { ID: this.etcd_lease_id }, this.config.etcd_mon_timeout, this.config.etcd_mon_retries);
            if (!res.result.TTL)
            {
-                await this.die('Lease expired');
+                this.die('Lease expired');
            }
        }, this.config.etcd_mon_timeout);
        if (!this.signals_set)
@@ -741,32 +753,9 @@ class Mon

    async on_stop(status)
    {
-        if (this.ws_keepalive_timer)
-        {
-            clearInterval(this.ws_keepalive_timer);
-            this.ws_keepalive_timer = null;
-        }
-        if (this.lease_timer)
-        {
-            clearInterval(this.lease_timer);
-            this.lease_timer = null;
-        }
-        if (this.etcd_lease_id)
-        {
-            const lease_id = this.etcd_lease_id;
-            this.etcd_lease_id = null;
-            await this.etcd_call('/lease/revoke', { ID: lease_id }, this.config.etcd_mon_timeout, this.config.etcd_mon_retries);
-        }
-        if (!status || !this.initConfig.restart_interval)
-        {
-            process.exit(status);
-        }
-        else
-        {
-            console.log('Restarting after '+this.initConfig.restart_interval+' seconds');
-            await new Promise(ok => setTimeout(ok, this.initConfig.restart_interval*1000));
-            await this.start();
-        }
+        clearInterval(this.lease_timer);
+        await this.etcd_call('/lease/revoke', { ID: this.etcd_lease_id }, this.config.etcd_mon_timeout, this.config.etcd_mon_retries);
+        process.exit(status);
    }

    async become_master()
@@ -871,7 +860,7 @@ class Mon
        }
        for (const node_id in tree)
        {
-            if (node_id === '')
+            if (node_id === '' || tree[node_id].level === 'osd' && (!tree[node_id].size || tree[node_id].size <= 0))
            {
                continue;
            }
@@ -981,7 +970,7 @@ class Mon

    save_new_pgs_txn(save_to, request, pool_id, up_osds, osd_tree, prev_pgs, new_pgs, pg_history)
    {
-        const aff_osds = this.get_affinity_osds(this.state.config.pools[pool_id], up_osds, osd_tree);
+        const aff_osds = this.get_affinity_osds(this.state.config.pools[pool_id] || {}, up_osds, osd_tree);
        const pg_items = {};
        this.reset_rng();
        new_pgs.map((osd_set, i) =>
@@ -1794,13 +1783,14 @@ class Mon
                return res.json;
            }
        }
-        await this.die();
+        this.die();
    }

-    async _die(err)
+    _die(err)
    {
+        // In fact we can just try to rejoin
        console.error(new Error(err || 'Cluster connection failed'));
-        await this.on_stop(1);
+        process.exit(1);
    }

    local_ips(all)
@@ -1845,7 +1835,7 @@ function POST(url, body, timeout)
            clearTimeout(timer_id);
            let res_body = '';
            res.setEncoding('utf8');
-            res.on('error', no);
+            res.on('error', (error) => ok({ error }));
            res.on('data', chunk => { res_body += chunk; });
            res.on('end', () =>
            {
@@ -1865,8 +1855,8 @@ function POST(url, body, timeout)
                }
            });
        });
-        req.on('error', no);
-        req.on('close', () => no(new Error('Connection closed prematurely')));
+        req.on('error', (error) => ok({ error }));
+        req.on('close', () => ok({ error: new Error('Connection closed prematurely') }));
        req.write(body_text);
        req.end();
    });
--- a/patches/cinder-vitastor.py
+++ b/patches/cinder-vitastor.py
@@ -50,7 +50,7 @@ from cinder.volume import configuration
 from cinder.volume import driver
 from cinder.volume import volume_utils

-VERSION = '0.8.8'
+VERSION = '0.9.0'

 LOG = logging.getLogger(__name__)

--- a/rpm/build-tarball.sh
+++ b/rpm/build-tarball.sh
@@ -24,4 +24,4 @@ rm fio
 mv fio-copy fio
 FIO=`rpm -qi fio | perl -e 'while(<>) { /^Epoch[\s:]+(\S+)/ && print "$1:"; /^Version[\s:]+(\S+)/ && print $1; /^Release[\s:]+(\S+)/ && print "-$1"; }'`
 perl -i -pe 's/(Requires:\s*fio)([^\n]+)?/$1 = '$FIO'/' $VITASTOR/rpm/vitastor-el$EL.spec
-tar --transform 's#^#vitastor-0.8.8/#' --exclude 'rpm/*.rpm' -czf $VITASTOR/../vitastor-0.8.8$(rpm --eval '%dist').tar.gz *
+tar --transform 's#^#vitastor-0.9.0/#' --exclude 'rpm/*.rpm' -czf $VITASTOR/../vitastor-0.9.0$(rpm --eval '%dist').tar.gz *
--- a/rpm/vitastor-el7.Dockerfile
+++ b/rpm/vitastor-el7.Dockerfile
@@ -35,7 +35,7 @@ ADD . /root/vitastor
 RUN set -e; \
    cd /root/vitastor/rpm; \
    sh build-tarball.sh; \
-    cp /root/vitastor-0.8.8.el7.tar.gz ~/rpmbuild/SOURCES; \
+    cp /root/vitastor-0.9.0.el7.tar.gz ~/rpmbuild/SOURCES; \
    cp vitastor-el7.spec ~/rpmbuild/SPECS/vitastor.spec; \
    cd ~/rpmbuild/SPECS/; \
    rpmbuild -ba vitastor.spec; \
--- a/rpm/vitastor-el7.spec
+++ b/rpm/vitastor-el7.spec
@@ -1,11 +1,11 @@
 Name:           vitastor
-Version:        0.8.8
+Version:        0.9.0
 Release:        1%{?dist}
 Summary:        Vitastor, a fast software-defined clustered block storage

 License:        Vitastor Network Public License 1.1
 URL:            https://vitastor.io/
-Source0:        vitastor-0.8.8.el7.tar.gz
+Source0:        vitastor-0.9.0.el7.tar.gz

 BuildRequires:  liburing-devel >= 0.6
 BuildRequires:  gperftools-devel
--- a/rpm/vitastor-el8.Dockerfile
+++ b/rpm/vitastor-el8.Dockerfile
@@ -35,7 +35,7 @@ ADD . /root/vitastor
 RUN set -e; \
    cd /root/vitastor/rpm; \
    sh build-tarball.sh; \
-    cp /root/vitastor-0.8.8.el8.tar.gz ~/rpmbuild/SOURCES; \
+    cp /root/vitastor-0.9.0.el8.tar.gz ~/rpmbuild/SOURCES; \
    cp vitastor-el8.spec ~/rpmbuild/SPECS/vitastor.spec; \
    cd ~/rpmbuild/SPECS/; \
    rpmbuild -ba vitastor.spec; \
--- a/rpm/vitastor-el8.spec
+++ b/rpm/vitastor-el8.spec
@@ -1,11 +1,11 @@
 Name:           vitastor
-Version:        0.8.8
+Version:        0.9.0
 Release:        1%{?dist}
 Summary:        Vitastor, a fast software-defined clustered block storage

 License:        Vitastor Network Public License 1.1
 URL:            https://vitastor.io/
-Source0:        vitastor-0.8.8.el8.tar.gz
+Source0:        vitastor-0.9.0.el8.tar.gz

 BuildRequires:  liburing-devel >= 0.6
 BuildRequires:  gperftools-devel
--- a/rpm/vitastor-el9.Dockerfile
+++ b/rpm/vitastor-el9.Dockerfile
@@ -18,7 +18,7 @@ ADD . /root/vitastor
 RUN set -e; \
    cd /root/vitastor/rpm; \
    sh build-tarball.sh; \
-    cp /root/vitastor-0.8.8.el9.tar.gz ~/rpmbuild/SOURCES; \
+    cp /root/vitastor-0.9.0.el9.tar.gz ~/rpmbuild/SOURCES; \
    cp vitastor-el9.spec ~/rpmbuild/SPECS/vitastor.spec; \
    cd ~/rpmbuild/SPECS/; \
    rpmbuild -ba vitastor.spec; \
--- a/rpm/vitastor-el9.spec
+++ b/rpm/vitastor-el9.spec
@@ -1,11 +1,11 @@
 Name:           vitastor
-Version:        0.8.8
+Version:        0.9.0
 Release:        1%{?dist}
 Summary:        Vitastor, a fast software-defined clustered block storage

 License:        Vitastor Network Public License 1.1
 URL:            https://vitastor.io/
-Source0:        vitastor-0.8.8.el9.tar.gz
+Source0:        vitastor-0.9.0.el9.tar.gz

 BuildRequires:  liburing-devel >= 0.6
 BuildRequires:  gperftools-devel
@@ -73,7 +73,7 @@ Vitastor library headers for development.
 Summary:        Vitastor - fio drivers
 Group:          Development/Libraries
 Requires:       vitastor-client = %{version}-%{release}
-Requires:       fio = 3.27-7.el9
+Requires:       fio = 3.27-8.el9


 %description -n vitastor-fio
--- a/src/CMakeLists.txt
+++ b/src/CMakeLists.txt
@@ -16,7 +16,7 @@ if("${CMAKE_INSTALL_PREFIX}" MATCHES "^/usr/local/?$")
 	set(CMAKE_INSTALL_RPATH "${CMAKE_INSTALL_PREFIX}/${CMAKE_INSTALL_LIBDIR}")
 endif()

-add_definitions(-DVERSION="0.8.8")
+add_definitions(-DVERSION="0.9.0")
 add_definitions(-Wall -Wno-sign-compare -Wno-comment -Wno-parentheses -Wno-pointer-arith -fdiagnostics-color=always -I ${CMAKE_SOURCE_DIR}/src)
 if (${WITH_ASAN})
 	add_definitions(-fsanitize=address -fno-omit-frame-pointer)
@@ -111,7 +111,7 @@ target_compile_options(vitastor_common PUBLIC -fPIC)
 add_executable(vitastor-osd
 	osd_main.cpp osd.cpp osd_secondary.cpp osd_peering.cpp osd_flush.cpp osd_peering_pg.cpp
 	osd_primary.cpp osd_primary_chain.cpp osd_primary_sync.cpp osd_primary_write.cpp osd_primary_subops.cpp
-	osd_cluster.cpp osd_rmw.cpp
+	osd_cluster.cpp osd_rmw.cpp osd_scrub.cpp osd_primary_describe.cpp
 )
 target_link_libraries(vitastor-osd
 	vitastor_common
@@ -141,6 +141,8 @@ add_library(vitastor_client SHARED
 	cli_common.cpp
 	cli_alloc_osd.cpp
 	cli_status.cpp
+	cli_describe.cpp
+	cli_fix.cpp
 	cli_df.cpp
 	cli_ls.cpp
 	cli_create.cpp
@@ -299,7 +301,7 @@ add_executable(test_cluster_client
 	EXCLUDE_FROM_ALL
 	test_cluster_client.cpp
 	pg_states.cpp osd_ops.cpp cluster_client.cpp cluster_client_list.cpp msgr_op.cpp mock/messenger.cpp msgr_stop.cpp
-	etcd_state_client.cpp timerfd_manager.cpp ../json11/json11.cpp
+	etcd_state_client.cpp timerfd_manager.cpp str_util.cpp ../json11/json11.cpp
 )
 target_compile_definitions(test_cluster_client PUBLIC -D__MOCK__)
 target_include_directories(test_cluster_client PUBLIC ${CMAKE_SOURCE_DIR}/src/mock)
--- a/src/blockstore.h
+++ b/src/blockstore.h
@@ -73,7 +73,10 @@ Input:
  write request is copied into the metadata area bitwise and stored there.

 Output:
- retval = number of bytes actually read/written or negative error number (-EINVAL or -ENOSPC)
+- retval = number of bytes actually read/written or negative error number
+  -EINVAL = invalid input parameters
+  -ENOENT = requested object/version does not exist for reads
+  -ENOSPC = no space left in the store for writes
 - version = the version actually read or written

 ## BS_OP_DELETE
@@ -122,11 +125,14 @@ Output:
 Get a list of all objects in this Blockstore.

 Input:
- oid.stripe = PG alignment
- len = PG count or 0 to list all objects
- offset = PG number
- oid.inode = min inode number or 0 to list all inodes
- version = max inode number or 0 to list all inodes
+- pg_alignment = PG alignment
+- pg_count = PG count or 0 to list all objects
+- pg_number = PG number
+- list_stable_limit = max number of clean objects in the reply
+  it's guaranteed that dirty objects are returned from the same interval,
+  i.e. from (min_oid .. min(max_oid, max(returned stable OIDs)))
+- min_oid = min inode/stripe or 0 to list all objects
+- max_oid = max inode/stripe or 0 to list all objects

 Output:
 - retval = total obj_ver_id count
@@ -143,10 +149,27 @@ struct blockstore_op_t
    uint64_t opcode;
    // finish callback
    std::function<void (blockstore_op_t*)> callback;
-    object_id oid;
-    uint64_t version;
-    uint32_t offset;
-    uint32_t len;
+    union __attribute__((__packed__))
+    {
+        // R/W
+        struct __attribute__((__packed__))
+        {
+            object_id oid;
+            uint64_t version;
+            uint32_t offset;
+            uint32_t len;
+        };
+        // List
+        struct __attribute__((__packed__))
+        {
+            object_id min_oid;
+            object_id max_oid;
+            uint32_t pg_alignment;
+            uint32_t pg_count;
+            uint32_t pg_number;
+            uint32_t list_stable_limit;
+        };
+    };
    void *buf;
    void *bitmap;
    int retval;
--- a/src/blockstore_flush.cpp
+++ b/src/blockstore_flush.cpp
@@ -536,14 +536,27 @@ resume_1:
                return false;
            }
            // zero out old metadata entry
+            {
+                clean_disk_entry *old_entry = (clean_disk_entry*)((uint8_t*)meta_old.buf + meta_old.pos*bs->dsk.clean_entry_size);
+                if (old_entry->oid.inode != 0 && old_entry->oid != cur.oid)
+                {
+                    printf("Fatal error (metadata corruption or bug): tried to wipe metadata entry %lu (%lx:%lx v%lu) as old location of %lx:%lx\n",
+                        old_clean_loc >> bs->dsk.block_order, old_entry->oid.inode, old_entry->oid.stripe,
+                        old_entry->version, cur.oid.inode, cur.oid.stripe);
+                    exit(1);
+                }
+            }
            memset((uint8_t*)meta_old.buf + meta_old.pos*bs->dsk.clean_entry_size, 0, bs->dsk.clean_entry_size);
-            await_sqe(15);
-            data->iov = (struct iovec){ meta_old.buf, bs->dsk.meta_block_size };
-            data->callback = simple_callback_w;
-            my_uring_prep_writev(
-                sqe, bs->dsk.meta_fd, &data->iov, 1, bs->dsk.meta_offset + bs->dsk.meta_block_size + meta_old.sector
-            );
-            wait_count++;
+            if (meta_old.sector != meta_new.sector)
+            {
+                await_sqe(15);
+                data->iov = (struct iovec){ meta_old.buf, bs->dsk.meta_block_size };
+                data->callback = simple_callback_w;
+                my_uring_prep_writev(
+                    sqe, bs->dsk.meta_fd, &data->iov, 1, bs->dsk.meta_offset + bs->dsk.meta_block_size + meta_old.sector
+                );
+                wait_count++;
+            }
        }
        if (has_delete)
        {
--- a/src/blockstore_impl.cpp
+++ b/src/blockstore_impl.cpp
@@ -462,11 +462,11 @@ void blockstore_impl_t::reshard_clean_db(pool_id_t pool, uint32_t pg_count, uint

 void blockstore_impl_t::process_list(blockstore_op_t *op)
 {
-    uint32_t list_pg = op->offset+1;
-    uint32_t pg_count = op->len;
-    uint64_t pg_stripe_size = op->oid.stripe;
-    uint64_t min_inode = op->oid.inode;
-    uint64_t max_inode = op->version;
+    uint32_t list_pg = op->pg_number+1;
+    uint32_t pg_count = op->pg_count;
+    uint64_t pg_stripe_size = op->pg_alignment;
+    uint64_t min_inode = op->min_oid.inode;
+    uint64_t max_inode = op->max_oid.inode;
    // Check PG
    if (pg_count != 0 && (pg_stripe_size < MIN_DATA_BLOCK_SIZE || list_pg > pg_count))
    {
@@ -513,7 +513,13 @@ void blockstore_impl_t::process_list(blockstore_op_t *op)
            stable_alloc += clean_db.size();
        }
    }
-    else
+    if (op->list_stable_limit > 0)
+    {
+        stable_alloc = op->list_stable_limit;
+        if (stable_alloc > 1024*1024)
+            stable_alloc = 1024*1024;
+    }
+    if (stable_alloc < 32768)
    {
        stable_alloc = 32768;
    }
@@ -524,22 +530,22 @@ void blockstore_impl_t::process_list(blockstore_op_t *op)
        FINISH_OP(op);
        return;
    }
+    auto max_oid = op->max_oid;
+    bool limited = false;
+    pool_pg_id_t last_shard_id = 0;
    for (auto shard_it = clean_db_shards.lower_bound(first_shard);
        shard_it != clean_db_shards.end() && shard_it->first <= last_shard;
        shard_it++)
    {
        auto & clean_db = shard_it->second;
        auto clean_it = clean_db.begin(), clean_end = clean_db.end();
-        if ((min_inode != 0 || max_inode != 0) && min_inode <= max_inode)
+        if (op->min_oid.inode != 0 || op->min_oid.stripe != 0)
        {
-            clean_it = clean_db.lower_bound({
-                .inode = min_inode,
-                .stripe = 0,
-            });
-            clean_end = clean_db.upper_bound({
-                .inode = max_inode,
-                .stripe = UINT64_MAX,
-            });
+            clean_it = clean_db.lower_bound(op->min_oid);
+        }
+        if ((max_oid.inode != 0 || max_oid.stripe != 0) && !(max_oid < op->min_oid))
+        {
+            clean_end = clean_db.upper_bound(max_oid);
        }
        for (; clean_it != clean_end; clean_it++)
        {
@@ -558,11 +564,29 @@ void blockstore_impl_t::process_list(blockstore_op_t *op)
                .oid = clean_it->first,
                .version = clean_it->second.version,
            };
+            if (op->list_stable_limit > 0 && stable_count >= op->list_stable_limit)
+            {
+                if (!limited)
+                {
+                    limited = true;
+                    max_oid = stable[stable_count-1].oid;
+                }
+                break;
+            }
        }
+        if (op->list_stable_limit > 0)
+        {
+            // To maintain the order, we have to include objects in the same range from other shards
+            if (last_shard_id != 0 && last_shard_id != shard_it->first)
+                std::sort(stable, stable+stable_count);
+            if (stable_count > op->list_stable_limit)
+                stable_count = op->list_stable_limit;
+        }
+        last_shard_id = shard_it->first;
    }
-    if (first_shard != last_shard)
+    if (op->list_stable_limit == 0 && first_shard != last_shard)
    {
-        // If that's not a per-PG listing, sort clean entries
+        // If that's not a per-PG listing, sort clean entries (already sorted if list_stable_limit != 0)
        std::sort(stable, stable+stable_count);
    }
    int clean_stable_count = stable_count;
@@ -571,20 +595,17 @@ void blockstore_impl_t::process_list(blockstore_op_t *op)
    obj_ver_id *unstable = NULL;
    {
        auto dirty_it = dirty_db.begin(), dirty_end = dirty_db.end();
-        if ((min_inode != 0 || max_inode != 0) && min_inode <= max_inode)
+        if (op->min_oid.inode != 0 || op->min_oid.stripe != 0)
        {
            dirty_it = dirty_db.lower_bound({
-                .oid = {
-                    .inode = min_inode,
-                    .stripe = 0,
-                },
+                .oid = op->min_oid,
                .version = 0,
            });
+        }
+        if ((max_oid.inode != 0 || max_oid.stripe != 0) && !(max_oid < op->min_oid))
+        {
            dirty_end = dirty_db.upper_bound({
-                .oid = {
-                    .inode = max_inode,
-                    .stripe = UINT64_MAX,
-                },
+                .oid = max_oid,
                .version = UINT64_MAX,
            });
        }
@@ -628,6 +649,11 @@ void blockstore_impl_t::process_list(blockstore_op_t *op)
                            stable[stable_count++] = dirty_it->first;
                        }
                    }
+                    if (op->list_stable_limit > 0 && stable_count >= op->list_stable_limit)
+                    {
+                        // Stop here
+                        break;
+                    }
                }
                else
                {
--- a/src/blockstore_init.cpp
+++ b/src/blockstore_init.cpp
@@ -790,7 +790,7 @@ int blockstore_init_journal::handle_journal_part(void *buf, uint64_t done_pos, u
                    unstab = unstab < ov.version ? ov.version : unstab;
                    if (je->type == JE_SMALL_WRITE_INSTANT)
                    {
-                        bs->mark_stable(ov, true);
+                        bs->mark_stable(ov);
                    }
                }
            }
@@ -890,7 +890,7 @@ int blockstore_init_journal::handle_journal_part(void *buf, uint64_t done_pos, u
                    unstab = unstab < ov.version ? ov.version : unstab;
                    if (je->type == JE_BIG_WRITE_INSTANT)
                    {
-                        bs->mark_stable(ov, true);
+                        bs->mark_stable(ov);
                    }
                }
            }
@@ -904,7 +904,7 @@ int blockstore_init_journal::handle_journal_part(void *buf, uint64_t done_pos, u
                    .oid = je->stable.oid,
                    .version = je->stable.version,
                };
-                bs->mark_stable(ov, true);
+                bs->mark_stable(ov);
            }
            else if (je->type == JE_ROLLBACK)
            {
@@ -961,7 +961,7 @@ int blockstore_init_journal::handle_journal_part(void *buf, uint64_t done_pos, u
                    bs->journal.used_sectors[proc_pos]++;
                    // Deletions are treated as immediately stable, because
                    // "2-phase commit" (write->stabilize) isn't sufficient for them anyway
-                    bs->mark_stable(ov, true);
+                    bs->mark_stable(ov);
                }
                // Ignore delete if neither preceding dirty entries nor the clean one are present
            }
--- a/src/blockstore_read.cpp
+++ b/src/blockstore_read.cpp
@@ -124,10 +124,8 @@ int blockstore_impl_t::dequeue_read(blockstore_op_t *read_op)
    bool dirty_found = (dirty_it != dirty_db.end() && dirty_it->first.oid == read_op->oid);
    if (!clean_found && !dirty_found)
    {
-        // region is not allocated - return zeroes
-        memset(read_op->buf, 0, read_op->len);
        read_op->version = 0;
-        read_op->retval = read_op->len;
+        read_op->retval = -ENOENT;
        FINISH_OP(read_op);
        return 2;
    }
@@ -142,12 +140,18 @@ int blockstore_impl_t::dequeue_read(blockstore_op_t *read_op)
            bool version_ok = !IS_IN_FLIGHT(dirty.state) && read_op->version >= dirty_it->first.version;
            if (IS_SYNCED(dirty.state))
            {
-                if (!version_ok && read_op->version != 0)
-                    read_op->version = dirty_it->first.version;
                version_ok = true;
            }
            if (version_ok)
            {
+                if (IS_DELETE(dirty.state))
+                {
+                    assert(!result_version);
+                    read_op->version = 0;
+                    read_op->retval = -ENOENT;
+                    FINISH_OP(read_op);
+                    return 2;
+                }
                if (!result_version)
                {
                    result_version = dirty_it->first.version;
@@ -234,12 +238,19 @@ int blockstore_impl_t::dequeue_read(blockstore_op_t *read_op)
            }
        }
    }
-    else if (fulfilled < read_op->len)
+    if (!result_version)
    {
-        // fill remaining parts with zeroes
-        assert(fulfill_read(read_op, fulfilled, 0, dsk.data_block_size, (BS_ST_DELETE | BS_ST_STABLE), 0, 0, 0));
+        // May happen if there are entries in dirty_db but all of them are !version_ok
+        read_op->version = 0;
+        read_op->retval = -ENOENT;
+        FINISH_OP(read_op);
+        return 2;
+    }
+    if (fulfilled < read_op->len)
+    {
+        assert(fulfill_read(read_op, fulfilled, 0, dsk.data_block_size, (BS_ST_DELETE | BS_ST_STABLE), 0, 0, 0));
+        assert(fulfilled == read_op->len);
    }
-    assert(fulfilled == read_op->len);
    read_op->version = result_version;
    if (!PRIV(read_op)->pending_ops)
    {
--- a/src/blockstore_rollback.cpp
+++ b/src/blockstore_rollback.cpp
@@ -179,7 +179,7 @@ void blockstore_impl_t::erase_dirty(blockstore_dirty_db_t::iterator dirty_start,
    {
        object_id oid = dirty_it->first.oid;
 #ifdef BLOCKSTORE_DEBUG
-        printf("Unblock writes-after-delete %lx:%lx v%lx\n", oid.inode, oid.stripe, dirty_it->first.version);
+        printf("Unblock writes-after-delete %lx:%lx v%lu\n", oid.inode, oid.stripe, dirty_it->first.version);
 #endif
        dirty_it = dirty_end;
        // Unblock operations blocked by delete flushing
--- a/src/blockstore_stable.cpp
+++ b/src/blockstore_stable.cpp
@@ -103,7 +103,7 @@ blockstore_op_t* blockstore_impl_t::selective_sync(blockstore_op_t *op)
    blockstore_op_t *sync_op = new blockstore_op_t;
    sync_op->opcode = BS_OP_SYNC;
    sync_op->buf = NULL;
-    sync_op->callback = [this](blockstore_op_t *sync_op)
+    sync_op->callback = [](blockstore_op_t *sync_op)
    {
        delete sync_op;
    };
@@ -244,7 +244,7 @@ int blockstore_impl_t::split_stab_op(blockstore_op_t *op, std::function<int(obj_
        // Make a wrapped callback
        int *split_op_counter = (int*)malloc_or_die(sizeof(int));
        *split_op_counter = (sync_op ? 1 : 0) + (split_stab_op ? 1 : 0) + (todo ? 1 : 0);
-        auto cb = [this, op, good_items = good_vers.items,
+        auto cb = [op, good_items = good_vers.items,
            bad_items = bad_vers.items, split_op_counter,
            orig_buf, real_cb = op->callback](blockstore_op_t *split_op)
        {
--- a/src/blockstore_write.cpp
+++ b/src/blockstore_write.cpp
@@ -6,7 +6,7 @@
 bool blockstore_impl_t::enqueue_write(blockstore_op_t *op)
 {
    // Check or assign version number
-    bool found = false, deleted = false, is_del = (op->opcode == BS_OP_DELETE);
+    bool found = false, deleted = false, unsynced = false, is_del = (op->opcode == BS_OP_DELETE);
    bool wait_big = false, wait_del = false;
    void *bmp = NULL;
    uint64_t version = 1;
@@ -26,6 +26,7 @@ bool blockstore_impl_t::enqueue_write(blockstore_op_t *op)
            found = true;
            version = dirty_it->first.version + 1;
            deleted = IS_DELETE(dirty_it->second.state);
+            unsynced = !IS_SYNCED(dirty_it->second.state);
            wait_del = ((dirty_it->second.state & BS_ST_WORKFLOW_MASK) == BS_ST_WAIT_DEL);
            wait_big = (dirty_it->second.state & BS_ST_TYPE_MASK) == BS_ST_BIG_WRITE
                ? !IS_SYNCED(dirty_it->second.state)
@@ -81,10 +82,28 @@ bool blockstore_impl_t::enqueue_write(blockstore_op_t *op)
            wait_del = true;
            PRIV(op)->real_version = op->version;
            op->version = version;
-            flusher->unshift_flush((obj_ver_id){
-                .oid = op->oid,
-                .version = version-1,
-            }, true);
+            if (unsynced)
+            {
+                // Issue an additional sync so the delete reaches the journal
+                blockstore_op_t *sync_op = new blockstore_op_t;
+                sync_op->opcode = BS_OP_SYNC;
+                sync_op->callback = [this, op](blockstore_op_t *sync_op)
+                {
+                    flusher->unshift_flush((obj_ver_id){
+                        .oid = op->oid,
+                        .version = op->version-1,
+                    }, true);
+                    delete sync_op;
+                };
+                enqueue_op(sync_op);
+            }
+            else
+            {
+                flusher->unshift_flush((obj_ver_id){
+                    .oid = op->oid,
+                    .version = version-1,
+                }, true);
+            }
        }
        else
        {
--- a/src/cli.cpp
+++ b/src/cli.cpp
@@ -73,6 +73,37 @@ static const char* help_text =
    "  <to> must be a child of <from> and <target> may be one of the layers between\n"
    "  <from> and <to>, including <from> and <to>.\n"
    "\n"
+    "vitastor-cli describe [--osds <osds>] [--object-state <states>] [--pool <pool>] [--inode <ino>] [--min-inode <ino>] [--max-inode <ino>] [--min-offset <offset>] [--max-offset <offset>]\n"
+    "  Describe unclean object locations in the cluster.\n"
+    "  --osds <osds>\n"
+    "      Only list objects from primary OSD(s) <osds>.\n"
+    "  --object-state <states>\n"
+    "      Only list objects in given state(s). State(s) may include:\n"
+    "      degraded, misplaced, incomplete, corrupted, inconsistent.\n"
+    "  --pool <pool name or number>\n"
+    "      Only list objects in the given pool.\n"
+    "  --inode, --min-inode, --max-inode\n"
+    "      Restrict listing to specific inode numbers.\n"
+    "  --min-offset, --max-offset\n"
+    "      Restrict listing to specific offsets inside inodes.\n"
+    "\n"
+    "vitastor-cli fix [--objects <objects>] [--bad-osds <osds>] [--part <part>] [--check no]\n"
+    "  Fix inconsistent objects in the cluster by deleting some copies.\n"
+    "  --objects <objects>\n"
+    "      Objects to fix, either in plain text or JSON format. If not specified,\n"
+    "      object list will be read from STDIN in one of the same formats.\n"
+    "      Plain text format: 0x<inode>:0x<stripe> <any delimiter> 0x<inode>:0x<stripe> ...\n"
+    "      JSON format: [{\"inode\":\"0x...\",\"stripe\":\"0x...\"},...]\n"
+    "  --bad-osds <osds>\n"
+    "      Remove inconsistent copies/parts of objects from these OSDs, effectively\n"
+    "      marking them bad and allowing Vitastor to recover objects from other copies.\n"
+    "  --part <number>\n"
+    "      Only remove EC part <number> (from 0 to pg_size-1), required for extreme\n"
+    "      edge cases where one OSD has multiple parts of a EC object.\n"
+    "  --check no\n"
+    "      Do not recheck that requested objects are actually inconsistent,\n"
+    "      delete requested copies/parts anyway.\n"
+    "\n"
    "vitastor-cli alloc-osd\n"
    "  Allocate a new OSD number and reserve it by creating empty /osd/stats/<n> key.\n"
    "\n"
@@ -168,6 +199,7 @@ static json11::Json::object parse_args(int narg, const char *args[])
 static int run(cli_tool_t *p, json11::Json::object cfg)
 {
    cli_result_t result = {};
+    p->is_command_line = true;
    p->parse_config(cfg);
    json11::Json::array cmd = cfg["command"].array_items();
    cfg.erase("command");
@@ -276,6 +308,16 @@ static int run(cli_tool_t *p, json11::Json::object cfg)
        }
        action_cb = p->start_rm(cfg);
    }
+    else if (cmd[0] == "describe")
+    {
+        // Describe unclean objects
+        action_cb = p->start_describe(cfg);
+    }
+    else if (cmd[0] == "fix")
+    {
+        // Fix inconsistent objects (by deleting some copies)
+        action_cb = p->start_fix(cfg);
+    }
    else if (cmd[0] == "alloc-osd")
    {
        // Allocate a new OSD number
--- a/src/cli.h
+++ b/src/cli.h
@@ -34,12 +34,12 @@ public:
    bool list_first = false;
    bool json_output = false;
    int log_level = 0;
+    bool is_command_line = false;
    bool color = false;

    ring_loop_t *ringloop = NULL;
    epoll_manager_t *epmgr = NULL;
    cluster_client_t *cli = NULL;
-    bool no_recovery = false, no_rebalance = false, readonly = false;

    int waiting = 0;
    cli_result_t etcd_err;
@@ -56,6 +56,8 @@ public:
    friend struct snap_remover_t;

    std::function<bool(cli_result_t &)> start_status(json11::Json);
+    std::function<bool(cli_result_t &)> start_describe(json11::Json);
+    std::function<bool(cli_result_t &)> start_fix(json11::Json);
    std::function<bool(cli_result_t &)> start_df(json11::Json);
    std::function<bool(cli_result_t &)> start_ls(json11::Json);
    std::function<bool(cli_result_t &)> start_create(json11::Json);
--- a/src/cli_describe.cpp
+++ b/src/cli_describe.cpp
@@ -0,0 +1,256 @@
+// Copyright (c) Vitaliy Filippov, 2019+
+// License: VNPL-1.1 (see README.md for details)
+
+#include "cli_fix.h"
+#include "cluster_client.h"
+#include "pg_states.h"
+#include "str_util.h"
+
+std::vector<uint64_t> parse_uint64_list(json11::Json val)
+{
+    std::vector<uint64_t> ret;
+    if (val.is_number())
+        ret.push_back(val.uint64_value());
+    else if (val.is_string())
+    {
+        const std::string & s = val.string_value();
+        for (int i = 0, p = -1; i <= s.size(); i++)
+        {
+            if (p < 0 && i < s.size() && (isdigit(s[i]) || s[i] == 'x'))
+                p = i;
+            else if (p >= 0 && (i >= s.size() || !isdigit(s[i]) && s[i] != 'x'))
+            {
+                ret.push_back(stoull_full(s.substr(p, i-p), 0));
+                p = -1;
+            }
+        }
+    }
+    else if (val.is_array())
+    {
+        for (auto & pg_num: val.array_items())
+            ret.push_back(pg_num.uint64_value());
+    }
+    return ret;
+}
+
+struct cli_describe_t
+{
+    uint64_t object_state = 0;
+    pool_id_t only_pool = 0;
+    std::vector<uint64_t> only_osds;
+    uint64_t min_inode = 0, max_inode = 0;
+    uint64_t min_offset = 0, max_offset = 0;
+
+    cli_tool_t *parent = NULL;
+    int state = 0;
+    int count = 0;
+
+    json11::Json options;
+    cli_result_t result;
+    json11::Json::array describe_items;
+
+    bool is_done()
+    {
+        return state == 100;
+    }
+
+    void parse_options(json11::Json cfg)
+    {
+        only_pool = cfg["pool"].uint64_value();
+        if (!only_pool && cfg["pool"].is_string())
+        {
+            for (auto & pp: parent->cli->st_cli.pool_config)
+            {
+                if (pp.second.name == cfg["pool"].string_value())
+                {
+                    only_pool = pp.first;
+                    break;
+                }
+            }
+        }
+        min_inode = cfg["inode"].uint64_value();
+        if (min_inode)
+        {
+            if (!INODE_POOL(min_inode))
+                min_inode |= (uint64_t)only_pool << (64-POOL_ID_BITS);
+            max_inode = min_inode;
+            min_offset = max_offset = 0;
+        }
+        else
+        {
+            min_inode = stoull_full(cfg["min_inode"].string_value(), 0); // to support 0x...
+            max_inode = stoull_full(cfg["max_inode"].string_value(), 0);
+            min_offset = stoull_full(cfg["min_offset"].string_value(), 0);
+            max_offset = stoull_full(cfg["max_offset"].string_value(), 0);
+            if (!min_inode && !max_inode && only_pool)
+            {
+                min_inode = (uint64_t)only_pool << (64-POOL_ID_BITS);
+                max_inode = ((uint64_t)only_pool << (64-POOL_ID_BITS)) |
+                    (((uint64_t)1 << (64-POOL_ID_BITS)) - 1);
+            }
+        }
+        only_osds = parse_uint64_list(cfg["osds"]);
+        object_state = stoull_full(cfg["object_state"].string_value(), 0);
+        if (!object_state && cfg["object_state"].is_string())
+        {
+            if (cfg["object_state"].string_value().find("inconsistent") != std::string::npos)
+                object_state |= OBJ_INCONSISTENT;
+            if (cfg["object_state"].string_value().find("corrupted") != std::string::npos)
+                object_state |= OBJ_CORRUPTED;
+            if (cfg["object_state"].string_value().find("incomplete") != std::string::npos)
+                object_state |= OBJ_INCOMPLETE;
+            if (cfg["object_state"].string_value().find("degraded") != std::string::npos)
+                object_state |= OBJ_DEGRADED;
+            if (cfg["object_state"].string_value().find("misplaced") != std::string::npos)
+                object_state |= OBJ_MISPLACED;
+        }
+    }
+
+    void loop()
+    {
+        if (state == 1)
+            goto resume_1;
+        if (state == 100)
+            return;
+        parse_options(options);
+        if (min_inode && !INODE_POOL(min_inode))
+        {
+            result = (cli_result_t){ .err = EINVAL, .text = "Pool is not specified" };
+            state = 100;
+            return;
+        }
+        if (!only_osds.size())
+        {
+            uint64_t min_pool = min_inode >> (64-POOL_ID_BITS);
+            uint64_t max_pool = max_inode >> (64-POOL_ID_BITS);
+            for (auto & pp: parent->cli->st_cli.pool_config)
+            {
+                if (pp.first >= min_pool && (!max_pool || pp.first <= max_pool))
+                {
+                    for (auto & pgp: pp.second.pg_config)
+                        only_osds.push_back(pgp.second.cur_primary);
+                }
+            }
+        }
+        remove_duplicates(only_osds);
+        parent->cli->init_msgr();
+        if (parent->json_output && parent->is_command_line)
+        {
+            printf("[\n");
+        }
+        for (int i = 0; i < only_osds.size(); i++)
+        {
+            osd_op_t *op = new osd_op_t;
+            op->req = (osd_any_op_t){
+                .describe = {
+                    .header = {
+                        .magic = SECONDARY_OSD_OP_MAGIC,
+                        .id = parent->cli->next_op_id(),
+                        .opcode = OSD_OP_DESCRIBE,
+                    },
+                    .object_state = object_state,
+                    .min_inode = min_inode,
+                    .min_offset = min_offset,
+                    .max_inode = max_inode,
+                    .max_offset = max_offset,
+                },
+            };
+            op->callback = [this, osd_num = only_osds[i]](osd_op_t *op)
+            {
+                if (op->reply.hdr.retval < 0)
+                {
+                    fprintf(
+                        stderr, "Failed to describe objects on OSD %lu (retval=%ld)\n",
+                        osd_num, op->reply.hdr.retval
+                    );
+                }
+                else if (op->reply.describe.result_bytes != op->reply.hdr.retval * sizeof(osd_reply_describe_item_t))
+                {
+                    fprintf(
+                        stderr, "Invalid response size from OSD %lu (expected %lu bytes, got %lu bytes)\n",
+                        osd_num, op->reply.hdr.retval * sizeof(osd_reply_describe_item_t), op->reply.describe.result_bytes
+                    );
+                }
+                else
+                {
+                    osd_reply_describe_item_t *items = (osd_reply_describe_item_t *)op->buf;
+                    for (int i = 0; i < op->reply.hdr.retval; i++)
+                    {
+                        if (!parent->json_output || parent->is_command_line)
+                        {
+#define FMT "{\"inode\":\"0x%lx\",\"stripe\":\"0x%lx\",\"part\":%u,\"osd_num\":%lu%s%s%s}"
+                            printf(
+                                (parent->json_output
+                                    ? (count > 0 ? ",\n  " FMT : "  " FMT)
+                                    : "%lx:%lx part %u on OSD %lu%s%s%s\n"),
+#undef FMT
+                                items[i].inode, items[i].stripe,
+                                items[i].role, items[i].osd_num,
+                                (items[i].loc_bad & LOC_CORRUPTED ? (parent->json_output ? ",\"corrupted\":true" : " corrupted") : ""),
+                                (items[i].loc_bad & LOC_INCONSISTENT ? (parent->json_output ? ",\"inconsistent\":true" : " inconsistent") : ""),
+                                (items[i].loc_bad & LOC_OUTDATED ? (parent->json_output ? ",\"outdated\":true" : " outdated") : "")
+                            );
+                        }
+                        else
+                        {
+                            auto json_item = json11::Json::object {
+                                { "inode", (uint64_t)items[i].inode },
+                                { "stripe", (uint64_t)items[i].stripe },
+                                { "part", (uint64_t)items[i].role },
+                                { "osd_num", (uint64_t)items[i].osd_num },
+                            };
+                            if (items[i].loc_bad & LOC_CORRUPTED)
+                                json_item["corrupted"] = true;
+                            if (items[i].loc_bad & LOC_INCONSISTENT)
+                                json_item["inconsistent"] = true;
+                            if (items[i].loc_bad & LOC_OUTDATED)
+                                json_item["outdated"] = true;
+                            describe_items.push_back(json_item);
+                        }
+                        count++;
+                    }
+                }
+                delete op;
+                parent->waiting--;
+                if (!parent->waiting)
+                    loop();
+            };
+            parent->waiting++;
+            parent->cli->execute_raw(only_osds[i], op);
+        }
+    resume_1:
+        state = 1;
+        if (parent->waiting > 0)
+        {
+            return;
+        }
+        if (parent->json_output && parent->is_command_line)
+        {
+            printf(count > 0 ? "\n]\n" : "]\n");
+        }
+        else
+        {
+            result.data = describe_items;
+        }
+        state = 100;
+        describe_items.clear();
+    }
+};
+
+std::function<bool(cli_result_t &)> cli_tool_t::start_describe(json11::Json cfg)
+{
+    auto describer = new cli_describe_t();
+    describer->parent = this;
+    describer->options = cfg;
+    return [describer](cli_result_t & result)
+    {
+        describer->loop();
+        if (describer->is_done())
+        {
+            result = describer->result;
+            delete describer;
+            return true;
+        }
+        return false;
+    };
+}
--- a/src/cli_fix.cpp
+++ b/src/cli_fix.cpp
@@ -0,0 +1,313 @@
+// Copyright (c) Vitaliy Filippov, 2019+
+// License: VNPL-1.1 (see README.md for details)
+
+#include "cli_fix.h"
+#include "cluster_client.h"
+#include "pg_states.h"
+#include "str_util.h"
+
+struct cli_fix_t
+{
+    std::vector<object_id> objects;
+    int part = -1;
+    int processed_count = 0;
+    std::set<osd_num_t> bad_osds;
+    bool no_check = false;
+
+    cli_tool_t *parent = NULL;
+    int state = 0;
+
+    json11::Json options;
+    cli_result_t result;
+    json11::Json::array fix_result;
+
+    bool is_done()
+    {
+        return state == 100;
+    }
+
+    void parse_objects_str(std::string str)
+    {
+        str = trim(str);
+        if (str[0] == '[')
+        {
+            std::string json_err;
+            json11::Json list = json11::Json::parse(str, json_err);
+            if (json_err != "")
+                fprintf(stderr, "Invalid JSON object list input: %s\n", json_err.c_str());
+            else
+                parse_object_list(list);
+        }
+        else
+        {
+            const char *s = str.c_str();
+            char *e = NULL;
+            int len = str.size();
+            object_id oid;
+            for (int p = 0; p < len; p++)
+            {
+                if (isdigit(s[p]))
+                {
+                    int p0 = p;
+                    oid.inode = strtoull(s+p, &e, 0);
+                    p = e-s;
+                    while (p < len && !isdigit(s[p]) && s[p] != ':')
+                        p++;
+                    if (s[p] != ':')
+                    {
+                        fprintf(stderr, "Invalid object ID in input: %s\n", std::string(s+p0, p-p0).c_str());
+                        continue;
+                    }
+                    p++;
+                    while (p < len && !isdigit(s[p]))
+                        p++;
+                    oid.stripe = strtoull(s+p, &e, 0) & ~STRIPE_MASK;
+                    p = e-s;
+                    if (oid.inode)
+                        objects.push_back(oid);
+                    else
+                        fprintf(stderr, "Invalid object ID in input: %s\n", std::string(s+p0, p-p0).c_str());
+                }
+            }
+        }
+    }
+
+    void parse_object_list(json11::Json list)
+    {
+        for (auto & obj: list.array_items())
+        {
+            object_id oid = (object_id){
+                .inode = stoull_full(obj["inode"].string_value(), 0),
+                .stripe = stoull_full(obj["stripe"].string_value(), 0) & ~STRIPE_MASK,
+            };
+            if (oid.inode)
+                objects.push_back(oid);
+            else
+                fprintf(stderr, "Invalid JSON object ID in input: %s, bad or missing \"inode\" field\n", obj.dump().c_str());
+        }
+    }
+
+    void parse_options(json11::Json cfg)
+    {
+        json11::Json object_list;
+        if (cfg["objects"].is_null())
+            parse_objects_str(read_all_fd(0));
+        else if (cfg["objects"].is_string())
+            parse_objects_str(cfg["objects"].string_value());
+        else
+            parse_object_list(cfg["objects"].array_items());
+        for (auto osd_num: parse_uint64_list(cfg["bad_osds"]))
+            bad_osds.insert(osd_num);
+        no_check = json_is_false(cfg["check"]);
+        if (cfg["part"].is_number() || cfg["part"].is_string())
+            part = cfg["part"].uint64_value();
+    }
+
+    void loop()
+    {
+        if (state == 1)
+            goto resume_1;
+        if (state == 100)
+            return;
+        parse_options(options);
+        if (!objects.size())
+        {
+            result = (cli_result_t){ .err = EINVAL, .text = "Object list is not specified" };
+            state = 100;
+            return;
+        }
+        if (!bad_osds.size())
+        {
+            result = (cli_result_t){ .err = EINVAL, .text = "OSDs are not specified" };
+            state = 100;
+            return;
+        }
+        remove_duplicates(objects);
+        parent->cli->init_msgr();
+    resume_1:
+        state = 1;
+        while (processed_count < objects.size())
+        {
+            if (parent->waiting >= parent->iodepth*parent->parallel_osds)
+            {
+                return;
+            }
+            auto & obj = objects[processed_count++];
+            auto pool_cfg_it = parent->cli->st_cli.pool_config.find(INODE_POOL(obj.inode));
+            if (pool_cfg_it == parent->cli->st_cli.pool_config.end())
+            {
+                fprintf(stderr, "Object %lx:%lx is from unknown pool\n", obj.inode, obj.stripe);
+                continue;
+            }
+            auto & pool_cfg = pool_cfg_it->second;
+            pg_num_t pg_num = (obj.stripe/pool_cfg.pg_stripe_size) % pool_cfg.real_pg_count + 1; // like map_to_pg()
+            auto pg_it = pool_cfg.pg_config.find(pg_num);
+            if (pg_it == pool_cfg.pg_config.end() ||
+                !pg_it->second.cur_primary || !(pg_it->second.cur_state & PG_ACTIVE))
+            {
+                fprintf(
+                    stderr, "Object %lx:%lx is from PG %u/%u which is not currently active\n",
+                    obj.inode, obj.stripe, pool_cfg_it->first, pg_num
+                );
+                continue;
+            }
+            osd_num_t primary_osd = pg_it->second.cur_primary;
+            // Describe -> Remove some copies -> Scrub again
+            osd_op_t *op = new osd_op_t;
+            op->req = (osd_any_op_t){
+                .describe = {
+                    .header = {
+                        .magic = SECONDARY_OSD_OP_MAGIC,
+                        .id = parent->cli->next_op_id(),
+                        .opcode = OSD_OP_DESCRIBE,
+                    },
+                    .min_inode = obj.inode,
+                    .min_offset = obj.stripe,
+                    .max_inode = obj.inode,
+                    .max_offset = obj.stripe,
+                },
+            };
+            op->callback = [this, primary_osd, &obj](osd_op_t *op)
+            {
+                if (op->reply.hdr.retval < 0 || op->reply.describe.result_bytes != op->reply.hdr.retval * sizeof(osd_reply_describe_item_t))
+                {
+                    fprintf(stderr, "Failed to describe objects on OSD %lu (retval=%ld)\n", primary_osd, op->reply.hdr.retval);
+                    parent->waiting--;
+                    loop();
+                }
+                else
+                {
+                    osd_reply_describe_item_t *items = (osd_reply_describe_item_t *)op->buf;
+                    int *rm_count = (int*)malloc_or_die(sizeof(int));
+                    *rm_count = 1; // just in case if anything gets called instantly
+                    for (int i = 0; i < op->reply.hdr.retval; i++)
+                    {
+                        if (((items[i].loc_bad & LOC_INCONSISTENT) || no_check) &&
+                            bad_osds.find(items[i].osd_num) != bad_osds.end() &&
+                            (part == -1 || items[i].role == part))
+                        {
+                            // Remove
+                            uint64_t rm_osd_num = items[i].osd_num;
+                            osd_op_t *rm_op = new osd_op_t;
+                            rm_op->req = (osd_any_op_t){
+                                .sec_del = {
+                                    .header = {
+                                        .magic = SECONDARY_OSD_OP_MAGIC,
+                                        .id = parent->cli->next_op_id(),
+                                        .opcode = OSD_OP_SEC_DELETE,
+                                    },
+                                    .oid = {
+                                        .inode = op->req.describe.min_inode,
+                                        .stripe = op->req.describe.min_offset | items[i].role,
+                                    },
+                                    .version = 0,
+                                },
+                            };
+                            rm_op->callback = [this, primary_osd, rm_osd_num, rm_count, &obj](osd_op_t *rm_op)
+                            {
+                                (*rm_count)--;
+                                if (rm_op->reply.hdr.retval < 0)
+                                {
+                                    fprintf(
+                                        stderr, "Failed to remove object %lx:%lx from OSD %lu (retval=%ld)\n",
+                                        rm_op->req.sec_del.oid.inode, rm_op->req.sec_del.oid.stripe,
+                                        rm_osd_num, rm_op->reply.hdr.retval
+                                    );
+                                }
+                                else if (parent->json_output)
+                                {
+                                    fix_result.push_back(json11::Json::object {
+                                        { "inode", (uint64_t)rm_op->req.sec_del.oid.inode },
+                                        { "stripe", (uint64_t)rm_op->req.sec_del.oid.stripe & ~STRIPE_MASK },
+                                        { "part", (uint64_t)rm_op->req.sec_del.oid.stripe & STRIPE_MASK },
+                                        { "osd_num", (uint64_t)rm_osd_num },
+                                    });
+                                }
+                                else
+                                {
+                                    printf(
+                                        "Removed %lx:%lx (part %lu) from OSD %lu\n",
+                                        rm_op->req.sec_del.oid.inode, rm_op->req.sec_del.oid.stripe & ~STRIPE_MASK,
+                                        rm_op->req.sec_del.oid.stripe & STRIPE_MASK, rm_osd_num
+                                    );
+                                }
+                                delete rm_op;
+                                if (!(*rm_count))
+                                {
+                                    // Scrub
+                                    free(rm_count);
+                                    osd_op_t *scrub_op = new osd_op_t;
+                                    scrub_op->req = (osd_any_op_t){
+                                        .rw = {
+                                            .header = {
+                                                .magic = SECONDARY_OSD_OP_MAGIC,
+                                                .id = parent->cli->next_op_id(),
+                                                .opcode = OSD_OP_SCRUB,
+                                            },
+                                            .inode = obj.inode,
+                                            .offset = obj.stripe,
+                                            .len = 0,
+                                        },
+                                    };
+                                    scrub_op->callback = [this, primary_osd, &obj](osd_op_t *scrub_op)
+                                    {
+                                        if (scrub_op->reply.hdr.retval < 0 && scrub_op->reply.hdr.retval != -ENOENT)
+                                        {
+                                            fprintf(
+                                                stderr, "Failed to scrub %lx:%lx on OSD %lu (retval=%ld)\n",
+                                                obj.inode, obj.stripe, primary_osd, scrub_op->reply.hdr.retval
+                                            );
+                                        }
+                                        delete scrub_op;
+                                        parent->waiting--;
+                                        loop();
+                                    };
+                                    parent->cli->execute_raw(primary_osd, scrub_op);
+                                }
+                            };
+                            (*rm_count)++;
+                            parent->cli->execute_raw(rm_osd_num, rm_op);
+                        }
+                    }
+                    (*rm_count)--;
+                    if (!*rm_count)
+                    {
+                        free(rm_count);
+                        parent->waiting--;
+                        loop();
+                    }
+                }
+                delete op;
+            };
+            parent->waiting++;
+            parent->cli->execute_raw(primary_osd, op);
+        }
+        if (parent->waiting > 0)
+        {
+            return;
+        }
+        if (parent->json_output)
+        {
+            result.data = fix_result;
+        }
+        state = 100;
+    }
+};
+
+std::function<bool(cli_result_t &)> cli_tool_t::start_fix(json11::Json cfg)
+{
+    auto fixer = new cli_fix_t();
+    fixer->parent = this;
+    fixer->options = cfg;
+    return [fixer](cli_result_t & result)
+    {
+        fixer->loop();
+        if (fixer->is_done())
+        {
+            result = fixer->result;
+            delete fixer;
+            return true;
+        }
+        return false;
+    };
+}
--- a/src/cli_fix.h
+++ b/src/cli_fix.h
@@ -0,0 +1,26 @@
+// Copyright (c) Vitaliy Filippov, 2019+
+// License: VNPL-1.1 (see README.md for details)
+
+#pragma once
+
+#include "cli.h"
+#include <algorithm>
+
+std::vector<uint64_t> parse_uint64_list(json11::Json val);
+
+template<class T> void remove_duplicates(std::vector<T> & ret)
+{
+    if (!ret.size())
+        return;
+    std::sort(ret.begin(), ret.end());
+    int j = 0;
+    for (int i = 1; i < ret.size(); i++)
+    {
+        if (ret[i] != ret[j])
+            ret[++j] = ret[i];
+    }
+    ret.resize(j+1);
+}
+
+// from http_client.cpp...
+bool json_is_false(const json11::Json & val);
--- a/src/cli_rm_osd.cpp
+++ b/src/cli_rm_osd.cpp
@@ -410,14 +410,17 @@ struct rm_osd_t
                        parent->cli->st_cli.etcd_prefix+"/pg/history/"+
                        std::to_string(pool_cfg.id)+"/"+std::to_string(pg_num)
                    );
+                    auto hist = json11::Json::object {
+                        { "epoch", pg_cfg.epoch },
+                        { "all_peers", pg_cfg.all_peers },
+                        { "osd_sets", pg_cfg.target_history },
+                    };
+                    if (pg_cfg.next_scrub)
+                        hist["next_scrub"] = pg_cfg.next_scrub;
                    history_updates.push_back(json11::Json::object {
                        { "request_put", json11::Json::object {
                            { "key", history_key },
-                            { "value", base64_encode(json11::Json(json11::Json::object {
-                                { "epoch", pg_cfg.epoch },
-                                { "all_peers", pg_cfg.all_peers },
-                                { "osd_sets", pg_cfg.target_history },
-                            }).dump()) },
+                            { "value", base64_encode(json11::Json(hist).dump()) },
                        } },
                    });
                    history_checks.push_back(json11::Json::object {
--- a/src/cli_status.cpp
+++ b/src/cli_status.cpp
@@ -201,6 +201,7 @@ resume_2:
        bool readonly = json_is_true(parent->cli->config["readonly"]);
        bool no_recovery = json_is_true(parent->cli->config["no_recovery"]);
        bool no_rebalance = json_is_true(parent->cli->config["no_rebalance"]);
+        bool no_scrub = json_is_true(parent->cli->config["no_scrub"]);
        if (parent->json_output)
        {
            // JSON output
@@ -219,6 +220,7 @@ resume_2:
                { "readonly", readonly },
                { "no_recovery", no_recovery },
                { "no_rebalance", no_rebalance },
+                { "no_scrub", no_scrub },
                { "pool_count", pool_count },
                { "active_pool_count", pools_active },
                { "pg_states", pgs_by_state },
--- a/src/cluster_client.cpp
+++ b/src/cluster_client.cpp
@@ -35,6 +35,7 @@ cluster_client_t::cluster_client_t(ring_loop_t *ringloop, timerfd_manager_t *tfd
            // peer_osd just connected
            continue_ops();
            continue_lists();
+            continue_raw_ops(peer_osd);
        }
        else if (dirty_buffers.size())
        {
@@ -104,6 +105,19 @@ cluster_op_t::~cluster_op_t()
    }
 }

+void cluster_client_t::continue_raw_ops(osd_num_t peer_osd)
+{
+    auto it = raw_ops.find(peer_osd);
+    while (it != raw_ops.end() && it->first == peer_osd)
+    {
+        auto op = it->second;
+        op->op_type = OSD_OP_OUT;
+        op->peer_fd = msgr.osd_peer_fds.at(peer_osd);
+        msgr.outbox_push(op);
+        raw_ops.erase(it++);
+    }
+}
+
 void cluster_client_t::init_msgr()
 {
    if (msgr_initialized)
@@ -512,6 +526,23 @@ void cluster_client_t::execute(cluster_op_t *op)
    }
 }

+void cluster_client_t::execute_raw(osd_num_t osd_num, osd_op_t *op)
+{
+    auto fd_it = msgr.osd_peer_fds.find(osd_num);
+    if (fd_it != msgr.osd_peer_fds.end())
+    {
+        op->op_type = OSD_OP_OUT;
+        op->peer_fd = fd_it->second;
+        msgr.outbox_push(op);
+    }
+    else
+    {
+        if (msgr.wanted_peers.find(osd_num) == msgr.wanted_peers.end())
+            msgr.connect_peer(osd_num, st_cli.peer_states[osd_num]);
+        raw_ops.emplace(osd_num, op);
+    }
+}
+
 void cluster_client_t::copy_write(cluster_op_t *op, std::map<object_id, cluster_buffer_t> & dirty_buffers)
 {
    // Save operation for replay when one of PGs goes out of sync
@@ -743,15 +774,16 @@ resume_3:
        erase_op(op);
        return 1;
    }
-    else if (op->retval != 0 && op->retval != -EPIPE)
+    else if (op->retval != 0 && op->retval != -EPIPE && op->retval != -EIO && op->retval != -ENOSPC)
    {
-        // Fatal error (not -EPIPE)
+        // Fatal error (neither -EPIPE, -EIO nor -ENOSPC)
+        // FIXME: Add a parameter to allow to not wait for EIOs (incomplete or corrupted objects) to heal
        erase_op(op);
        return 1;
    }
    else
    {
-        // -EPIPE - clear the error and retry
+        // Non-fatal error - clear the error and retry
        op->retval = 0;
        if (op->needs_reslice)
        {
@@ -1048,7 +1080,7 @@ resume_1:
                uw_it->second.state = CACHE_DIRTY;
            }
        }
-        if (op->retval == -EPIPE)
+        if (op->retval == -EPIPE || op->retval == -EIO || op->retval == -ENOSPC)
        {
            // Retry later
            op->parts.clear();
@@ -1119,13 +1151,13 @@ void cluster_client_t::handle_op_part(cluster_op_part_t *part)
    {
        // Operation failed, retry
        part->flags |= PART_ERROR;
-        if (!op->retval || op->retval == -EPIPE)
+        if (!op->retval || op->retval == -EPIPE || part->op.reply.hdr.retval == -EIO)
        {
-            // Don't overwrite other errors with -EPIPE
+            // Error priority: EIO > ENOSPC > EPIPE
            op->retval = part->op.reply.hdr.retval;
        }
        int stop_fd = -1;
-        if (op->retval != -EINTR && op->retval != -EIO)
+        if (op->retval != -EINTR && op->retval != -EIO && op->retval != -ENOSPC)
        {
            stop_fd = part->op.peer_fd;
            fprintf(
@@ -1133,21 +1165,25 @@ void cluster_client_t::handle_op_part(cluster_op_part_t *part)
                osd_op_names[part->op.req.hdr.opcode], part->osd_num, part->op.reply.hdr.retval, expected
            );
        }
+        else
+        {
+            fprintf(
+                stderr, "%s operation failed on OSD %lu: retval=%ld (expected %d)\n",
+                osd_op_names[part->op.req.hdr.opcode], part->osd_num, part->op.reply.hdr.retval, expected
+            );
+        }
        // All next things like timer, continue_sync/rw and stop_client may affect the operation again
        // So do all these things after modifying operation state, otherwise we may hit reenterability bugs
        // FIXME postpone such things to set_immediate here to avoid bugs
-        if (part->op.reply.hdr.retval == -EPIPE)
+        // Mark op->up_wait = true to retry operation after a short pause (not immediately)
+        op->up_wait = true;
+        if (!retry_timeout_id)
        {
-            // Mark op->up_wait = true before stopping the client
-            op->up_wait = true;
-            if (!retry_timeout_id)
+            retry_timeout_id = tfd->set_timer(up_wait_retry_interval, false, [this](int)
            {
-                retry_timeout_id = tfd->set_timer(up_wait_retry_interval, false, [this](int)
-                {
-                    retry_timeout_id = 0;
-                    continue_ops(true);
-                });
-            }
+                retry_timeout_id = 0;
+                continue_ops(true);
+            });
        }
        if (op->inflight_count == 0)
        {
--- a/src/cluster_client.h
+++ b/src/cluster_client.h
@@ -103,6 +103,7 @@ class cluster_client_t
    ring_consumer_t consumer;
    std::vector<std::function<void(void)>> on_ready_hooks;
    std::vector<inode_list_t*> lists;
+    std::multimap<osd_num_t, osd_op_t*> raw_ops;
    int continuing_ops = 0;
    bool msgr_initialized = false;

@@ -118,6 +119,7 @@ public:
    cluster_client_t(ring_loop_t *ringloop, timerfd_manager_t *tfd, json11::Json & config);
    ~cluster_client_t();
    void execute(cluster_op_t *op);
+    void execute_raw(osd_num_t osd_num, osd_op_t *op);
    bool is_ready();
    void on_ready(std::function<void(void)> fn);

@@ -153,4 +155,5 @@ protected:
    void continue_lists();
    void continue_listing(inode_list_t *lst);
    void send_list(inode_list_osd_t *cur_list);
+    void continue_raw_ops(osd_num_t peer_osd);
 };
--- a/src/disk_tool_prepare.cpp
+++ b/src/disk_tool_prepare.cpp
@@ -621,6 +621,11 @@ int disk_tool_t::prepare(std::vector<std::string> devices)
                }
                // Treat all disks as SSDs if not in the hybrid mode
                prepare_one(options, hybrid && dev.is_hdd ? 1 : 0);
+                if (hybrid)
+                {
+                    options.erase("journal_device");
+                    options.erase("meta_device");
+                }
            }
        }
    }
--- a/src/disk_tool_utils.cpp
+++ b/src/disk_tool_utils.cpp
@@ -55,23 +55,6 @@ std::string realpath_str(std::string path, bool nofail)
    return rp;
 }

-std::string read_all_fd(int fd)
-{
-    int res_size = 0;
-    std::string res;
-    while (1)
-    {
-        res.resize(res_size+1024);
-        int r = read(fd, (char*)res.data()+res_size, res.size()-res_size);
-        if (r > 0)
-            res_size += r;
-        else if (!r || errno != EAGAIN && errno != EINTR)
-            break;
-    }
-    res.resize(res_size);
-    return res;
-}
-
 std::string read_file(std::string file, bool allow_enoent)
 {
    std::string res;
--- a/src/etcd_state_client.cpp
+++ b/src/etcd_state_client.cpp
@@ -7,8 +7,8 @@
 #ifndef __MOCK__
 #include "addr_util.h"
 #include "http_client.h"
-#include "str_util.h"
 #endif
+#include "str_util.h"

 etcd_state_client_t::~etcd_state_client_t()
 {
@@ -777,6 +777,10 @@ void etcd_state_client_t::parse_state(const etcd_kv_t & kv)
                fprintf(stderr, "Pool %u has invalid bitmap_granularity (must divide block_size), skipping pool\n", pool_id);
                continue;
            }
+            // Scrub Interval
+            pc.scrub_interval = parse_time(pool_item.second["scrub_interval"].string_value());
+            if (!pc.scrub_interval)
+                pc.scrub_interval = 0;
            // Immediate Commit Mode
            pc.immediate_commit = pool_item.second["immediate_commit"].is_string()
                ? (pool_item.second["immediate_commit"].string_value() == "all"
@@ -919,6 +923,8 @@ void etcd_state_client_t::parse_state(const etcd_kv_t & kv)
            }
            // Read epoch
            pg_cfg.epoch = value["epoch"].uint64_value();
+            // Next scrub timestamp (0 or empty = scrub is not needed)
+            pg_cfg.next_scrub = value["next_scrub"].uint64_value();
            if (on_change_pg_history_hook != NULL)
            {
                on_change_pg_history_hook(pool_id, pg_num);
--- a/src/etcd_state_client.h
+++ b/src/etcd_state_client.h
@@ -39,6 +39,7 @@ struct pg_config_t
    osd_num_t cur_primary;
    int cur_state;
    uint64_t epoch;
+    uint64_t next_scrub;
 };

 struct pool_config_t
@@ -55,6 +56,7 @@ struct pool_config_t
    uint64_t max_osd_combinations;
    uint64_t pg_stripe_size;
    std::map<pg_num_t, pg_config_t> pg_config;
+    uint64_t scrub_interval;
 };

 struct inode_config_t
--- a/src/messenger.cpp
+++ b/src/messenger.cpp
@@ -251,6 +251,10 @@ void osd_messenger_t::try_connect_peer_addr(osd_num_t peer_osd, const char *peer
        return;
    }
    clients[peer_fd] = new osd_client_t();
+    if (log_level > 0)
+    {
+        fprintf(stderr, "Connecting to OSD %lu at %s:%d (client %d)\n", peer_osd, peer_host, peer_port, peer_fd);
+    }
    clients[peer_fd]->peer_addr = addr;
    clients[peer_fd]->peer_port = peer_port;
    clients[peer_fd]->peer_fd = peer_fd;
@@ -313,7 +317,10 @@ void osd_messenger_t::handle_peer_epoll(int peer_fd, int epoll_events)
    if (epoll_events & EPOLLRDHUP)
    {
        // Stop client
-        fprintf(stderr, "[OSD %lu] client %d disconnected\n", this->osd_num, peer_fd);
+        if (log_level > 0)
+        {
+            fprintf(stderr, "[OSD %lu] client %d disconnected\n", this->osd_num, peer_fd);
+        }
        stop_client(peer_fd, true);
    }
    else if (epoll_events & EPOLLIN)
--- a/src/messenger.h
+++ b/src/messenger.h
@@ -50,7 +50,7 @@ struct osd_client_t

    sockaddr_storage peer_addr;
    int peer_port;
-    int peer_fd;
+    int peer_fd = -1;
    int peer_state;
    int connect_timeout_id = -1;
    int ping_time_remaining = 0;
@@ -87,11 +87,7 @@ struct osd_client_t
    std::vector<iovec> send_list, next_send_list;
    std::vector<msgr_sendp_t> outbox, next_outbox;

-    ~osd_client_t()
-    {
-        free(in_buf);
-        in_buf = NULL;
-    }
+    ~osd_client_t();
 };

 struct osd_wanted_peer_t
--- a/src/msgr_receive.cpp
+++ b/src/msgr_receive.cpp
@@ -103,7 +103,10 @@ bool osd_messenger_t::handle_read(int result, osd_client_t *cl)
            cl->recv_list.eat(result);
            if (cl->recv_list.done >= cl->recv_list.count)
            {
-                handle_finished_read(cl);
+                if (!handle_finished_read(cl))
+                {
+                    goto fin;
+                }
            }
        }
        if (result >= cl->read_iov.iov_len)
@@ -248,10 +251,6 @@ void osd_messenger_t::handle_op_hdr(osd_client_t *cl)
        }
        cl->read_remaining = cur_op->req.sec_read_bmp.len;
    }
-    else if (cur_op->req.hdr.opcode == OSD_OP_READ)
-    {
-        cl->read_remaining = 0;
-    }
    else if (cur_op->req.hdr.opcode == OSD_OP_WRITE)
    {
        if (cur_op->req.rw.len > 0)
@@ -271,6 +270,12 @@ void osd_messenger_t::handle_op_hdr(osd_client_t *cl)
        }
        cl->read_remaining = cur_op->req.show_conf.json_len;
    }
+    /*else if (cur_op->req.hdr.opcode == OSD_OP_READ ||
+        cur_op->req.hdr.opcode == OSD_OP_SCRUB ||
+        cur_op->req.hdr.opcode == OSD_OP_DESCRIBE)
+    {
+        cl->read_remaining = 0;
+    }*/
    if (cl->read_remaining > 0)
    {
        // Read data
@@ -364,6 +369,16 @@ bool osd_messenger_t::handle_reply_hdr(osd_client_t *cl)
        op->buf = malloc_or_die(op->reply.hdr.retval);
        cl->recv_list.push_back(op->buf, op->reply.hdr.retval);
    }
+    else if (op->reply.hdr.opcode == OSD_OP_DESCRIBE && op->reply.hdr.retval > 0)
+    {
+        delete cl->read_op;
+        cl->read_op = op;
+        cl->read_state = CL_READ_REPLY_DATA;
+        cl->read_remaining = op->reply.describe.result_bytes;
+        free(op->buf);
+        op->buf = malloc_or_die(op->reply.describe.result_bytes);
+        cl->recv_list.push_back(op->buf, op->reply.describe.result_bytes);
+    }
    else
    {
 reuse:
--- a/src/msgr_send.cpp
+++ b/src/msgr_send.cpp
@@ -73,7 +73,8 @@ void osd_messenger_t::outbox_push(osd_op_t *cur_op)
        ? (cur_op->req.hdr.opcode == OSD_OP_READ ||
        cur_op->req.hdr.opcode == OSD_OP_SEC_READ ||
        cur_op->req.hdr.opcode == OSD_OP_SEC_LIST ||
-        cur_op->req.hdr.opcode == OSD_OP_SHOW_CONFIG)
+        cur_op->req.hdr.opcode == OSD_OP_SHOW_CONFIG ||
+        cur_op->req.hdr.opcode == OSD_OP_DESCRIBE)
        : (cur_op->req.hdr.opcode == OSD_OP_WRITE ||
        cur_op->req.hdr.opcode == OSD_OP_SEC_WRITE ||
        cur_op->req.hdr.opcode == OSD_OP_SEC_WRITE_STABLE ||
--- a/src/msgr_stop.cpp
+++ b/src/msgr_stop.cpp
@@ -122,17 +122,6 @@ void osd_messenger_t::stop_client(int peer_fd, bool force, bool force_delete)
        // Cancel outbound operations
        cancel_osd_ops(cl);
    }
-#ifndef __MOCK__
-    // And close the FD only when everything is done
-    // ...because peer_fd number can get reused after close()
-    close(peer_fd);
-#ifdef WITH_RDMA
-    if (cl->rdma_conn)
-    {
-        delete cl->rdma_conn;
-    }
-#endif
-#endif
    // Find the item again because it can be invalidated at this point
    it = clients.find(peer_fd);
    if (it != clients.end())
@@ -145,3 +134,25 @@ void osd_messenger_t::stop_client(int peer_fd, bool force, bool force_delete)
        delete cl;
    }
 }
+
+osd_client_t::~osd_client_t()
+{
+    free(in_buf);
+    in_buf = NULL;
+    if (peer_fd >= 0)
+    {
+        // Close the FD only when the client is actually destroyed
+        // Which only happens when all references are cleared
+        close(peer_fd);
+        peer_fd = -1;
+    }
+#ifndef __MOCK__
+#ifdef WITH_RDMA
+    if (rdma_conn)
+    {
+        delete rdma_conn;
+        rdma_conn = NULL;
+    }
+#endif
+#endif
+}
--- a/src/nbd_proxy.cpp
+++ b/src/nbd_proxy.cpp
@@ -137,12 +137,19 @@ public:
            "OPTIONS:\n"
            "  All usual Vitastor config options like --etcd_address <etcd_address> plus NBD-specific:\n"
            "  --nbd_timeout 30\n"
-            "    timeout in seconds after which the kernel will stop the device\n"
-            "    you can set it to 0, but beware that you won't be able to stop the device at all\n"
-            "    if vitastor-nbd process dies\n"
+            "    Timeout for I/O operations in seconds after exceeding which the kernel stops\n"
+            "    the device. You can set it to 0 to disable the timeout, but beware that you\n"
+            "    won't be able to stop the device at all if vitastor-nbd process dies.\n"
            "  --nbd_max_devices 64 --nbd_max_part 3\n"
-            "    options for the \"nbd\" kernel module when modprobing it (nbds_max and max_part).\n"
-            "    note that maximum allowed (nbds_max)*(1+max_part) is 256.\n",
+            "    Options for the \"nbd\" kernel module when modprobing it (nbds_max and max_part).\n"
+            "    note that maximum allowed (nbds_max)*(1+max_part) is 256.\n"
+            "  --logfile /path/to/log/file.txt\n"
+            "    Wite log messages to the specified file instead of dropping them (in background mode)\n"
+            "    or printing them to the standard output (in foreground mode).\n"
+            "  --dev_num N\n"
+            "    Use the specified device /dev/nbdN instead of automatic selection.\n"
+            "  --foreground 1\n"
+            "    Stay in foreground, do not daemonize.n",
            exe_name, exe_name, exe_name
        );
        exit(0);
--- a/src/osd.cpp
+++ b/src/osd.cpp
@@ -13,6 +13,7 @@
 #include "osd_primary.h"
 #include "osd.h"
 #include "http_client.h"
+#include "str_util.h"

 static blockstore_config_t json_to_bs(const json11::Json::object & config)
 {
@@ -168,6 +169,8 @@ void osd_t::parse_config(bool init)
    no_rebalance = json_is_true(config["no_rebalance"]);
    auto old_no_recovery = no_recovery;
    no_recovery = json_is_true(config["no_recovery"]);
+    auto old_no_scrub = no_scrub;
+    no_scrub = json_is_true(config["no_scrub"]);
    auto old_autosync_interval = autosync_interval;
    if (!config["autosync_interval"].is_null())
    {
@@ -207,6 +210,38 @@ void osd_t::parse_config(bool init)
    inode_vanish_time = config["inode_vanish_time"].uint64_value();
    if (!inode_vanish_time)
        inode_vanish_time = 60;
+    auto old_auto_scrub = auto_scrub;
+    auto_scrub = json_is_true(config["auto_scrub"]);
+    global_scrub_interval = parse_time(config["scrub_interval"].string_value());
+    if (!global_scrub_interval)
+        global_scrub_interval = 30*86400;
+    scrub_queue_depth = config["scrub_queue_depth"].uint64_value();
+    if (scrub_queue_depth < 1 || scrub_queue_depth > MAX_RECOVERY_QUEUE)
+        scrub_queue_depth = 1;
+    scrub_find_best = !json_is_false(config["scrub_find_best"]);
+    scrub_ec_max_bruteforce = config["scrub_ec_max_bruteforce"].uint64_value();
+    if (scrub_ec_max_bruteforce < 1)
+        scrub_ec_max_bruteforce = 100;
+    scrub_sleep_ms = config["scrub_sleep"].uint64_value();
+    scrub_list_limit = config["scrub_list_limit"].uint64_value();
+    if (!scrub_list_limit)
+        scrub_list_limit = 1000;
+    if (!old_auto_scrub && auto_scrub)
+    {
+        // Schedule scrubbing
+        for (auto & pgp: pgs)
+        {
+            plan_scrub(pgp.second);
+        }
+    }
+    if (old_no_scrub && !no_scrub)
+    {
+        // Wakeup scrubbing
+        for (auto & pgp: pgs)
+        {
+            schedule_scrub(pgp.second);
+        }
+    }
    if ((old_no_rebalance && !no_rebalance || old_no_recovery && !no_recovery) &&
        !(peering_state & (OSD_RECOVERING | OSD_FLUSHING_PGS)))
    {
@@ -337,6 +372,8 @@ void osd_t::exec_op(osd_op_t *cur_op)
        cur_op->req.hdr.opcode != OSD_OP_SEC_LIST &&
        cur_op->req.hdr.opcode != OSD_OP_READ &&
        cur_op->req.hdr.opcode != OSD_OP_SEC_READ_BMP &&
+        cur_op->req.hdr.opcode != OSD_OP_SCRUB &&
+        cur_op->req.hdr.opcode != OSD_OP_DESCRIBE &&
        cur_op->req.hdr.opcode != OSD_OP_SHOW_CONFIG)
    {
        // Readonly mode
@@ -367,6 +404,14 @@ void osd_t::exec_op(osd_op_t *cur_op)
    {
        continue_primary_del(cur_op);
    }
+    else if (cur_op->req.hdr.opcode == OSD_OP_SCRUB)
+    {
+        continue_primary_scrub(cur_op);
+    }
+    else if (cur_op->req.hdr.opcode == OSD_OP_DESCRIBE)
+    {
+        continue_primary_describe(cur_op);
+    }
    else
    {
        exec_secondary(cur_op);
@@ -431,6 +476,10 @@ void osd_t::print_stats()
            recovery_stat_bytes[1][i] = recovery_stat_bytes[0][i];
        }
    }
+    if (corrupted_objects > 0)
+    {
+        printf("[OSD %lu] %lu object(s) corrupted\n", osd_num, corrupted_objects);
+    }
    if (incomplete_objects > 0)
    {
        printf("[OSD %lu] %lu object(s) incomplete\n", osd_num, incomplete_objects);
@@ -498,10 +547,11 @@ void osd_t::print_slow()
                else if (op->req.hdr.opcode == OSD_OP_SEC_LIST)
                {
                    bufprintf(
-                        " inode=%lx-%lx pg=%u/%u, stripe=%lu",
-                        op->req.sec_list.min_inode, op->req.sec_list.max_inode,
+                        " oid=%lx/%lx-%lx/%lx pg=%u/%u, stripe=%lu, limit=%u",
+                        op->req.sec_list.min_inode, op->req.sec_list.min_stripe,
+                        op->req.sec_list.max_inode, op->req.sec_list.max_stripe,
                        op->req.sec_list.list_pg, op->req.sec_list.pg_count,
-                        op->req.sec_list.pg_stripe_size
+                        op->req.sec_list.pg_stripe_size, op->req.sec_list.stable_limit
                    );
                }
                else if (op->req.hdr.opcode == OSD_OP_READ || op->req.hdr.opcode == OSD_OP_WRITE ||
--- a/src/osd.h
+++ b/src/osd.h
@@ -28,6 +28,7 @@
 #define OSD_PEERING_PGS 0x04
 #define OSD_FLUSHING_PGS 0x08
 #define OSD_RECOVERING 0x10
+#define OSD_SCRUBBING 0x20

 #define MAX_AUTOSYNC_INTERVAL 3600
 #define DEFAULT_AUTOSYNC_INTERVAL 5
@@ -98,6 +99,7 @@ class osd_t
    bool run_primary = false;
    bool no_rebalance = false;
    bool no_recovery = false;
+    bool no_scrub = false;
    std::string bind_address;
    int bind_port, listen_backlog = 128;
    // FIXME: Implement client queue depth limit
@@ -113,6 +115,13 @@ class osd_t
    int recovery_sync_batch = DEFAULT_RECOVERY_BATCH;
    int inode_vanish_time = 60;
    int log_level = 0;
+    bool auto_scrub = false;
+    uint64_t global_scrub_interval = 30*86400;
+    uint64_t scrub_queue_depth = 1;
+    uint64_t scrub_sleep_ms = 0;
+    uint32_t scrub_list_limit = 1000;
+    bool scrub_find_best = true;
+    uint64_t scrub_ec_max_bruteforce = 100;

    // cluster state

@@ -135,15 +144,24 @@ class osd_t
    std::set<pool_pg_num_t> dirty_pgs;
    std::set<osd_num_t> dirty_osds;
    int copies_to_delete_after_sync_count = 0;
-    uint64_t misplaced_objects = 0, degraded_objects = 0, incomplete_objects = 0;
+    uint64_t misplaced_objects = 0, degraded_objects = 0, incomplete_objects = 0, inconsistent_objects = 0, corrupted_objects = 0;
    int peering_state = 0;
    std::map<object_id, osd_recovery_op_t> recovery_ops;
+    std::map<object_id, osd_op_t*> scrub_ops;
    bool recovery_last_degraded = true;
    pool_pg_num_t recovery_last_pg;
    object_id recovery_last_oid;
    int recovery_pg_done = 0, recovery_done = 0;
    osd_op_t *autosync_op = NULL;

+    // Scrubbing
+    uint64_t scrub_nearest_ts = 0;
+    int scrub_timer_id = -1;
+    pool_pg_num_t scrub_last_pg = {};
+    osd_op_t *scrub_list_op = NULL;
+    pg_list_result_t scrub_cur_list = {};
+    uint64_t scrub_list_pos = 0;
+
    // Unstable writes
    uint64_t unstable_write_count = 0;
    std::map<osd_object_id_t, uint64_t> unstable_writes;
@@ -221,6 +239,14 @@ class osd_t
    bool continue_recovery();
    pg_osd_set_state_t* change_osd_set(pg_osd_set_state_t *st, pg_t *pg);

+    // scrub
+    void scrub_list(pool_pg_num_t pg_id, osd_num_t role_osd, object_id min_oid);
+    int pick_next_scrub(object_id & next_oid);
+    void submit_scrub_op(object_id oid);
+    bool continue_scrub();
+    void plan_scrub(pg_t & pg, bool report_state = true);
+    void schedule_scrub(pg_t & pg);
+
    // op execution
    void exec_op(osd_op_t *cur_op);
    void finish_op(osd_op_t *cur_op, int retval);
@@ -235,13 +261,19 @@ class osd_t
    void autosync();
    bool prepare_primary_rw(osd_op_t *cur_op);
    void continue_primary_read(osd_op_t *cur_op);
+    void continue_primary_scrub(osd_op_t *cur_op);
+    void continue_primary_describe(osd_op_t *cur_op);
    void continue_primary_write(osd_op_t *cur_op);
    void cancel_primary_write(osd_op_t *cur_op);
    void continue_primary_sync(osd_op_t *cur_op);
    void continue_primary_del(osd_op_t *cur_op);
    bool check_write_queue(osd_op_t *cur_op, pg_t & pg);
-    void remove_object_from_state(object_id & oid, pg_osd_set_state_t *object_state, pg_t &pg);
-    void free_object_state(pg_t & pg, pg_osd_set_state_t **object_state);
+    pg_osd_set_state_t* add_object_to_set(pg_t & pg, const object_id oid, const pg_osd_set_t & osd_set,
+        uint64_t old_pg_state, int log_at_level);
+    void remove_object_from_state(object_id & oid, pg_osd_set_state_t **object_state, pg_t &pg, bool report = true);
+    pg_osd_set_state_t *mark_object_corrupted(pg_t & pg, object_id oid, pg_osd_set_state_t *prev_object_state,
+        osd_rmw_stripe_t *stripes, bool ref, bool inconsistent);
+    void deref_object_state(pg_t & pg, pg_osd_set_state_t **object_state, bool deref);
    bool remember_unstable_write(osd_op_t *cur_op, pg_t & pg, pg_osd_set_t & loc_set, int base_state);
    void handle_primary_subop(osd_op_t *subop, osd_op_t *cur_op);
    void handle_primary_bs_subop(osd_op_t *subop);
@@ -256,10 +288,11 @@ class osd_t
    int submit_primary_sync_subops(osd_op_t *cur_op);
    void submit_primary_stab_subops(osd_op_t *cur_op);

-    uint64_t* get_object_osd_set(pg_t &pg, object_id &oid, uint64_t *def, pg_osd_set_state_t **object_state);
+    uint64_t* get_object_osd_set(pg_t &pg, object_id &oid, pg_osd_set_state_t **object_state);

    void continue_chained_read(osd_op_t *cur_op);
    int submit_chained_read_requests(pg_t & pg, osd_op_t *cur_op);
+    void check_corrupted_chained(pg_t & pg, osd_op_t *cur_op);
    void send_chained_read_results(pg_t & pg, osd_op_t *cur_op);
    std::vector<osd_chain_read_t> collect_chained_read_requests(osd_op_t *cur_op);
    int collect_bitmap_requests(osd_op_t *cur_op, pg_t & pg, std::vector<bitmap_request_t> & bitmap_requests);
--- a/src/osd_cluster.cpp
+++ b/src/osd_cluster.cpp
@@ -337,6 +337,8 @@ void osd_t::report_statistics()
        pg_stats["misplaced_count"] = pg.misplaced_objects.size();
        pg_stats["degraded_count"] = pg.degraded_objects.size();
        pg_stats["incomplete_count"] = pg.incomplete_objects.size();
+        if (pg.corrupted_count)
+            pg_stats["corrupted_count"] = pg.corrupted_count;
        pg_stats["write_osd_set"] = pg.cur_set;
        txn.push_back(json11::Json::object {
            { "request_put", json11::Json::object {
@@ -692,6 +694,11 @@ void osd_t::apply_pg_config()
                            pg_it->second.all_peers == vec_all_peers)
                        {
                            // No change in osd_set and history
+                            if (pg_it->second.next_scrub != pg_cfg.next_scrub)
+                            {
+                                pg_it->second.next_scrub = pg_cfg.next_scrub;
+                                schedule_scrub(pg_it->second);
+                            }
                            continue;
                        }
                        else
@@ -743,6 +750,7 @@ void osd_t::apply_pg_config()
                    .reported_epoch = pg_cfg.epoch,
                    .target_history = pg_cfg.target_history,
                    .all_peers = vec_all_peers,
+                    .next_scrub = pg_cfg.next_scrub,
                    .target_set = pg_cfg.target_set,
                };
                if (pg.scheme == POOL_SCHEME_EC)
@@ -812,11 +820,21 @@ void osd_t::report_pg_states()
                    pg_it->second.cur_state != 0)
                {
                    pg_state_exists = true;
+                    if (pg.state == PG_OFFLINE && pg_it->second.cur_primary != this->osd_num)
+                    {
+                        // Nothing to check or report, PG is already taken over by another OSD
+                        continue;
+                    }
                }
            }
        }
        if (!pg_state_exists)
        {
+            if (pg.state == PG_OFFLINE)
+            {
+                // Nothing to check or report, PG is already stopped
+                continue;
+            }
            // Check that the PG key does not exist
            // Failed check indicates an unsuccessful PG lock attempt in this case
            checks.push_back(json11::Json::object {
@@ -873,6 +891,8 @@ void osd_t::report_pg_states()
                    { "all_peers", pg.all_peers },
                    { "osd_sets", pg.target_history },
                };
+                if (pg.next_scrub)
+                    history_value["next_scrub"] = pg.next_scrub;
                checks.push_back(json11::Json::object {
                    { "target", "MOD" },
                    { "key", history_key },
@@ -901,6 +921,15 @@ void osd_t::report_pg_states()
    {
        etcd_reporting_pg_state = false;
        if (!data["succeeded"].bool_value())
+        {
+            std::string rpgnames = "";
+            for (auto pp: reporting_pgs)
+            {
+                rpgnames += (rpgnames.size() ? ", " : "")+std::to_string(pp.pool_pg_num.pool_id)+"/"+std::to_string(pp.pool_pg_num.pg_num);
+            }
+            printf("Error reporting PG %s states, will repeat the attempt: %s\n", rpgnames.c_str(), err.c_str());
+        }
+        if (!data["succeeded"].bool_value())
        {
            // One of PG state updates failed, put dirty flags back
            for (auto pp: reporting_pgs)
--- a/src/osd_flush.cpp
+++ b/src/osd_flush.cpp
@@ -192,7 +192,9 @@ bool osd_t::submit_flush_op(pool_id_t pool_id, pg_num_t pg_num, pg_flush_batch_t
                op->bs_op = NULL;
                delete op;
            },
-            .len = (uint32_t)count,
+            {
+                .len = (uint32_t)count,
+            },
            .buf = op->buf,
        });
        bs->enqueue_op(op->bs_op);
@@ -303,27 +305,25 @@ void osd_t::submit_recovery_op(osd_recovery_op_t *op)
    };
    if (log_level > 2)
    {
-        printf("Submitting recovery operation for %lx:%lx\n", op->oid.inode, op->oid.stripe);
+        printf("Submitting recovery operation for %lx:%lx (%s)\n", op->oid.inode, op->oid.stripe, op->degraded ? "degraded" : "misplaced");
    }
+    op->osd_op->peer_fd = -1;
    op->osd_op->callback = [this, op](osd_op_t *osd_op)
    {
        if (osd_op->reply.hdr.retval < 0)
        {
            // Error recovering object
-            if (osd_op->reply.hdr.retval == -EPIPE)
-            {
-                // PG is stopped or one of the OSDs is gone, error is harmless
-                printf(
-                    "[PG %u/%u] Recovery operation failed with object %lx:%lx\n",
-                    INODE_POOL(op->oid.inode),
-                    map_to_pg(op->oid, st_cli.pool_config.at(INODE_POOL(op->oid.inode)).pg_stripe_size),
-                    op->oid.inode, op->oid.stripe
-                );
-            }
-            else
-            {
-                throw std::runtime_error("Failed to recover an object");
-            }
+            // EPIPE is totally harmless (peer is gone), others like EIO/EDOM may be not
+            printf(
+                "[PG %u/%u] Recovery operation failed with object %lx:%lx: error %ld\n",
+                INODE_POOL(op->oid.inode),
+                map_to_pg(op->oid, st_cli.pool_config.at(INODE_POOL(op->oid.inode)).pg_stripe_size),
+                op->oid.inode, op->oid.stripe, osd_op->reply.hdr.retval
+            );
+        }
+        else if (log_level > 2)
+        {
+            printf("Recovery operation done for %lx:%lx\n", op->oid.inode, op->oid.stripe);
        }
        // CAREFUL! op = &recovery_ops[op->oid]. Don't access op->* after recovery_ops.erase()
        op->osd_op = NULL;
--- a/src/osd_ops.cpp
+++ b/src/osd_ops.cpp
@@ -21,4 +21,6 @@ const char* osd_op_names[] = {
    "primary_delete",
    "ping",
    "sec_read_bmp",
+    "scrub",
+    "describe",
 };
--- a/src/osd_ops.h
+++ b/src/osd_ops.h
@@ -29,7 +29,9 @@
 #define OSD_OP_DELETE               14
 #define OSD_OP_PING                 15
 #define OSD_OP_SEC_READ_BMP         16
-#define OSD_OP_MAX                  16
+#define OSD_OP_SCRUB                17
+#define OSD_OP_DESCRIBE             18
+#define OSD_OP_MAX                  18
 #define OSD_RW_MAX                  64*1024*1024
 #define OSD_PROTOCOL_VERSION        1

@@ -43,6 +45,11 @@
 #define MEM_ALIGNMENT 4096
 #endif

+// Constants for osd_reply_describe_item_t.loc_bad
+#define LOC_OUTDATED 1
+#define LOC_CORRUPTED 2
+#define LOC_INCONSISTENT 4
+
 // common request and reply headers
 struct __attribute__((__packed__)) osd_op_header_t
 {
@@ -173,6 +180,11 @@ struct __attribute__((__packed__)) osd_op_sec_list_t
    uint64_t pg_stripe_size;
    // inode range (used to select pools)
    uint64_t min_inode, max_inode;
+    // min/max oid stripe, added after inodes for backwards compatibility
+    // also for backwards compatibility, max_stripe=UINT64_MAX means 0 and 0 means UINT64_MAX O_o
+    uint64_t min_stripe, max_stripe;
+    // max stable object count
+    uint32_t stable_limit;
 };

 struct __attribute__((__packed__)) osd_reply_sec_list_t
@@ -223,6 +235,36 @@ struct __attribute__((__packed__)) osd_reply_sync_t
    osd_reply_header_t header;
 };

+// describe unclean object states in detail
+struct __attribute__((__packed__)) osd_op_describe_t
+{
+    osd_op_header_t header;
+    // state mask to filter objects by state (0 or 0xfff..ff = all objects)
+    uint64_t object_state;
+    // minimum inode and offset
+    uint64_t min_inode, min_offset;
+    // maximum inode and offset
+    uint64_t max_inode, max_offset;
+    // limit
+    uint64_t limit;
+};
+
+struct __attribute__((__packed__)) osd_reply_describe_t
+{
+    osd_reply_header_t header;
+    // size of the resulting <osd_reply_describe_item_t> array in bytes
+    uint64_t result_bytes;
+};
+
+struct __attribute__((__packed__)) osd_reply_describe_item_t
+{
+    uint64_t inode;
+    uint64_t stripe;
+    uint32_t role;      // part number: 0 for replicas, 0..pg_size-1 for EC
+    uint32_t loc_bad;   // LOC_OUTDATED / LOC_CORRUPTED / LOC_INCONSISTENT
+    osd_num_t osd_num;  // OSD number
+};
+
 // FIXME it would be interesting to try to unify blockstore_op and osd_op formats
 union osd_any_op_t
 {
@@ -236,6 +278,7 @@ union osd_any_op_t
    osd_op_show_config_t show_conf;
    osd_op_rw_t rw;
    osd_op_sync_t sync;
+    osd_op_describe_t describe;
    uint8_t buf[OSD_PACKET_SIZE];
 };

@@ -251,6 +294,7 @@ union osd_any_reply_t
    osd_reply_show_config_t show_conf;
    osd_reply_rw_t rw;
    osd_reply_sync_t sync;
+    osd_reply_describe_t describe;
    uint8_t buf[OSD_PACKET_SIZE];
 };

--- a/src/osd_peering.cpp
+++ b/src/osd_peering.cpp
@@ -25,6 +25,7 @@ void osd_t::handle_peers()
                {
                    p.second.calc_object_states(log_level);
                    report_pg_state(p.second);
+                    schedule_scrub(p.second);
                    incomplete_objects += p.second.incomplete_objects.size();
                    misplaced_objects += p.second.misplaced_objects.size();
                    // FIXME: degraded objects may currently include misplaced, too! Report them separately?
@@ -83,6 +84,13 @@ void osd_t::handle_peers()
            peering_state = peering_state & ~OSD_RECOVERING;
        }
    }
+    if (peering_state & OSD_SCRUBBING)
+    {
+        if (!continue_scrub())
+        {
+            peering_state = peering_state & ~OSD_SCRUBBING;
+        }
+    }
 }

 void osd_t::repeer_pgs(osd_num_t peer_osd)
@@ -128,9 +136,11 @@ void osd_t::reset_pg(pg_t & pg)
    pg.state_dict.clear();
    copies_to_delete_after_sync_count -= pg.copies_to_delete_after_sync.size();
    pg.copies_to_delete_after_sync.clear();
+    corrupted_objects -= pg.corrupted_count;
    incomplete_objects -= pg.incomplete_objects.size();
    misplaced_objects -= pg.misplaced_objects.size();
    degraded_objects -= pg.degraded_objects.size();
+    pg.corrupted_count = 0;
    pg.incomplete_objects.clear();
    pg.misplaced_objects.clear();
    pg.degraded_objects.clear();
@@ -206,7 +216,7 @@ void osd_t::start_pg_peering(pg_t & pg)
            pg.cur_loc_set.push_back({
                .role = (uint64_t)role,
                .osd_num = pg.cur_set[role],
-                .outdated = false,
+                .loc_bad = 0,
            });
        }
    }
@@ -319,11 +329,12 @@ void osd_t::submit_list_subop(osd_num_t role_osd, pg_peering_state_t *ps)
        clock_gettime(CLOCK_REALTIME, &op->tv_begin);
        op->bs_op = new blockstore_op_t();
        op->bs_op->opcode = BS_OP_LIST;
-        op->bs_op->oid.stripe = st_cli.pool_config[ps->pool_id].pg_stripe_size;
-        op->bs_op->oid.inode = ((uint64_t)ps->pool_id << (64 - POOL_ID_BITS));
-        op->bs_op->version = ((uint64_t)(ps->pool_id+1) << (64 - POOL_ID_BITS)) - 1;
-        op->bs_op->len = pg_counts[ps->pool_id];
-        op->bs_op->offset = ps->pg_num-1;
+        op->bs_op->pg_alignment = st_cli.pool_config[ps->pool_id].pg_stripe_size;
+        op->bs_op->min_oid.inode = ((uint64_t)ps->pool_id << (64 - POOL_ID_BITS));
+        op->bs_op->max_oid.inode = ((uint64_t)(ps->pool_id+1) << (64 - POOL_ID_BITS)) - 1;
+        op->bs_op->max_oid.stripe = UINT64_MAX;
+        op->bs_op->pg_count = pg_counts[ps->pool_id];
+        op->bs_op->pg_number = ps->pg_num-1;
        op->bs_op->callback = [this, ps, op, role_osd](blockstore_op_t *bs_op)
        {
            if (op->bs_op->retval < 0)
@@ -483,6 +494,7 @@ void osd_t::report_pg_state(pg_t & pg)
        pg.all_peers = pg.target_set;
        std::sort(pg.all_peers.begin(), pg.all_peers.end());
        pg.cur_peers = pg.target_set;
+        plan_scrub(pg, false);
        // Change pg_config at the same time, otherwise our PG reconciling loop may try to apply the old metadata
        auto & pg_cfg = st_cli.pool_config[pg.pool_id].pg_config[pg.pg_num];
        pg_cfg.target_history = pg.target_history;
@@ -526,6 +538,7 @@ void osd_t::report_pg_state(pg_t & pg)
                pg.cur_peers.push_back(pg_osd);
            }
        }
+        plan_scrub(pg, false);
        auto & pg_cfg = st_cli.pool_config[pg.pool_id].pg_config[pg.pg_num];
        pg_cfg.target_history = pg.target_history;
        pg_cfg.all_peers = pg.all_peers;
--- a/src/osd_peering_pg.cpp
+++ b/src/osd_peering_pg.cpp
@@ -255,7 +255,7 @@ void pg_obj_state_check_t::finish_object()
    }
    else if (n_mismatched > 0)
    {
-        if (log_level > 2 && (replicated || n_roles >= pg->pg_cursize))
+        if (log_level > 2)
        {
            printf("Object is misplaced: %lx:%lx version=%lu/%lu\n", oid.inode, oid.stripe, target_ver, max_ver);
        }
@@ -280,7 +280,7 @@ void pg_obj_state_check_t::finish_object()
            osd_set.push_back((pg_obj_loc_t){
                .role = (list[i].oid.stripe & STRIPE_MASK),
                .osd_num = list[i].osd_num,
-                .outdated = false,
+                .loc_bad = 0,
            });
        }
    }
@@ -302,7 +302,7 @@ void pg_obj_state_check_t::finish_object()
                osd_set.push_back((pg_obj_loc_t){
                    .role = (list[i].oid.stripe & STRIPE_MASK),
                    .osd_num = list[i].osd_num,
-                    .outdated = true,
+                    .loc_bad = LOC_OUTDATED,
                });
                if (!(state & (OBJ_INCOMPLETE | OBJ_DEGRADED)))
                {
@@ -322,67 +322,77 @@ void pg_obj_state_check_t::finish_object()
    }
    else
    {
-        auto it = pg->state_dict.find(osd_set);
-        if (it == pg->state_dict.end())
-        {
-            std::vector<uint64_t> read_target;
-            if (replicated)
-            {
-                for (auto & o: osd_set)
-                {
-                    if (!o.outdated)
-                    {
-                        read_target.push_back(o.osd_num);
-                    }
-                }
-                while (read_target.size() < pg->pg_size)
-                {
-                    // FIXME: This is because we then use .data() and assume it's at least <pg_size> long
-                    read_target.push_back(0);
-                }
-            }
-            else
-            {
-                read_target.resize(pg->pg_size);
-                for (int i = 0; i < pg->pg_size; i++)
-                {
-                    read_target[i] = 0;
-                }
-                for (auto & o: osd_set)
-                {
-                    if (!o.outdated)
-                    {
-                        read_target[o.role] = o.osd_num;
-                    }
-                }
-            }
-            pg->state_dict[osd_set] = {
-                .read_target = read_target,
-                .osd_set = osd_set,
-                .state = state,
-                .object_count = 1,
-            };
-            it = pg->state_dict.find(osd_set);
-        }
-        else
-        {
-            it->second.object_count++;
-        }
-        if (state & OBJ_INCOMPLETE)
-        {
-            pg->incomplete_objects[oid] = &it->second;
-        }
-        else if (state & OBJ_DEGRADED)
-        {
-            pg->degraded_objects[oid] = &it->second;
-        }
-        else
-        {
-            pg->misplaced_objects[oid] = &it->second;
-        }
+        pg->add_object_to_state(oid, state, osd_set);
    }
 }

+pg_osd_set_state_t* pg_t::add_object_to_state(const object_id oid, const uint64_t state, const pg_osd_set_t & osd_set)
+{
+    auto it = state_dict.find(osd_set);
+    if (it == state_dict.end())
+    {
+        std::vector<osd_num_t> read_target;
+        if (scheme == POOL_SCHEME_REPLICATED)
+        {
+            for (auto & o: osd_set)
+            {
+                if (!(o.loc_bad & (LOC_OUTDATED | LOC_CORRUPTED)))
+                {
+                    read_target.push_back(o.osd_num);
+                }
+            }
+            while (read_target.size() < pg_size)
+            {
+                // FIXME: This is because we then use .data() and assume it's at least <pg_size> long
+                read_target.push_back(0);
+            }
+        }
+        else
+        {
+            read_target.resize(pg_size);
+            for (int i = 0; i < pg_size; i++)
+            {
+                read_target[i] = 0;
+            }
+            for (auto & o: osd_set)
+            {
+                if (!(o.loc_bad & (LOC_OUTDATED | LOC_CORRUPTED)))
+                {
+                    read_target[o.role] = o.osd_num;
+                }
+            }
+        }
+        state_dict[osd_set] = {
+            .read_target = read_target,
+            .osd_set = osd_set,
+            .state = state,
+            .object_count = 1,
+        };
+        it = state_dict.find(osd_set);
+    }
+    else
+    {
+        it->second.object_count++;
+    }
+    if (state & OBJ_INCONSISTENT)
+    {
+        inconsistent_objects[oid] = &it->second;
+    }
+    else if (state & OBJ_INCOMPLETE)
+    {
+        incomplete_objects[oid] = &it->second;
+    }
+    else if (state & OBJ_DEGRADED)
+    {
+        degraded_objects[oid] = &it->second;
+    }
+    else
+    {
+        misplaced_objects[oid] = &it->second;
+    }
+    return &it->second;
+}
+
 // FIXME: Write at least some tests for this function
 void pg_t::calc_object_states(int log_level)
 {
@@ -446,7 +456,9 @@ void pg_t::calc_object_states(int log_level)
                osd_set_desc += (osd_set_desc == "" ? "" : ", ")+
                    std::to_string(loc.osd_num)+
                    (st.replicated ? "" : "("+std::to_string(loc.role)+")")+
-                    (loc.outdated ? "(old)" : "");
+                    (loc.loc_bad & LOC_OUTDATED ? "(old)" : "")+
+                    (loc.loc_bad & LOC_CORRUPTED ? "(corrupted)" : "")+
+                    (loc.loc_bad & LOC_INCONSISTENT ? "(inconsistent)" : "");
            }
            printf("[PG %u/%u] %lu objects on OSD set %s\n", pool_id, pg_num, stp.second.object_count, osd_set_desc.c_str());
        }
@@ -456,7 +468,7 @@ void pg_t::calc_object_states(int log_level)
 void pg_t::print_state()
 {
    printf(
-        "[PG %u/%u] is %s%s%s%s%s%s%s%s%s%s%s%s%s%s (%lu objects)\n", pool_id, pg_num,
+        "[PG %u/%u] is %s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s (%lu objects)\n", pool_id, pg_num,
        (state & PG_STARTING) ? "starting" : "",
        (state & PG_OFFLINE) ? "offline" : "",
        (state & PG_PEERING) ? "peering" : "",
@@ -465,12 +477,15 @@ void pg_t::print_state()
        (state & PG_REPEERING) ? "repeering" : "",
        (state & PG_STOPPING) ? "stopping" : "",
        (state & PG_DEGRADED) ? " + degraded" : "",
+        (state & PG_HAS_INCONSISTENT) ? " + has_inconsistent" : "",
+        (state & PG_HAS_CORRUPTED) ? " + has_corrupted" : "",
        (state & PG_HAS_INCOMPLETE) ? " + has_incomplete" : "",
        (state & PG_HAS_DEGRADED) ? " + has_degraded" : "",
        (state & PG_HAS_MISPLACED) ? " + has_misplaced" : "",
        (state & PG_HAS_UNCLEAN) ? " + has_unclean" : "",
        (state & PG_HAS_INVALID) ? " + has_invalid" : "",
        (state & PG_LEFT_ON_DEAD) ? " + left_on_dead" : "",
+        (state & PG_SCRUBBING) ? " + scrubbing" : "",
        total_count
    );
 }
--- a/src/osd_peering_pg.h
+++ b/src/osd_peering_pg.h
@@ -17,7 +17,7 @@ struct pg_obj_loc_t
 {
    uint64_t role;
    osd_num_t osd_num;
-    bool outdated;
+    uint32_t loc_bad; // LOC_OUTDATED / LOC_CORRUPTED / LOC_INCONSISTENT
 };

 typedef std::vector<pg_obj_loc_t> pg_osd_set_t;
@@ -30,6 +30,7 @@ struct pg_osd_set_state_t
    pg_osd_set_t osd_set;
    uint64_t state = 0;
    uint64_t object_count = 0;
+    uint64_t ref_count = 0;
 };

 struct pg_list_result_t
@@ -91,6 +92,8 @@ struct pg_t
    // target history and all potential peers
    std::vector<std::vector<osd_num_t>> target_history;
    std::vector<osd_num_t> all_peers;
+    // next scrub time
+    uint64_t next_scrub = 0;
    bool history_changed = false;
    // peer list from the last peering event
    std::vector<osd_num_t> cur_peers;
@@ -106,7 +109,8 @@ struct pg_t
    // it may consume up to ~ (raw storage / object size) * 24 bytes in the worst case scenario
    // which is up to ~192 MB per 1 TB in the worst case scenario
    std::map<pg_osd_set_t, pg_osd_set_state_t> state_dict;
-    btree::btree_map<object_id, pg_osd_set_state_t*> incomplete_objects, misplaced_objects, degraded_objects;
+    uint64_t corrupted_count;
+    btree::btree_map<object_id, pg_osd_set_state_t*> inconsistent_objects, incomplete_objects, misplaced_objects, degraded_objects;
    std::map<obj_piece_id_t, flush_action_t> flush_actions;
    std::vector<obj_ver_osd_t> copies_to_delete_after_sync;
    btree::btree_map<object_id, uint64_t> ver_override;
@@ -116,15 +120,16 @@ struct pg_t
    int inflight = 0; // including write_queue
    std::multimap<object_id, osd_op_t*> write_queue;

+    pg_osd_set_state_t* add_object_to_state(const object_id oid, const uint64_t state, const pg_osd_set_t & osd_set);
    void calc_object_states(int log_level);
    void print_state();
 };

 inline bool operator < (const pg_obj_loc_t &a, const pg_obj_loc_t &b)
 {
-    return a.outdated < b.outdated ||
-        a.outdated == b.outdated && a.role < b.role ||
-        a.outdated == b.outdated && a.role == b.role && a.osd_num < b.osd_num;
+    return a.loc_bad < b.loc_bad ||
+        a.loc_bad == b.loc_bad && a.role < b.role ||
+        a.loc_bad == b.loc_bad && a.role == b.role && a.osd_num < b.osd_num;
 }

 inline bool operator == (const obj_piece_id_t & a, const obj_piece_id_t & b)
--- a/src/osd_primary.cpp
+++ b/src/osd_primary.cpp
@@ -52,7 +52,9 @@ bool osd_t::prepare_primary_rw(osd_op_t *cur_op)
        finish_op(cur_op, -EINVAL);
        return false;
    }
-    int stripe_count = (pool_cfg.scheme == POOL_SCHEME_REPLICATED ? 1 : pg_it->second.pg_size);
+    // Scrub is similar to r/w, so it's also handled here
+    int stripe_count = (pool_cfg.scheme == POOL_SCHEME_REPLICATED
+        && cur_op->req.hdr.opcode != OSD_OP_SCRUB ? 1 : pg_it->second.pg_size);
    int chain_size = 0;
    if (cur_op->req.hdr.opcode == OSD_OP_READ && cur_op->req.rw.meta_revision > 0)
    {
@@ -90,6 +92,8 @@ bool osd_t::prepare_primary_rw(osd_op_t *cur_op)
        chain_size * (
            // - copy of the chain
            sizeof(inode_t) +
+            // - object states for every chain item
+            sizeof(void*) +
            // - bitmap buffers for chained read
            stripe_count * clean_entry_bitmap_size +
            // - 'missing' flags for chained reads
@@ -117,6 +121,8 @@ bool osd_t::prepare_primary_rw(osd_op_t *cur_op)
    {
        op_data->read_chain = (inode_t*)data_buf;
        data_buf = (uint8_t*)data_buf + sizeof(inode_t) * chain_size;
+        op_data->chain_states = (pg_osd_set_state_t**)data_buf;
+        data_buf = (uint8_t*)data_buf + sizeof(pg_osd_set_state_t*) * chain_size;
        op_data->snapshot_bitmaps = data_buf;
        data_buf = (uint8_t*)data_buf + chain_size * stripe_count * clean_entry_bitmap_size;
        op_data->missing_flags = (uint8_t*)data_buf;
@@ -131,6 +137,7 @@ bool osd_t::prepare_primary_rw(osd_op_t *cur_op)
            inode_it->second.parent_id != cur_op->req.rw.inode)
        {
            op_data->read_chain[chain_num++] = inode_it->second.parent_id;
+            op_data->chain_states[chain_num++] = NULL;
            inode_it = st_cli.inode_config.find(inode_it->second.parent_id);
        }
    }
@@ -138,12 +145,12 @@ bool osd_t::prepare_primary_rw(osd_op_t *cur_op)
    return true;
 }

-uint64_t* osd_t::get_object_osd_set(pg_t &pg, object_id &oid, uint64_t *def, pg_osd_set_state_t **object_state)
+uint64_t* osd_t::get_object_osd_set(pg_t &pg, object_id &oid, pg_osd_set_state_t **object_state)
 {
    if (!(pg.state & (PG_HAS_INCOMPLETE | PG_HAS_DEGRADED | PG_HAS_MISPLACED)))
    {
        *object_state = NULL;
-        return def;
+        return pg.cur_set.data();
    }
    auto st_it = pg.incomplete_objects.find(oid);
    if (st_it != pg.incomplete_objects.end())
@@ -164,7 +171,7 @@ uint64_t* osd_t::get_object_osd_set(pg_t &pg, object_id &oid, uint64_t *def, pg_
        return st_it->second->read_target.data();
    }
    *object_state = NULL;
-    return def;
+    return pg.cur_set.data();
 }

 void osd_t::continue_primary_read(osd_op_t *cur_op)
@@ -183,6 +190,7 @@ void osd_t::continue_primary_read(osd_op_t *cur_op)
        goto resume_1;
    else if (op_data->st == 2)
        goto resume_2;
+resume_0:
    cur_op->reply.rw.bitmap_len = 0;
    {
        auto & pg = pgs.at({ .pool_id = INODE_POOL(op_data->oid.inode), .pg_num = op_data->pg_num });
@@ -206,15 +214,17 @@ void osd_t::continue_primary_read(osd_op_t *cur_op)
        // Determine version
        auto vo_it = pg.ver_override.find(op_data->oid);
        op_data->target_ver = vo_it != pg.ver_override.end() ? vo_it->second : UINT64_MAX;
-        op_data->prev_set = pg.cur_set.data();
-        if (pg.state != PG_ACTIVE)
-        {
-            // PG may be degraded or have misplaced objects
-            op_data->prev_set = get_object_osd_set(pg, op_data->oid, pg.cur_set.data(), &op_data->object_state);
-        }
+        // PG may have degraded or misplaced objects
+        op_data->prev_set = get_object_osd_set(pg, op_data->oid, &op_data->object_state);
        if (pg.state == PG_ACTIVE || op_data->scheme == POOL_SCHEME_REPLICATED)
        {
            // Fast happy-path
+            if (op_data->scheme == POOL_SCHEME_REPLICATED &&
+                op_data->object_state && (op_data->object_state->state & OBJ_INCOMPLETE))
+            {
+                finish_op(cur_op, -EIO);
+                return;
+            }
            cur_op->buf = alloc_read_buffer(op_data->stripes, op_data->pg_data_size, 0);
            submit_primary_subops(SUBMIT_RMW_READ, op_data->target_ver, op_data->prev_set, cur_op);
            op_data->st = 1;
@@ -240,6 +250,14 @@ resume_1:
 resume_2:
    if (op_data->errors > 0)
    {
+        if (op_data->errcode == -EIO || op_data->errcode == -EDOM)
+        {
+            // I/O or checksum error
+            auto & pg = pgs.at({ .pool_id = INODE_POOL(op_data->oid.inode), .pg_num = op_data->pg_num });
+            // FIXME: ref = true ideally... because new_state != state is not necessarily true if it's freed and recreated
+            op_data->object_state = mark_object_corrupted(pg, op_data->oid, op_data->object_state, op_data->stripes, false, false);
+            goto resume_0;
+        }
        finish_op(cur_op, op_data->errcode);
        return;
    }
@@ -278,10 +296,284 @@ resume_2:
    finish_op(cur_op, cur_op->req.rw.len);
 }

-// Decrement pg_osd_set_state_t's object_count and change PG state accordingly
-void osd_t::remove_object_from_state(object_id & oid, pg_osd_set_state_t *object_state, pg_t & pg)
+pg_osd_set_state_t *osd_t::mark_object_corrupted(pg_t & pg, object_id oid, pg_osd_set_state_t *prev_object_state,
+    osd_rmw_stripe_t *stripes, bool ref, bool inconsistent)
 {
-    if (object_state->state & OBJ_INCOMPLETE)
+    pg_osd_set_state_t *object_state = NULL;
+    get_object_osd_set(pg, oid, &object_state);
+    if (prev_object_state != object_state)
+    {
+        // Object state changed in between by a parallel I/O operation, skip marking as failed
+        if (ref)
+        {
+            deref_object_state(pg, &prev_object_state, ref);
+            if (object_state)
+                object_state->ref_count++;
+        }
+        return object_state;
+    }
+    pg_osd_set_t corrupted_set;
+    if (object_state)
+    {
+        corrupted_set = object_state->osd_set;
+    }
+    else
+    {
+        for (int i = 0; i < pg.cur_set.size(); i++)
+        {
+            corrupted_set.push_back((pg_obj_loc_t){
+                .role = (pg.scheme == POOL_SCHEME_REPLICATED ? 0 : (uint64_t)i),
+                .osd_num = pg.cur_set[i],
+            });
+        }
+    }
+    // Mark object chunk(s) as corrupted
+    int changes = 0;
+    for (auto chunk_it = corrupted_set.begin(); chunk_it != corrupted_set.end(); )
+    {
+        auto & chunk = *chunk_it;
+        if (stripes[chunk.role].osd_num == chunk.osd_num)
+        {
+            if (stripes[chunk.role].not_exists)
+            {
+                changes++;
+                corrupted_set.erase(chunk_it, chunk_it+1);
+                continue;
+            }
+            if (stripes[chunk.role].read_error && chunk.loc_bad != LOC_CORRUPTED)
+            {
+                changes++;
+                chunk.loc_bad = LOC_CORRUPTED;
+            }
+            else if (stripes[chunk.role].read_end > 0 && !stripes[chunk.role].missing &&
+                (chunk.loc_bad & LOC_CORRUPTED))
+            {
+                changes++;
+                chunk.loc_bad &= ~LOC_CORRUPTED;
+            }
+        }
+        if (inconsistent && !chunk.loc_bad)
+        {
+            changes++;
+            chunk.loc_bad |= LOC_INCONSISTENT;
+        }
+        else if (!inconsistent && (chunk.loc_bad & LOC_INCONSISTENT))
+        {
+            changes++;
+            chunk.loc_bad &= ~LOC_INCONSISTENT;
+        }
+        chunk_it++;
+    }
+    if (!changes)
+    {
+        // No chunks newly marked as corrupted - object is already marked or moved
+        return object_state;
+    }
+    int old_pg_state = pg.state;
+    if (object_state)
+    {
+        remove_object_from_state(oid, &object_state, pg, false);
+        deref_object_state(pg, &object_state, ref);
+    }
+    // Insert object into the new state and retry
+    object_state = add_object_to_set(pg, oid, corrupted_set, old_pg_state, 2);
+    if (ref)
+    {
+        object_state->ref_count++;
+    }
+    return object_state;
+}
+
+pg_osd_set_state_t* osd_t::add_object_to_set(pg_t & pg, const object_id oid, const pg_osd_set_t & osd_set,
+    uint64_t old_pg_state, int log_at_level)
+{
+    // Object state will be calculated from <osd_set>
+    uint64_t has_roles = 0, n_roles = 0, n_copies = 0, n_invalid = 0, n_outdated = 0,
+        n_misplaced = 0, n_corrupted = 0, n_inconsistent = 0;
+    for (auto & chunk: osd_set)
+    {
+        if (chunk.role >= (pg.scheme == POOL_SCHEME_REPLICATED ? 1 : pg.pg_size))
+        {
+            n_invalid++;
+        }
+        else if (chunk.loc_bad & LOC_OUTDATED)
+        {
+            n_outdated++;
+        }
+        else
+        {
+            if (chunk.loc_bad & LOC_INCONSISTENT)
+            {
+                n_inconsistent++;
+            }
+            if (chunk.loc_bad & LOC_CORRUPTED)
+            {
+                n_corrupted++;
+            }
+            else if (pg.scheme == POOL_SCHEME_REPLICATED)
+            {
+                n_roles = 1;
+                int i;
+                for (i = 0; i < pg.cur_set.size() && pg.cur_set[i] != chunk.osd_num; i++) {}
+                if (i == pg.cur_set.size())
+                {
+                    n_misplaced++;
+                }
+            }
+            else
+            {
+                if (!(has_roles & (1 << chunk.role)))
+                {
+                    n_roles++;
+                    has_roles |= (1 << chunk.role);
+                }
+                if (pg.cur_set[chunk.role] != chunk.osd_num)
+                {
+                    n_misplaced++;
+                }
+            }
+            n_copies++;
+        }
+    }
+    uint64_t obj_state = 0;
+    int pg_state_bits = 0;
+    if (n_corrupted > 0)
+    {
+        this->corrupted_objects++;
+        pg.corrupted_count++;
+        obj_state |= OBJ_CORRUPTED;
+        pg_state_bits |= PG_HAS_CORRUPTED;
+    }
+    if (n_invalid > 0 || n_inconsistent > 0)
+    {
+        this->inconsistent_objects++;
+        obj_state |= OBJ_INCONSISTENT;
+        pg_state_bits |= PG_HAS_INCONSISTENT;
+    }
+    else if (n_roles < pg.pg_data_size)
+    {
+        this->incomplete_objects++;
+        obj_state |= OBJ_INCOMPLETE;
+        pg_state_bits = PG_HAS_INCOMPLETE;
+    }
+    else if (n_roles < pg.pg_cursize)
+    {
+        this->degraded_objects++;
+        obj_state |= OBJ_DEGRADED;
+        pg_state_bits = PG_HAS_DEGRADED;
+    }
+    else if (n_misplaced > 0 || n_outdated > 0)
+    {
+        this->misplaced_objects++;
+        obj_state |= OBJ_MISPLACED;
+        pg_state_bits = PG_HAS_MISPLACED;
+    }
+    if (this->log_level >= log_at_level)
+    {
+        printf("Marking object %lx:%lx ", oid.inode, oid.stripe);
+        for (int i = 0, j = 0; i < object_state_bit_count; i++)
+        {
+            if ((obj_state & object_state_bits[i]) || object_state_bits[i] == 0 && obj_state == 0)
+            {
+                printf((j++) ? "+%s" : "%s", object_state_names[i]);
+            }
+        }
+        if (pg.scheme == POOL_SCHEME_REPLICATED)
+        {
+            printf(": %lu copies available", n_copies);
+        }
+        else
+        {
+            printf(": %lu parts / %lu copies available", n_roles, n_copies);
+        }
+        if (n_invalid > 0)
+        {
+            printf(", %lu invalid", n_invalid);
+        }
+        if (n_outdated > 0)
+        {
+            printf(", %lu outdated", n_outdated);
+        }
+        if (n_misplaced > 0)
+        {
+            printf(", %lu misplaced", n_misplaced);
+        }
+        if (n_corrupted > 0)
+        {
+            printf(", %lu corrupted", n_corrupted);
+        }
+        if (n_inconsistent > 0)
+        {
+            printf(", %lu inconsistent", n_inconsistent);
+        }
+        printf("\n");
+    }
+    pg.state |= pg_state_bits;
+    if (pg.state != old_pg_state)
+    {
+        report_pg_state(pg);
+        if ((pg.state & (PG_HAS_DEGRADED | PG_HAS_MISPLACED)) !=
+            (old_pg_state & (PG_HAS_DEGRADED | PG_HAS_MISPLACED)))
+        {
+            peering_state = peering_state | OSD_RECOVERING;
+            if ((pg.state & PG_HAS_DEGRADED) != (old_pg_state & PG_HAS_DEGRADED))
+            {
+                // Restart recovery from degraded objects
+                recovery_last_degraded = true;
+                recovery_last_pg = {};
+                recovery_last_oid = {};
+            }
+            ringloop->wakeup();
+        }
+    }
+    if (!obj_state)
+    {
+        // Object is clean
+        return NULL;
+    }
+    // Insert object into the new state and retry
+    return pg.add_object_to_state(oid, obj_state, osd_set);
+}
+
+// Decrement pg_osd_set_state_t's object_count and change PG state accordingly
+void osd_t::remove_object_from_state(object_id & oid, pg_osd_set_state_t **object_state, pg_t & pg, bool report)
+{
+    if (!*object_state)
+    {
+        return;
+    }
+    pg_osd_set_state_t *recheck_state = NULL;
+    get_object_osd_set(pg, oid, &recheck_state);
+    if (recheck_state != *object_state)
+    {
+        recheck_state->ref_count++;
+        (*object_state)->ref_count--;
+        *object_state = recheck_state;
+        return;
+    }
+    bool changed = false;
+    (*object_state)->object_count--;
+    if ((*object_state)->state & OBJ_CORRUPTED)
+    {
+        this->corrupted_objects--;
+        pg.corrupted_count--;
+        if (!pg.corrupted_count)
+        {
+            pg.state = pg.state & ~PG_HAS_CORRUPTED;
+            changed = true;
+        }
+    }
+    if ((*object_state)->state & OBJ_INCONSISTENT)
+    {
+        this->inconsistent_objects--;
+        pg.inconsistent_objects.erase(oid);
+        if (!pg.inconsistent_objects.size())
+        {
+            pg.state = pg.state & ~PG_HAS_INCONSISTENT;
+            changed = true;
+        }
+    }
+    else if ((*object_state)->state & OBJ_INCOMPLETE)
    {
        // Successful write means that object is not incomplete anymore
        this->incomplete_objects--;
@@ -289,41 +581,52 @@ void osd_t::remove_object_from_state(object_id & oid, pg_osd_set_state_t *object
        if (!pg.incomplete_objects.size())
        {
            pg.state = pg.state & ~PG_HAS_INCOMPLETE;
-            report_pg_state(pg);
+            changed = true;
        }
    }
-    else if (object_state->state & OBJ_DEGRADED)
+    else if ((*object_state)->state & OBJ_DEGRADED)
    {
        this->degraded_objects--;
        pg.degraded_objects.erase(oid);
        if (!pg.degraded_objects.size())
        {
            pg.state = pg.state & ~PG_HAS_DEGRADED;
-            report_pg_state(pg);
+            changed = true;
        }
    }
-    else if (object_state->state & OBJ_MISPLACED)
+    else if ((*object_state)->state & OBJ_MISPLACED)
    {
        this->misplaced_objects--;
        pg.misplaced_objects.erase(oid);
        if (!pg.misplaced_objects.size())
        {
            pg.state = pg.state & ~PG_HAS_MISPLACED;
-            report_pg_state(pg);
+            changed = true;
        }
    }
    else
    {
-        throw std::runtime_error("BUG: Invalid object state: "+std::to_string(object_state->state));
+        throw std::runtime_error("BUG: Invalid object state: "+std::to_string((*object_state)->state));
+    }
+    if (changed && report)
+    {
+        report_pg_state(pg);
    }
 }

-void osd_t::free_object_state(pg_t & pg, pg_osd_set_state_t **object_state)
+void osd_t::deref_object_state(pg_t & pg, pg_osd_set_state_t **object_state, bool deref)
 {
-    if (*object_state && !(--(*object_state)->object_count))
+    if (*object_state)
    {
-        pg.state_dict.erase((*object_state)->osd_set);
-        *object_state = NULL;
+        if (deref)
+        {
+            (*object_state)->ref_count--;
+        }
+        if (!(*object_state)->object_count && !(*object_state)->ref_count)
+        {
+            pg.state_dict.erase((*object_state)->osd_set);
+            *object_state = NULL;
+        }
    }
 }

@@ -353,21 +656,28 @@ void osd_t::continue_primary_del(osd_op_t *cur_op)
    }
 resume_1:
    // Determine which OSDs contain this object and delete it
-    op_data->prev_set = get_object_osd_set(pg, op_data->oid, pg.cur_set.data(), &op_data->object_state);
+    op_data->prev_set = get_object_osd_set(pg, op_data->oid, &op_data->object_state);
+    if (op_data->object_state)
+    {
+        op_data->object_state->ref_count++;
+    }
    // Submit 1 read to determine the actual version number
    submit_primary_subops(SUBMIT_RMW_READ, UINT64_MAX, op_data->prev_set, cur_op);
+    op_data->prev_set = NULL;
 resume_2:
    op_data->st = 2;
    return;
 resume_3:
    if (op_data->errors > 0)
    {
+        deref_object_state(pg, &op_data->object_state, true);
        pg_cancel_write_queue(pg, cur_op, op_data->oid, op_data->errcode);
        return;
    }
    // Check CAS version
    if (cur_op->req.rw.version && op_data->fact_ver != (cur_op->req.rw.version-1))
    {
+        deref_object_state(pg, &op_data->object_state, true);
        cur_op->reply.hdr.retval = -EINTR;
        cur_op->reply.rw.version = op_data->fact_ver;
        goto continue_others;
@@ -383,6 +693,7 @@ resume_4:
 resume_5:
    if (op_data->errors > 0)
    {
+        deref_object_state(pg, &op_data->object_state, true);
        pg_cancel_write_queue(pg, cur_op, op_data->oid, op_data->errcode);
        return;
    }
@@ -395,8 +706,8 @@ resume_5:
    }
    else
    {
-        remove_object_from_state(op_data->oid, op_data->object_state, pg);
-        free_object_state(pg, &op_data->object_state);
+        remove_object_from_state(op_data->oid, &op_data->object_state, pg);
+        deref_object_state(pg, &op_data->object_state, true);
    }
    pg.total_count--;
    cur_op->reply.hdr.retval = 0;
--- a/src/osd_primary.h
+++ b/src/osd_primary.h
@@ -9,6 +9,7 @@
 #define SUBMIT_READ 0
 #define SUBMIT_RMW_READ 1
 #define SUBMIT_WRITE 2
+#define SUBMIT_SCRUB_READ 3

 struct unstable_osd_num_t
 {
@@ -50,6 +51,7 @@ struct osd_primary_op_data_t
            // for read_bitmaps
            void *snapshot_bitmaps;
            inode_t *read_chain;
+            pg_osd_set_state_t **chain_states;
            uint8_t *missing_flags;
            int chain_size;
            osd_chain_read_t *chain_reads;
--- a/src/osd_primary_chain.cpp
+++ b/src/osd_primary_chain.cpp
@@ -40,10 +40,24 @@ resume_3:
 resume_4:
    if (op_data->errors > 0)
    {
-        free(op_data->chain_reads);
-        op_data->chain_reads = NULL;
-        finish_op(cur_op, op_data->errcode);
-        return;
+        if (op_data->errcode == -EIO || op_data->errcode == -EDOM)
+        {
+            // Handle corrupted reads and retry...
+            check_corrupted_chained(pg, cur_op);
+            free(cur_op->buf);
+            cur_op->buf = NULL;
+            free(op_data->chain_reads);
+            op_data->chain_reads = NULL;
+            // FIXME: We can in theory retry only specific parts instead of the whole operation
+            goto resume_1;
+        }
+        else
+        {
+            free(op_data->chain_reads);
+            op_data->chain_reads = NULL;
+            finish_op(cur_op, op_data->errcode);
+            return;
+        }
    }
    send_chained_read_results(pg, cur_op);
    finish_op(cur_op, cur_op->req.rw.len);
@@ -131,8 +145,7 @@ int osd_t::collect_bitmap_requests(osd_op_t *cur_op, pg_t & pg, std::vector<bitm
        object_id cur_oid = { .inode = op_data->read_chain[chain_num], .stripe = op_data->oid.stripe };
        auto vo_it = pg.ver_override.find(cur_oid);
        uint64_t target_version = vo_it != pg.ver_override.end() ? vo_it->second : UINT64_MAX;
-        pg_osd_set_state_t *object_state;
-        uint64_t* cur_set = get_object_osd_set(pg, cur_oid, pg.cur_set.data(), &object_state);
+        uint64_t* cur_set = get_object_osd_set(pg, cur_oid, &op_data->chain_states[chain_num]);
        if (pg.scheme == POOL_SCHEME_REPLICATED)
        {
            osd_num_t read_target = 0;
@@ -247,6 +260,7 @@ int osd_t::submit_bitmap_subops(osd_op_t *cur_op, pg_t & pg)
                osd_op_t *subop = op_data->subops+subop_idx;
                subop->op_type = OSD_OP_OUT;
                // FIXME: Use the pre-allocated buffer
+                assert(!subop->buf);
                subop->buf = malloc_or_die(sizeof(obj_ver_id)*(i+1-prev));
                subop->req = (osd_any_op_t){
                    .sec_read_bmp = {
@@ -375,6 +389,8 @@ int osd_t::submit_chained_read_requests(pg_t & pg, osd_op_t *cur_op)
    op_data->chain_read_count = chain_reads.size();
    op_data->chain_reads = (osd_chain_read_t*)calloc_or_die(
        1, sizeof(osd_chain_read_t) * chain_reads.size()
+        // FIXME: Allocate only <chain_reads.size()> instead of <chain_size> stripes
+        // (but it's slightly harder to handle in send_chained_read_results())
        + sizeof(osd_rmw_stripe_t) * stripe_count * op_data->chain_size
    );
    osd_rmw_stripe_t *chain_stripes = (osd_rmw_stripe_t*)(
@@ -403,8 +419,7 @@ int osd_t::submit_chained_read_requests(pg_t & pg, osd_op_t *cur_op)
        uint64_t *cur_set = pg.cur_set.data();
        if (pg.state != PG_ACTIVE)
        {
-            pg_osd_set_state_t *object_state;
-            cur_set = get_object_osd_set(pg, cur_oid, pg.cur_set.data(), &object_state);
+            cur_set = get_object_osd_set(pg, cur_oid, &op_data->chain_states[chain_reads[cri].chain_pos]);
            if (op_data->scheme != POOL_SCHEME_REPLICATED)
            {
                if (extend_missing_stripes(stripes, cur_set, pg.pg_data_size, pg.pg_size) < 0)
@@ -416,6 +431,17 @@ int osd_t::submit_chained_read_requests(pg_t & pg, osd_op_t *cur_op)
                }
                op_data->degraded = 1;
            }
+            else
+            {
+                auto cur_state = op_data->chain_states[chain_reads[cri].chain_pos];
+                if (cur_state && (cur_state->state & OBJ_INCOMPLETE))
+                {
+                    free(op_data->chain_reads);
+                    op_data->chain_reads = NULL;
+                    finish_op(cur_op, -EIO);
+                    return -1;
+                }
+            }
        }
        if (op_data->scheme == POOL_SCHEME_REPLICATED)
        {
@@ -433,6 +459,7 @@ int osd_t::submit_chained_read_requests(pg_t & pg, osd_op_t *cur_op)
            }
        }
    }
+    assert(!cur_op->buf);
    cur_op->buf = memalign_or_die(MEM_ALIGNMENT, read_buffer_size);
    void *cur_buf = cur_op->buf;
    for (int cri = 0; cri < chain_reads.size(); cri++)
@@ -468,12 +495,8 @@ int osd_t::submit_chained_read_requests(pg_t & pg, osd_op_t *cur_op)
        object_id cur_oid = { .inode = chain_reads[cri].inode, .stripe = op_data->oid.stripe };
        auto vo_it = pg.ver_override.find(cur_oid);
        uint64_t target_ver = vo_it != pg.ver_override.end() ? vo_it->second : UINT64_MAX;
-        uint64_t *cur_set = pg.cur_set.data();
-        if (pg.state != PG_ACTIVE)
-        {
-            pg_osd_set_state_t *object_state;
-            cur_set = get_object_osd_set(pg, cur_oid, pg.cur_set.data(), &object_state);
-        }
+        auto cur_state = op_data->chain_states[chain_reads[cri].chain_pos];
+        uint64_t *cur_set = (pg.state != PG_ACTIVE && cur_state ? cur_state->read_target.data() : pg.cur_set.data());
        int zero_read = -1;
        if (op_data->scheme == POOL_SCHEME_REPLICATED)
        {
@@ -487,6 +510,33 @@ int osd_t::submit_chained_read_requests(pg_t & pg, osd_op_t *cur_op)
    return 0;
 }

+void osd_t::check_corrupted_chained(pg_t & pg, osd_op_t *cur_op)
+{
+    osd_primary_op_data_t *op_data = cur_op->op_data;
+    int stripe_count = (pg.scheme == POOL_SCHEME_REPLICATED ? 1 : pg.pg_size);
+    osd_rmw_stripe_t *chain_stripes = (osd_rmw_stripe_t*)(
+        (uint8_t*)op_data->chain_reads + sizeof(osd_chain_read_t) * op_data->chain_read_count
+    );
+    for (int cri = 0; cri < op_data->chain_read_count; cri++)
+    {
+        object_id cur_oid = { .inode = op_data->chain_reads[cri].inode, .stripe = op_data->oid.stripe };
+        osd_rmw_stripe_t *stripes = chain_stripes + op_data->chain_reads[cri].chain_pos*stripe_count;
+        bool corrupted = false;
+        for (int i = 0; i < stripe_count; i++)
+        {
+            if (stripes[i].read_error)
+            {
+                corrupted = true;
+                break;
+            }
+        }
+        if (corrupted)
+        {
+            mark_object_corrupted(pg, cur_oid, op_data->chain_states[op_data->chain_reads[cri].chain_pos], stripes, false, false);
+        }
+    }
+}
+
 void osd_t::send_chained_read_results(pg_t & pg, osd_op_t *cur_op)
 {
    osd_primary_op_data_t *op_data = cur_op->op_data;
--- a/src/osd_primary_describe.cpp
+++ b/src/osd_primary_describe.cpp
@@ -0,0 +1,128 @@
+// Copyright (c) Vitaliy Filippov, 2019+
+// License: VNPL-1.1 (see README.md for details)
+
+#include <queue>
+#include "osd_primary.h"
+
+struct unclean_list_t
+{
+    btree::btree_map<object_id, pg_osd_set_state_t*>::iterator it, end;
+    uint64_t state_mask, state;
+};
+
+struct desc_item_list_t
+{
+    int alloc, size;
+    osd_reply_describe_item_t *items;
+};
+
+static void include_list(std::vector<unclean_list_t> & lists,
+    btree::btree_map<object_id, pg_osd_set_state_t*> & from,
+    osd_op_describe_t & desc, uint64_t state_mask, uint64_t state)
+{
+    auto it = desc.min_inode || desc.min_offset ? from.lower_bound((object_id){
+        .inode = desc.min_inode,
+        .stripe = desc.min_offset,
+    }) : from.begin();
+    auto end_it = desc.max_inode || desc.max_offset ? from.upper_bound((object_id){
+        .inode = desc.max_inode,
+        .stripe = desc.max_offset,
+    }) : from.end();
+    lists.push_back((unclean_list_t){
+        .it = it,
+        .end = end_it,
+        .state_mask = state_mask,
+        .state = state,
+    });
+}
+
+struct obj_list_t
+{
+    object_id oid;
+    int list_id;
+};
+
+static inline bool operator < (const obj_list_t & a, const obj_list_t & b)
+{
+    return b.oid < a.oid;
+}
+
+static void scan_lists(std::vector<unclean_list_t> & lists, uint64_t limit, desc_item_list_t & res)
+{
+    if (limit > 1048576)
+    {
+        limit = 1048576;
+    }
+    std::priority_queue<obj_list_t> min;
+    for (int i = 0; i < lists.size(); i++)
+    {
+        if (lists[i].it != lists[i].end)
+        {
+            min.push((obj_list_t){ .oid = lists[i].it->first, .list_id = i });
+        }
+    }
+    while (min.size() && (!limit || res.size < limit))
+    {
+        auto i = min.top().list_id;
+        min.pop();
+        for (auto & chunk: lists[i].it->second->osd_set)
+        {
+            if (res.size >= res.alloc)
+            {
+                res.alloc = !res.alloc ? 128 : (res.alloc*2);
+                res.items = (osd_reply_describe_item_t*)realloc_or_die(res.items, res.alloc * sizeof(osd_reply_describe_item_t));
+            }
+            res.items[res.size++] = (osd_reply_describe_item_t){
+                .inode   = lists[i].it->first.inode,
+                .stripe  = lists[i].it->first.stripe,
+                .role    = (uint32_t)chunk.role,
+                .loc_bad = chunk.loc_bad,
+                .osd_num = chunk.osd_num,
+            };
+        }
+        lists[i].it++;
+        if (lists[i].it != lists[i].end)
+        {
+            min.push((obj_list_t){ .oid = lists[i].it->first, .list_id = i });
+        }
+    }
+}
+
+// Describe unclean objects
+void osd_t::continue_primary_describe(osd_op_t *cur_op)
+{
+    auto & desc = cur_op->req.describe;
+    if (!desc.object_state)
+        desc.object_state = ~desc.object_state;
+    std::vector<unclean_list_t> lists;
+    for (auto pg_it = pgs.begin(); pg_it != pgs.end(); pg_it++)
+    {
+        auto & pg = pg_it->second;
+        if (desc.object_state & OBJ_INCONSISTENT)
+            include_list(lists, pg.inconsistent_objects, desc, 0, 0);
+        if (desc.object_state & OBJ_CORRUPTED)
+        {
+            if (!(desc.object_state & OBJ_INCOMPLETE))
+                include_list(lists, pg.incomplete_objects, desc, OBJ_CORRUPTED, OBJ_CORRUPTED);
+            if (!(desc.object_state & OBJ_DEGRADED))
+                include_list(lists, pg.degraded_objects, desc, OBJ_CORRUPTED, OBJ_CORRUPTED);
+            if (!(desc.object_state & OBJ_MISPLACED))
+                include_list(lists, pg.misplaced_objects, desc, OBJ_CORRUPTED, OBJ_CORRUPTED);
+        }
+        uint64_t skip_corrupted = !(desc.object_state & OBJ_CORRUPTED) ? OBJ_CORRUPTED : 0;
+        if (desc.object_state & OBJ_INCOMPLETE)
+            include_list(lists, pg.incomplete_objects, desc, skip_corrupted, 0);
+        if (desc.object_state & OBJ_DEGRADED)
+            include_list(lists, pg.degraded_objects, desc, skip_corrupted, 0);
+        if (desc.object_state & OBJ_MISPLACED)
+            include_list(lists, pg.misplaced_objects, desc, skip_corrupted, 0);
+    }
+    desc_item_list_t res = {};
+    scan_lists(lists, desc.limit, res);
+    assert(!cur_op->buf);
+    cur_op->buf = res.items;
+    cur_op->reply.describe.result_bytes = res.size * sizeof(osd_reply_describe_item_t);
+    if (res.items)
+        cur_op->iov.push_back(res.items, res.size * sizeof(osd_reply_describe_item_t));
+    finish_op(cur_op, res.size);
+}
--- a/src/osd_primary_subops.cpp
+++ b/src/osd_primary_subops.cpp
@@ -9,6 +9,7 @@ void osd_t::autosync()
    {
        autosync_op = new osd_op_t();
        autosync_op->op_type = OSD_OP_IN;
+        autosync_op->peer_fd = -1;
        autosync_op->req = (osd_any_op_t){
            .sync = {
                .header = {
@@ -80,7 +81,11 @@ void osd_t::finish_op(osd_op_t *cur_op, int retval)
        free(cur_op->op_data);
        cur_op->op_data = NULL;
    }
-    if (!cur_op->peer_fd)
+    cur_op->reply.hdr.magic = SECONDARY_OSD_REPLY_MAGIC;
+    cur_op->reply.hdr.id = cur_op->req.hdr.id;
+    cur_op->reply.hdr.opcode = cur_op->req.hdr.opcode;
+    cur_op->reply.hdr.retval = retval;
+    if (cur_op->peer_fd == -1)
    {
        // Copy lambda to be unaffected by `delete op`
        std::function<void(osd_op_t*)>(cur_op->callback)(cur_op);
@@ -91,10 +96,6 @@ void osd_t::finish_op(osd_op_t *cur_op, int retval)
        auto cl_it = msgr.clients.find(cur_op->peer_fd);
        if (cl_it != msgr.clients.end())
        {
-            cur_op->reply.hdr.magic = SECONDARY_OSD_REPLY_MAGIC;
-            cur_op->reply.hdr.id = cur_op->req.hdr.id;
-            cur_op->reply.hdr.opcode = cur_op->req.hdr.opcode;
-            cur_op->reply.hdr.retval = retval;
            msgr.outbox_push(cur_op);
        }
        else
@@ -142,43 +143,50 @@ int osd_t::submit_primary_subop_batch(int submit_type, inode_t inode, uint64_t o
    for (int role = 0; role < op_data->pg_size; role++)
    {
        // We always submit zero-length writes to all replicas, even if the stripe is not modified
-        if (!(wr || !rep && stripes[role].read_end != 0 || zero_read == role))
+        if (!(wr || !rep && stripes[role].read_end != 0 || zero_read == role || submit_type == SUBMIT_SCRUB_READ))
        {
            continue;
        }
        osd_num_t role_osd_num = osd_set[role];
+        int stripe_num = rep ? 0 : role;
+        osd_rmw_stripe_t *si = stripes + (submit_type == SUBMIT_SCRUB_READ ? role : stripe_num);
        if (role_osd_num != 0)
        {
-            int stripe_num = rep ? 0 : role;
            osd_op_t *subop = op_data->subops + i;
            uint32_t subop_len = wr
-                ? stripes[stripe_num].write_end - stripes[stripe_num].write_start
-                : stripes[stripe_num].read_end - stripes[stripe_num].read_start;
-            if (!wr && stripes[stripe_num].read_end == UINT32_MAX)
+                ? si->write_end - si->write_start
+                : si->read_end - si->read_start;
+            if (!wr && si->read_end == UINT32_MAX)
            {
                subop_len = 0;
            }
+            si->osd_num = role_osd_num;
+            si->read_error = false;
+            subop->bitmap = si->bmp_buf;
+            subop->bitmap_len = clean_entry_bitmap_size;
+            // Using rmw_buf to pass pointer to stripes. Dirty but should work
+            subop->rmw_buf = si;
            if (role_osd_num == this->osd_num)
            {
                clock_gettime(CLOCK_REALTIME, &subop->tv_begin);
                subop->op_type = (uint64_t)cur_op;
-                subop->bitmap = stripes[stripe_num].bmp_buf;
-                subop->bitmap_len = clean_entry_bitmap_size;
-                subop->bs_op = new blockstore_op_t({
+                subop->bs_op = new blockstore_op_t((blockstore_op_t){
                    .opcode = (uint64_t)(wr ? (rep ? BS_OP_WRITE_STABLE : BS_OP_WRITE) : BS_OP_READ),
                    .callback = [subop, this](blockstore_op_t *bs_subop)
                    {
                        handle_primary_bs_subop(subop);
                    },
-                    .oid = {
-                        .inode = inode,
-                        .stripe = op_data->oid.stripe | stripe_num,
-                    },
-                    .version = op_version,
-                    .offset = wr ? stripes[stripe_num].write_start : stripes[stripe_num].read_start,
-                    .len = subop_len,
-                    .buf = wr ? stripes[stripe_num].write_buf : stripes[stripe_num].read_buf,
-                    .bitmap = stripes[stripe_num].bmp_buf,
+                    { {
+                        .oid = (object_id){
+                            .inode = inode,
+                            .stripe = op_data->oid.stripe | stripe_num,
+                        },
+                        .version = op_version,
+                        .offset = wr ? si->write_start : si->read_start,
+                        .len = subop_len,
+                    } },
+                    .buf = wr ? si->write_buf : si->read_buf,
+                    .bitmap = si->bmp_buf,
                });
 #ifdef OSD_DEBUG
                printf(
@@ -192,8 +200,6 @@ int osd_t::submit_primary_subop_batch(int submit_type, inode_t inode, uint64_t o
            else
            {
                subop->op_type = OSD_OP_OUT;
-                subop->bitmap = stripes[stripe_num].bmp_buf;
-                subop->bitmap_len = clean_entry_bitmap_size;
                subop->req.sec_rw = {
                    .header = {
                        .magic = SECONDARY_OSD_OP_MAGIC,
@@ -205,7 +211,7 @@ int osd_t::submit_primary_subop_batch(int submit_type, inode_t inode, uint64_t o
                        .stripe = op_data->oid.stripe | stripe_num,
                    },
                    .version = op_version,
-                    .offset = wr ? stripes[stripe_num].write_start : stripes[stripe_num].read_start,
+                    .offset = wr ? si->write_start : si->read_start,
                    .len = subop_len,
                    .attr_len = wr ? clean_entry_bitmap_size : 0,
                };
@@ -218,16 +224,16 @@ int osd_t::submit_primary_subop_batch(int submit_type, inode_t inode, uint64_t o
 #endif
                if (wr)
                {
-                    if (stripes[stripe_num].write_end > stripes[stripe_num].write_start)
+                    if (si->write_end > si->write_start)
                    {
-                        subop->iov.push_back(stripes[stripe_num].write_buf, stripes[stripe_num].write_end - stripes[stripe_num].write_start);
+                        subop->iov.push_back(si->write_buf, si->write_end - si->write_start);
                    }
                }
                else
                {
                    if (subop_len > 0)
                    {
-                        subop->iov.push_back(stripes[stripe_num].read_buf, subop_len);
+                        subop->iov.push_back(si->read_buf, subop_len);
                    }
                }
                subop->callback = [cur_op, this](osd_op_t *subop)
@@ -250,6 +256,10 @@ int osd_t::submit_primary_subop_batch(int submit_type, inode_t inode, uint64_t o
            }
            i++;
        }
+        else
+        {
+            si->osd_num = 0;
+        }
    }
    return i-subop_idx;
 }
@@ -334,14 +344,45 @@ void osd_t::handle_primary_subop(osd_op_t *subop, osd_op_t *cur_op)
    else
        expected = 0;
    osd_primary_op_data_t *op_data = cur_op->op_data;
+    if (retval == -ENOENT && opcode == OSD_OP_SEC_READ)
+    {
+        // ENOENT is not an error for almost all reads, except scrub
+        retval = expected;
+        memset(((osd_rmw_stripe_t*)subop->rmw_buf)->read_buf, 0, expected);
+        ((osd_rmw_stripe_t*)subop->rmw_buf)->not_exists = true;
+    }
+    if (retval == expected && (opcode == OSD_OP_SEC_READ || opcode == OSD_OP_SEC_WRITE || opcode == OSD_OP_SEC_WRITE_STABLE))
+    {
+        uint64_t version = subop->reply.sec_rw.version;
+#ifdef OSD_DEBUG
+        uint64_t peer_osd = msgr.clients.find(subop->peer_fd) != msgr.clients.end()
+            ? msgr.clients[subop->peer_fd]->osd_num : osd_num;
+        printf("subop %s %lx:%lx from osd %lu: version = %lu\n", osd_op_names[opcode], subop->req.sec_rw.oid.inode, subop->req.sec_rw.oid.stripe, peer_osd, version);
+#endif
+        if (op_data->fact_ver != UINT64_MAX)
+        {
+            if (op_data->fact_ver != 0 && op_data->fact_ver != version)
+            {
+                fprintf(
+                    stderr, "different fact_versions returned from %s subops: %lu vs %lu\n",
+                    osd_op_names[opcode], version, op_data->fact_ver
+                );
+                retval = -ERANGE;
+            }
+            else
+                op_data->fact_ver = version;
+        }
+    }
    if (retval != expected)
    {
        if (opcode == OSD_OP_SEC_READ || opcode == OSD_OP_SEC_WRITE || opcode == OSD_OP_SEC_WRITE_STABLE)
        {
            printf(
-                "%s subop to %lx:%lx v%lu failed on peer %d: retval = %d (expected %d)\n",
+                subop->peer_fd >= 0
+                    ? "%1$s subop to %2$lx:%3$lx v%4$lu failed on peer %7$d: retval = %5$d (expected %6$d)\n"
+                    : "%1$s subop to %2$lx:%3$lx v%4$lu failed locally: retval = %5$d (expected %6$d)\n",
                osd_op_names[opcode], subop->req.sec_rw.oid.inode, subop->req.sec_rw.oid.stripe, subop->req.sec_rw.version,
-                subop->peer_fd, retval, expected
+                retval, expected, subop->peer_fd
            );
        }
        else
@@ -351,43 +392,33 @@ void osd_t::handle_primary_subop(osd_op_t *subop, osd_op_t *cur_op)
                osd_op_names[opcode], subop->peer_fd, retval, expected
            );
        }
-        // Error priority: EIO > ENOSPC > EPIPE
-        if (op_data->errcode == 0 || retval == -EIO ||
-            retval == -ENOSPC && op_data->errcode == -EPIPE)
+        if (opcode == OSD_OP_SEC_READ && (retval == -EIO || retval == -EDOM))
+        {
+            // We'll retry reads from other replica(s) on EIO/EDOM and mark object as corrupted
+            ((osd_rmw_stripe_t*)subop->rmw_buf)->read_error = true;
+        }
+        subop->rmw_buf = NULL;
+        // Error priority: ENOSPC and others > EIO > EDOM > EPIPE
+        if (op_data->errcode == 0 ||
+            retval == -EIO && (op_data->errcode == -EDOM || op_data->errcode == -EPIPE) ||
+            retval == -EDOM && (op_data->errcode == -EPIPE) ||
+            retval != -EIO && retval != -EDOM && retval != -EPIPE)
        {
            op_data->errcode = retval;
        }
        op_data->errors++;
-        if (subop->peer_fd >= 0 && (opcode != OSD_OP_SEC_WRITE && opcode != OSD_OP_SEC_WRITE_STABLE ||
-            retval != -ENOSPC))
+        if (subop->peer_fd >= 0 && retval != -EDOM && retval != -ERANGE &&
+            (retval != -ENOSPC || opcode != OSD_OP_SEC_WRITE && opcode != OSD_OP_SEC_WRITE_STABLE) &&
+            (retval != -EIO || opcode != OSD_OP_SEC_READ))
        {
-            // Drop connection on any error expect ENOSPC
+            // Drop connection on unexpected errors
            msgr.stop_client(subop->peer_fd);
        }
    }
    else
    {
+        subop->rmw_buf = NULL;
        op_data->done++;
-        if (opcode == OSD_OP_SEC_READ || opcode == OSD_OP_SEC_WRITE || opcode == OSD_OP_SEC_WRITE_STABLE)
-        {
-            uint64_t version = subop->reply.sec_rw.version;
-#ifdef OSD_DEBUG
-            uint64_t peer_osd = msgr.clients.find(subop->peer_fd) != msgr.clients.end()
-                ? msgr.clients[subop->peer_fd]->osd_num : osd_num;
-            printf("subop %lu from osd %lu: version = %lu\n", opcode, peer_osd, version);
-#endif
-            if (op_data->fact_ver != UINT64_MAX)
-            {
-                if (op_data->fact_ver != 0 && op_data->fact_ver != version)
-                {
-                    throw std::runtime_error(
-                        "different fact_versions returned from "+std::string(osd_op_names[opcode])+
-                        " subops: "+std::to_string(version)+" vs "+std::to_string(op_data->fact_ver)
-                    );
-                }
-                op_data->fact_ver = version;
-            }
-        }
    }
    if ((op_data->errors + op_data->done) >= op_data->n_subops)
    {
@@ -410,6 +441,10 @@ void osd_t::handle_primary_subop(osd_op_t *subop, osd_op_t *cur_op)
        {
            continue_primary_del(cur_op);
        }
+        else if (cur_op->req.hdr.opcode == OSD_OP_SCRUB)
+        {
+            continue_primary_scrub(cur_op);
+        }
        else
        {
            throw std::runtime_error("BUG: unknown opcode");
@@ -498,8 +533,10 @@ void osd_t::submit_primary_del_batch(osd_op_t *cur_op, obj_ver_osd_t *chunks_to_
                {
                    handle_primary_bs_subop(subop);
                },
-                .oid = chunk.oid,
-                .version = chunk.version,
+                { {
+                    .oid = chunk.oid,
+                    .version = chunk.version,
+                } },
            });
            bs->enqueue_op(subops[i].bs_op);
        }
@@ -613,7 +650,9 @@ void osd_t::submit_primary_stab_subops(osd_op_t *cur_op)
                {
                    handle_primary_bs_subop(subop);
                },
-                .len = (uint32_t)stab_osd.len,
+                {
+                    .len = (uint32_t)stab_osd.len,
+                },
                .buf = (void*)(op_data->unstable_writes + stab_osd.start),
            });
            bs->enqueue_op(subops[i].bs_op);
--- a/src/osd_primary_write.cpp
+++ b/src/osd_primary_write.cpp
@@ -58,7 +58,13 @@ resume_1:
    // Determine blocks to read and write
    // Missing chunks are allowed to be overwritten even in incomplete objects
    // FIXME: Allow to do small writes to the old (degraded/misplaced) OSD set for lower performance impact
-    op_data->prev_set = get_object_osd_set(pg, op_data->oid, pg.cur_set.data(), &op_data->object_state);
+    op_data->prev_set = get_object_osd_set(pg, op_data->oid, &op_data->object_state);
+    if (op_data->object_state)
+    {
+        // Protect object_state from being freed by a parallel read operation changing it
+        op_data->object_state->ref_count++;
+    }
+retry_1:
    if (op_data->scheme == POOL_SCHEME_REPLICATED)
    {
        // Simplified algorithm
@@ -68,6 +74,12 @@ resume_1:
        if (pg.cur_set.data() != op_data->prev_set && (op_data->stripes[0].write_start != 0 ||
            op_data->stripes[0].write_end != bs_block_size))
        {
+            if (op_data->object_state->state & OBJ_INCOMPLETE)
+            {
+                // Refuse partial overwrite of an incomplete (corrupted) object
+                cur_op->reply.hdr.retval = -EIO;
+                goto continue_others;
+            }
            // Object is degraded/misplaced and will be moved to <write_osd_set>
            op_data->stripes[0].read_start = 0;
            op_data->stripes[0].read_end = bs_block_size;
@@ -81,24 +93,66 @@ resume_1:
        if (!cur_op->rmw_buf)
        {
            // Refuse partial overwrite of an incomplete object
-            cur_op->reply.hdr.retval = -EINVAL;
+            cur_op->reply.hdr.retval = -EIO;
            goto continue_others;
        }
    }
    // Read required blocks
-    submit_primary_subops(SUBMIT_RMW_READ, UINT64_MAX, op_data->prev_set, cur_op);
+    {
+        if (op_data->object_state && (op_data->object_state->state & OBJ_INCOMPLETE))
+        {
+            // Allow to read version number (just version number!) from corrupted chunks
+            // to allow full overwrite of a corrupted object
+            bool found = false;
+            for (int role = 0; role < op_data->pg_size; role++)
+            {
+                if (op_data->prev_set[role] != 0 || op_data->stripes[role].read_end > op_data->stripes[role].read_start)
+                {
+                    found = true;
+                    break;
+                }
+            }
+            if (!found)
+            {
+                osd_num_t corrupted_target[op_data->pg_size];
+                for (int role = 0; role < op_data->pg_size; role++)
+                {
+                    corrupted_target[role] = 0;
+                }
+                for (auto & loc: op_data->object_state->osd_set)
+                {
+                    if (!(loc.loc_bad & LOC_OUTDATED) && !corrupted_target[loc.role])
+                    {
+                        corrupted_target[loc.role] = loc.osd_num;
+                    }
+                }
+                submit_primary_subops(SUBMIT_RMW_READ, UINT64_MAX, corrupted_target, cur_op);
+                goto resume_2;
+            }
+        }
+        submit_primary_subops(SUBMIT_RMW_READ, UINT64_MAX, op_data->prev_set, cur_op);
+    }
 resume_2:
    op_data->st = 2;
    return;
 resume_3:
    if (op_data->errors > 0)
    {
+        if (op_data->errcode == -EIO || op_data->errcode == -EDOM)
+        {
+            // Mark object corrupted and retry
+            op_data->object_state = mark_object_corrupted(pg, op_data->oid, op_data->object_state, op_data->stripes, true, false);
+            op_data->prev_set = op_data->object_state ? op_data->object_state->read_target.data() : pg.cur_set.data();
+            goto retry_1;
+        }
+        deref_object_state(pg, &op_data->object_state, true);
        pg_cancel_write_queue(pg, cur_op, op_data->oid, op_data->errcode);
        return;
    }
    // Check CAS version
    if (cur_op->req.rw.version && op_data->fact_ver != (cur_op->req.rw.version-1))
    {
+        deref_object_state(pg, &op_data->object_state, true);
        cur_op->reply.hdr.retval = -EINTR;
        cur_op->reply.rw.version = op_data->fact_ver;
        goto continue_others;
@@ -182,6 +236,7 @@ resume_10:
    // Recheck PG state after reporting history - maybe it's already stopping/restarting
    if (pg.state & (PG_STOPPING|PG_REPEERING))
    {
+        deref_object_state(pg, &op_data->object_state, true);
        pg_cancel_write_queue(pg, cur_op, op_data->oid, -EPIPE);
        return;
    }
@@ -197,6 +252,12 @@ resume_5:
    }
    if (op_data->errors > 0)
    {
+        // FIXME: Handle ENOSPC. If one of the subops fail with ENOSPC here,
+        // next writes to the same object will also fail because they'll try
+        // to overwrite the same version number which will result in EEXIST.
+        // To fix it, we should mark the object as degraded for replicas,
+        // and rollback successful part updates in case of EC.
+        deref_object_state(pg, &op_data->object_state, true);
        pg_cancel_write_queue(pg, cur_op, op_data->oid, op_data->errcode);
        return;
    }
@@ -205,7 +266,7 @@ resume_5:
        // We must forget the unclean state of the object before deleting it
        // so the next reads don't accidentally read a deleted version
        // And it should be done at the same time as the removal of the version override
-        remove_object_from_state(op_data->oid, op_data->object_state, pg);
+        remove_object_from_state(op_data->oid, &op_data->object_state, pg);
        pg.clean_count++;
    }
 resume_6:
@@ -260,12 +321,12 @@ resume_7:
                    copies_to_delete_after_sync_count++;
                }
            }
-            free_object_state(pg, &op_data->object_state);
+            deref_object_state(pg, &op_data->object_state, true);
        }
        else
        {
            submit_primary_del_subops(cur_op, pg.cur_set.data(), pg.pg_size, op_data->object_state->osd_set);
-            free_object_state(pg, &op_data->object_state);
+            deref_object_state(pg, &op_data->object_state, true);
            if (op_data->n_subops > 0)
            {
 resume_8:
--- a/src/osd_rmw.cpp
+++ b/src/osd_rmw.cpp
@@ -1084,3 +1084,180 @@ void calc_rmw_parity_ec(osd_rmw_stripe_t *stripes, int pg_size, int pg_minsize,
    }
    calc_rmw_parity_copy_parity(stripes, pg_size, pg_minsize, read_osd_set, write_osd_set, chunk_size, start, end);
 }
+
+// Generate subsets of k items each in {0..n-1}
+static bool first_combination(int *subset, int k, int n)
+{
+    if (k > n)
+        return false;
+    for (int i = 0; i < k; i++)
+        subset[i] = i;
+    return true;
+}
+
+static bool next_combination(int *subset, int k, int n)
+{
+    int pos = k-1;
+    while (true)
+    {
+        subset[pos]++;
+        if (subset[pos] >= n-(k-1-pos))
+        {
+            if (pos == 0)
+                return false;
+            pos--;
+        }
+        else
+            break;
+    }
+    for (pos++; pos < k; pos++)
+    {
+        subset[pos] = subset[pos-1]+1;
+    }
+    return true;
+}
+
+static int c_n_k(int n, int k)
+{
+    int c = 1;
+    for (int i = n; i > k; i--)
+        c *= i;
+    for (int i = 2; i <= (n-k); i++)
+        c /= i;
+    return c;
+}
+
+std::vector<int> ec_find_good(osd_rmw_stripe_t *stripes, int pg_size, int pg_minsize, bool is_xor,
+    uint32_t chunk_size, uint32_t bitmap_size, int max_bruteforce)
+{
+    std::vector<int> found_valid;
+    int cur_live[pg_size], live_count = 0, exists_count = 0;
+    osd_num_t fake_osd_set[pg_size];
+    for (int role = 0; role < pg_size; role++)
+    {
+        if (!stripes[role].missing)
+        {
+            if (!stripes[role].not_exists)
+                exists_count++;
+            cur_live[live_count++] = role;
+            fake_osd_set[role] = role+1;
+        }
+    }
+    if (live_count <= pg_minsize)
+    {
+        return std::vector<int>();
+    }
+    if (exists_count <= pg_minsize)
+    {
+        // Special case: user manually deleted some chunks
+        for (int role = 0; role < pg_size; role++)
+            if (!stripes[role].missing && !stripes[role].not_exists)
+                found_valid.push_back(role);
+        return found_valid;
+    }
+    // Try to locate errors using brute force if there isn't too many combinations
+    osd_rmw_stripe_t brute_stripes[pg_size];
+    int out_count = live_count-pg_minsize;
+    bool brute_force = out_count > 1 && c_n_k(live_count-1, out_count-1) <= max_bruteforce;
+    int subset[pg_minsize], outset[out_count];
+    // Select all combinations with items except the last one (== anything to compare)
+    first_combination(subset, pg_minsize, live_count-1);
+    uint8_t *tmp_buf = (uint8_t*)malloc_or_die(pg_size*chunk_size);
+    do
+    {
+        memcpy(brute_stripes, stripes, sizeof(osd_rmw_stripe_t)*pg_size);
+        int i = 0, j = 0, k = 0;
+        for (; i < pg_minsize; i++, j++)
+            while (j < subset[i])
+                outset[k++] = j++;
+        while (j < pg_size)
+            outset[k++] = j++;
+        for (int i = 0; i < out_count; i++)
+        {
+            brute_stripes[cur_live[outset[i]]].missing = true;
+            brute_stripes[cur_live[outset[i]]].read_buf = tmp_buf+cur_live[outset[i]]*chunk_size;
+        }
+        for (int i = 0; i < pg_minsize; i++)
+        {
+            brute_stripes[i].write_buf = brute_stripes[i].read_buf;
+            brute_stripes[i].req_start = 0;
+            brute_stripes[i].req_end = chunk_size;
+        }
+        for (int i = pg_minsize; i < pg_size; i++)
+        {
+            brute_stripes[i].write_buf = tmp_buf+i*chunk_size;
+        }
+        if (is_xor)
+        {
+            assert(pg_size == pg_minsize+1);
+            reconstruct_stripes_xor(brute_stripes, pg_size, bitmap_size);
+        }
+        else
+        {
+            reconstruct_stripes_ec(brute_stripes, pg_size, pg_minsize, bitmap_size);
+            calc_rmw_parity_ec(brute_stripes, pg_size, pg_minsize, fake_osd_set, fake_osd_set, chunk_size, bitmap_size);
+        }
+        for (int i = pg_minsize; i < pg_size; i++)
+        {
+            brute_stripes[i].read_buf = brute_stripes[i].write_buf;
+        }
+        int valid_count = 0;
+        for (int i = 0; i < out_count; i++)
+        {
+            if (memcmp(brute_stripes[cur_live[outset[i]]].read_buf,
+                    stripes[cur_live[outset[i]]].read_buf, chunk_size) == 0)
+            {
+                brute_stripes[cur_live[outset[i]]].missing = false;
+                valid_count++;
+            }
+        }
+        if (valid_count > 0)
+        {
+            if (found_valid.size())
+            {
+                // Check if we found the same set from the different point of view,
+                // like 1 2 3 -> valid 4 5 and 1 3 4 -> valid 2 5
+                for (int i = 0, j = 0; i < pg_size; i++)
+                {
+                    if (!brute_stripes[i].missing)
+                    {
+                        if (j >= found_valid.size() || found_valid[j] != i)
+                        {
+                            // Ambiguity: we found multiple valid sets and don't know which one is correct
+                            found_valid.clear();
+                            break;
+                        }
+                        j++;
+                    }
+                }
+                if (!found_valid.size())
+                {
+                    break;
+                }
+            }
+            else
+            {
+                for (int i = 0; i < pg_size; i++)
+                {
+                    if (!brute_stripes[i].missing)
+                    {
+                        found_valid.push_back(i);
+                    }
+                }
+            }
+            if (valid_count == out_count)
+            {
+                // All chunks are good
+                break;
+            }
+        }
+        if (!brute_force)
+        {
+            // Do not attempt brute force if there are too many combinations because even
+            // if we find it we won't be able to check that it's the only good one
+            break;
+        }
+    } while (out_count > 1 && next_combination(subset, pg_minsize, live_count-1));
+    free(tmp_buf);
+    return found_valid;
+}
--- a/src/osd_rmw.h
+++ b/src/osd_rmw.h
@@ -4,6 +4,7 @@
 #pragma once

 #include <stdint.h>
+#include <vector>
 #include "object_id.h"
 #include "osd_id.h"

@@ -26,7 +27,10 @@ struct osd_rmw_stripe_t
    // read_end=UINT32_MAX means to only read bitmap, but not data
    uint32_t read_start, read_end;
    uint32_t write_start, write_end;
-    bool missing;
+    osd_num_t osd_num;
+    bool missing: 1;
+    bool read_error: 1;
+    bool not_exists: 1;
 };

 // Here pg_minsize is the number of data chunks, not the minimum number of alive OSDs for the PG to operate
@@ -52,3 +56,6 @@ void reconstruct_stripes_ec(osd_rmw_stripe_t *stripes, int pg_size, int pg_minsi

 void calc_rmw_parity_ec(osd_rmw_stripe_t *stripes, int pg_size, int pg_minsize,
    uint64_t *read_osd_set, uint64_t *write_osd_set, uint32_t chunk_size, uint32_t bitmap_size);
+
+std::vector<int> ec_find_good(osd_rmw_stripe_t *stripes, int pg_size, int pg_minsize, bool is_xor,
+    uint32_t chunk_size, uint32_t bitmap_size, int max_bruteforce);
--- a/src/osd_rmw_test.cpp
+++ b/src/osd_rmw_test.cpp
@@ -28,6 +28,7 @@ void test14();
 void test15(bool second);
 void test16();
 void test_recover_22_d2();
+void test_ec43_error_bruteforce();

 int main(int narg, char *args[])
 {
@@ -64,6 +65,8 @@ int main(int narg, char *args[])
    test16();
    // Test 17
    test_recover_22_d2();
+    // Error bruteforce
+    test_ec43_error_bruteforce();
    // End
    printf("all ok\n");
    return 0;
@@ -1106,3 +1109,72 @@ void test_recover_22_d2()
    // Done
    use_ec(4, 2, false);
 }
+
+/***
+
+EC 4+2 error location bruteforce
+
+***/
+
+static void assert_eq_vec(const std::vector<int> & b, const std::vector<int> & a)
+{
+    printf("Expect [");
+    for (int i = 0; i < a.size(); i++)
+        printf(" %d", a[i]);
+    printf(" ] have [");
+    for (int i = 0; i < b.size(); i++)
+        printf(" %d", b[i]);
+    printf(" ]\n");
+    assert(a == b);
+}
+
+void test_ec43_error_bruteforce()
+{
+    use_ec(7, 4, true);
+    osd_num_t osd_set[7] = { 1, 2, 3, 4, 5, 6, 7 };
+    osd_rmw_stripe_t stripes[7] = {};
+    split_stripes(4, 4096, 0, 4096 * 4, stripes);
+    uint8_t *write_buf = (uint8_t*)malloc_or_die(4096 * 7);
+    set_pattern(write_buf+0*4096, 4096, PATTERN0);
+    set_pattern(write_buf+1*4096, 4096, PATTERN1);
+    set_pattern(write_buf+2*4096, 4096, PATTERN2);
+    set_pattern(write_buf+3*4096, 4096, PATTERN3);
+    uint8_t *rmw_buf = (uint8_t*)calc_rmw(write_buf, stripes, osd_set, 7, 4, 7, osd_set, 4096, 0);
+    calc_rmw_parity_ec(stripes, 7, 4, osd_set, osd_set, 4096, 0);
+    check_pattern(stripes[4].write_buf, 4096, PATTERN0^PATTERN1^PATTERN2^PATTERN3);
+    check_pattern(stripes[5].write_buf, 4096, 0xfcee568ba36371ac); // 2nd EC chunk
+    check_pattern(stripes[6].write_buf, 4096, 0x139274739ae6f387); // 3rd EC chunk
+    memcpy(write_buf+4*4096, stripes[4].write_buf, 4096);
+    memcpy(write_buf+5*4096, stripes[5].write_buf, 4096);
+    memcpy(write_buf+6*4096, stripes[6].write_buf, 4096);
+    // Try to locate errors
+    for (int i = 0; i < 7; i++)
+    {
+        stripes[i].read_start = 0;
+        stripes[i].read_end = 4096;
+        stripes[i].read_buf = write_buf+i*4096;
+        stripes[i].write_buf = NULL;
+    }
+    // All good chunks
+    auto res = ec_find_good(stripes, 7, 4, false, 4096, 0, 100);
+    assert_eq_vec(res, std::vector<int>({0, 1, 2, 3, 4, 5, 6}));
+    // 1 missing chunk
+    set_pattern(write_buf+1*4096, 4096, 0);
+    res = ec_find_good(stripes, 7, 4, false, 4096, 0, 100);
+    assert_eq_vec(res, std::vector<int>({0, 2, 3, 4, 5, 6}));
+    // 2 missing chunks
+    set_pattern(write_buf+1*4096, 4096, 0);
+    set_pattern(write_buf+5*4096, 4096, 0);
+    res = ec_find_good(stripes, 7, 4, false, 4096, 0, 100);
+    assert_eq_vec(res, std::vector<int>({0, 2, 3, 4, 6}));
+    // 3 missing chunks
+    set_pattern(write_buf+1*4096, 4096, 0);
+    set_pattern(write_buf+5*4096, 4096, 0);
+    set_pattern(write_buf+6*4096, 4096, 0);
+    res = ec_find_good(stripes, 7, 4, false, 4096, 0, 100);
+    assert_eq_vec(res, std::vector<int>());
+    // Done
+    free(rmw_buf);
+    free(write_buf);
+    use_ec(7, 4, false);
+}
--- a/src/osd_scrub.cpp
+++ b/src/osd_scrub.cpp
@@ -0,0 +1,623 @@
+// Copyright (c) Vitaliy Filippov, 2019+
+// License: VNPL-1.1 (see README.md for details)
+
+#include "osd_primary.h"
+
+#define SELF_FD -1
+
+void osd_t::scrub_list(pool_pg_num_t pg_id, osd_num_t role_osd, object_id min_oid)
+{
+    pool_id_t pool_id = pg_id.pool_id;
+    pg_num_t pg_num = pg_id.pg_num;
+    assert(!scrub_list_op);
+    if (role_osd == this->osd_num)
+    {
+        // Self
+        osd_op_t *op = new osd_op_t();
+        op->op_type = 0;
+        op->peer_fd = SELF_FD;
+        clock_gettime(CLOCK_REALTIME, &op->tv_begin);
+        op->bs_op = new blockstore_op_t();
+        op->bs_op->opcode = BS_OP_LIST;
+        op->bs_op->pg_alignment = st_cli.pool_config[pool_id].pg_stripe_size;
+        if (min_oid.inode != 0 || min_oid.stripe != 0)
+            op->bs_op->min_oid = min_oid;
+        else
+        {
+            op->bs_op->min_oid.inode = ((uint64_t)pool_id << (64 - POOL_ID_BITS));
+            op->bs_op->min_oid.stripe = 0;
+        }
+        op->bs_op->max_oid.inode = ((uint64_t)(pool_id+1) << (64 - POOL_ID_BITS)) - 1;
+        op->bs_op->max_oid.stripe = UINT64_MAX;
+        op->bs_op->list_stable_limit = scrub_list_limit;
+        op->bs_op->pg_count = pg_counts[pool_id];
+        op->bs_op->pg_number = pg_num-1;
+        op->bs_op->callback = [this, op](blockstore_op_t *bs_op)
+        {
+            scrub_list_op = NULL;
+            if (op->bs_op->retval < 0)
+            {
+                printf("Local OP_LIST failed: retval=%d\n", op->bs_op->retval);
+                force_stop(1);
+                return;
+            }
+            add_bs_subop_stats(op);
+            scrub_cur_list = {
+                .buf = (obj_ver_id*)op->bs_op->buf,
+                .total_count = (uint64_t)op->bs_op->retval,
+                .stable_count = op->bs_op->version,
+            };
+            delete op->bs_op;
+            op->bs_op = NULL;
+            delete op;
+            continue_scrub();
+        };
+        scrub_list_op = op;
+        bs->enqueue_op(op->bs_op);
+    }
+    else
+    {
+        // Peer
+        osd_op_t *op = new osd_op_t();
+        op->op_type = OSD_OP_OUT;
+        op->peer_fd = msgr.osd_peer_fds.at(role_osd);
+        op->req = (osd_any_op_t){
+            .sec_list = {
+                .header = {
+                    .magic = SECONDARY_OSD_OP_MAGIC,
+                    .id = msgr.next_subop_id++,
+                    .opcode = OSD_OP_SEC_LIST,
+                },
+                .list_pg = pg_num,
+                .pg_count = pg_counts[pool_id],
+                .pg_stripe_size = st_cli.pool_config[pool_id].pg_stripe_size,
+                .min_inode = min_oid.inode ? min_oid.inode : ((uint64_t)(pool_id) << (64 - POOL_ID_BITS)),
+                .max_inode = ((uint64_t)(pool_id+1) << (64 - POOL_ID_BITS)) - 1,
+                .min_stripe = min_oid.stripe,
+                .stable_limit = scrub_list_limit,
+            },
+        };
+        op->callback = [this, role_osd](osd_op_t *op)
+        {
+            scrub_list_op = NULL;
+            if (op->reply.hdr.retval < 0)
+            {
+                printf("Failed to get object list from OSD %lu (retval=%ld), disconnecting peer\n", role_osd, op->reply.hdr.retval);
+                int fail_fd = op->peer_fd;
+                delete op;
+                msgr.stop_client(fail_fd);
+                return;
+            }
+            scrub_cur_list = {
+                .buf = (obj_ver_id*)op->buf,
+                .total_count = (uint64_t)op->reply.hdr.retval,
+                .stable_count = op->reply.sec_list.stable_count,
+            };
+            // set op->buf to NULL so it doesn't get freed
+            op->buf = NULL;
+            delete op;
+            continue_scrub();
+        };
+        scrub_list_op = op;
+        msgr.outbox_push(op);
+    }
+}
+
+int osd_t::pick_next_scrub(object_id & next_oid)
+{
+    if (!pgs.size())
+    {
+        if (scrub_cur_list.buf)
+        {
+            free(scrub_cur_list.buf);
+            scrub_cur_list = {};
+            scrub_last_pg = {};
+        }
+        return 0;
+    }
+    timespec tv_now;
+    clock_gettime(CLOCK_REALTIME, &tv_now);
+    bool rescan = scrub_last_pg.pool_id != 0 || scrub_last_pg.pg_num != 0;
+    // Restart scanning from the same PG as the last time
+    auto pg_it = pgs.lower_bound(scrub_last_pg);
+    if (pg_it == pgs.end() && rescan)
+    {
+        pg_it = pgs.begin();
+        rescan = false;
+    }
+    while (pg_it != pgs.end())
+    {
+        if ((pg_it->second.state & PG_ACTIVE) && pg_it->second.next_scrub && pg_it->second.next_scrub < tv_now.tv_sec)
+        {
+            // Continue scrubbing from the next object
+            if (scrub_last_pg == pg_it->first)
+            {
+                while (scrub_list_pos < scrub_cur_list.total_count)
+                {
+                    auto oid = scrub_cur_list.buf[scrub_list_pos].oid;
+                    oid.stripe &= ~STRIPE_MASK;
+                    scrub_list_pos++;
+                    if (recovery_ops.find(oid) == recovery_ops.end() &&
+                        scrub_ops.find(oid) == scrub_ops.end() &&
+                        pg_it->second.write_queue.find(oid) == pg_it->second.write_queue.end())
+                    {
+                        next_oid = oid;
+                        if (!(pg_it->second.state & PG_SCRUBBING))
+                        {
+                            // Currently scrubbing this PG
+                            pg_it->second.state = pg_it->second.state | PG_SCRUBBING;
+                            report_pg_state(pg_it->second);
+                        }
+                        return 2;
+                    }
+                }
+            }
+            if (scrub_last_pg == pg_it->first &&
+                scrub_list_pos >= scrub_cur_list.total_count &&
+                scrub_cur_list.stable_count < scrub_list_limit)
+            {
+                // End of the list, mark this PG as scrubbed and go to the next PG
+            }
+            else
+            {
+                // Continue listing
+                object_id scrub_last_oid = {};
+                if (scrub_last_pg == pg_it->first && scrub_cur_list.stable_count > 0)
+                {
+                    scrub_last_oid = scrub_cur_list.buf[scrub_cur_list.stable_count-1].oid;
+                    scrub_last_oid.stripe++;
+                }
+                osd_num_t scrub_osd = 0;
+                for (osd_num_t pg_osd: pg_it->second.cur_set)
+                {
+                    if (pg_osd == this->osd_num || scrub_osd == 0)
+                        scrub_osd = pg_osd;
+                }
+                if (!(pg_it->second.state & PG_SCRUBBING))
+                {
+                    // Currently scrubbing this PG
+                    pg_it->second.state = pg_it->second.state | PG_SCRUBBING;
+                    report_pg_state(pg_it->second);
+                }
+                if (scrub_cur_list.buf)
+                {
+                    free(scrub_cur_list.buf);
+                    scrub_cur_list = {};
+                    scrub_list_pos = 0;
+                }
+                scrub_last_pg = pg_it->first;
+                scrub_list(pg_it->first, scrub_osd, scrub_last_oid);
+                return 1;
+            }
+            if (pg_it->second.state & PG_SCRUBBING)
+            {
+                scrub_last_pg = {};
+                pg_it->second.state = pg_it->second.state & ~PG_SCRUBBING;
+                pg_it->second.next_scrub = 0;
+                pg_it->second.history_changed = true;
+                report_pg_state(pg_it->second);
+            }
+            // The list is definitely not needed anymore
+            if (scrub_cur_list.buf)
+            {
+                free(scrub_cur_list.buf);
+                scrub_cur_list = {};
+            }
+        }
+        pg_it++;
+        if (pg_it == pgs.end() && rescan)
+        {
+            // Scan one more time to guarantee that there are no PGs to scrub
+            pg_it = pgs.begin();
+            rescan = false;
+        }
+    }
+    // Scanned all PGs - no more scrubs to do
+    return 0;
+}
+
+void osd_t::submit_scrub_op(object_id oid)
+{
+    auto osd_op = new osd_op_t();
+    osd_op->op_type = OSD_OP_OUT;
+    osd_op->peer_fd = -1;
+    osd_op->req = (osd_any_op_t){
+        .rw = {
+            .header = {
+                .magic = SECONDARY_OSD_OP_MAGIC,
+                .id = 1,
+                .opcode = OSD_OP_SCRUB,
+            },
+            .inode = oid.inode,
+            .offset = oid.stripe,
+            .len = 0,
+        },
+    };
+    if (log_level > 2)
+    {
+        printf("Submitting scrub for %lx:%lx\n", oid.inode, oid.stripe);
+    }
+    osd_op->callback = [this](osd_op_t *osd_op)
+    {
+        object_id oid = { .inode = osd_op->req.rw.inode, .stripe = osd_op->req.rw.offset };
+        if (osd_op->reply.hdr.retval < 0 && osd_op->reply.hdr.retval != -ENOENT)
+        {
+            // Scrub error
+            printf(
+                "Scrub failed with object %lx:%lx (PG %u/%u): error %ld\n",
+                oid.inode, oid.stripe, INODE_POOL(oid.inode),
+                map_to_pg(oid, st_cli.pool_config.at(INODE_POOL(oid.inode)).pg_stripe_size),
+                osd_op->reply.hdr.retval
+            );
+        }
+        else if (log_level > 2)
+        {
+            printf("Scrubbed %lx:%lx\n", oid.inode, oid.stripe);
+        }
+        delete osd_op;
+        if (scrub_sleep_ms)
+        {
+            this->tfd->set_timer(scrub_sleep_ms, false, [this, oid](int timer_id)
+            {
+                scrub_ops.erase(oid);
+                continue_scrub();
+            });
+        }
+        else
+        {
+            scrub_ops.erase(oid);
+            continue_scrub();
+        }
+    };
+    scrub_ops[oid] = osd_op;
+    exec_op(osd_op);
+}
+
+// Triggers scrub requests
+// Scrub reads data from all replicas and compares it
+// To scrub first we need to read objects listings
+bool osd_t::continue_scrub()
+{
+    if (scrub_list_op)
+    {
+        return true;
+    }
+    if (no_scrub)
+    {
+        // Return false = no more scrub work to do
+        scrub_cur_list = {};
+        scrub_last_pg = {};
+        scrub_nearest_ts = 0;
+        if (scrub_timer_id >= 0)
+        {
+            tfd->clear_timer(scrub_timer_id);
+            scrub_timer_id = -1;
+        }
+        for (auto pg_it = pgs.begin(); pg_it != pgs.end(); pg_it++)
+        {
+            if (pg_it->second.state & PG_SCRUBBING)
+            {
+                pg_it->second.state = pg_it->second.state & ~PG_SCRUBBING;
+                report_pg_state(pg_it->second);
+            }
+        }
+        return false;
+    }
+    while (scrub_ops.size() < scrub_queue_depth)
+    {
+        object_id oid;
+        int r = pick_next_scrub(oid);
+        if (r == 2)
+            submit_scrub_op(oid);
+        else
+            return r;
+    }
+    return true;
+}
+
+void osd_t::plan_scrub(pg_t & pg, bool report_state)
+{
+    if ((pg.state & PG_ACTIVE) && !pg.next_scrub && auto_scrub)
+    {
+        timespec tv_now;
+        clock_gettime(CLOCK_REALTIME, &tv_now);
+        auto & pool_cfg = st_cli.pool_config.at(pg.pool_id);
+        auto interval = pool_cfg.scrub_interval ? pool_cfg.scrub_interval : global_scrub_interval;
+        if (pg.next_scrub != tv_now.tv_sec + interval)
+        {
+            pool_cfg.pg_config[pg.pg_num].next_scrub = pg.next_scrub = tv_now.tv_sec + interval;
+            pg.history_changed = true;
+            if (report_state)
+                report_pg_state(pg);
+        }
+        schedule_scrub(pg);
+    }
+}
+
+void osd_t::schedule_scrub(pg_t & pg)
+{
+    if (!no_scrub && pg.next_scrub && (!scrub_nearest_ts || scrub_nearest_ts > pg.next_scrub))
+    {
+        scrub_nearest_ts = pg.next_scrub;
+        timespec tv_now;
+        clock_gettime(CLOCK_REALTIME, &tv_now);
+        if (scrub_timer_id >= 0)
+        {
+            tfd->clear_timer(scrub_timer_id);
+            scrub_timer_id = -1;
+        }
+        if (tv_now.tv_sec > scrub_nearest_ts)
+        {
+            scrub_nearest_ts = 0;
+            peering_state = peering_state | OSD_SCRUBBING;
+            ringloop->wakeup();
+        }
+        else
+        {
+            scrub_timer_id = tfd->set_timer((scrub_nearest_ts-tv_now.tv_sec)*1000, false, [this](int timer_id)
+            {
+                scrub_timer_id = -1;
+                scrub_nearest_ts = 0;
+                peering_state = peering_state | OSD_SCRUBBING;
+                ringloop->wakeup();
+            });
+        }
+    }
+}
+
+void osd_t::continue_primary_scrub(osd_op_t *cur_op)
+{
+    if (!cur_op->op_data && !prepare_primary_rw(cur_op))
+        return;
+    osd_primary_op_data_t *op_data = cur_op->op_data;
+    if (op_data->st == 1)
+        goto resume_1;
+    else if (op_data->st == 2)
+        goto resume_2;
+    {
+        auto & pg = pgs.at({ .pool_id = INODE_POOL(op_data->oid.inode), .pg_num = op_data->pg_num });
+        // Determine version
+        auto vo_it = pg.ver_override.find(op_data->oid);
+        op_data->target_ver = vo_it != pg.ver_override.end() ? vo_it->second : UINT64_MAX;
+        // PG may have degraded or misplaced objects
+        op_data->prev_set = get_object_osd_set(pg, op_data->oid, &op_data->object_state);
+        // Read all available chunks
+        int n_copies = 0;
+        op_data->degraded = false;
+        for (int role = 0; role < op_data->pg_size; role++)
+        {
+            op_data->stripes[role].write_buf = NULL;
+            op_data->stripes[role].read_start = 0;
+            op_data->stripes[role].read_end = bs_block_size;
+            if (op_data->prev_set[role] != 0)
+            {
+                n_copies++;
+            }
+            else
+            {
+                op_data->stripes[role].missing = true;
+                if (op_data->scheme != POOL_SCHEME_REPLICATED && role < op_data->pg_data_size)
+                {
+                    op_data->degraded = true;
+                }
+            }
+        }
+        if (n_copies <= op_data->pg_data_size)
+        {
+            // Nothing to compare, even if we'd like to
+            finish_op(cur_op, 0);
+            return;
+        }
+        cur_op->buf = alloc_read_buffer(op_data->stripes, op_data->pg_size, 0);
+        // Submit reads
+        osd_op_t *subops = new osd_op_t[n_copies];
+        op_data->fact_ver = 0;
+        op_data->done = op_data->errors = op_data->errcode = 0;
+        op_data->n_subops = n_copies;
+        op_data->subops = subops;
+        int sent = submit_primary_subop_batch(SUBMIT_SCRUB_READ, op_data->oid.inode, op_data->target_ver,
+            op_data->stripes, op_data->prev_set, cur_op, 0, -1);
+        assert(sent == n_copies);
+        op_data->st = 1;
+    }
+resume_1:
+    return;
+resume_2:
+    if (op_data->errors > 0)
+    {
+        if (op_data->errcode == -EIO || op_data->errcode == -EDOM)
+        {
+            // I/O or checksum error
+            int n_copies = 0;
+            for (int role = 0; role < op_data->pg_size; role++)
+            {
+                if (op_data->stripes[role].read_error)
+                {
+                    op_data->stripes[role].missing = true;
+                    if (op_data->scheme != POOL_SCHEME_REPLICATED && role < op_data->pg_data_size)
+                    {
+                        op_data->degraded = true;
+                    }
+                }
+                else if (!op_data->stripes[role].missing)
+                {
+                    n_copies++;
+                }
+            }
+            if (n_copies <= op_data->pg_data_size)
+            {
+                // Nothing to compare, just mark the object as corrupted
+                auto & pg = pgs.at({ .pool_id = INODE_POOL(op_data->oid.inode), .pg_num = op_data->pg_num });
+                // FIXME: ref = true ideally... because new_state != state is not necessarily true if it's freed and recreated
+                op_data->object_state = mark_object_corrupted(pg, op_data->oid, op_data->object_state, op_data->stripes, false, false);
+                // Operation is treated as unsuccessful only if the object becomes unreadable
+                finish_op(cur_op, n_copies < op_data->pg_data_size ? op_data->errcode : 0);
+                return;
+            }
+            // Proceed, we can still compare chunks that were successfully read
+        }
+        else
+        {
+            finish_op(cur_op, op_data->errcode);
+            return;
+        }
+    }
+    bool inconsistent = false;
+    if (op_data->scheme == POOL_SCHEME_REPLICATED)
+    {
+        // Check that all chunks have returned the same data
+        int total = 0;
+        int eq_to[op_data->pg_size];
+        for (int role = 0; role < op_data->pg_size; role++)
+        {
+            eq_to[role] = -1;
+            if (op_data->stripes[role].read_end != 0 && !op_data->stripes[role].missing &&
+                !op_data->stripes[role].not_exists)
+            {
+                total++;
+                eq_to[role] = role;
+                for (int other = 0; other < role; other++)
+                {
+                    // Only compare with unique chunks (eq_to[other] == other)
+                    if (eq_to[other] == other && memcmp(op_data->stripes[role].read_buf, op_data->stripes[other].read_buf, bs_block_size) == 0)
+                    {
+                        eq_to[role] = eq_to[other];
+                        break;
+                    }
+                }
+            }
+        }
+        int votes[op_data->pg_size];
+        for (int role = 0; role < op_data->pg_size; role++)
+            votes[role] = 0;
+        for (int role = 0; role < op_data->pg_size; role++)
+        {
+            if (eq_to[role] != -1)
+                votes[eq_to[role]]++;
+        }
+        int best = -1;
+        for (int role = 0; role < op_data->pg_size; role++)
+        {
+            if (votes[role] > (best >= 0 ? votes[best] : 0))
+                best = role;
+        }
+        if (best >= 0 && votes[best] < total)
+        {
+            bool unknown = false;
+            for (int role = 0; role < op_data->pg_size; role++)
+            {
+                if (role != best && votes[role] == votes[best])
+                {
+                    unknown = true;
+                }
+                if (votes[role] > 0 && votes[role] < votes[best])
+                {
+                    printf(
+                        "[PG %u/%u] Object %lx:%lx v%lu copy on OSD %lu doesn't match %d other copies%s\n",
+                        INODE_POOL(op_data->oid.inode), op_data->pg_num,
+                        op_data->oid.inode, op_data->oid.stripe, op_data->fact_ver,
+                        op_data->stripes[role].osd_num, votes[best],
+                        scrub_find_best ? ", marking it as corrupted" : ""
+                    );
+                    if (scrub_find_best)
+                    {
+                        op_data->stripes[role].read_error = true;
+                    }
+                }
+            }
+            if (!scrub_find_best)
+            {
+                unknown = true;
+            }
+            if (unknown)
+            {
+                // It's unknown which replica is good. There are multiple versions with no majority
+                // Mark all good replicas as ambiguous
+                best = -1;
+                inconsistent = true;
+                printf(
+                    "[PG %u/%u] Object %lx:%lx v%lu is inconsistent: copies don't match. Use vitastor-cli fix to fix it\n",
+                    INODE_POOL(op_data->oid.inode), op_data->pg_num,
+                    op_data->oid.inode, op_data->oid.stripe, op_data->fact_ver
+                );
+            }
+        }
+    }
+    else
+    {
+        assert(op_data->scheme == POOL_SCHEME_EC || op_data->scheme == POOL_SCHEME_XOR);
+        auto good_subset = ec_find_good(
+            op_data->stripes, op_data->pg_size, op_data->pg_data_size, op_data->scheme == POOL_SCHEME_XOR,
+            bs_block_size, clean_entry_bitmap_size, scrub_ec_max_bruteforce
+        );
+        if (!good_subset.size())
+        {
+            inconsistent = true;
+            printf(
+                "[PG %u/%u] Object %lx:%lx v%lu is inconsistent: parity chunks don't match data. Use vitastor-cli fix to fix it\n",
+                INODE_POOL(op_data->oid.inode), op_data->pg_num,
+                op_data->oid.inode, op_data->oid.stripe, op_data->fact_ver
+            );
+        }
+        else
+        {
+            int total = 0;
+            for (int role = 0; role < op_data->pg_size; role++)
+            {
+                if (!op_data->stripes[role].missing)
+                {
+                    total++;
+                    op_data->stripes[role].read_error = true;
+                }
+            }
+            for (int role: good_subset)
+            {
+                op_data->stripes[role].read_error = false;
+            }
+            for (int role = 0; role < op_data->pg_size; role++)
+            {
+                if (!op_data->stripes[role].missing && op_data->stripes[role].read_error)
+                {
+                    printf(
+                        "[PG %u/%u] Object %lx:%lx v%lu chunk %d on OSD %lu doesn't match other chunks%s\n",
+                        INODE_POOL(op_data->oid.inode), op_data->pg_num,
+                        op_data->oid.inode, op_data->oid.stripe, op_data->fact_ver,
+                        role, op_data->stripes[role].osd_num,
+                        scrub_find_best ? ", marking it as corrupted" : ""
+                    );
+                }
+            }
+            if (!scrub_find_best && good_subset.size() < total)
+            {
+                inconsistent = true;
+                printf(
+                    "[PG %u/%u] Object %lx:%lx v%lu is marked as inconsistent because scrub_find_best is turned off. Use vitastor-cli fix to fix it\n",
+                    INODE_POOL(op_data->oid.inode), op_data->pg_num,
+                    op_data->oid.inode, op_data->oid.stripe, op_data->fact_ver
+                );
+                for (int role = 0; role < op_data->pg_size; role++)
+                {
+                    if (!op_data->stripes[role].missing && op_data->stripes[role].read_error)
+                    {
+                        // Undo error locator marking chunk as bad
+                        op_data->stripes[role].read_error = false;
+                    }
+                }
+            }
+        }
+    }
+    for (int role = 0; role < op_data->pg_size; role++)
+    {
+        if (op_data->stripes[role].osd_num != 0 &&
+            (op_data->stripes[role].read_error || op_data->stripes[role].not_exists) ||
+            inconsistent)
+        {
+            // Got at least 1 read error or mismatch, mark the object as corrupted
+            auto & pg = pgs.at({ .pool_id = INODE_POOL(op_data->oid.inode), .pg_num = op_data->pg_num });
+            // FIXME: ref = true ideally... because new_state != state is not necessarily true if it's freed and recreated
+            op_data->object_state = mark_object_corrupted(pg, op_data->oid, op_data->object_state, op_data->stripes, false, inconsistent);
+            break;
+        }
+    }
+    finish_op(cur_op, 0);
+}
--- a/src/osd_secondary.cpp
+++ b/src/osd_secondary.cpp
@@ -125,11 +125,18 @@ void osd_t::exec_secondary(osd_op_t *cur_op)
            secondary_op_callback(cur_op);
            return;
        }
-        cur_op->bs_op->oid.stripe = cur_op->req.sec_list.pg_stripe_size;
-        cur_op->bs_op->len = cur_op->req.sec_list.pg_count;
-        cur_op->bs_op->offset = cur_op->req.sec_list.list_pg - 1;
-        cur_op->bs_op->oid.inode = cur_op->req.sec_list.min_inode;
-        cur_op->bs_op->version = cur_op->req.sec_list.max_inode;
+        cur_op->bs_op->pg_alignment = cur_op->req.sec_list.pg_stripe_size;
+        cur_op->bs_op->pg_count = cur_op->req.sec_list.pg_count;
+        cur_op->bs_op->pg_number = cur_op->req.sec_list.list_pg - 1;
+        cur_op->bs_op->min_oid.inode = cur_op->req.sec_list.min_inode;
+        cur_op->bs_op->min_oid.stripe = cur_op->req.sec_list.min_stripe;
+        cur_op->bs_op->max_oid.inode = cur_op->req.sec_list.max_inode;
+        if (cur_op->req.sec_list.max_inode && cur_op->req.sec_list.max_stripe != UINT64_MAX)
+        {
+            cur_op->bs_op->max_oid.stripe = cur_op->req.sec_list.max_stripe
+                ? cur_op->req.sec_list.max_stripe : UINT64_MAX;
+        }
+        cur_op->bs_op->list_stable_limit = cur_op->req.sec_list.stable_limit;
 #ifdef OSD_STUB
        cur_op->bs_op->retval = 0;
        cur_op->bs_op->buf = NULL;
--- a/src/pg_states.cpp
+++ b/src/pg_states.cpp
@@ -3,9 +3,9 @@

 #include "pg_states.h"

-const int pg_state_bit_count = 14;
+const int pg_state_bit_count = 17;

-const int pg_state_bits[14] = {
+const int pg_state_bits[17] = {
    PG_STARTING,
    PG_PEERING,
    PG_INCOMPLETE,
@@ -14,15 +14,18 @@ const int pg_state_bits[14] = {
    PG_STOPPING,
    PG_OFFLINE,
    PG_DEGRADED,
+    PG_HAS_INCONSISTENT,
+    PG_HAS_CORRUPTED,
    PG_HAS_INCOMPLETE,
    PG_HAS_DEGRADED,
    PG_HAS_MISPLACED,
    PG_HAS_UNCLEAN,
    PG_HAS_INVALID,
    PG_LEFT_ON_DEAD,
+    PG_SCRUBBING,
 };

-const char *pg_state_names[14] = {
+const char *pg_state_names[17] = {
    "starting",
    "peering",
    "incomplete",
@@ -31,10 +34,37 @@ const char *pg_state_names[14] = {
    "stopping",
    "offline",
    "degraded",
+    "has_inconsistent",
+    "has_corrupted",
    "has_incomplete",
    "has_degraded",
    "has_misplaced",
    "has_unclean",
    "has_invalid",
    "left_on_dead",
+    "scrubbing",
+};
+
+const int object_state_bit_count = 8;
+
+const int object_state_bits[8] = {
+    OBJ_DEGRADED,
+    OBJ_INCOMPLETE,
+    OBJ_MISPLACED,
+    OBJ_CORRUPTED,
+    OBJ_INCONSISTENT,
+    OBJ_NEEDS_STABLE,
+    OBJ_NEEDS_ROLLBACK,
+    0,
+};
+
+const char *object_state_names[8] = {
+    "degraded",
+    "incomplete",
+    "misplaced",
+    "corrupted",
+    "inconsistent",
+    "needs_stable",
+    "needs_rollback",
+    "clean",
 };
--- a/src/pg_states.h
+++ b/src/pg_states.h
@@ -22,7 +22,10 @@
 #define PG_HAS_MISPLACED (1<<10)
 #define PG_HAS_UNCLEAN (1<<11)
 #define PG_HAS_INVALID (1<<12)
-#define PG_LEFT_ON_DEAD (1<<13)
+#define PG_HAS_CORRUPTED (1<<13)
+#define PG_HAS_INCONSISTENT (1<<14)
+#define PG_LEFT_ON_DEAD (1<<15)
+#define PG_SCRUBBING (1<<16)

 // Lower bits that represent object role (EC 0/1/2... or always 0 with replication)
 // 12 bits is a safe default that doesn't depend on pg_stripe_size or pg_block_size
@@ -32,9 +35,18 @@
 #define OBJ_DEGRADED 0x02
 #define OBJ_INCOMPLETE 0x04
 #define OBJ_MISPLACED 0x08
+// OBJ_CORRUPTED is always set with one of OBJ_INCOMPLETE/OBJ_DEGRADED/OBJ_MISPLACED
+#define OBJ_CORRUPTED 0x10
+// OBJ_INCONSISTENT is when its replicas don't match, but it's unclear which one is correct
+// OBJ_INCONSISTENT may be set with CORRUPTED, but never with other states
+#define OBJ_INCONSISTENT 0x20
 #define OBJ_NEEDS_STABLE 0x10000
 #define OBJ_NEEDS_ROLLBACK 0x20000

 extern const int pg_state_bits[];
 extern const char *pg_state_names[];
 extern const int pg_state_bit_count;
+
+extern const int object_state_bits[];
+extern const char *object_state_names[];
+extern const int object_state_bit_count;
--- a/src/qemu_driver.c
+++ b/src/qemu_driver.c
@@ -218,6 +218,7 @@ static void coroutine_fn vitastor_co_get_metadata(VitastorRPC *task)
    }
 }

+// FIXME: Fix thread safety of the driver - now it segfaults when iothread is enabled in QEMU
 static void vitastor_aio_set_fd_handler(void *ctx, int fd, int unused1, IOHandler *fd_read, IOHandler *fd_write, void *unused2, void *opaque)
 {
    aio_set_fd_handler(ctx, fd,
--- a/src/str_util.cpp
+++ b/src/str_util.cpp
@@ -3,6 +3,7 @@

 #include <assert.h>
 #include <string.h>
+#include <unistd.h>
 #include "str_util.h"

 std::string base64_encode(const std::string &in)
@@ -249,3 +250,53 @@ void print_help(const char *help_text, std::string exe_name, std::string cmd, bo
    fwrite(filtered_text.data(), filtered_text.size(), 1, stdout);
    exit(0);
 }
+
+uint64_t parse_time(std::string time_str, bool *ok)
+{
+    if (!time_str.length())
+    {
+        if (ok)
+            *ok = false;
+        return 0;
+    }
+    uint64_t mul = 1;
+    char type_char = tolower(time_str[time_str.length()-1]);
+    if (type_char == 's' || type_char == 'm' || type_char == 'h' || type_char == 'd' || type_char == 'y')
+    {
+        if (type_char == 's')
+            mul = 1;
+        else if (time_str[time_str.length()-1] == 'M')
+            mul = 30*86400;
+        else if (type_char == 'm')
+            mul = 60;
+        else if (type_char == 'h')
+            mul = 3600;
+        else if (type_char == 'd')
+            mul = 86400;
+        else /*if (type_char == 'y')*/
+            mul = 86400*365;
+        time_str = time_str.substr(0, time_str.length()-1);
+    }
+    uint64_t ts = stoull_full(time_str, 0) * mul;
+    if (ok)
+        *ok = !(ts == 0 && time_str != "0" && (time_str != "" || mul != 1));
+    return ts;
+}
+
+std::string read_all_fd(int fd)
+{
+    int res_size = 0, res_alloc = 0;
+    std::string res;
+    while (1)
+    {
+        if (res_size >= res_alloc)
+            res.resize((res_alloc = (res_alloc ? res_alloc*2 : 1024)));
+        int r = read(fd, (char*)res.data()+res_size, res_alloc-res_size);
+        if (r > 0)
+            res_size += r;
+        else if (!r || errno != EAGAIN && errno != EINTR)
+            break;
+    }
+    res.resize(res_size);
+    return res;
+}
--- a/src/str_util.h
+++ b/src/str_util.h
@@ -15,3 +15,5 @@ std::string str_replace(const std::string & in, const std::string & needle, cons
 uint64_t stoull_full(const std::string & str, int base = 0);
 std::string format_size(uint64_t size, bool nobytes = false);
 void print_help(const char *help_text, std::string exe_name, std::string cmd, bool all);
+uint64_t parse_time(std::string time_str, bool *ok = NULL);
+std::string read_all_fd(int fd);
--- a/src/vitastor.pc.in
+++ b/src/vitastor.pc.in
@@ -6,7 +6,7 @@ includedir=${prefix}/@CMAKE_INSTALL_INCLUDEDIR@

 Name: Vitastor
 Description: Vitastor client library
-Version: 0.8.8
+Version: 0.9.0
 Libs: -L${libdir} -lvitastor_client
 Cflags: -I${includedir}

--- a/Show More
+++ b/Show More