WIP Use random_hier_combinations

Replace flatten_tree with extract_tree_levels
Implement multi-level tree extractor for hierarchical failure domains
2023-05-18 17:44:00 +03:00 · 2023-05-18 17:44:00 +03:00 · 2023-05-18 17:44:00 +03:00 · 2023-05-18 17:44:00 +03:00
100 changed files with 639 additions and 3862 deletions
--- a/.gitea/workflows/test.yml
+++ b/.gitea/workflows/test.yml
@@ -550,111 +550,3 @@ jobs:
          echo ""
        done

-  test_scrub:
-    runs-on: ubuntu-latest
-    needs: build
-    container: ${{env.TEST_IMAGE}}:${{github.sha}}
-    steps:
-    - name: Run test
-      id: test
-      timeout-minutes: 3
-      run: /root/vitastor/tests/test_scrub.sh
-    - name: Print logs
-      if: always() && steps.test.outcome == 'failure'
-      run: |
-        for i in /root/vitastor/testdata/*.log /root/vitastor/testdata/*.txt; do
-          echo "-------- $i --------"
-          cat $i
-          echo ""
-        done
-
-  test_scrub_zero_osd_2:
-    runs-on: ubuntu-latest
-    needs: build
-    container: ${{env.TEST_IMAGE}}:${{github.sha}}
-    steps:
-    - name: Run test
-      id: test
-      timeout-minutes: 3
-      run: ZERO_OSD=2 /root/vitastor/tests/test_scrub.sh
-    - name: Print logs
-      if: always() && steps.test.outcome == 'failure'
-      run: |
-        for i in /root/vitastor/testdata/*.log /root/vitastor/testdata/*.txt; do
-          echo "-------- $i --------"
-          cat $i
-          echo ""
-        done
-
-  test_scrub_xor:
-    runs-on: ubuntu-latest
-    needs: build
-    container: ${{env.TEST_IMAGE}}:${{github.sha}}
-    steps:
-    - name: Run test
-      id: test
-      timeout-minutes: 3
-      run: SCHEME=xor /root/vitastor/tests/test_scrub.sh
-    - name: Print logs
-      if: always() && steps.test.outcome == 'failure'
-      run: |
-        for i in /root/vitastor/testdata/*.log /root/vitastor/testdata/*.txt; do
-          echo "-------- $i --------"
-          cat $i
-          echo ""
-        done
-
-  test_scrub_pg_size_3:
-    runs-on: ubuntu-latest
-    needs: build
-    container: ${{env.TEST_IMAGE}}:${{github.sha}}
-    steps:
-    - name: Run test
-      id: test
-      timeout-minutes: 3
-      run: PG_SIZE=3 /root/vitastor/tests/test_scrub.sh
-    - name: Print logs
-      if: always() && steps.test.outcome == 'failure'
-      run: |
-        for i in /root/vitastor/testdata/*.log /root/vitastor/testdata/*.txt; do
-          echo "-------- $i --------"
-          cat $i
-          echo ""
-        done
-
-  test_scrub_pg_size_6_pg_minsize_4_osd_count_6_ec:
-    runs-on: ubuntu-latest
-    needs: build
-    container: ${{env.TEST_IMAGE}}:${{github.sha}}
-    steps:
-    - name: Run test
-      id: test
-      timeout-minutes: 3
-      run: PG_SIZE=6 PG_MINSIZE=4 OSD_COUNT=6 SCHEME=ec /root/vitastor/tests/test_scrub.sh
-    - name: Print logs
-      if: always() && steps.test.outcome == 'failure'
-      run: |
-        for i in /root/vitastor/testdata/*.log /root/vitastor/testdata/*.txt; do
-          echo "-------- $i --------"
-          cat $i
-          echo ""
-        done
-
-  test_scrub_ec:
-    runs-on: ubuntu-latest
-    needs: build
-    container: ${{env.TEST_IMAGE}}:${{github.sha}}
-    steps:
-    - name: Run test
-      id: test
-      timeout-minutes: 3
-      run: SCHEME=ec /root/vitastor/tests/test_scrub.sh
-    - name: Print logs
-      if: always() && steps.test.outcome == 'failure'
-      run: |
-        for i in /root/vitastor/testdata/*.log /root/vitastor/testdata/*.txt; do
-          echo "-------- $i --------"
-          cat $i
-          echo ""
-        done
-
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -2,6 +2,6 @@ cmake_minimum_required(VERSION 2.8.12)

 project(vitastor)

-set(VERSION "0.9.1")
+set(VERSION "0.8.9")

 add_subdirectory(src)
--- a/csi/Makefile
+++ b/csi/Makefile
@@ -1,4 +1,4 @@
-VERSION ?= v0.9.1
+VERSION ?= v0.8.9

 all: build push

--- a/csi/deploy/004-csi-nodeplugin.yaml
+++ b/csi/deploy/004-csi-nodeplugin.yaml
@@ -49,7 +49,7 @@ spec:
            capabilities:
              add: ["SYS_ADMIN"]
            allowPrivilegeEscalation: true
-          image: vitalif/vitastor-csi:v0.9.1
+          image: vitalif/vitastor-csi:v0.8.9
          args:
            - "--node=$(NODE_ID)"
            - "--endpoint=$(CSI_ENDPOINT)"
--- a/csi/deploy/007-csi-provisioner.yaml
+++ b/csi/deploy/007-csi-provisioner.yaml
@@ -116,7 +116,7 @@ spec:
            privileged: true
            capabilities:
              add: ["SYS_ADMIN"]
-          image: vitalif/vitastor-csi:v0.9.1
+          image: vitalif/vitastor-csi:v0.8.9
          args:
            - "--node=$(NODE_ID)"
            - "--endpoint=$(CSI_ENDPOINT)"
--- a/csi/src/config.go
+++ b/csi/src/config.go
@@ -5,7 +5,7 @@ package vitastor

 const (
    vitastorCSIDriverName    = "csi.vitastor.io"
-    vitastorCSIDriverVersion = "0.9.1"
+    vitastorCSIDriverVersion = "0.8.9"
 )

 // Config struct fills the parameters of request or user input
--- a/debian/changelog
+++ b/debian/changelog
@@ -1,10 +1,10 @@
-vitastor (0.9.1-1) unstable; urgency=medium
+vitastor (0.8.9-1) unstable; urgency=medium

  * Bugfixes

 -- Vitaliy Filippov <vitalif@yourcmc.ru>  Fri, 03 Jun 2022 02:09:44 +0300

-vitastor (0.9.1-1) unstable; urgency=medium
+vitastor (0.8.9-1) unstable; urgency=medium

  * Implement NFS proxy
  * Add documentation
--- a/debian/vitastor.Dockerfile
+++ b/debian/vitastor.Dockerfile
@@ -34,8 +34,8 @@ RUN set -e -x; \
    mkdir -p /root/packages/vitastor-$REL; \
    rm -rf /root/packages/vitastor-$REL/*; \
    cd /root/packages/vitastor-$REL; \
-    cp -r /root/vitastor vitastor-0.9.1; \
-    cd vitastor-0.9.1; \
+    cp -r /root/vitastor vitastor-0.8.9; \
+    cd vitastor-0.8.9; \
    ln -s /root/fio-build/fio-*/ ./fio; \
    FIO=$(head -n1 fio/debian/changelog | perl -pe 's/^.*\((.*?)\).*$/$1/'); \
    ls /usr/include/linux/raw.h || cp ./debian/raw.h /usr/include/linux/raw.h; \
@@ -48,8 +48,8 @@ RUN set -e -x; \
    rm -rf a b; \
    echo "dep:fio=$FIO" > debian/fio_version; \
    cd /root/packages/vitastor-$REL; \
-    tar --sort=name --mtime='2020-01-01' --owner=0 --group=0 --exclude=debian -cJf vitastor_0.9.1.orig.tar.xz vitastor-0.9.1; \
-    cd vitastor-0.9.1; \
+    tar --sort=name --mtime='2020-01-01' --owner=0 --group=0 --exclude=debian -cJf vitastor_0.8.9.orig.tar.xz vitastor-0.8.9; \
+    cd vitastor-0.8.9; \
    V=$(head -n1 debian/changelog | perl -pe 's/^.*\((.*?)\).*$/$1/'); \
    DEBFULLNAME="Vitaliy Filippov <vitalif@yourcmc.ru>" dch -D $REL -v "$V""$REL" "Rebuild for $REL"; \
    DEB_BUILD_OPTIONS=nocheck dpkg-buildpackage --jobs=auto -sa; \
--- a/docs/config/common.en.md
+++ b/docs/config/common.en.md
@@ -25,16 +25,11 @@ running if required parameters are specified.
 ## etcd_address

 - Type: string or array of strings
- Can be changed online: yes

 etcd connection endpoint(s). Multiple endpoints may be delimited by "," or
 specified in a JSON array `["10.0.115.10:2379/v3","10.0.115.11:2379/v3"]`.
 Note that https is not supported for etcd connections yet.

-etcd connection endpoints can be changed online by updating global
-configuration in etcd itself - this allows to switch the cluster to new
-etcd addresses without downtime.
-
 ## etcd_prefix

 - Type: string
@@ -47,6 +42,5 @@ example, use a single etcd cluster for multiple Vitastor clusters.

 - Type: integer
 - Default: 0
- Can be changed online: yes

 Log level. Raise if you want more verbose output.
--- a/docs/config/common.ru.md
+++ b/docs/config/common.ru.md
@@ -24,14 +24,10 @@
 ## etcd_address

 - Тип: строка или массив строк
- Можно менять на лету: да

 Адрес(а) подключения к etcd. Несколько адресов могут разделяться запятой
 или указываться в виде JSON-массива `["10.0.115.10:2379/v3","10.0.115.11:2379/v3"]`.

-Адреса подключения к etcd можно поменять на лету, обновив конфигурацию в
-самом etcd - это позволяет переключить кластер на новые etcd без остановки.
-
 ## etcd_prefix

 - Тип: строка
@@ -45,6 +41,5 @@

 - Тип: целое число
 - Значение по умолчанию: 0
- Можно менять на лету: да

 Уровень логгирования. Повысьте, если хотите более подробный вывод.
--- a/docs/config/network.en.md
+++ b/docs/config/network.en.md
@@ -153,7 +153,6 @@ operations.
 - Type: seconds
 - Default: 5
 - Minimum: 1
- Can be changed online: yes

 Interval before attempting to reconnect to an unavailable OSD.

@@ -162,7 +161,6 @@ Interval before attempting to reconnect to an unavailable OSD.
 - Type: seconds
 - Default: 5
 - Minimum: 1
- Can be changed online: yes

 Timeout for OSD connection attempts.

@@ -171,7 +169,6 @@ Timeout for OSD connection attempts.
 - Type: seconds
 - Default: 5
 - Minimum: 1
- Can be changed online: yes

 OSD connection inactivity time after which clients and other OSDs send
 keepalive requests to check state of the connection.
@@ -181,7 +178,6 @@ keepalive requests to check state of the connection.
 - Type: seconds
 - Default: 5
 - Minimum: 1
- Can be changed online: yes

 Maximum time to wait for OSD keepalive responses. If an OSD doesn't respond
 within this time, the connection to it is dropped and a reconnection attempt
@@ -192,7 +188,6 @@ is scheduled.
 - Type: milliseconds
 - Default: 500
 - Minimum: 50
- Can be changed online: yes

 OSDs respond to clients with a special error code when they receive I/O
 requests for a PG that's not synchronized and started. This parameter sets
@@ -202,7 +197,6 @@ the time for the clients to wait before re-attempting such I/O requests.

 - Type: integer
 - Default: 5
- Can be changed online: yes

 Maximum number of attempts for etcd requests which can't be retried
 indefinitely.
@@ -211,7 +205,6 @@ indefinitely.

 - Type: milliseconds
 - Default: 1000
- Can be changed online: yes

 Timeout for etcd requests which should complete quickly, like lease refresh.

@@ -219,7 +212,6 @@ Timeout for etcd requests which should complete quickly, like lease refresh.

 - Type: milliseconds
 - Default: 5000
- Can be changed online: yes

 Timeout for etcd requests which are allowed to wait for some time.

@@ -227,7 +219,6 @@ Timeout for etcd requests which are allowed to wait for some time.

 - Type: seconds
 - Default: max(30, etcd_report_interval*2)
- Can be changed online: yes

 Timeout for etcd connection HTTP Keep-Alive. Should be higher than
 etcd_report_interval to guarantee that keepalive actually works.
@@ -236,7 +227,6 @@ etcd_report_interval to guarantee that keepalive actually works.

 - Type: seconds
 - Default: 30
- Can be changed online: yes

 etcd websocket ping interval required to keep the connection alive and
 detect disconnections quickly.
@@ -245,7 +235,6 @@ detect disconnections quickly.

 - Type: integer
 - Default: 33554432
- Can be changed online: yes

 Without immediate_commit=all this parameter sets the limit of "dirty"
 (not committed by fsync) data allowed by the client before forcing an
--- a/docs/config/network.ru.md
+++ b/docs/config/network.ru.md
@@ -161,7 +161,6 @@ OSD в любом случае согласовывают реальное зн
 - Тип: секунды
 - Значение по умолчанию: 5
 - Минимальное значение: 1
- Можно менять на лету: да

 Время ожидания перед повторной попыткой соединиться с недоступным OSD.

@@ -170,7 +169,6 @@ OSD в любом случае согласовывают реальное зн
 - Тип: секунды
 - Значение по умолчанию: 5
 - Минимальное значение: 1
- Можно менять на лету: да

 Максимальное время ожидания попытки соединения с OSD.

@@ -179,7 +177,6 @@ OSD в любом случае согласовывают реальное зн
 - Тип: секунды
 - Значение по умолчанию: 5
 - Минимальное значение: 1
- Можно менять на лету: да

 Время неактивности соединения с OSD, после которого клиенты или другие OSD
 посылают запрос проверки состояния соединения.
@@ -189,7 +186,6 @@ OSD в любом случае согласовывают реальное зн
 - Тип: секунды
 - Значение по умолчанию: 5
 - Минимальное значение: 1
- Можно менять на лету: да

 Максимальное время ожидания ответа на запрос проверки состояния соединения.
 Если OSD не отвечает за это время, соединение отключается и производится
@@ -200,7 +196,6 @@ OSD в любом случае согласовывают реальное зн
 - Тип: миллисекунды
 - Значение по умолчанию: 500
 - Минимальное значение: 50
- Можно менять на лету: да

 Когда OSD получают от клиентов запросы ввода-вывода, относящиеся к не
 поднятым на данный момент на них PG, либо к PG в процессе синхронизации,
@@ -212,7 +207,6 @@ OSD в любом случае согласовывают реальное зн

 - Тип: целое число
 - Значение по умолчанию: 5
- Можно менять на лету: да

 Максимальное число попыток выполнения запросов к etcd для тех запросов,
 которые нельзя повторять бесконечно.
@@ -221,7 +215,6 @@ OSD в любом случае согласовывают реальное зн

 - Тип: миллисекунды
 - Значение по умолчанию: 1000
- Можно менять на лету: да

 Максимальное время выполнения запросов к etcd, которые должны завершаться
 быстро, таких, как обновление резервации (lease).
@@ -230,7 +223,6 @@ OSD в любом случае согласовывают реальное зн

 - Тип: миллисекунды
 - Значение по умолчанию: 5000
- Можно менять на лету: да

 Максимальное время выполнения запросов к etcd, для которых не обязательно
 гарантировать быстрое выполнение.
@@ -239,7 +231,6 @@ OSD в любом случае согласовывают реальное зн

 - Тип: секунды
 - Значение по умолчанию: max(30, etcd_report_interval*2)
- Можно менять на лету: да

 Таймаут для HTTP Keep-Alive в соединениях к etcd. Должен быть больше, чем
 etcd_report_interval, чтобы keepalive гарантированно работал.
@@ -248,7 +239,6 @@ etcd_report_interval, чтобы keepalive гарантированно рабо

 - Тип: секунды
 - Значение по умолчанию: 30
- Можно менять на лету: да

 Интервал проверки живости вебсокет-подключений к etcd.

@@ -256,7 +246,6 @@ etcd_report_interval, чтобы keepalive гарантированно рабо

 - Тип: целое число
 - Значение по умолчанию: 33554432
- Можно менять на лету: да

 При работе без immediate_commit=all - это лимит объёма "грязных" (не
 зафиксированных fsync-ом) данных, при достижении которого клиент будет
--- a/docs/config/osd.en.md
+++ b/docs/config/osd.en.md
@@ -7,8 +7,7 @@
 # Runtime OSD Parameters

 These parameters only apply to OSDs, are not fixed at the moment of OSD drive
-initialization and can be changed - either with an OSD restart or, for some of
-them, even without restarting by updating configuration in etcd.
+initialization and can be changed with an OSD restart.

 - [etcd_report_interval](#etcd_report_interval)
 - [run_primary](#run_primary)
@@ -39,14 +38,6 @@ them, even without restarting by updating configuration in etcd.
 - [throttle_target_parallelism](#throttle_target_parallelism)
 - [throttle_threshold_us](#throttle_threshold_us)
 - [osd_memlock](#osd_memlock)
- [auto_scrub](#auto_scrub)
- [no_scrub](#no_scrub)
- [scrub_interval](#scrub_interval)
- [scrub_queue_depth](#scrub_queue_depth)
- [scrub_sleep](#scrub_sleep)
- [scrub_list_limit](#scrub_list_limit)
- [scrub_find_best](#scrub_find_best)
- [scrub_ec_max_bruteforce](#scrub_ec_max_bruteforce)

 ## etcd_report_interval

@@ -100,7 +91,6 @@ OSD by hand.

 - Type: seconds
 - Default: 5
- Can be changed online: yes

 Time interval at which automatic fsyncs/flushes are issued by each OSD when
 the immediate_commit mode if disabled. fsyncs are required because without
@@ -113,7 +103,6 @@ issue fsyncs at all.

 - Type: integer
 - Default: 128
- Can be changed online: yes

 Same as autosync_interval, but sets the maximum number of uncommitted write
 operations before issuing an fsync operation internally.
@@ -122,7 +111,6 @@ operations before issuing an fsync operation internally.

 - Type: integer
 - Default: 4
- Can be changed online: yes

 Maximum recovery operations per one primary OSD at any given moment of time.
 Currently it's the only parameter available to tune the speed or recovery
@@ -132,7 +120,6 @@ and rebalancing, but it's planned to implement more.

 - Type: integer
 - Default: 128
- Can be changed online: yes

 Number of recovery operations before switching to recovery of the next PG.
 The idea is to mix all PGs during recovery for more even space and load
@@ -143,7 +130,6 @@ Degraded PGs are anyway scanned first.

 - Type: integer
 - Default: 16
- Can be changed online: yes

 Maximum number of recovery operations before issuing an additional fsync.

@@ -159,7 +145,6 @@ the underlying device. This may be useful for recovery purposes.

 - Type: boolean
 - Default: false
- Can be changed online: yes

 Disable automatic background recovery of objects. Note that it doesn't
 affect implicit recovery of objects happening during writes - a write is
@@ -169,7 +154,6 @@ always made to a full set of at least pg_minsize OSDs.

 - Type: boolean
 - Default: false
- Can be changed online: yes

 Disable background movement of data between different OSDs. Disabling it
 means that PGs in the `has_misplaced` state will be left in it indefinitely.
@@ -178,7 +162,6 @@ means that PGs in the `has_misplaced` state will be left in it indefinitely.

 - Type: seconds
 - Default: 3
- Can be changed online: yes

 Time interval at which OSDs print simple human-readable operation
 statistics on stdout.
@@ -187,7 +170,6 @@ statistics on stdout.

 - Type: seconds
 - Default: 10
- Can be changed online: yes

 Time interval at which OSDs dump slow or stuck operations on stdout, if
 they're any. Also it's the time after which an operation is considered
@@ -197,7 +179,6 @@ they're any. Also it's the time after which an operation is considered

 - Type: seconds
 - Default: 60
- Can be changed online: yes

 Number of seconds after which a deleted inode is removed from OSD statistics.

@@ -205,7 +186,6 @@ Number of seconds after which a deleted inode is removed from OSD statistics.

 - Type: integer
 - Default: 128
- Can be changed online: yes

 Parallel client write operation limit per one OSD. Operations that exceed
 this limit are pushed to a temporary queue instead of being executed
@@ -215,7 +195,6 @@ immediately.

 - Type: integer
 - Default: 1
- Can be changed online: yes

 Flusher is a micro-thread that moves data from the journal to the data
 area of the device. Their number is auto-tuned between minimum and maximum.
@@ -225,7 +204,6 @@ Minimum number is set by this parameter.

 - Type: integer
 - Default: 256
- Can be changed online: yes

 Maximum number of journal flushers (see above min_flusher_count).

@@ -282,7 +260,6 @@ Most (99%) other SSDs don't need this option.

 - Type: boolean
 - Default: false
- Can be changed online: yes

 Enable soft throttling of small journaled writes. Useful for hybrid OSDs
 with fast journal/metadata devices and slow data devices. The idea is that
@@ -300,7 +277,6 @@ fills up.

 - Type: integer
 - Default: 100
- Can be changed online: yes

 Target maximum number of throttled operations per second under the condition
 of full journal. Set it to approximate random write iops of your data devices
@@ -310,7 +286,6 @@ of full journal. Set it to approximate random write iops of your data devices

 - Type: integer
 - Default: 100
- Can be changed online: yes

 Target maximum bandwidth in MB/s of throttled operations per second under
 the condition of full journal. Set it to approximate linear write
@@ -320,7 +295,6 @@ performance of your data devices (HDDs).

 - Type: integer
 - Default: 1
- Can be changed online: yes

 Target maximum parallelism of throttled operations under the condition of
 full journal. Set it to approximate internal parallelism of your data
@@ -330,7 +304,6 @@ devices (1 for HDDs, 4-8 for SSDs).

 - Type: microseconds
 - Default: 50
- Can be changed online: yes

 Minimal computed delay to be applied to throttled operations. Usually
 doesn't need to be changed.
@@ -340,103 +313,4 @@ doesn't need to be changed.
 - Type: boolean
 - Default: false

-Lock all OSD memory to prevent it from being unloaded into swap with
-mlockall(). Requires sufficient ulimit -l (max locked memory).
-
-## auto_scrub
-
- Type: boolean
- Default: false
- Can be changed online: yes
-
-Data scrubbing is the process of background verification of copies to find
-and repair corrupted blocks. It's not run automatically by default since
-it's a new feature. Set this parameter to true to enable automatic scrubs.
-
-This parameter makes OSDs automatically schedule data scrubbing of clean PGs
-every `scrub_interval` (see below). You can also start/schedule scrubbing
-manually by setting `next_scrub` JSON key to the desired UNIX time of the
-next scrub in `/pg/history/...` values in etcd.
-
-## no_scrub
-
- Type: boolean
- Default: false
- Can be changed online: yes
-
-Temporarily disable scrubbing and stop running scrubs.
-
-## scrub_interval
-
- Type: string
- Default: 30d
- Can be changed online: yes
-
-Default automatic scrubbing interval for all pools. Numbers without suffix
-are treated as seconds, possible unit suffixes include 's' (seconds),
-'m' (minutes), 'h' (hours), 'd' (days), 'M' (months) and 'y' (years).
-
-## scrub_queue_depth
-
- Type: integer
- Default: 1
- Can be changed online: yes
-
-Number of parallel scrubbing operations per one OSD.
-
-## scrub_sleep
-
- Type: milliseconds
- Default: 0
- Can be changed online: yes
-
-Additional interval between two consecutive scrubbing operations on one OSD.
-Can be used to slow down scrubbing if it affects user load too much.
-
-## scrub_list_limit
-
- Type: integer
- Default: 1000
- Can be changed online: yes
-
-Number of objects to list in one listing operation during scrub.
-
-## scrub_find_best
-
- Type: boolean
- Default: true
- Can be changed online: yes
-
-Find and automatically restore best versions of objects with unmatched
-copies. In replicated setups, the best version is the version with most
-matching replicas. In EC setups, the best version is the subset of data
-and parity chunks without mismatches.
-
-The hypothetical situation where you might want to disable it is when
-you have 3 replicas and you are paranoid that 2 HDDs out of 3 may silently
-corrupt an object in the same way (for example, zero it out) and only
-1 HDD will remain good. In this case disabling scrub_find_best may help
-you to recover the data! See also scrub_ec_max_bruteforce below.
-
-## scrub_ec_max_bruteforce
-
- Type: integer
- Default: 100
- Can be changed online: yes
-
-Vitastor can locate corrupted chunks in EC setups with more than 1 parity
-chunk by brute-forcing all possible error locations. This configuration
-value limits the maximum number of checked combinations. You can try to
-increase it if you have EC N+K setup with N and K large enough for
-combination count `C(N+K-1, K-1) = (N+K-1)! / (K-1)! / N!` to be greater
-than the default 100.
-
-If there are too many possible combinations or if multiple combinations give
-correct results then objects are marked inconsistent and aren't recovered
-automatically.
-
-In replicated setups bruteforcing isn't needed, Vitastor just assumes that
-the variant with most available equal copies is correct. For example, if
-you have 3 replicas and 1 of them differs, this one is considered to be
-corrupted. But if there is no "best" version with more copies than all
-others have then the object is also marked as inconsistent.
+Lock all OSD memory to prevent it from being unloaded into swap with mlockall(). Requires sufficient ulimit -l (max locked memory).
--- a/docs/config/osd.ru.md
+++ b/docs/config/osd.ru.md
@@ -8,8 +8,7 @@

 Данные параметры используются только OSD, но, в отличие от дисковых параметров,
 не фиксируются в момент инициализации дисков OSD и могут быть изменены в любой
-момент с помощью перезапуска OSD, а некоторые и без перезапуска, с помощью
-изменения конфигурации в etcd.
+момент с перезапуском OSD.

 - [etcd_report_interval](#etcd_report_interval)
 - [run_primary](#run_primary)
@@ -40,14 +39,6 @@
 - [throttle_target_parallelism](#throttle_target_parallelism)
 - [throttle_threshold_us](#throttle_threshold_us)
 - [osd_memlock](#osd_memlock)
- [auto_scrub](#auto_scrub)
- [no_scrub](#no_scrub)
- [scrub_interval](#scrub_interval)
- [scrub_queue_depth](#scrub_queue_depth)
- [scrub_sleep](#scrub_sleep)
- [scrub_list_limit](#scrub_list_limit)
- [scrub_find_best](#scrub_find_best)
- [scrub_ec_max_bruteforce](#scrub_ec_max_bruteforce)

 ## etcd_report_interval

@@ -102,7 +93,6 @@ RUNNING), подходящий под заданную маску. Также н

 - Тип: секунды
 - Значение по умолчанию: 5
- Можно менять на лету: да

 Временной интервал отправки автоматических fsync-ов (операций очистки кэша)
 каждым OSD для случая, когда режим immediate_commit отключён. fsync-и нужны
@@ -115,7 +105,6 @@ OSD, чтобы успевать очищать журнал - без них OSD

 - Тип: целое число
 - Значение по умолчанию: 128
- Можно менять на лету: да

 Аналогично autosync_interval, но задаёт не временной интервал, а
 максимальное количество незафиксированных операций записи перед
@@ -125,7 +114,6 @@ OSD, чтобы успевать очищать журнал - без них OSD

 - Тип: целое число
 - Значение по умолчанию: 4
- Можно менять на лету: да

 Максимальное число операций восстановления на одном первичном OSD в любой
 момент времени. На данный момент единственный параметр, который можно менять
@@ -136,7 +124,6 @@ OSD, чтобы успевать очищать журнал - без них OSD

 - Тип: целое число
 - Значение по умолчанию: 128
- Можно менять на лету: да

 Число операций восстановления перед переключением на восстановление другой PG.
 Идея заключается в том, чтобы восстанавливать все PG одновременно для более
@@ -148,7 +135,6 @@ OSD, чтобы успевать очищать журнал - без них OSD

 - Тип: целое число
 - Значение по умолчанию: 16
- Можно менять на лету: да

 Максимальное число операций восстановления перед дополнительным fsync.

@@ -164,7 +150,6 @@ OSD, чтобы успевать очищать журнал - без них OSD

 - Тип: булево (да/нет)
 - Значение по умолчанию: false
- Можно менять на лету: да

 Отключить автоматическое фоновое восстановление объектов. Обратите внимание,
 что эта опция не отключает восстановление объектов, происходящее при
@@ -175,7 +160,6 @@ OSD.

 - Тип: булево (да/нет)
 - Значение по умолчанию: false
- Можно менять на лету: да

 Отключить фоновое перемещение объектов между разными OSD. Отключение
 означает, что PG, находящиеся в состоянии `has_misplaced`, будут оставлены
@@ -185,7 +169,6 @@ OSD.

 - Тип: секунды
 - Значение по умолчанию: 3
- Можно менять на лету: да

 Временной интервал, с которым OSD печатают простую человекочитаемую
 статистику выполнения операций в стандартный вывод.
@@ -194,7 +177,6 @@ OSD.

 - Тип: секунды
 - Значение по умолчанию: 10
- Можно менять на лету: да

 Временной интервал, с которым OSD выводят в стандартный вывод список
 медленных или зависших операций, если таковые имеются. Также время, при
@@ -204,7 +186,6 @@ OSD.

 - Тип: секунды
 - Значение по умолчанию: 60
- Можно менять на лету: да

 Число секунд, через которое удалённые инод удаляется и из статистики OSD.

@@ -212,7 +193,6 @@ OSD.

 - Тип: целое число
 - Значение по умолчанию: 128
- Можно менять на лету: да

 Максимальное число одновременных клиентских операций записи на один OSD.
 Операции, превышающие этот лимит, не исполняются сразу, а сохраняются во
@@ -222,7 +202,6 @@ OSD.

 - Тип: целое число
 - Значение по умолчанию: 1
- Можно менять на лету: да

 Flusher - это микро-поток (корутина), которая копирует данные из журнала в
 основную область устройства данных. Их число настраивается динамически между
@@ -232,7 +211,6 @@ Flusher - это микро-поток (корутина), которая коп

 - Тип: целое число
 - Значение по умолчанию: 256
- Можно менять на лету: да

 Максимальное число микро-потоков очистки журнала (см. выше min_flusher_count).

@@ -292,7 +270,6 @@ Flusher - это микро-поток (корутина), которая коп

 - Тип: булево (да/нет)
 - Значение по умолчанию: false
- Можно менять на лету: да

 Разрешить мягкое ограничение скорости журналируемой записи. Полезно для
 гибридных OSD с быстрыми устройствами метаданных и медленными устройствами
@@ -311,7 +288,6 @@ Flusher - это микро-поток (корутина), которая коп

 - Тип: целое число
 - Значение по умолчанию: 100
- Можно менять на лету: да

 Расчётное максимальное число ограничиваемых операций в секунду при условии
 отсутствия свободного места в журнале. Устанавливайте приблизительно равным
@@ -322,7 +298,6 @@ Flusher - это микро-поток (корутина), которая коп

 - Тип: целое число
 - Значение по умолчанию: 100
- Можно менять на лету: да

 Расчётный максимальный размер в МБ/с ограничиваемых операций в секунду при
 условии отсутствия свободного места в журнале. Устанавливайте приблизительно
@@ -333,7 +308,6 @@ Flusher - это микро-поток (корутина), которая коп

 - Тип: целое число
 - Значение по умолчанию: 1
- Можно менять на лету: да

 Расчётный максимальный параллелизм ограничиваемых операций в секунду при
 условии отсутствия свободного места в журнале. Устанавливайте приблизительно
@@ -344,7 +318,6 @@ Flusher - это микро-поток (корутина), которая коп

 - Тип: микросекунды
 - Значение по умолчанию: 50
- Можно менять на лету: да

 Минимальная применимая к ограничиваемым операциям задержка. Обычно не
 требует изменений.
@@ -354,113 +327,4 @@ Flusher - это микро-поток (корутина), которая коп
 - Тип: булево (да/нет)
 - Значение по умолчанию: false

-Блокировать всю память OSD с помощью mlockall, чтобы запретить её выгрузку
-в пространство подкачки. Требует достаточного значения ulimit -l (лимита
-заблокированной памяти).
-
-## auto_scrub
-
- Тип: булево (да/нет)
- Значение по умолчанию: false
- Можно менять на лету: да
-
-Скраб - процесс фоновой проверки копий данных, предназначенный, чтобы
-находить и исправлять повреждённые блоки. По умолчанию эти проверки ещё не
-запускаются автоматически, так как являются новой функцией. Чтобы включить
-автоматическое планирование скрабов, установите данный параметр в true.
-
-Включённый параметр заставляет OSD автоматически планировать фоновую
-проверку чистых PG раз в `scrub_interval` (см. ниже). Вы также можете
-запустить или запланировать проверку вручную, установив значение ключа JSON
-`next_scrub` внутри ключей etcd `/pg/history/...` в UNIX-время следующей
-желаемой проверки.
-
-## no_scrub
-
- Тип: булево (да/нет)
- Значение по умолчанию: false
- Можно менять на лету: да
-
-Временно отключить и остановить запущенные скрабы.
-
-## scrub_interval
-
- Тип: строка
- Значение по умолчанию: 30d
- Можно менять на лету: да
-
-Интервал автоматической фоновой проверки по умолчанию для всех пулов.
-Значения без указанной единицы измерения считаются в секундах, допустимые
-символы единиц измерения в конце: 's' (секунды),
-'m' (минуты), 'h' (часы), 'd' (дни), 'M' (месяца) или 'y' (годы).
-
-## scrub_queue_depth
-
- Тип: целое число
- Значение по умолчанию: 1
- Можно менять на лету: да
-
-Число параллельных операций фоновой проверки на один OSD.
-
-## scrub_sleep
-
- Тип: миллисекунды
- Значение по умолчанию: 0
- Можно менять на лету: да
-
-Дополнительный интервал ожидания после фоновой проверки каждого объекта на
-одном OSD. Может использоваться для замедления скраба, если он слишком
-сильно влияет на пользовательскую нагрузку.
-
-## scrub_list_limit
-
- Тип: целое число
- Значение по умолчанию: 1000
- Можно менять на лету: да
-
-Размер загружаемых за одну операцию списков объектов в процессе фоновой
-проверки.
-
-## scrub_find_best
-
- Тип: булево (да/нет)
- Значение по умолчанию: true
- Можно менять на лету: да
-
-Находить и автоматически восстанавливать "лучшие версии" объектов с
-несовпадающими копиями/частями. При использовании репликации "лучшая"
-версия - версия, доступная в большем числе экземпляров, чем другие. При
-использовании кодов коррекции ошибок "лучшая" версия - это подмножество
-частей данных и чётности, полностью соответствующих друг другу.
-
-Гипотетическая ситуация, в которой вы можете захотеть отключить этот
-поиск - это если у вас 3 реплики и вы боитесь, что 2 диска из 3 могут
-незаметно и одинаково повредить данные одного и того же объекта, например,
-занулив его, и только 1 диск останется неповреждённым. В этой ситуации
-отключение этого параметра поможет вам восстановить данные! Смотрите также
-описание следующего параметра - scrub_ec_max_bruteforce.
-
-## scrub_ec_max_bruteforce
-
- Тип: целое число
- Значение по умолчанию: 100
- Можно менять на лету: да
-
-Vitastor старается определить повреждённые части объектов при использовании
-EC (кодов коррекции ошибок) с более, чем 1 диском чётности, путём перебора
-всех возможных комбинаций ошибочных частей. Данное значение конфигурации
-ограничивает число перебираемых комбинаций. Вы можете попробовать поднять
-его, если используете схему кодирования EC N+K с N и K, достаточно большими
-для того, чтобы число сочетаний `C(N+K-1, K-1) = (N+K-1)! / (K-1)! / N!`
-было больше, чем стандартное значение 100.
-
-Если возможных комбинаций слишком много или если корректная комбинаций не
-определяется однозначно, объекты помечаются неконсистентными (inconsistent)
-и не восстанавливаются автоматически.
-
-При использовании репликации перебор не нужен, Vitastor просто предполагает,
-что вариант объекта с наибольшим количеством одинаковых копий корректен.
-Например, если вы используете 3 реплики и 1 из них отличается, эта 1 копия
-считается некорректной. Однако, если "лучшую" версию с числом доступных
-копий большим, чем у всех других версий, найти невозможно, то объект тоже
-маркируется неконсистентным.
+Блокировать всю память OSD с помощью mlockall, чтобы запретить её выгрузку в пространство подкачки. Требует достаточного значения ulimit -l (лимита заблокированной памяти).
--- a/docs/config/pool.en.md
+++ b/docs/config/pool.en.md
@@ -40,7 +40,6 @@ Parameters:
 - [root_node](#root_node)
 - [osd_tags](#osd_tags)
 - [primary_affinity_tags](#primary_affinity_tags)
- [scrub_interval](#scrub_interval)

 Examples:

@@ -273,13 +272,6 @@ Specifies OSD tags to prefer putting primary OSDs in this pool to.
 Note that for EC/XOR pools Vitastor always prefers to put primary OSD on one
 of the OSDs containing a data chunk for a PG.

-## scrub_interval
-
- Type: time interval (number + unit s/m/h/d/M/y)
-
-Automatic scrubbing interval for this pool. Overrides
-[global scrub_interval setting](osd.en.md#scrub_interval).
-
 # Examples

 ## Replicated pool
--- a/docs/config/pool.ru.md
+++ b/docs/config/pool.ru.md
@@ -39,7 +39,6 @@
 - [root_node](#root_node)
 - [osd_tags](#osd_tags)
 - [primary_affinity_tags](#primary_affinity_tags)
- [scrub_interval](#scrub_interval)

 Примеры:

@@ -277,13 +276,6 @@ PG в Vitastor эферемерны, то есть вы можете менят
 для PG этого пула. Имейте в виду, что для EC-пулов Vitastor также всегда
 предпочитает помещать первичный OSD на один из OSD с данными, а не с чётностью.

-## scrub_interval
-
- Тип: временной интервал (число + единица измерения s/m/h/d/M/y)
-
-Интервал скраба, то есть, автоматической фоновой проверки данных для данного пула.
-Переопределяет [глобальную настройку scrub_interval](osd.ru.md#scrub_interval).
-
 # Примеры

 ## Реплицированный пул
--- a/docs/config/src/common.yml
+++ b/docs/config/src/common.yml
@@ -11,21 +11,13 @@
 - name: etcd_address
  type: string or array of strings
  type_ru: строка или массив строк
-  online: true
  info: |
    etcd connection endpoint(s). Multiple endpoints may be delimited by "," or
    specified in a JSON array `["10.0.115.10:2379/v3","10.0.115.11:2379/v3"]`.
    Note that https is not supported for etcd connections yet.
-
-    etcd connection endpoints can be changed online by updating global
-    configuration in etcd itself - this allows to switch the cluster to new
-    etcd addresses without downtime.
  info_ru: |
    Адрес(а) подключения к etcd. Несколько адресов могут разделяться запятой
    или указываться в виде JSON-массива `["10.0.115.10:2379/v3","10.0.115.11:2379/v3"]`.
-
-    Адреса подключения к etcd можно поменять на лету, обновив конфигурацию в
-    самом etcd - это позволяет переключить кластер на новые etcd без остановки.
 - name: etcd_prefix
  type: string
  default: "/vitastor"
@@ -39,6 +31,5 @@
 - name: log_level
  type: int
  default: 0
-  online: true
  info: Log level. Raise if you want more verbose output.
  info_ru: Уровень логгирования. Повысьте, если хотите более подробный вывод.
--- a/docs/config/src/make.js
+++ b/docs/config/src/make.js
@@ -14,7 +14,6 @@ const L = {
        toc_config: '[Configuration](../config.en.md)',
        toc_usage: 'Usage',
        toc_performance: 'Performance',
-        online: 'Can be changed online: yes',
    },
    ru: {
        Documentation: 'Документация',
@@ -29,7 +28,6 @@ const L = {
        toc_config: '[Конфигурация](../config.ru.md)',
        toc_usage: 'Использование',
        toc_performance: 'Производительность',
-        online: 'Можно менять на лету: да',
    },
 };
 const types = {
@@ -72,8 +70,6 @@ for (const file of params_files)
                out += `- ${L[lang]['Default'] || 'Default'}: ${c.default}\n`;
            if (c.min !== undefined)
                out += `- ${L[lang]['Minimum'] || 'Minimum'}: ${c.min}\n`;
-            if (c.online)
-                out += `- ${L[lang]['online'] || 'Can be changed online: yes'}\n`;
            out += `\n`+(c["info_"+lang] || c["info"]).replace(/\s+$/, '');
        }
        const head = fs.readFileSync(__dirname+'/'+file+'.'+lang+'.md', { encoding: 'utf-8' });
--- a/docs/config/src/network.yml
+++ b/docs/config/src/network.yml
@@ -164,21 +164,18 @@
  type: sec
  min: 1
  default: 5
-  online: true
  info: Interval before attempting to reconnect to an unavailable OSD.
  info_ru: Время ожидания перед повторной попыткой соединиться с недоступным OSD.
 - name: peer_connect_timeout
  type: sec
  min: 1
  default: 5
-  online: true
  info: Timeout for OSD connection attempts.
  info_ru: Максимальное время ожидания попытки соединения с OSD.
 - name: osd_idle_timeout
  type: sec
  min: 1
  default: 5
-  online: true
  info: |
    OSD connection inactivity time after which clients and other OSDs send
    keepalive requests to check state of the connection.
@@ -189,7 +186,6 @@
  type: sec
  min: 1
  default: 5
-  online: true
  info: |
    Maximum time to wait for OSD keepalive responses. If an OSD doesn't respond
    within this time, the connection to it is dropped and a reconnection attempt
@@ -202,7 +198,6 @@
  type: ms
  min: 50
  default: 500
-  online: true
  info: |
    OSDs respond to clients with a special error code when they receive I/O
    requests for a PG that's not synchronized and started. This parameter sets
@@ -216,7 +211,6 @@
 - name: max_etcd_attempts
  type: int
  default: 5
-  online: true
  info: |
    Maximum number of attempts for etcd requests which can't be retried
    indefinitely.
@@ -226,7 +220,6 @@
 - name: etcd_quick_timeout
  type: ms
  default: 1000
-  online: true
  info: |
    Timeout for etcd requests which should complete quickly, like lease refresh.
  info_ru: |
@@ -235,7 +228,6 @@
 - name: etcd_slow_timeout
  type: ms
  default: 5000
-  online: true
  info: Timeout for etcd requests which are allowed to wait for some time.
  info_ru: |
    Максимальное время выполнения запросов к etcd, для которых не обязательно
@@ -243,7 +235,6 @@
 - name: etcd_keepalive_timeout
  type: sec
  default: max(30, etcd_report_interval*2)
-  online: true
  info: |
    Timeout for etcd connection HTTP Keep-Alive. Should be higher than
    etcd_report_interval to guarantee that keepalive actually works.
@@ -253,7 +244,6 @@
 - name: etcd_ws_keepalive_timeout
  type: sec
  default: 30
-  online: true
  info: |
    etcd websocket ping interval required to keep the connection alive and
    detect disconnections quickly.
@@ -262,7 +252,6 @@
 - name: client_dirty_limit
  type: int
  default: 33554432
-  online: true
  info: |
    Without immediate_commit=all this parameter sets the limit of "dirty"
    (not committed by fsync) data allowed by the client before forcing an
--- a/docs/config/src/osd.en.md
+++ b/docs/config/src/osd.en.md
@@ -1,5 +1,4 @@
 # Runtime OSD Parameters

 These parameters only apply to OSDs, are not fixed at the moment of OSD drive
-initialization and can be changed - either with an OSD restart or, for some of
-them, even without restarting by updating configuration in etcd.
+initialization and can be changed with an OSD restart.
--- a/docs/config/src/osd.ru.md
+++ b/docs/config/src/osd.ru.md
@@ -2,5 +2,4 @@

 Данные параметры используются только OSD, но, в отличие от дисковых параметров,
 не фиксируются в момент инициализации дисков OSD и могут быть изменены в любой
-момент с помощью перезапуска OSD, а некоторые и без перезапуска, с помощью
-изменения конфигурации в etcd.
+момент с перезапуском OSD.
--- a/docs/config/src/osd.yml
+++ b/docs/config/src/osd.yml
@@ -66,7 +66,6 @@
 - name: autosync_interval
  type: sec
  default: 5
-  online: true
  info: |
    Time interval at which automatic fsyncs/flushes are issued by each OSD when
    the immediate_commit mode if disabled. fsyncs are required because without
@@ -84,7 +83,6 @@
 - name: autosync_writes
  type: int
  default: 128
-  online: true
  info: |
    Same as autosync_interval, but sets the maximum number of uncommitted write
    operations before issuing an fsync operation internally.
@@ -95,7 +93,6 @@
 - name: recovery_queue_depth
  type: int
  default: 4
-  online: true
  info: |
    Maximum recovery operations per one primary OSD at any given moment of time.
    Currently it's the only parameter available to tune the speed or recovery
@@ -108,7 +105,6 @@
 - name: recovery_pg_switch
  type: int
  default: 128
-  online: true
  info: |
    Number of recovery operations before switching to recovery of the next PG.
    The idea is to mix all PGs during recovery for more even space and load
@@ -123,7 +119,6 @@
 - name: recovery_sync_batch
  type: int
  default: 16
-  online: true
  info: Maximum number of recovery operations before issuing an additional fsync.
  info_ru: Максимальное число операций восстановления перед дополнительным fsync.
 - name: readonly
@@ -138,7 +133,6 @@
 - name: no_recovery
  type: bool
  default: false
-  online: true
  info: |
    Disable automatic background recovery of objects. Note that it doesn't
    affect implicit recovery of objects happening during writes - a write is
@@ -151,7 +145,6 @@
 - name: no_rebalance
  type: bool
  default: false
-  online: true
  info: |
    Disable background movement of data between different OSDs. Disabling it
    means that PGs in the `has_misplaced` state will be left in it indefinitely.
@@ -162,7 +155,6 @@
 - name: print_stats_interval
  type: sec
  default: 3
-  online: true
  info: |
    Time interval at which OSDs print simple human-readable operation
    statistics on stdout.
@@ -172,7 +164,6 @@
 - name: slow_log_interval
  type: sec
  default: 10
-  online: true
  info: |
    Time interval at which OSDs dump slow or stuck operations on stdout, if
    they're any. Also it's the time after which an operation is considered
@@ -184,7 +175,6 @@
 - name: inode_vanish_time
  type: sec
  default: 60
-  online: true
  info: |
    Number of seconds after which a deleted inode is removed from OSD statistics.
  info_ru: |
@@ -192,7 +182,6 @@
 - name: max_write_iodepth
  type: int
  default: 128
-  online: true
  info: |
    Parallel client write operation limit per one OSD. Operations that exceed
    this limit are pushed to a temporary queue instead of being executed
@@ -204,7 +193,6 @@
 - name: min_flusher_count
  type: int
  default: 1
-  online: true
  info: |
    Flusher is a micro-thread that moves data from the journal to the data
    area of the device. Their number is auto-tuned between minimum and maximum.
@@ -216,7 +204,6 @@
 - name: max_flusher_count
  type: int
  default: 256
-  online: true
  info: |
    Maximum number of journal flushers (see above min_flusher_count).
  info_ru: |
@@ -297,7 +284,6 @@
 - name: throttle_small_writes
  type: bool
  default: false
-  online: true
  info: |
    Enable soft throttling of small journaled writes. Useful for hybrid OSDs
    with fast journal/metadata devices and slow data devices. The idea is that
@@ -326,7 +312,6 @@
 - name: throttle_target_iops
  type: int
  default: 100
-  online: true
  info: |
    Target maximum number of throttled operations per second under the condition
    of full journal. Set it to approximate random write iops of your data devices
@@ -339,7 +324,6 @@
 - name: throttle_target_mbs
  type: int
  default: 100
-  online: true
  info: |
    Target maximum bandwidth in MB/s of throttled operations per second under
    the condition of full journal. Set it to approximate linear write
@@ -352,7 +336,6 @@
 - name: throttle_target_parallelism
  type: int
  default: 1
-  online: true
  info: |
    Target maximum parallelism of throttled operations under the condition of
    full journal. Set it to approximate internal parallelism of your data
@@ -365,7 +348,6 @@
 - name: throttle_threshold_us
  type: us
  default: 50
-  online: true
  info: |
    Minimal computed delay to be applied to throttled operations. Usually
    doesn't need to be changed.
@@ -375,151 +357,10 @@
 - name: osd_memlock
  type: bool
  default: false
-  info: |
+  info: >
    Lock all OSD memory to prevent it from being unloaded into swap with
    mlockall(). Requires sufficient ulimit -l (max locked memory).
-  info_ru: |
+  info_ru: >
    Блокировать всю память OSD с помощью mlockall, чтобы запретить её выгрузку
    в пространство подкачки. Требует достаточного значения ulimit -l (лимита
    заблокированной памяти).
- name: auto_scrub
-  type: bool
-  default: false
-  online: true
-  info: |
-    Data scrubbing is the process of background verification of copies to find
-    and repair corrupted blocks. It's not run automatically by default since
-    it's a new feature. Set this parameter to true to enable automatic scrubs.
-
-    This parameter makes OSDs automatically schedule data scrubbing of clean PGs
-    every `scrub_interval` (see below). You can also start/schedule scrubbing
-    manually by setting `next_scrub` JSON key to the desired UNIX time of the
-    next scrub in `/pg/history/...` values in etcd.
-  info_ru: |
-    Скраб - процесс фоновой проверки копий данных, предназначенный, чтобы
-    находить и исправлять повреждённые блоки. По умолчанию эти проверки ещё не
-    запускаются автоматически, так как являются новой функцией. Чтобы включить
-    автоматическое планирование скрабов, установите данный параметр в true.
-
-    Включённый параметр заставляет OSD автоматически планировать фоновую
-    проверку чистых PG раз в `scrub_interval` (см. ниже). Вы также можете
-    запустить или запланировать проверку вручную, установив значение ключа JSON
-    `next_scrub` внутри ключей etcd `/pg/history/...` в UNIX-время следующей
-    желаемой проверки.
- name: no_scrub
-  type: bool
-  default: false
-  online: true
-  info: |
-    Temporarily disable scrubbing and stop running scrubs.
-  info_ru: |
-    Временно отключить и остановить запущенные скрабы.
- name: scrub_interval
-  type: string
-  default: 30d
-  online: true
-  info: |
-    Default automatic scrubbing interval for all pools. Numbers without suffix
-    are treated as seconds, possible unit suffixes include 's' (seconds),
-    'm' (minutes), 'h' (hours), 'd' (days), 'M' (months) and 'y' (years).
-  info_ru: |
-    Интервал автоматической фоновой проверки по умолчанию для всех пулов.
-    Значения без указанной единицы измерения считаются в секундах, допустимые
-    символы единиц измерения в конце: 's' (секунды),
-    'm' (минуты), 'h' (часы), 'd' (дни), 'M' (месяца) или 'y' (годы).
- name: scrub_queue_depth
-  type: int
-  default: 1
-  online: true
-  info: |
-    Number of parallel scrubbing operations per one OSD.
-  info_ru: |
-    Число параллельных операций фоновой проверки на один OSD.
- name: scrub_sleep
-  type: ms
-  default: 0
-  online: true
-  info: |
-    Additional interval between two consecutive scrubbing operations on one OSD.
-    Can be used to slow down scrubbing if it affects user load too much.
-  info_ru: |
-    Дополнительный интервал ожидания после фоновой проверки каждого объекта на
-    одном OSD. Может использоваться для замедления скраба, если он слишком
-    сильно влияет на пользовательскую нагрузку.
- name: scrub_list_limit
-  type: int
-  default: 1000
-  online: true
-  info: |
-    Number of objects to list in one listing operation during scrub.
-  info_ru: |
-    Размер загружаемых за одну операцию списков объектов в процессе фоновой
-    проверки.
- name: scrub_find_best
-  type: bool
-  default: true
-  online: true
-  info: |
-    Find and automatically restore best versions of objects with unmatched
-    copies. In replicated setups, the best version is the version with most
-    matching replicas. In EC setups, the best version is the subset of data
-    and parity chunks without mismatches.
-
-    The hypothetical situation where you might want to disable it is when
-    you have 3 replicas and you are paranoid that 2 HDDs out of 3 may silently
-    corrupt an object in the same way (for example, zero it out) and only
-    1 HDD will remain good. In this case disabling scrub_find_best may help
-    you to recover the data! See also scrub_ec_max_bruteforce below.
-  info_ru: |
-    Находить и автоматически восстанавливать "лучшие версии" объектов с
-    несовпадающими копиями/частями. При использовании репликации "лучшая"
-    версия - версия, доступная в большем числе экземпляров, чем другие. При
-    использовании кодов коррекции ошибок "лучшая" версия - это подмножество
-    частей данных и чётности, полностью соответствующих друг другу.
-
-    Гипотетическая ситуация, в которой вы можете захотеть отключить этот
-    поиск - это если у вас 3 реплики и вы боитесь, что 2 диска из 3 могут
-    незаметно и одинаково повредить данные одного и того же объекта, например,
-    занулив его, и только 1 диск останется неповреждённым. В этой ситуации
-    отключение этого параметра поможет вам восстановить данные! Смотрите также
-    описание следующего параметра - scrub_ec_max_bruteforce.
- name: scrub_ec_max_bruteforce
-  type: int
-  default: 100
-  online: true
-  info: |
-    Vitastor can locate corrupted chunks in EC setups with more than 1 parity
-    chunk by brute-forcing all possible error locations. This configuration
-    value limits the maximum number of checked combinations. You can try to
-    increase it if you have EC N+K setup with N and K large enough for
-    combination count `C(N+K-1, K-1) = (N+K-1)! / (K-1)! / N!` to be greater
-    than the default 100.
-
-    If there are too many possible combinations or if multiple combinations give
-    correct results then objects are marked inconsistent and aren't recovered
-    automatically.
-
-    In replicated setups bruteforcing isn't needed, Vitastor just assumes that
-    the variant with most available equal copies is correct. For example, if
-    you have 3 replicas and 1 of them differs, this one is considered to be
-    corrupted. But if there is no "best" version with more copies than all
-    others have then the object is also marked as inconsistent.
-  info_ru: |
-    Vitastor старается определить повреждённые части объектов при использовании
-    EC (кодов коррекции ошибок) с более, чем 1 диском чётности, путём перебора
-    всех возможных комбинаций ошибочных частей. Данное значение конфигурации
-    ограничивает число перебираемых комбинаций. Вы можете попробовать поднять
-    его, если используете схему кодирования EC N+K с N и K, достаточно большими
-    для того, чтобы число сочетаний `C(N+K-1, K-1) = (N+K-1)! / (K-1)! / N!`
-    было больше, чем стандартное значение 100.
-
-    Если возможных комбинаций слишком много или если корректная комбинаций не
-    определяется однозначно, объекты помечаются неконсистентными (inconsistent)
-    и не восстанавливаются автоматически.
-
-    При использовании репликации перебор не нужен, Vitastor просто предполагает,
-    что вариант объекта с наибольшим количеством одинаковых копий корректен.
-    Например, если вы используете 3 реплики и 1 из них отличается, эта 1 копия
-    считается некорректной. Однако, если "лучшую" версию с числом доступных
-    копий большим, чем у всех других версий, найти невозможно, то объект тоже
-    маркируется неконсистентным.
--- a/docs/installation/packages.en.md
+++ b/docs/installation/packages.en.md
@@ -45,10 +45,3 @@
 - etcd 3.4.15 or newer. Earlier versions won't work because of various bugs,
  for example [#12402](https://github.com/etcd-io/etcd/pull/12402).
 - node.js 10 or newer
-
-## Version archive
-
-All previous Vitastor and other components (QEMU, etcd...) package builds
-can be found here:
-
-https://vitastor.io/archive/
--- a/docs/installation/packages.ru.md
+++ b/docs/installation/packages.ru.md
@@ -44,10 +44,3 @@
 - etcd 3.4.15 или новее. Более старые версии не будут работать из-за разных багов,
  например, [#12402](https://github.com/etcd-io/etcd/pull/12402).
 - node.js 10 или новее
-
-## Архив предыдущих версий
-
-Все предыдущие сборки пакетов Vitastor и других компонентов, таких, как QEMU
-и etcd, можно скачать по следующей ссылке:
-
-https://vitastor.io/archive/
--- a/docs/intro/features.en.md
+++ b/docs/intro/features.en.md
@@ -29,7 +29,6 @@
 - Snapshots and copy-on-write image clones
 - [Write throttling to smooth random write workloads in SSD+HDD configurations](../config/osd.en.md#throttle_small_writes)
 - [RDMA/RoCEv2 support via libibverbs](../config/network.en.md#rdma_device)
- [Scrubbing without checksums](../config/osd.en.md#auto_scrub) (verification of copies)

 ## Plugins and tools

@@ -55,6 +54,7 @@ The following features are planned for the future:
 - iSCSI proxy
 - Multi-threaded client
 - Faster failover
+- Scrubbing without checksums (verification of replicas)
 - Checksums
 - Tiered storage (SSD caching)
 - NVDIMM support
--- a/docs/intro/features.ru.md
+++ b/docs/intro/features.ru.md
@@ -31,7 +31,6 @@
 - Снапшоты и copy-on-write клоны
 - [Сглаживание производительности случайной записи в SSD+HDD конфигурациях](../config/osd.ru.md#throttle_small_writes)
 - [Поддержка RDMA/RoCEv2 через libibverbs](../config/network.ru.md#rdma_device)
- [Фоновая проверка целостности без контрольных сумм](../config/osd.ru.md#auto_scrub) (сверка копий)

 ## Драйверы и инструменты

@@ -55,6 +54,7 @@
 - iSCSI-прокси
 - Многопоточный клиент
 - Более быстрое переключение при отказах
+- Фоновая проверка целостности без контрольных сумм (сверка реплик)
 - Контрольные суммы
 - Поддержка SSD-кэширования (tiered storage)
 - Поддержка NVDIMM
--- a/docs/usage/cli.en.md
+++ b/docs/usage/cli.en.md
@@ -20,8 +20,6 @@ It supports the following commands:
 - [flatten](#flatten)
 - [rm-data](#rm-data)
 - [merge-data](#merge-data)
- [describe](#describe)
- [fix](#fix)
 - [alloc-osd](#alloc-osd)
 - [rm-osd](#rm-osd)

@@ -176,51 +174,6 @@ Merge layer data without changing metadata. Merge `<from>`..`<to>` to `<target>`
 `<to>` must be a child of `<from>` and `<target>` may be one of the layers between
 `<from>` and `<to>`, including `<from>` and `<to>`.

-## describe
-
-`vitastor-cli describe [--osds <osds>] [--object-state <states>] [--pool <pool>]
-    [--inode <ino>] [--min-inode <ino>] [--max-inode <ino>]
-    [--min-offset <offset>] [--max-offset <offset>]`
-
-Describe unclean object locations in the cluster.
-
-```
--osds <osds>
-    Only list objects from primary OSD(s) <osds>.
--object-state <states>
-    Only list objects in given state(s). State(s) may include:
-    degraded, misplaced, incomplete, corrupted, inconsistent.
--pool <pool name or number>
-    Only list objects in the given pool.
--inode, --min-inode, --max-inode
-    Restrict listing to specific inode numbers.
--min-offset, --max-offset
-    Restrict listing to specific offsets inside inodes.
-```
-
-## fix
-
-`vitastor-cli fix [--objects <objects>] [--bad-osds <osds>] [--part <part>] [--check no]`
-
-Fix inconsistent objects in the cluster by deleting some copies.
-
-```
--objects <objects>
-    Objects to fix, either in plain text or JSON format. If not specified,
-    object list will be read from STDIN in one of the same formats.
-    Plain text format: 0x<inode>:0x<stripe> <any delimiter> 0x<inode>:0x<stripe> ...
-    JSON format: [{"inode":"0x...","stripe":"0x..."},...]
--bad-osds <osds>
-    Remove inconsistent copies/parts of objects from these OSDs, effectively
-    marking them bad and allowing Vitastor to recover objects from other copies.
--part <number>
-    Only remove EC part <number> (from 0 to pg_size-1), required for extreme
-    edge cases where one OSD has multiple parts of a EC object.
--check no
-    Do not recheck that requested objects are actually inconsistent,
-    delete requested copies/parts anyway.
-```
-
 ## alloc-osd

 `vitastor-cli alloc-osd`
--- a/docs/usage/cli.ru.md
+++ b/docs/usage/cli.ru.md
@@ -184,59 +184,6 @@ vitastor-cli snap-create [-p|--pool <id|name>] <image>@<snapshot>
 в целевой образ `<target>`. `<to>` должен быть дочерним образом `<from>`, а `<target>`
 должен быть одним из слоёв между `<from>` и `<to>`, включая сами `<from>` и `<to>`.

-## describe
-
-`vitastor-cli describe [--osds <osds>] [--object-state <состояния>] [--pool <пул>]
-    [--inode <номер>] [--min-inode <номер>] [--max-inode <номер>]
-    [--min-offset <смещение>] [--max-offset <смещение>]`
-
-Описать состояние "грязных" объектов в кластере, то есть таких объектов, копии
-или части которых хранятся на наборе OSD, не равном целевому.
-
-```
--osds <osds>
-    Перечислять только объекты с первичных OSD из списка <osds>.
--object-state <состояния>
-    Перечислять только объекты в указанных состояниях. Возможные состояния
-    объектов:
-    - degraded - деградированная избыточность
-    - misplaced - перемещённый
-    - incomplete - нечитаемый из-за потери большего числа частей, чем допустимо
-    - corrupted - с одной или более повреждённой частью
-    - inconsistent - неконсистентный, с неоднозначным расхождением копий/частей
--pool <имя или ID пула>
-    Перечислять только объекты из заданного пула.
--inode, --min-inode, --max-inode
-    Перечислять только объекты из указанных номеров инодов (образов).
--min-offset, --max-offset
-    Перечислять только объекты с заданных смещений внутри образов.
-```
-
-## fix
-
-`vitastor-cli fix [--objects <объекты>] [--bad-osds <osds>] [--part <номер>] [--check no]`
-
-Исправить неконсистентные (неоднозначные) объекты путём удаления части копий.
-
-```
--objects <объекты>
-    Объекты для исправления - в простом текстовом или JSON формате. Если опция
-    не указана, список объектов читается со стандартного ввода в тех же форматах.
-    Простой формат: 0x<инод>:0x<смещение> <любой разделитель> 0x<инод>:0x<смещение> ...
-    Формат JSON: [{"inode":"0x<инод>","stripe":"0x<смещение>"},...]
--bad-osds <osds>
-    Удалить неконсистентные копии/части объектов с данных OSD, таким образом
-    признавая потерю этих копий и позволяя Vitastor-у восстановить объекты из
-    других копий.
--part <номер>
-    Удалить только части EC с заданным номером (от 0 до pg_size-1). Нужно только
-    в редких граничных случаях, когда один и тот же OSD содержит несколько частей
-    одного EC-объекта.
--check no
-    Не перепроверять, что заданные объекты действительно в неконсистентном
-    состоянии и просто удалять заданные части.
-```
-
 ## alloc-osd

 `vitastor-cli alloc-osd`
--- a/docs/usage/nbd.en.md
+++ b/docs/usage/nbd.en.md
@@ -25,23 +25,6 @@ It will output a block device name like /dev/nbd0 which you can then use as a no

 You can also use `--pool <POOL> --inode <INODE> --size <SIZE>` instead of `--image <IMAGE>` if you want.

-Additional options for map command:
-
-* `--nbd_timeout 30` \
-  Timeout for I/O operations in seconds after exceeding which the kernel stops
-  the device. You can set it to 0 to disable the timeout, but beware that you
-  won't be able to stop the device at all if vitastor-nbd process dies.
-* `--nbd_max_devices 64 --nbd_max_part 3` \
-  Options for the `nbd` kernel module when modprobing it (`nbds_max` and `max_part`).
-  note that maximum allowed (nbds_max)*(1+max_part) is 256.
-* `--logfile /path/to/log/file.txt` \
-  Write log messages to the specified file instead of dropping them (in background mode)
-  or printing them to the standard output (in foreground mode).
-* `--dev_num N` \
-  Use the specified device /dev/nbdN instead of automatic selection.
-* `--foreground 1` \
-  Stay in foreground, do not daemonize.
-
 ## Unmap image

 To unmap the device run:
@@ -49,27 +32,3 @@ To unmap the device run:
 ```
 vitastor-nbd unmap /dev/nbd0
 ```
-
-## List mapped images
-
-```
-vitastor-nbd ls [--json]
-```
-
-Example output (normal format):
-
-```
-/dev/nbd0
-image: bench
-pid: 584536
-
-/dev/nbd1
-image: bench1
-pid: 584546
-```
-
-Example output (JSON format):
-
-```
-{"/dev/nbd0": {"image": "bench", "pid": 584536}, "/dev/nbd1": {"image": "bench1", "pid": 584546}}
-```
--- a/docs/usage/nbd.ru.md
+++ b/docs/usage/nbd.ru.md
@@ -30,27 +30,6 @@ vitastor-nbd map --etcd_address 10.115.0.10:2379/v3 --image testimg
 Для обращения по номеру инода, аналогично другим командам, можно использовать опции
 `--pool <POOL> --inode <INODE> --size <SIZE>` вместо `--image testimg`.

-Дополнительные опции для команды подключения NBD-устройства:
-
-* `--nbd_timeout 30` \
-  Максимальное время выполнения любой операции чтения/записи в секундах, при
-  превышении которого ядро остановит NBD-устройство. Вы можете установить опцию
-  в 0, чтобы отключить ограничение времени, но имейте в виду, что в этом случае
-  вы вообще не сможете отключить NBD-устройство при нештатном завершении процесса
-  vitastor-nbd.
-* `--nbd_max_devices 64 --nbd_max_part 3` \
-  Опции, передаваемые модулю ядра nbd, если его загружает vitastor-nbd
-  (`nbds_max` и `max_part`). Имейте в виду, что (nbds_max)*(1+max_part)
-  обычно не должно превышать 256.
-* `--logfile /path/to/log/file.txt` \
-  Писать сообщения о процессе работы в заданный файл, вместо пропуска их
-  при фоновом режиме запуска или печати на стандартный вывод при запуске
-  в консоли с `--foreground 1`.
-* `--dev_num N` \
-  Использовать заданное устройство `/dev/nbdN` вместо автоматического подбора.
-* `--foreground 1` \
-  Не уводить процесс в фоновый режим.
-
 ## Отключить устройство

 Для отключения устройства выполните:
@@ -58,27 +37,3 @@ vitastor-nbd map --etcd_address 10.115.0.10:2379/v3 --image testimg
 ```
 vitastor-nbd unmap /dev/nbd0
 ```
-
-## Вывести подключённые устройства
-
-```
-vitastor-nbd ls [--json]
-```
-
-Пример вывода в обычном формате:
-
-```
-/dev/nbd0
-image: bench
-pid: 584536
-
-/dev/nbd1
-image: bench1
-pid: 584546
-```
-
-Пример вывода в JSON-формате:
-
-```
-{"/dev/nbd0": {"image": "bench", "pid": 584536}, "/dev/nbd1": {"image": "bench1", "pid": 584546}}
-```
--- a/mon/lp-optimizer.js
+++ b/mon/lp-optimizer.js
@@ -50,7 +50,8 @@ async function lp_solve(text)
    return { score, vars };
 }

-async function optimize_initial({ osd_tree, pg_count, pg_size = 3, pg_minsize = 2, max_combinations = 10000, parity_space = 1, ordered = false })
+async function optimize_initial({ osd_tree, pg_count, pg_size = 3, pg_minsize = 2, hier_sizes = null,
+    max_combinations = 10000, parity_space = 1, ordered = false, seq_layout = false })
 {
    if (!pg_count || !osd_tree)
    {
@@ -58,7 +59,7 @@ async function optimize_initial({ osd_tree, pg_count, pg_size = 3, pg_minsize =
    }
    const all_weights = Object.assign({}, ...Object.values(osd_tree));
    const total_weight = Object.values(all_weights).reduce((a, c) => Number(a) + Number(c), 0);
-    const all_pgs = Object.values(random_combinations(osd_tree, pg_size, max_combinations, parity_space > 1));
+    const all_pgs = Object.values(random_hier_combinations(osd_tree, hier_sizes || [ pg_size, 1 ], max_combinations, parity_space > 1, seq_layout));
    const pg_per_osd = {};
    for (const pg of all_pgs)
    {
@@ -216,39 +217,45 @@ function calc_intersect_weights(old_pg_size, pg_size, pg_count, prev_weights, al
    return move_weights;
 }

-function add_valid_previous(osd_tree, prev_weights, all_pgs)
+function build_parent_per_leaf(osd_tree, res = {}, parents = [])
+{
+    for (const item in osd_tree)
+    {
+        if (osd_tree[item] instanceof Object)
+            build_parent_per_leaf(osd_tree[item], res, [ ...parents, item ]);
+        else
+            res[item] = parents;
+    }
+    return res;
+}
+
+function add_valid_previous(osd_tree, prev_weights, all_pgs, hier_sizes)
 {
    // Add previous combinations that are still valid
-    const hosts = Object.keys(osd_tree).sort();
-    const host_per_osd = {};
-    for (const host in osd_tree)
-    {
-        for (const osd in osd_tree[host])
-        {
-            host_per_osd[osd] = host;
-        }
-    }
+    const parent_per_osd = build_parent_per_leaf(osd_tree);
    skip_pg: for (const pg_name in prev_weights)
    {
-        const seen_hosts = {};
+        const seen = [];
        const pg = pg_name.substr(3).split(/_/);
        for (const osd of pg)
        {
-            if (!host_per_osd[osd] || seen_hosts[host_per_osd[osd]])
-            {
+            if (!parent_per_osd[osd])
                continue skip_pg;
+            for (let i = 0; i < parent_per_osd[osd].length; i++)
+            {
+                seen[parent_per_osd[osd][i]]++;
+                if (seen[parent_per_osd[osd][i]] > hier_sizes[i])
+                    continue skip_pg;
            }
-            seen_hosts[host_per_osd[osd]] = true;
        }
        if (!all_pgs[pg_name])
-        {
            all_pgs[pg_name] = pg;
-        }
    }
 }

 // Try to minimize data movement
-async function optimize_change({ prev_pgs: prev_int_pgs, osd_tree, pg_size = 3, pg_minsize = 2, max_combinations = 10000, parity_space = 1, ordered = false })
+async function optimize_change({ prev_pgs: prev_int_pgs, osd_tree, pg_size = 3, pg_minsize = 2,
+    hier_sizes = null, max_combinations = 10000, parity_space = 1, ordered = false, seq_layout = false })
 {
    if (!osd_tree)
    {
@@ -273,10 +280,10 @@ async function optimize_change({ prev_pgs: prev_int_pgs, osd_tree, pg_size = 3,
    }
    const old_pg_size = prev_int_pgs[0].length;
    // Get all combinations
-    let all_pgs = random_combinations(osd_tree, pg_size, max_combinations, parity_space > 1);
+    let all_pgs = random_hier_combinations(osd_tree, hier_sizes || [ pg_size, 1 ], max_combinations, parity_space > 1, seq_layout);
    if (old_pg_size == pg_size)
    {
-        add_valid_previous(osd_tree, prev_weights, all_pgs);
+        add_valid_previous(osd_tree, prev_weights, all_pgs, hier_sizes || [ pg_size, 1 ]);
    }
    all_pgs = Object.values(all_pgs);
    const pg_per_osd = {};
@@ -502,41 +509,147 @@ function put_aligned_pgs(aligned_pgs, int_pgs, prev_int_pgs, keygen)
    }
 }

-// Convert multi-level osd_tree = { level: number|string, id?: string, size?: number, children?: osd_tree }[]
+// Convert multi-level tree_node = { level: number|string, id?: string, size?: number, children?: tree_node[] }
 // levels = { string: number }
-// to a two-level osd_tree suitable for all_combinations()
-function flatten_tree(osd_tree, levels, failure_domain_level, osd_level, domains = {}, i = { i: 1 })
+// to a multi-level OSD tree suitable for random_hier_combinations()
+// (or in case of just 2 levels - for all_combinations() / random_combinations())
+//
+// Example:
+// tree_node = { level: 'dc', children: [ { level: 'rack', children: [ { level: 'host', children: [ { level: 'osd', size: 10 } ] } ] } ] }
+// extract_levels = [ 'rack', 'osd' ]
+// level_defs = { dc: 1, rack: 2, host: 3, osd: 4 }
+//
+// Result:
+// { rack0: { osd1: 10 } }
+function extract_tree_levels(tree_node, extract_levels, level_defs, new_tree = { idx: 1, items: {} })
 {
-    osd_level = levels[osd_level] || osd_level;
-    failure_domain_level = levels[failure_domain_level] || failure_domain_level;
-    for (const node of osd_tree)
+    const next_level = Number(level_defs[extract_levels[0]] || extract_levels[0]) || 0;
+    const level_name = level_defs[extract_levels[0]] ? extract_levels[0] : 'l'+extract_levels[0]+'_';
+    const is_leaf = extract_levels.length == 1;
+    if ((level_defs[tree_node.level] || tree_node.level) >= next_level)
    {
-        if ((levels[node.level] || node.level) < failure_domain_level)
+        if (!is_leaf)
        {
-            flatten_tree(node.children||[], levels, failure_domain_level, osd_level, domains, i);
+            // Insert a (possibly fake) level
+            const nt = { idx: 1, items: {} };
+            new_tree.items[level_name+(new_tree.idx++)] = nt.items;
+            extract_tree_levels(tree_node, extract_levels.slice(1), level_defs, nt);
        }
        else
        {
-            domains['dom'+(i.i++)] = extract_osds([ node ], levels, osd_level);
+            // Insert a leaf node
+            const leaf_id = tree_node.id || (level_name+(new_tree.idx++));
+            new_tree.items[leaf_id] = tree_node.size;
        }
    }
-    return domains;
+    else
+    {
+        for (const child_node of tree_node.children||[])
+        {
+            extract_tree_levels(child_node, extract_levels, level_defs, new_tree);
+        }
+    }
+    return new_tree.items;
 }

-function extract_osds(osd_tree, levels, osd_level, osds = {})
+// generate random PGs with hierarchical failure domains, i.e. for example 3 DC each with 2 HOSTS
+// osd_tree = { level3_id: { level2_id: { level1_id: scalar_value } }, ... }
+//   osd_tree may contain arbitrary number of levels, but level count must be the same across the whole tree
+// size_per_level = number of items to select on each level, for example [3, 2, 1].
+//   must have the same number of items as the osd_tree level count.
+// count = PG count to generate
+// ordered = don't treat (x,y) and (y,x) as equal
+// seq_layout = true for the [DC1,DC1,DC2,DC2,DC3,DC3] layout, false for [DC1,DC2,DC3,DC1,DC2,DC3] layout
+function random_hier_combinations(osd_tree, size_per_level, count, ordered, seq_layout)
 {
-    for (const node of osd_tree)
+    let seed = 0x5f020e43;
+    const rng = () =>
    {
-        if ((levels[node.level] || node.level) >= osd_level)
+        seed ^= seed << 13;
+        seed ^= seed >> 17;
+        seed ^= seed << 5;
+        return seed + 2147483648;
+    };
+    const get_max_level = (o) =>
+    {
+        let lvl = 0;
+        while (o instanceof Object)
        {
-            osds[node.id] = node.size;
+            for (const k in o)
+            {
+                lvl++;
+                o = o[k];
+                break;
+            }
        }
-        else
+        return lvl;
+    };
+    const max_level = get_max_level(osd_tree);
+    const gen_pg = (select) =>
+    {
+        let pg = [ osd_tree ];
+        for (let level = 0; level < max_level; level++)
        {
-            extract_osds(node.children||[], levels, osd_level, osds);
+            let npg = [];
+            for (let i = 0; i < pg.length; i++)
+            {
+                const keys = pg[i] instanceof Object ? Object.keys(pg[i]) : [];
+                const max_keys = keys.length < size_per_level[level] ? keys.length : size_per_level[level];
+                for (let j = 0; j < max_keys; j++)
+                {
+                    const r = select(level, i, j, (ordered ? keys.length : (keys.length - (max_keys - j - 1))));
+                    const el = pg[i][keys[r]] instanceof Object ? pg[i][keys[r]] : keys[r];
+                    npg[seq_layout ? i*size_per_level[level]+j : j*pg.length+i] = el;
+                    keys.splice(ordered ? r : 0, ordered ? 1 : (r+1));
+                }
+                for (let j = max_keys; j < size_per_level[level]; j++)
+                    npg[seq_layout ? i*size_per_level[level]+j : j*pg.length+i] = NO_OSD;
+            }
+            pg = npg;
        }
+        return pg;
+    };
+    const r = {};
+    // Generate random combinations including each OSD at least once
+    let has_next = true;
+    let ctr = [];
+    while (has_next)
+    {
+        let pg = gen_pg((level, i, j, n) =>
+        {
+            if (i == 0 && j == 0)
+            {
+                // Select a pre-determined OSD in the first position on each level
+                const r = ctr[level] == null || ctr[level][1] != n ? 0 : ctr[level][0];
+                ctr[level] = [ r, n ];
+                return r;
+            }
+            return rng() % n;
+        });
+        for (let i = ctr.length-1; i >= 0; i--)
+        {
+            ctr[i][0]++;
+            if (ctr[i][0] < ctr[i][1])
+                break;
+            else
+                ctr[i] = null;
+        }
+        has_next = ctr[0] != null;
+        const cyclic_pgs = [ pg ];
+        if (ordered)
+            for (let i = 1; i < pg.size; i++)
+                cyclic_pgs.push([ ...pg.slice(i), ...pg.slice(0, i) ]);
+        for (const pg of cyclic_pgs)
+            r['pg_'+pg.join('_')] = pg;
    }
-    return osds;
+    // Generate purely random combinations
+    while (count > 0)
+    {
+        let pg = gen_pg((l, i, j, n) => rng() % n);
+        r['pg_'+pg.join('_')] = pg;
+        count--;
+    }
+    return r;
 }

 // ordered = don't treat (x,y) and (y,x) as equal
@@ -752,11 +865,12 @@ module.exports = {
    pg_weights_space_efficiency,
    pg_list_space_efficiency,
    pg_per_osd_space_efficiency,
-    flatten_tree,
+    extract_tree_levels,

    lp_solve,
    make_int_pgs,
    align_pgs,
    random_combinations,
+    random_hier_combinations,
    all_combinations,
 };
--- a/mon/mon.js
+++ b/mon/mon.js
@@ -104,21 +104,12 @@ const etcd_tree = {
            autosync_writes: 128,
            client_queue_depth: 128, // unused
            recovery_queue_depth: 4,
-            recovery_pg_switch: 128,
            recovery_sync_batch: 16,
            no_recovery: false,
            no_rebalance: false,
            print_stats_interval: 3,
            slow_log_interval: 10,
            inode_vanish_time: 60,
-            auto_scrub: false,
-            no_scrub: false,
-            scrub_interval: '30d', // 1s/1m/1h/1d
-            scrub_queue_depth: 1,
-            scrub_sleep: 0, // milliseconds
-            scrub_list_limit: 1000, // objects to list on one scrub iteration
-            scrub_find_best: true,
-            scrub_ec_max_bruteforce: 100, // maximum EC error locator brute-force iterators
            // blockstore - fixed in superblock
            block_size,
            disk_alignment,
@@ -168,6 +159,10 @@ const etcd_tree = {
                // number of parity chunks, required for EC
                parity_chunks?: 1,
                pg_count: 100,
+                // failure_domain = string | { string: int }
+                // the second case specifies multiple failure domains. example:
+                // { datacenter: 3, host: 2 } - means 3 datacenters with 2 hosts each, for EC 4+2
+                // guarantees availability on outage of either 1 datacenter or 2 hosts
                failure_domain: 'host',
                max_osd_combinations: 10000,
                // block_size, bitmap_granularity, immediate_commit must match all OSDs used in that pool
@@ -181,8 +176,6 @@ const etcd_tree = {
                osd_tags?: 'nvme' | [ 'nvme', ... ],
                // prefer to put primary on OSD with these tags
                primary_affinity_tags?: 'nvme' | [ 'nvme', ... ],
-                // scrub interval
-                scrub_interval?: '30d',
            },
            ...
        }, */
@@ -278,7 +271,7 @@ const etcd_tree = {
                    primary: osd_num_t,
                    state: ("starting"|"peering"|"incomplete"|"active"|"repeering"|"stopping"|"offline"|
                        "degraded"|"has_incomplete"|"has_degraded"|"has_misplaced"|"has_unclean"|
-                        "has_invalid"|"has_inconsistent"|"has_corrupted"|"left_on_dead"|"scrubbing")[],
+                        "has_invalid"|"left_on_dead")[],
                }
            }, */
        },
@@ -300,7 +293,6 @@ const etcd_tree = {
                    osd_sets: osd_num_t[][],
                    all_peers: osd_num_t[],
                    epoch: uint64_t,
-                    next_scrub: uint64_t,
                },
            }, */
        },
@@ -1039,6 +1031,32 @@ class Mon
        pool_cfg.parity_chunks = Math.floor(pool_cfg.parity_chunks) || undefined;
        pool_cfg.pg_count = Math.floor(pool_cfg.pg_count);
        pool_cfg.failure_domain = pool_cfg.failure_domain || 'host';
+        if (pool_cfg.failure_domain instanceof Object)
+        {
+            for (const key in pool_cfg.failure_domain)
+            {
+                const cnt = parseInt(pool_cfg.failure_domain[key]);
+                if (!cnt || cnt <= 0)
+                {
+                    if (warn)
+                        console.log('Pool '+pool_id+' specifies invalid item count for failure domain \"'+key+'\"');
+                    return false;
+                }
+                if (key !== 'host' && key != 'osd' && !(key in this.config.placement_levels||{}))
+                {
+                    if (warn)
+                        console.log('Pool '+pool_id+' uses invalid failure domain \"'+key+'\"');
+                    return false;
+                }
+            }
+        }
+        else if (pool_cfg.failure_domain !== 'host' && pool_cfg.failure_domain != 'osd' &&
+            !(pool_cfg.failure_domain in this.config.placement_levels||{}))
+        {
+            if (warn)
+                console.log('Pool '+pool_id+' uses invalid failure domain \"'+pool_cfg.failure_domain+'\"');
+            return false;
+        }
        pool_cfg.max_osd_combinations = Math.floor(pool_cfg.max_osd_combinations) || 10000;
        if (!/^[1-9]\d*$/.exec(''+pool_id))
        {
@@ -1124,27 +1142,23 @@ class Mon
    filter_osds_by_tags(orig_tree, flat_tree, tags)
    {
        if (!tags)
-        {
-            return;
-        }
+            return 1;
        for (const tag of (tags instanceof Array ? tags : [ tags ]))
        {
-            for (const host in flat_tree)
+            for (const item in flat_tree)
            {
-                let found = 0;
-                for (const osd in flat_tree[host])
+                if (flat_tree[item] instanceof Object)
                {
-                    if (!orig_tree[osd].tags || !orig_tree[osd].tags[tag])
-                        delete flat_tree[host][osd];
-                    else
-                        found++;
-                }
-                if (!found)
-                {
-                    delete flat_tree[host];
+                    if (!filter_osds_by_tags(orig_tree, flat_tree[item], tags))
+                        delete flat_tree[item];
                }
+                else if (!orig_tree[item].tags || !orig_tree[item].tags[tag])
+                    delete flat_tree[item];
            }
        }
+        for (const item in flat_tree)
+            return 1;
+        return 0;
    }

    get_affinity_osds(pool_cfg, up_osds, osd_tree)
@@ -1203,9 +1217,11 @@ class Mon
                {
                    continue;
                }
-                let pool_tree = osd_tree[pool_cfg.root_node || ''];
-                pool_tree = pool_tree ? pool_tree.children : [];
-                pool_tree = LPOptimizer.flatten_tree(pool_tree, levels, pool_cfg.failure_domain, 'osd');
+                let pool_tree = osd_tree[pool_cfg.root_node || ''] || {};
+                const failure_domains = pool_cfg.failure_domain instanceof Object
+                    ? [ ...Object.keys(pool_cfg.failure_domain), 'osd' ]
+                    : [ pool_cfg.failure_domain, 'osd' ];
+                pool_tree = LPOptimizer.extract_tree_levels(pool_tree, failure_domains, levels);
                this.filter_osds_by_tags(osd_tree, pool_tree, pool_cfg.osd_tags);
                // These are for the purpose of building history.osd_sets
                const real_prev_pgs = [];
@@ -1232,6 +1248,9 @@ class Mon
                    pg_count: pool_cfg.pg_count,
                    pg_size: pool_cfg.pg_size,
                    pg_minsize: pool_cfg.pg_minsize,
+                    hier_sizes: pool_cfg.failure_domain instanceof Object
+                        ? [ ...Object.values(pool_cfg.failure_domain), 1 ]
+                        : null,
                    max_combinations: pool_cfg.max_osd_combinations,
                    ordered: pool_cfg.scheme != 'replicated',
                };
@@ -1287,7 +1306,7 @@ class Mon
                    } });
                }
                LPOptimizer.print_change_stats(optimize_result);
-                const pg_effsize = Math.min(pool_cfg.pg_size, Object.keys(pool_tree).length);
+                const pg_effsize = Math.min(pool_cfg.pg_size, Object.keys(pool_tree).length); // FIXME requires hier support too
                this.state.pool.stats[pool_id] = {
                    used_raw_tb: (this.state.pool.stats[pool_id]||{}).used_raw_tb || 0,
                    total_raw_tb: optimize_result.space,
--- a/mon/test-optimize-undersized.js
+++ b/mon/test-optimize-undersized.js
@@ -36,7 +36,7 @@ const crush_tree = [
    ] },
 ];

-const osd_tree = LPOptimizer.flatten_tree(crush_tree, {}, 1, 3);
+const osd_tree = LPOptimizer.extract_tree_levels({ level: -Infinity, children: crush_tree }, [ 1, 3 ], {});
 console.log(osd_tree);

 async function run()
@@ -47,32 +47,32 @@ async function run()
    LPOptimizer.print_change_stats(res, false);
    assert(res.space == 0);
    console.log('\nAdding 1st failure domain:');
-    cur_tree['dom1'] = osd_tree['dom1'];
+    cur_tree['l1_1'] = osd_tree['l1_1'];
    res = await LPOptimizer.optimize_change({ prev_pgs: res.int_pgs, osd_tree: cur_tree, pg_size: 3 });
    LPOptimizer.print_change_stats(res, false);
    assert(res.space == 12 && res.total_space == 12);
    console.log('\nAdding 2nd failure domain:');
-    cur_tree['dom2'] = osd_tree['dom2'];
+    cur_tree['l1_2'] = osd_tree['l1_2'];
    res = await LPOptimizer.optimize_change({ prev_pgs: res.int_pgs, osd_tree: cur_tree, pg_size: 3 });
    LPOptimizer.print_change_stats(res, false);
    assert(res.space == 24 && res.total_space == 24);
    console.log('\nAdding 3rd failure domain:');
-    cur_tree['dom3'] = osd_tree['dom3'];
+    cur_tree['l1_3'] = osd_tree['l1_3'];
    res = await LPOptimizer.optimize_change({ prev_pgs: res.int_pgs, osd_tree: cur_tree, pg_size: 3 });
    LPOptimizer.print_change_stats(res, false);
    assert(res.space == 36 && res.total_space == 36);
    console.log('\nRemoving 3rd failure domain:');
-    delete cur_tree['dom3'];
+    delete cur_tree['l1_3'];
    res = await LPOptimizer.optimize_change({ prev_pgs: res.int_pgs, osd_tree: cur_tree, pg_size: 3 });
    LPOptimizer.print_change_stats(res, false);
    assert(res.space == 24 && res.total_space == 24);
    console.log('\nRemoving 2nd failure domain:');
-    delete cur_tree['dom2'];
+    delete cur_tree['l1_2'];
    res = await LPOptimizer.optimize_change({ prev_pgs: res.int_pgs, osd_tree: cur_tree, pg_size: 3 });
    LPOptimizer.print_change_stats(res, false);
    assert(res.space == 12 && res.total_space == 12);
    console.log('\nRemoving 1st failure domain:');
-    delete cur_tree['dom1'];
+    delete cur_tree['l1_1'];
    res = await LPOptimizer.optimize_change({ prev_pgs: res.int_pgs, osd_tree: cur_tree, pg_size: 3 });
    LPOptimizer.print_change_stats(res, false);
    assert(res.space == 0);
--- a/mon/test-optimize.js
+++ b/mon/test-optimize.js
@@ -108,7 +108,11 @@ async function run()
    LPOptimizer.print_change_stats(res, false);

    console.log('\n256 PGs, size=3, failure domain=rack');
-    res = await LPOptimizer.optimize_initial({ osd_tree: LPOptimizer.flatten_tree(crush_tree, {}, 1, 3), pg_size: 3, pg_count: 256 });
+    res = await LPOptimizer.optimize_initial({
+        osd_tree: LPOptimizer.extract_tree_levels({ level: -Infinity, children: crush_tree }, [ 1, 3 ], {}),
+        pg_size: 3,
+        pg_count: 256,
+    });
    LPOptimizer.print_change_stats(res, false);
 }

--- a/mon/test-random-hier.js
+++ b/mon/test-random-hier.js
@@ -0,0 +1,56 @@
+// Copyright (c) Vitaliy Filippov, 2019+
+// License: VNPL-1.1 (see README.md for details)
+
+const LPOptimizer = require('./lp-optimizer.js');
+
+const osd_tree = {
+    100: { 110: { 111: 1, 112: 1 }, 120: { 121: 1, 122: 1 } },
+    200: { 210: { 211: 1, 212: 1 }, 220: { 221: 1, 222: 1 } },
+    300: { 310: { 311: 1, 312: 1 }, 320: { 321: 1, 322: 1 } },
+    400: { 410: { 411: 1, 412: 1 }, 420: { 421: 1, 422: 1 } },
+    500: { 510: { 511: 1, 512: 1 }, 520: { 521: 1, 522: 1 } },
+};
+
+const osd_tree2 = {
+    100: { 111: 1, 112: 1, 121: 1, 122: 1 },
+    200: { 211: 1, 212: 1, 221: 1, 222: 1 },
+    300: { 311: 1, 312: 1, 321: 1, 322: 1 },
+    400: { 411: 1, 412: 1, 421: 1, 422: 1 },
+    500: { 511: 1, 512: 1, 521: 1, 522: 1 },
+};
+
+const osd_tree3 = {
+    100: { 111: 1, 112: 1, 121: 1, 122: 1 },
+    200: { 211: 1, 212: 1, 221: 1, 222: 1 },
+    300: { 311: 1, 312: 1, 321: 1, 322: 1 },
+    400: { 411: 1, 412: 1, 421: 1, 422: 1 },
+    500: { 511: 1 },
+};
+
+async function run()
+{
+    let r;
+    console.log(r = LPOptimizer.random_hier_combinations(osd_tree, [ 3, 2, 1 ], 10000, false, true));
+    console.log(r = LPOptimizer.random_hier_combinations(osd_tree2, [ 3, 2 ], 0, false, true));
+    // Will contain 'Z':
+    console.log(r = LPOptimizer.random_combinations(osd_tree2, 6, 0, true));
+    console.log(r = LPOptimizer.extract_tree_levels(
+        { level: 'dc', children: [
+            { level: 'rack', children: [
+                { level: 'host', children: [
+                    { level: 'osd', id: 'OSD5', size: 10 },
+                ] },
+            ] },
+            { level: 'osd', id: 'OSD10', size: 10 },
+        ] },
+        [ 'rack', 'osd' ],
+        { dc: 1, rack: 2, host: 3, osd: 4 }
+    ));
+    if (JSON.stringify(r) != '{"rack1":{"OSD5":10},"rack2":{"OSD10":10}}')
+        throw new Error('extract_tree_levels failed');
+    // should not contain Z:
+    console.log(r = LPOptimizer.random_hier_combinations(osd_tree3, [ 3, 2 ], 0, false, true));
+    console.log('OK');
+}
+
+run().catch(console.error);
--- a/patches/VitastorPlugin.pm
+++ b/patches/VitastorPlugin.pm
@@ -388,6 +388,8 @@ sub unmap_volume
    my ($class, $storeid, $scfg, $volname, $snapname) = @_;
    my $prefix = defined $scfg->{vitastor_prefix} ? $scfg->{vitastor_prefix} : 'pve/';

+    return 1 if !$scfg->{vitastor_nbd};
+
    my ($vtype, $name, $vmid) = $class->parse_volname($volname);
    $name .= '@'.$snapname if $snapname;

@@ -411,7 +413,7 @@ sub activate_volume
 sub deactivate_volume
 {
    my ($class, $storeid, $scfg, $volname, $snapname, $cache) = @_;
-    $class->unmap_volume($storeid, $scfg, $volname, $snapname) if $scfg->{vitastor_nbd};
+    $class->unmap_volume($storeid, $scfg, $volname, $snapname);
    return 1;
 }

--- a/patches/cinder-vitastor.py
+++ b/patches/cinder-vitastor.py
@@ -50,7 +50,7 @@ from cinder.volume import configuration
 from cinder.volume import driver
 from cinder.volume import volume_utils

-VERSION = '0.9.1'
+VERSION = '0.8.9'

 LOG = logging.getLogger(__name__)

--- a/rpm/build-tarball.sh
+++ b/rpm/build-tarball.sh
@@ -24,4 +24,4 @@ rm fio
 mv fio-copy fio
 FIO=`rpm -qi fio | perl -e 'while(<>) { /^Epoch[\s:]+(\S+)/ && print "$1:"; /^Version[\s:]+(\S+)/ && print $1; /^Release[\s:]+(\S+)/ && print "-$1"; }'`
 perl -i -pe 's/(Requires:\s*fio)([^\n]+)?/$1 = '$FIO'/' $VITASTOR/rpm/vitastor-el$EL.spec
-tar --transform 's#^#vitastor-0.9.1/#' --exclude 'rpm/*.rpm' -czf $VITASTOR/../vitastor-0.9.1$(rpm --eval '%dist').tar.gz *
+tar --transform 's#^#vitastor-0.8.9/#' --exclude 'rpm/*.rpm' -czf $VITASTOR/../vitastor-0.8.9$(rpm --eval '%dist').tar.gz *
--- a/rpm/vitastor-el7.Dockerfile
+++ b/rpm/vitastor-el7.Dockerfile
@@ -35,7 +35,7 @@ ADD . /root/vitastor
 RUN set -e; \
    cd /root/vitastor/rpm; \
    sh build-tarball.sh; \
-    cp /root/vitastor-0.9.1.el7.tar.gz ~/rpmbuild/SOURCES; \
+    cp /root/vitastor-0.8.9.el7.tar.gz ~/rpmbuild/SOURCES; \
    cp vitastor-el7.spec ~/rpmbuild/SPECS/vitastor.spec; \
    cd ~/rpmbuild/SPECS/; \
    rpmbuild -ba vitastor.spec; \
--- a/rpm/vitastor-el7.spec
+++ b/rpm/vitastor-el7.spec
@@ -1,11 +1,11 @@
 Name:           vitastor
-Version:        0.9.1
+Version:        0.8.9
 Release:        1%{?dist}
 Summary:        Vitastor, a fast software-defined clustered block storage

 License:        Vitastor Network Public License 1.1
 URL:            https://vitastor.io/
-Source0:        vitastor-0.9.1.el7.tar.gz
+Source0:        vitastor-0.8.9.el7.tar.gz

 BuildRequires:  liburing-devel >= 0.6
 BuildRequires:  gperftools-devel
--- a/rpm/vitastor-el8.Dockerfile
+++ b/rpm/vitastor-el8.Dockerfile
@@ -35,7 +35,7 @@ ADD . /root/vitastor
 RUN set -e; \
    cd /root/vitastor/rpm; \
    sh build-tarball.sh; \
-    cp /root/vitastor-0.9.1.el8.tar.gz ~/rpmbuild/SOURCES; \
+    cp /root/vitastor-0.8.9.el8.tar.gz ~/rpmbuild/SOURCES; \
    cp vitastor-el8.spec ~/rpmbuild/SPECS/vitastor.spec; \
    cd ~/rpmbuild/SPECS/; \
    rpmbuild -ba vitastor.spec; \
--- a/rpm/vitastor-el8.spec
+++ b/rpm/vitastor-el8.spec
@@ -1,11 +1,11 @@
 Name:           vitastor
-Version:        0.9.1
+Version:        0.8.9
 Release:        1%{?dist}
 Summary:        Vitastor, a fast software-defined clustered block storage

 License:        Vitastor Network Public License 1.1
 URL:            https://vitastor.io/
-Source0:        vitastor-0.9.1.el8.tar.gz
+Source0:        vitastor-0.8.9.el8.tar.gz

 BuildRequires:  liburing-devel >= 0.6
 BuildRequires:  gperftools-devel
--- a/rpm/vitastor-el9.Dockerfile
+++ b/rpm/vitastor-el9.Dockerfile
@@ -18,7 +18,7 @@ ADD . /root/vitastor
 RUN set -e; \
    cd /root/vitastor/rpm; \
    sh build-tarball.sh; \
-    cp /root/vitastor-0.9.1.el9.tar.gz ~/rpmbuild/SOURCES; \
+    cp /root/vitastor-0.8.9.el9.tar.gz ~/rpmbuild/SOURCES; \
    cp vitastor-el9.spec ~/rpmbuild/SPECS/vitastor.spec; \
    cd ~/rpmbuild/SPECS/; \
    rpmbuild -ba vitastor.spec; \
--- a/rpm/vitastor-el9.spec
+++ b/rpm/vitastor-el9.spec
@@ -1,11 +1,11 @@
 Name:           vitastor
-Version:        0.9.1
+Version:        0.8.9
 Release:        1%{?dist}
 Summary:        Vitastor, a fast software-defined clustered block storage

 License:        Vitastor Network Public License 1.1
 URL:            https://vitastor.io/
-Source0:        vitastor-0.9.1.el9.tar.gz
+Source0:        vitastor-0.8.9.el9.tar.gz

 BuildRequires:  liburing-devel >= 0.6
 BuildRequires:  gperftools-devel
@@ -73,7 +73,7 @@ Vitastor library headers for development.
 Summary:        Vitastor - fio drivers
 Group:          Development/Libraries
 Requires:       vitastor-client = %{version}-%{release}
-Requires:       fio = 3.27-8.el9
+Requires:       fio = 3.27-7.el9


 %description -n vitastor-fio
--- a/src/CMakeLists.txt
+++ b/src/CMakeLists.txt
@@ -16,7 +16,7 @@ if("${CMAKE_INSTALL_PREFIX}" MATCHES "^/usr/local/?$")
 	set(CMAKE_INSTALL_RPATH "${CMAKE_INSTALL_PREFIX}/${CMAKE_INSTALL_LIBDIR}")
 endif()

-add_definitions(-DVERSION="0.9.1")
+add_definitions(-DVERSION="0.8.9")
 add_definitions(-Wall -Wno-sign-compare -Wno-comment -Wno-parentheses -Wno-pointer-arith -fdiagnostics-color=always -I ${CMAKE_SOURCE_DIR}/src)
 if (${WITH_ASAN})
 	add_definitions(-fsanitize=address -fno-omit-frame-pointer)
@@ -111,7 +111,7 @@ target_compile_options(vitastor_common PUBLIC -fPIC)
 add_executable(vitastor-osd
 	osd_main.cpp osd.cpp osd_secondary.cpp osd_peering.cpp osd_flush.cpp osd_peering_pg.cpp
 	osd_primary.cpp osd_primary_chain.cpp osd_primary_sync.cpp osd_primary_write.cpp osd_primary_subops.cpp
-	osd_cluster.cpp osd_rmw.cpp osd_scrub.cpp osd_primary_describe.cpp
+	osd_cluster.cpp osd_rmw.cpp
 )
 target_link_libraries(vitastor-osd
 	vitastor_common
@@ -141,8 +141,6 @@ add_library(vitastor_client SHARED
 	cli_common.cpp
 	cli_alloc_osd.cpp
 	cli_status.cpp
-	cli_describe.cpp
-	cli_fix.cpp
 	cli_df.cpp
 	cli_ls.cpp
 	cli_create.cpp
@@ -301,7 +299,7 @@ add_executable(test_cluster_client
 	EXCLUDE_FROM_ALL
 	test_cluster_client.cpp
 	pg_states.cpp osd_ops.cpp cluster_client.cpp cluster_client_list.cpp msgr_op.cpp mock/messenger.cpp msgr_stop.cpp
-	etcd_state_client.cpp timerfd_manager.cpp str_util.cpp ../json11/json11.cpp
+	etcd_state_client.cpp timerfd_manager.cpp ../json11/json11.cpp
 )
 target_compile_definitions(test_cluster_client PUBLIC -D__MOCK__)
 target_include_directories(test_cluster_client PUBLIC ${CMAKE_SOURCE_DIR}/src/mock)
--- a/src/blockstore.h
+++ b/src/blockstore.h
@@ -73,10 +73,7 @@ Input:
  write request is copied into the metadata area bitwise and stored there.

 Output:
- retval = number of bytes actually read/written or negative error number
-  -EINVAL = invalid input parameters
-  -ENOENT = requested object/version does not exist for reads
-  -ENOSPC = no space left in the store for writes
+- retval = number of bytes actually read/written or negative error number (-EINVAL or -ENOSPC)
 - version = the version actually read or written

 ## BS_OP_DELETE
@@ -125,14 +122,11 @@ Output:
 Get a list of all objects in this Blockstore.

 Input:
- pg_alignment = PG alignment
- pg_count = PG count or 0 to list all objects
- pg_number = PG number
- list_stable_limit = max number of clean objects in the reply
-  it's guaranteed that dirty objects are returned from the same interval,
-  i.e. from (min_oid .. min(max_oid, max(returned stable OIDs)))
- min_oid = min inode/stripe or 0 to list all objects
- max_oid = max inode/stripe or 0 to list all objects
+- oid.stripe = PG alignment
+- len = PG count or 0 to list all objects
+- offset = PG number
+- oid.inode = min inode number or 0 to list all inodes
+- version = max inode number or 0 to list all inodes

 Output:
 - retval = total obj_ver_id count
@@ -149,27 +143,10 @@ struct blockstore_op_t
    uint64_t opcode;
    // finish callback
    std::function<void (blockstore_op_t*)> callback;
-    union __attribute__((__packed__))
-    {
-        // R/W
-        struct __attribute__((__packed__))
-        {
-            object_id oid;
-            uint64_t version;
-            uint32_t offset;
-            uint32_t len;
-        };
-        // List
-        struct __attribute__((__packed__))
-        {
-            object_id min_oid;
-            object_id max_oid;
-            uint32_t pg_alignment;
-            uint32_t pg_count;
-            uint32_t pg_number;
-            uint32_t list_stable_limit;
-        };
-    };
+    object_id oid;
+    uint64_t version;
+    uint32_t offset;
+    uint32_t len;
    void *buf;
    void *bitmap;
    int retval;
--- a/src/blockstore_flush.cpp
+++ b/src/blockstore_flush.cpp
@@ -536,27 +536,14 @@ resume_1:
                return false;
            }
            // zero out old metadata entry
-            {
-                clean_disk_entry *old_entry = (clean_disk_entry*)((uint8_t*)meta_old.buf + meta_old.pos*bs->dsk.clean_entry_size);
-                if (old_entry->oid.inode != 0 && old_entry->oid != cur.oid)
-                {
-                    printf("Fatal error (metadata corruption or bug): tried to wipe metadata entry %lu (%lx:%lx v%lu) as old location of %lx:%lx\n",
-                        old_clean_loc >> bs->dsk.block_order, old_entry->oid.inode, old_entry->oid.stripe,
-                        old_entry->version, cur.oid.inode, cur.oid.stripe);
-                    exit(1);
-                }
-            }
            memset((uint8_t*)meta_old.buf + meta_old.pos*bs->dsk.clean_entry_size, 0, bs->dsk.clean_entry_size);
-            if (meta_old.sector != meta_new.sector)
-            {
-                await_sqe(15);
-                data->iov = (struct iovec){ meta_old.buf, bs->dsk.meta_block_size };
-                data->callback = simple_callback_w;
-                my_uring_prep_writev(
-                    sqe, bs->dsk.meta_fd, &data->iov, 1, bs->dsk.meta_offset + bs->dsk.meta_block_size + meta_old.sector
-                );
-                wait_count++;
-            }
+            await_sqe(15);
+            data->iov = (struct iovec){ meta_old.buf, bs->dsk.meta_block_size };
+            data->callback = simple_callback_w;
+            my_uring_prep_writev(
+                sqe, bs->dsk.meta_fd, &data->iov, 1, bs->dsk.meta_offset + bs->dsk.meta_block_size + meta_old.sector
+            );
+            wait_count++;
        }
        if (has_delete)
        {
--- a/src/blockstore_impl.cpp
+++ b/src/blockstore_impl.cpp
@@ -462,11 +462,11 @@ void blockstore_impl_t::reshard_clean_db(pool_id_t pool, uint32_t pg_count, uint

 void blockstore_impl_t::process_list(blockstore_op_t *op)
 {
-    uint32_t list_pg = op->pg_number+1;
-    uint32_t pg_count = op->pg_count;
-    uint64_t pg_stripe_size = op->pg_alignment;
-    uint64_t min_inode = op->min_oid.inode;
-    uint64_t max_inode = op->max_oid.inode;
+    uint32_t list_pg = op->offset+1;
+    uint32_t pg_count = op->len;
+    uint64_t pg_stripe_size = op->oid.stripe;
+    uint64_t min_inode = op->oid.inode;
+    uint64_t max_inode = op->version;
    // Check PG
    if (pg_count != 0 && (pg_stripe_size < MIN_DATA_BLOCK_SIZE || list_pg > pg_count))
    {
@@ -513,13 +513,7 @@ void blockstore_impl_t::process_list(blockstore_op_t *op)
            stable_alloc += clean_db.size();
        }
    }
-    if (op->list_stable_limit > 0)
-    {
-        stable_alloc = op->list_stable_limit;
-        if (stable_alloc > 1024*1024)
-            stable_alloc = 1024*1024;
-    }
-    if (stable_alloc < 32768)
+    else
    {
        stable_alloc = 32768;
    }
@@ -530,22 +524,22 @@ void blockstore_impl_t::process_list(blockstore_op_t *op)
        FINISH_OP(op);
        return;
    }
-    auto max_oid = op->max_oid;
-    bool limited = false;
-    pool_pg_id_t last_shard_id = 0;
    for (auto shard_it = clean_db_shards.lower_bound(first_shard);
        shard_it != clean_db_shards.end() && shard_it->first <= last_shard;
        shard_it++)
    {
        auto & clean_db = shard_it->second;
        auto clean_it = clean_db.begin(), clean_end = clean_db.end();
-        if (op->min_oid.inode != 0 || op->min_oid.stripe != 0)
+        if ((min_inode != 0 || max_inode != 0) && min_inode <= max_inode)
        {
-            clean_it = clean_db.lower_bound(op->min_oid);
-        }
-        if ((max_oid.inode != 0 || max_oid.stripe != 0) && !(max_oid < op->min_oid))
-        {
-            clean_end = clean_db.upper_bound(max_oid);
+            clean_it = clean_db.lower_bound({
+                .inode = min_inode,
+                .stripe = 0,
+            });
+            clean_end = clean_db.upper_bound({
+                .inode = max_inode,
+                .stripe = UINT64_MAX,
+            });
        }
        for (; clean_it != clean_end; clean_it++)
        {
@@ -564,29 +558,11 @@ void blockstore_impl_t::process_list(blockstore_op_t *op)
                .oid = clean_it->first,
                .version = clean_it->second.version,
            };
-            if (op->list_stable_limit > 0 && stable_count >= op->list_stable_limit)
-            {
-                if (!limited)
-                {
-                    limited = true;
-                    max_oid = stable[stable_count-1].oid;
-                }
-                break;
-            }
        }
-        if (op->list_stable_limit > 0)
-        {
-            // To maintain the order, we have to include objects in the same range from other shards
-            if (last_shard_id != 0 && last_shard_id != shard_it->first)
-                std::sort(stable, stable+stable_count);
-            if (stable_count > op->list_stable_limit)
-                stable_count = op->list_stable_limit;
-        }
-        last_shard_id = shard_it->first;
    }
-    if (op->list_stable_limit == 0 && first_shard != last_shard)
+    if (first_shard != last_shard)
    {
-        // If that's not a per-PG listing, sort clean entries (already sorted if list_stable_limit != 0)
+        // If that's not a per-PG listing, sort clean entries
        std::sort(stable, stable+stable_count);
    }
    int clean_stable_count = stable_count;
@@ -595,17 +571,20 @@ void blockstore_impl_t::process_list(blockstore_op_t *op)
    obj_ver_id *unstable = NULL;
    {
        auto dirty_it = dirty_db.begin(), dirty_end = dirty_db.end();
-        if (op->min_oid.inode != 0 || op->min_oid.stripe != 0)
+        if ((min_inode != 0 || max_inode != 0) && min_inode <= max_inode)
        {
            dirty_it = dirty_db.lower_bound({
-                .oid = op->min_oid,
+                .oid = {
+                    .inode = min_inode,
+                    .stripe = 0,
+                },
                .version = 0,
            });
-        }
-        if ((max_oid.inode != 0 || max_oid.stripe != 0) && !(max_oid < op->min_oid))
-        {
            dirty_end = dirty_db.upper_bound({
-                .oid = max_oid,
+                .oid = {
+                    .inode = max_inode,
+                    .stripe = UINT64_MAX,
+                },
                .version = UINT64_MAX,
            });
        }
@@ -649,11 +628,6 @@ void blockstore_impl_t::process_list(blockstore_op_t *op)
                            stable[stable_count++] = dirty_it->first;
                        }
                    }
-                    if (op->list_stable_limit > 0 && stable_count >= op->list_stable_limit)
-                    {
-                        // Stop here
-                        break;
-                    }
                }
                else
                {
--- a/src/blockstore_read.cpp
+++ b/src/blockstore_read.cpp
@@ -124,8 +124,10 @@ int blockstore_impl_t::dequeue_read(blockstore_op_t *read_op)
    bool dirty_found = (dirty_it != dirty_db.end() && dirty_it->first.oid == read_op->oid);
    if (!clean_found && !dirty_found)
    {
+        // region is not allocated - return zeroes
+        memset(read_op->buf, 0, read_op->len);
        read_op->version = 0;
-        read_op->retval = -ENOENT;
+        read_op->retval = read_op->len;
        FINISH_OP(read_op);
        return 2;
    }
@@ -138,16 +140,14 @@ int blockstore_impl_t::dequeue_read(blockstore_op_t *read_op)
        {
            dirty_entry& dirty = dirty_it->second;
            bool version_ok = !IS_IN_FLIGHT(dirty.state) && read_op->version >= dirty_it->first.version;
+            if (IS_SYNCED(dirty.state))
+            {
+                if (!version_ok && read_op->version != 0)
+                    read_op->version = dirty_it->first.version;
+                version_ok = true;
+            }
            if (version_ok)
            {
-                if (IS_DELETE(dirty.state))
-                {
-                    assert(!result_version);
-                    read_op->version = 0;
-                    read_op->retval = -ENOENT;
-                    FINISH_OP(read_op);
-                    return 2;
-                }
                if (!result_version)
                {
                    result_version = dirty_it->first.version;
@@ -234,19 +234,12 @@ int blockstore_impl_t::dequeue_read(blockstore_op_t *read_op)
            }
        }
    }
-    if (!result_version)
-    {
-        // May happen if there are entries in dirty_db but all of them are !version_ok
-        read_op->version = 0;
-        read_op->retval = -ENOENT;
-        FINISH_OP(read_op);
-        return 2;
-    }
-    if (fulfilled < read_op->len)
+    else if (fulfilled < read_op->len)
    {
+        // fill remaining parts with zeroes
        assert(fulfill_read(read_op, fulfilled, 0, dsk.data_block_size, (BS_ST_DELETE | BS_ST_STABLE), 0, 0, 0));
-        assert(fulfilled == read_op->len);
    }
+    assert(fulfilled == read_op->len);
    read_op->version = result_version;
    if (!PRIV(read_op)->pending_ops)
    {
--- a/src/blockstore_rollback.cpp
+++ b/src/blockstore_rollback.cpp
@@ -179,7 +179,7 @@ void blockstore_impl_t::erase_dirty(blockstore_dirty_db_t::iterator dirty_start,
    {
        object_id oid = dirty_it->first.oid;
 #ifdef BLOCKSTORE_DEBUG
-        printf("Unblock writes-after-delete %lx:%lx v%lu\n", oid.inode, oid.stripe, dirty_it->first.version);
+        printf("Unblock writes-after-delete %lx:%lx v%lx\n", oid.inode, oid.stripe, dirty_it->first.version);
 #endif
        dirty_it = dirty_end;
        // Unblock operations blocked by delete flushing
--- a/src/blockstore_stable.cpp
+++ b/src/blockstore_stable.cpp
@@ -103,7 +103,7 @@ blockstore_op_t* blockstore_impl_t::selective_sync(blockstore_op_t *op)
    blockstore_op_t *sync_op = new blockstore_op_t;
    sync_op->opcode = BS_OP_SYNC;
    sync_op->buf = NULL;
-    sync_op->callback = [](blockstore_op_t *sync_op)
+    sync_op->callback = [this](blockstore_op_t *sync_op)
    {
        delete sync_op;
    };
@@ -244,7 +244,7 @@ int blockstore_impl_t::split_stab_op(blockstore_op_t *op, std::function<int(obj_
        // Make a wrapped callback
        int *split_op_counter = (int*)malloc_or_die(sizeof(int));
        *split_op_counter = (sync_op ? 1 : 0) + (split_stab_op ? 1 : 0) + (todo ? 1 : 0);
-        auto cb = [op, good_items = good_vers.items,
+        auto cb = [this, op, good_items = good_vers.items,
            bad_items = bad_vers.items, split_op_counter,
            orig_buf, real_cb = op->callback](blockstore_op_t *split_op)
        {
--- a/src/blockstore_write.cpp
+++ b/src/blockstore_write.cpp
@@ -6,7 +6,7 @@
 bool blockstore_impl_t::enqueue_write(blockstore_op_t *op)
 {
    // Check or assign version number
-    bool found = false, deleted = false, unsynced = false, is_del = (op->opcode == BS_OP_DELETE);
+    bool found = false, deleted = false, is_del = (op->opcode == BS_OP_DELETE);
    bool wait_big = false, wait_del = false;
    void *bmp = NULL;
    uint64_t version = 1;
@@ -26,7 +26,6 @@ bool blockstore_impl_t::enqueue_write(blockstore_op_t *op)
            found = true;
            version = dirty_it->first.version + 1;
            deleted = IS_DELETE(dirty_it->second.state);
-            unsynced = !IS_SYNCED(dirty_it->second.state);
            wait_del = ((dirty_it->second.state & BS_ST_WORKFLOW_MASK) == BS_ST_WAIT_DEL);
            wait_big = (dirty_it->second.state & BS_ST_TYPE_MASK) == BS_ST_BIG_WRITE
                ? !IS_SYNCED(dirty_it->second.state)
@@ -82,28 +81,10 @@ bool blockstore_impl_t::enqueue_write(blockstore_op_t *op)
            wait_del = true;
            PRIV(op)->real_version = op->version;
            op->version = version;
-            if (unsynced)
-            {
-                // Issue an additional sync so the delete reaches the journal
-                blockstore_op_t *sync_op = new blockstore_op_t;
-                sync_op->opcode = BS_OP_SYNC;
-                sync_op->callback = [this, op](blockstore_op_t *sync_op)
-                {
-                    flusher->unshift_flush((obj_ver_id){
-                        .oid = op->oid,
-                        .version = op->version-1,
-                    }, true);
-                    delete sync_op;
-                };
-                enqueue_op(sync_op);
-            }
-            else
-            {
-                flusher->unshift_flush((obj_ver_id){
-                    .oid = op->oid,
-                    .version = version-1,
-                }, true);
-            }
+            flusher->unshift_flush((obj_ver_id){
+                .oid = op->oid,
+                .version = version-1,
+            }, true);
        }
        else
        {
--- a/src/cli.cpp
+++ b/src/cli.cpp
@@ -73,37 +73,6 @@ static const char* help_text =
    "  <to> must be a child of <from> and <target> may be one of the layers between\n"
    "  <from> and <to>, including <from> and <to>.\n"
    "\n"
-    "vitastor-cli describe [--osds <osds>] [--object-state <states>] [--pool <pool>] [--inode <ino>] [--min-inode <ino>] [--max-inode <ino>] [--min-offset <offset>] [--max-offset <offset>]\n"
-    "  Describe unclean object locations in the cluster.\n"
-    "  --osds <osds>\n"
-    "      Only list objects from primary OSD(s) <osds>.\n"
-    "  --object-state <states>\n"
-    "      Only list objects in given state(s). State(s) may include:\n"
-    "      degraded, misplaced, incomplete, corrupted, inconsistent.\n"
-    "  --pool <pool name or number>\n"
-    "      Only list objects in the given pool.\n"
-    "  --inode, --min-inode, --max-inode\n"
-    "      Restrict listing to specific inode numbers.\n"
-    "  --min-offset, --max-offset\n"
-    "      Restrict listing to specific offsets inside inodes.\n"
-    "\n"
-    "vitastor-cli fix [--objects <objects>] [--bad-osds <osds>] [--part <part>] [--check no]\n"
-    "  Fix inconsistent objects in the cluster by deleting some copies.\n"
-    "  --objects <objects>\n"
-    "      Objects to fix, either in plain text or JSON format. If not specified,\n"
-    "      object list will be read from STDIN in one of the same formats.\n"
-    "      Plain text format: 0x<inode>:0x<stripe> <any delimiter> 0x<inode>:0x<stripe> ...\n"
-    "      JSON format: [{\"inode\":\"0x...\",\"stripe\":\"0x...\"},...]\n"
-    "  --bad-osds <osds>\n"
-    "      Remove inconsistent copies/parts of objects from these OSDs, effectively\n"
-    "      marking them bad and allowing Vitastor to recover objects from other copies.\n"
-    "  --part <number>\n"
-    "      Only remove EC part <number> (from 0 to pg_size-1), required for extreme\n"
-    "      edge cases where one OSD has multiple parts of a EC object.\n"
-    "  --check no\n"
-    "      Do not recheck that requested objects are actually inconsistent,\n"
-    "      delete requested copies/parts anyway.\n"
-    "\n"
    "vitastor-cli alloc-osd\n"
    "  Allocate a new OSD number and reserve it by creating empty /osd/stats/<n> key.\n"
    "\n"
@@ -199,7 +168,6 @@ static json11::Json::object parse_args(int narg, const char *args[])
 static int run(cli_tool_t *p, json11::Json::object cfg)
 {
    cli_result_t result = {};
-    p->is_command_line = true;
    p->parse_config(cfg);
    json11::Json::array cmd = cfg["command"].array_items();
    cfg.erase("command");
@@ -308,16 +276,6 @@ static int run(cli_tool_t *p, json11::Json::object cfg)
        }
        action_cb = p->start_rm(cfg);
    }
-    else if (cmd[0] == "describe")
-    {
-        // Describe unclean objects
-        action_cb = p->start_describe(cfg);
-    }
-    else if (cmd[0] == "fix")
-    {
-        // Fix inconsistent objects (by deleting some copies)
-        action_cb = p->start_fix(cfg);
-    }
    else if (cmd[0] == "alloc-osd")
    {
        // Allocate a new OSD number
--- a/src/cli.h
+++ b/src/cli.h
@@ -34,12 +34,12 @@ public:
    bool list_first = false;
    bool json_output = false;
    int log_level = 0;
-    bool is_command_line = false;
    bool color = false;

    ring_loop_t *ringloop = NULL;
    epoll_manager_t *epmgr = NULL;
    cluster_client_t *cli = NULL;
+    bool no_recovery = false, no_rebalance = false, readonly = false;

    int waiting = 0;
    cli_result_t etcd_err;
@@ -56,8 +56,6 @@ public:
    friend struct snap_remover_t;

    std::function<bool(cli_result_t &)> start_status(json11::Json);
-    std::function<bool(cli_result_t &)> start_describe(json11::Json);
-    std::function<bool(cli_result_t &)> start_fix(json11::Json);
    std::function<bool(cli_result_t &)> start_df(json11::Json);
    std::function<bool(cli_result_t &)> start_ls(json11::Json);
    std::function<bool(cli_result_t &)> start_create(json11::Json);
--- a/src/cli_describe.cpp
+++ b/src/cli_describe.cpp
@@ -1,256 +0,0 @@
-// Copyright (c) Vitaliy Filippov, 2019+
-// License: VNPL-1.1 (see README.md for details)
-
-#include "cli_fix.h"
-#include "cluster_client.h"
-#include "pg_states.h"
-#include "str_util.h"
-
-std::vector<uint64_t> parse_uint64_list(json11::Json val)
-{
-    std::vector<uint64_t> ret;
-    if (val.is_number())
-        ret.push_back(val.uint64_value());
-    else if (val.is_string())
-    {
-        const std::string & s = val.string_value();
-        for (int i = 0, p = -1; i <= s.size(); i++)
-        {
-            if (p < 0 && i < s.size() && (isdigit(s[i]) || s[i] == 'x'))
-                p = i;
-            else if (p >= 0 && (i >= s.size() || !isdigit(s[i]) && s[i] != 'x'))
-            {
-                ret.push_back(stoull_full(s.substr(p, i-p), 0));
-                p = -1;
-            }
-        }
-    }
-    else if (val.is_array())
-    {
-        for (auto & pg_num: val.array_items())
-            ret.push_back(pg_num.uint64_value());
-    }
-    return ret;
-}
-
-struct cli_describe_t
-{
-    uint64_t object_state = 0;
-    pool_id_t only_pool = 0;
-    std::vector<uint64_t> only_osds;
-    uint64_t min_inode = 0, max_inode = 0;
-    uint64_t min_offset = 0, max_offset = 0;
-
-    cli_tool_t *parent = NULL;
-    int state = 0;
-    int count = 0;
-
-    json11::Json options;
-    cli_result_t result;
-    json11::Json::array describe_items;
-
-    bool is_done()
-    {
-        return state == 100;
-    }
-
-    void parse_options(json11::Json cfg)
-    {
-        only_pool = cfg["pool"].uint64_value();
-        if (!only_pool && cfg["pool"].is_string())
-        {
-            for (auto & pp: parent->cli->st_cli.pool_config)
-            {
-                if (pp.second.name == cfg["pool"].string_value())
-                {
-                    only_pool = pp.first;
-                    break;
-                }
-            }
-        }
-        min_inode = cfg["inode"].uint64_value();
-        if (min_inode)
-        {
-            if (!INODE_POOL(min_inode))
-                min_inode |= (uint64_t)only_pool << (64-POOL_ID_BITS);
-            max_inode = min_inode;
-            min_offset = max_offset = 0;
-        }
-        else
-        {
-            min_inode = stoull_full(cfg["min_inode"].string_value(), 0); // to support 0x...
-            max_inode = stoull_full(cfg["max_inode"].string_value(), 0);
-            min_offset = stoull_full(cfg["min_offset"].string_value(), 0);
-            max_offset = stoull_full(cfg["max_offset"].string_value(), 0);
-            if (!min_inode && !max_inode && only_pool)
-            {
-                min_inode = (uint64_t)only_pool << (64-POOL_ID_BITS);
-                max_inode = ((uint64_t)only_pool << (64-POOL_ID_BITS)) |
-                    (((uint64_t)1 << (64-POOL_ID_BITS)) - 1);
-            }
-        }
-        only_osds = parse_uint64_list(cfg["osds"]);
-        object_state = stoull_full(cfg["object_state"].string_value(), 0);
-        if (!object_state && cfg["object_state"].is_string())
-        {
-            if (cfg["object_state"].string_value().find("inconsistent") != std::string::npos)
-                object_state |= OBJ_INCONSISTENT;
-            if (cfg["object_state"].string_value().find("corrupted") != std::string::npos)
-                object_state |= OBJ_CORRUPTED;
-            if (cfg["object_state"].string_value().find("incomplete") != std::string::npos)
-                object_state |= OBJ_INCOMPLETE;
-            if (cfg["object_state"].string_value().find("degraded") != std::string::npos)
-                object_state |= OBJ_DEGRADED;
-            if (cfg["object_state"].string_value().find("misplaced") != std::string::npos)
-                object_state |= OBJ_MISPLACED;
-        }
-    }
-
-    void loop()
-    {
-        if (state == 1)
-            goto resume_1;
-        if (state == 100)
-            return;
-        parse_options(options);
-        if (min_inode && !INODE_POOL(min_inode))
-        {
-            result = (cli_result_t){ .err = EINVAL, .text = "Pool is not specified" };
-            state = 100;
-            return;
-        }
-        if (!only_osds.size())
-        {
-            uint64_t min_pool = min_inode >> (64-POOL_ID_BITS);
-            uint64_t max_pool = max_inode >> (64-POOL_ID_BITS);
-            for (auto & pp: parent->cli->st_cli.pool_config)
-            {
-                if (pp.first >= min_pool && (!max_pool || pp.first <= max_pool))
-                {
-                    for (auto & pgp: pp.second.pg_config)
-                        only_osds.push_back(pgp.second.cur_primary);
-                }
-            }
-        }
-        remove_duplicates(only_osds);
-        parent->cli->init_msgr();
-        if (parent->json_output && parent->is_command_line)
-        {
-            printf("[\n");
-        }
-        for (int i = 0; i < only_osds.size(); i++)
-        {
-            osd_op_t *op = new osd_op_t;
-            op->req = (osd_any_op_t){
-                .describe = {
-                    .header = {
-                        .magic = SECONDARY_OSD_OP_MAGIC,
-                        .id = parent->cli->next_op_id(),
-                        .opcode = OSD_OP_DESCRIBE,
-                    },
-                    .object_state = object_state,
-                    .min_inode = min_inode,
-                    .min_offset = min_offset,
-                    .max_inode = max_inode,
-                    .max_offset = max_offset,
-                },
-            };
-            op->callback = [this, osd_num = only_osds[i]](osd_op_t *op)
-            {
-                if (op->reply.hdr.retval < 0)
-                {
-                    fprintf(
-                        stderr, "Failed to describe objects on OSD %lu (retval=%ld)\n",
-                        osd_num, op->reply.hdr.retval
-                    );
-                }
-                else if (op->reply.describe.result_bytes != op->reply.hdr.retval * sizeof(osd_reply_describe_item_t))
-                {
-                    fprintf(
-                        stderr, "Invalid response size from OSD %lu (expected %lu bytes, got %lu bytes)\n",
-                        osd_num, op->reply.hdr.retval * sizeof(osd_reply_describe_item_t), op->reply.describe.result_bytes
-                    );
-                }
-                else
-                {
-                    osd_reply_describe_item_t *items = (osd_reply_describe_item_t *)op->buf;
-                    for (int i = 0; i < op->reply.hdr.retval; i++)
-                    {
-                        if (!parent->json_output || parent->is_command_line)
-                        {
-#define FMT "{\"inode\":\"0x%lx\",\"stripe\":\"0x%lx\",\"part\":%u,\"osd_num\":%lu%s%s%s}"
-                            printf(
-                                (parent->json_output
-                                    ? (count > 0 ? ",\n  " FMT : "  " FMT)
-                                    : "%lx:%lx part %u on OSD %lu%s%s%s\n"),
-#undef FMT
-                                items[i].inode, items[i].stripe,
-                                items[i].role, items[i].osd_num,
-                                (items[i].loc_bad & LOC_CORRUPTED ? (parent->json_output ? ",\"corrupted\":true" : " corrupted") : ""),
-                                (items[i].loc_bad & LOC_INCONSISTENT ? (parent->json_output ? ",\"inconsistent\":true" : " inconsistent") : ""),
-                                (items[i].loc_bad & LOC_OUTDATED ? (parent->json_output ? ",\"outdated\":true" : " outdated") : "")
-                            );
-                        }
-                        else
-                        {
-                            auto json_item = json11::Json::object {
-                                { "inode", (uint64_t)items[i].inode },
-                                { "stripe", (uint64_t)items[i].stripe },
-                                { "part", (uint64_t)items[i].role },
-                                { "osd_num", (uint64_t)items[i].osd_num },
-                            };
-                            if (items[i].loc_bad & LOC_CORRUPTED)
-                                json_item["corrupted"] = true;
-                            if (items[i].loc_bad & LOC_INCONSISTENT)
-                                json_item["inconsistent"] = true;
-                            if (items[i].loc_bad & LOC_OUTDATED)
-                                json_item["outdated"] = true;
-                            describe_items.push_back(json_item);
-                        }
-                        count++;
-                    }
-                }
-                delete op;
-                parent->waiting--;
-                if (!parent->waiting)
-                    loop();
-            };
-            parent->waiting++;
-            parent->cli->execute_raw(only_osds[i], op);
-        }
-    resume_1:
-        state = 1;
-        if (parent->waiting > 0)
-        {
-            return;
-        }
-        if (parent->json_output && parent->is_command_line)
-        {
-            printf(count > 0 ? "\n]\n" : "]\n");
-        }
-        else
-        {
-            result.data = describe_items;
-        }
-        state = 100;
-        describe_items.clear();
-    }
-};
-
-std::function<bool(cli_result_t &)> cli_tool_t::start_describe(json11::Json cfg)
-{
-    auto describer = new cli_describe_t();
-    describer->parent = this;
-    describer->options = cfg;
-    return [describer](cli_result_t & result)
-    {
-        describer->loop();
-        if (describer->is_done())
-        {
-            result = describer->result;
-            delete describer;
-            return true;
-        }
-        return false;
-    };
-}
--- a/src/cli_fix.cpp
+++ b/src/cli_fix.cpp
@@ -1,313 +0,0 @@
-// Copyright (c) Vitaliy Filippov, 2019+
-// License: VNPL-1.1 (see README.md for details)
-
-#include "cli_fix.h"
-#include "cluster_client.h"
-#include "pg_states.h"
-#include "str_util.h"
-
-struct cli_fix_t
-{
-    std::vector<object_id> objects;
-    int part = -1;
-    int processed_count = 0;
-    std::set<osd_num_t> bad_osds;
-    bool no_check = false;
-
-    cli_tool_t *parent = NULL;
-    int state = 0;
-
-    json11::Json options;
-    cli_result_t result;
-    json11::Json::array fix_result;
-
-    bool is_done()
-    {
-        return state == 100;
-    }
-
-    void parse_objects_str(std::string str)
-    {
-        str = trim(str);
-        if (str[0] == '[')
-        {
-            std::string json_err;
-            json11::Json list = json11::Json::parse(str, json_err);
-            if (json_err != "")
-                fprintf(stderr, "Invalid JSON object list input: %s\n", json_err.c_str());
-            else
-                parse_object_list(list);
-        }
-        else
-        {
-            const char *s = str.c_str();
-            char *e = NULL;
-            int len = str.size();
-            object_id oid;
-            for (int p = 0; p < len; p++)
-            {
-                if (isdigit(s[p]))
-                {
-                    int p0 = p;
-                    oid.inode = strtoull(s+p, &e, 0);
-                    p = e-s;
-                    while (p < len && !isdigit(s[p]) && s[p] != ':')
-                        p++;
-                    if (s[p] != ':')
-                    {
-                        fprintf(stderr, "Invalid object ID in input: %s\n", std::string(s+p0, p-p0).c_str());
-                        continue;
-                    }
-                    p++;
-                    while (p < len && !isdigit(s[p]))
-                        p++;
-                    oid.stripe = strtoull(s+p, &e, 0) & ~STRIPE_MASK;
-                    p = e-s;
-                    if (oid.inode)
-                        objects.push_back(oid);
-                    else
-                        fprintf(stderr, "Invalid object ID in input: %s\n", std::string(s+p0, p-p0).c_str());
-                }
-            }
-        }
-    }
-
-    void parse_object_list(json11::Json list)
-    {
-        for (auto & obj: list.array_items())
-        {
-            object_id oid = (object_id){
-                .inode = stoull_full(obj["inode"].string_value(), 0),
-                .stripe = stoull_full(obj["stripe"].string_value(), 0) & ~STRIPE_MASK,
-            };
-            if (oid.inode)
-                objects.push_back(oid);
-            else
-                fprintf(stderr, "Invalid JSON object ID in input: %s, bad or missing \"inode\" field\n", obj.dump().c_str());
-        }
-    }
-
-    void parse_options(json11::Json cfg)
-    {
-        json11::Json object_list;
-        if (cfg["objects"].is_null())
-            parse_objects_str(read_all_fd(0));
-        else if (cfg["objects"].is_string())
-            parse_objects_str(cfg["objects"].string_value());
-        else
-            parse_object_list(cfg["objects"].array_items());
-        for (auto osd_num: parse_uint64_list(cfg["bad_osds"]))
-            bad_osds.insert(osd_num);
-        no_check = json_is_false(cfg["check"]);
-        if (cfg["part"].is_number() || cfg["part"].is_string())
-            part = cfg["part"].uint64_value();
-    }
-
-    void loop()
-    {
-        if (state == 1)
-            goto resume_1;
-        if (state == 100)
-            return;
-        parse_options(options);
-        if (!objects.size())
-        {
-            result = (cli_result_t){ .err = EINVAL, .text = "Object list is not specified" };
-            state = 100;
-            return;
-        }
-        if (!bad_osds.size())
-        {
-            result = (cli_result_t){ .err = EINVAL, .text = "OSDs are not specified" };
-            state = 100;
-            return;
-        }
-        remove_duplicates(objects);
-        parent->cli->init_msgr();
-    resume_1:
-        state = 1;
-        while (processed_count < objects.size())
-        {
-            if (parent->waiting >= parent->iodepth*parent->parallel_osds)
-            {
-                return;
-            }
-            auto & obj = objects[processed_count++];
-            auto pool_cfg_it = parent->cli->st_cli.pool_config.find(INODE_POOL(obj.inode));
-            if (pool_cfg_it == parent->cli->st_cli.pool_config.end())
-            {
-                fprintf(stderr, "Object %lx:%lx is from unknown pool\n", obj.inode, obj.stripe);
-                continue;
-            }
-            auto & pool_cfg = pool_cfg_it->second;
-            pg_num_t pg_num = (obj.stripe/pool_cfg.pg_stripe_size) % pool_cfg.real_pg_count + 1; // like map_to_pg()
-            auto pg_it = pool_cfg.pg_config.find(pg_num);
-            if (pg_it == pool_cfg.pg_config.end() ||
-                !pg_it->second.cur_primary || !(pg_it->second.cur_state & PG_ACTIVE))
-            {
-                fprintf(
-                    stderr, "Object %lx:%lx is from PG %u/%u which is not currently active\n",
-                    obj.inode, obj.stripe, pool_cfg_it->first, pg_num
-                );
-                continue;
-            }
-            osd_num_t primary_osd = pg_it->second.cur_primary;
-            // Describe -> Remove some copies -> Scrub again
-            osd_op_t *op = new osd_op_t;
-            op->req = (osd_any_op_t){
-                .describe = {
-                    .header = {
-                        .magic = SECONDARY_OSD_OP_MAGIC,
-                        .id = parent->cli->next_op_id(),
-                        .opcode = OSD_OP_DESCRIBE,
-                    },
-                    .min_inode = obj.inode,
-                    .min_offset = obj.stripe,
-                    .max_inode = obj.inode,
-                    .max_offset = obj.stripe,
-                },
-            };
-            op->callback = [this, primary_osd, &obj](osd_op_t *op)
-            {
-                if (op->reply.hdr.retval < 0 || op->reply.describe.result_bytes != op->reply.hdr.retval * sizeof(osd_reply_describe_item_t))
-                {
-                    fprintf(stderr, "Failed to describe objects on OSD %lu (retval=%ld)\n", primary_osd, op->reply.hdr.retval);
-                    parent->waiting--;
-                    loop();
-                }
-                else
-                {
-                    osd_reply_describe_item_t *items = (osd_reply_describe_item_t *)op->buf;
-                    int *rm_count = (int*)malloc_or_die(sizeof(int));
-                    *rm_count = 1; // just in case if anything gets called instantly
-                    for (int i = 0; i < op->reply.hdr.retval; i++)
-                    {
-                        if (((items[i].loc_bad & LOC_INCONSISTENT) || no_check) &&
-                            bad_osds.find(items[i].osd_num) != bad_osds.end() &&
-                            (part == -1 || items[i].role == part))
-                        {
-                            // Remove
-                            uint64_t rm_osd_num = items[i].osd_num;
-                            osd_op_t *rm_op = new osd_op_t;
-                            rm_op->req = (osd_any_op_t){
-                                .sec_del = {
-                                    .header = {
-                                        .magic = SECONDARY_OSD_OP_MAGIC,
-                                        .id = parent->cli->next_op_id(),
-                                        .opcode = OSD_OP_SEC_DELETE,
-                                    },
-                                    .oid = {
-                                        .inode = op->req.describe.min_inode,
-                                        .stripe = op->req.describe.min_offset | items[i].role,
-                                    },
-                                    .version = 0,
-                                },
-                            };
-                            rm_op->callback = [this, primary_osd, rm_osd_num, rm_count, &obj](osd_op_t *rm_op)
-                            {
-                                (*rm_count)--;
-                                if (rm_op->reply.hdr.retval < 0)
-                                {
-                                    fprintf(
-                                        stderr, "Failed to remove object %lx:%lx from OSD %lu (retval=%ld)\n",
-                                        rm_op->req.sec_del.oid.inode, rm_op->req.sec_del.oid.stripe,
-                                        rm_osd_num, rm_op->reply.hdr.retval
-                                    );
-                                }
-                                else if (parent->json_output)
-                                {
-                                    fix_result.push_back(json11::Json::object {
-                                        { "inode", (uint64_t)rm_op->req.sec_del.oid.inode },
-                                        { "stripe", (uint64_t)rm_op->req.sec_del.oid.stripe & ~STRIPE_MASK },
-                                        { "part", (uint64_t)rm_op->req.sec_del.oid.stripe & STRIPE_MASK },
-                                        { "osd_num", (uint64_t)rm_osd_num },
-                                    });
-                                }
-                                else
-                                {
-                                    printf(
-                                        "Removed %lx:%lx (part %lu) from OSD %lu\n",
-                                        rm_op->req.sec_del.oid.inode, rm_op->req.sec_del.oid.stripe & ~STRIPE_MASK,
-                                        rm_op->req.sec_del.oid.stripe & STRIPE_MASK, rm_osd_num
-                                    );
-                                }
-                                delete rm_op;
-                                if (!(*rm_count))
-                                {
-                                    // Scrub
-                                    free(rm_count);
-                                    osd_op_t *scrub_op = new osd_op_t;
-                                    scrub_op->req = (osd_any_op_t){
-                                        .rw = {
-                                            .header = {
-                                                .magic = SECONDARY_OSD_OP_MAGIC,
-                                                .id = parent->cli->next_op_id(),
-                                                .opcode = OSD_OP_SCRUB,
-                                            },
-                                            .inode = obj.inode,
-                                            .offset = obj.stripe,
-                                            .len = 0,
-                                        },
-                                    };
-                                    scrub_op->callback = [this, primary_osd, &obj](osd_op_t *scrub_op)
-                                    {
-                                        if (scrub_op->reply.hdr.retval < 0 && scrub_op->reply.hdr.retval != -ENOENT)
-                                        {
-                                            fprintf(
-                                                stderr, "Failed to scrub %lx:%lx on OSD %lu (retval=%ld)\n",
-                                                obj.inode, obj.stripe, primary_osd, scrub_op->reply.hdr.retval
-                                            );
-                                        }
-                                        delete scrub_op;
-                                        parent->waiting--;
-                                        loop();
-                                    };
-                                    parent->cli->execute_raw(primary_osd, scrub_op);
-                                }
-                            };
-                            (*rm_count)++;
-                            parent->cli->execute_raw(rm_osd_num, rm_op);
-                        }
-                    }
-                    (*rm_count)--;
-                    if (!*rm_count)
-                    {
-                        free(rm_count);
-                        parent->waiting--;
-                        loop();
-                    }
-                }
-                delete op;
-            };
-            parent->waiting++;
-            parent->cli->execute_raw(primary_osd, op);
-        }
-        if (parent->waiting > 0)
-        {
-            return;
-        }
-        if (parent->json_output)
-        {
-            result.data = fix_result;
-        }
-        state = 100;
-    }
-};
-
-std::function<bool(cli_result_t &)> cli_tool_t::start_fix(json11::Json cfg)
-{
-    auto fixer = new cli_fix_t();
-    fixer->parent = this;
-    fixer->options = cfg;
-    return [fixer](cli_result_t & result)
-    {
-        fixer->loop();
-        if (fixer->is_done())
-        {
-            result = fixer->result;
-            delete fixer;
-            return true;
-        }
-        return false;
-    };
-}
--- a/src/cli_fix.h
+++ b/src/cli_fix.h
@@ -1,26 +0,0 @@
-// Copyright (c) Vitaliy Filippov, 2019+
-// License: VNPL-1.1 (see README.md for details)
-
-#pragma once
-
-#include "cli.h"
-#include <algorithm>
-
-std::vector<uint64_t> parse_uint64_list(json11::Json val);
-
-template<class T> void remove_duplicates(std::vector<T> & ret)
-{
-    if (!ret.size())
-        return;
-    std::sort(ret.begin(), ret.end());
-    int j = 0;
-    for (int i = 1; i < ret.size(); i++)
-    {
-        if (ret[i] != ret[j])
-            ret[++j] = ret[i];
-    }
-    ret.resize(j+1);
-}
-
-// from http_client.cpp...
-bool json_is_false(const json11::Json & val);
--- a/src/cli_rm_osd.cpp
+++ b/src/cli_rm_osd.cpp
@@ -410,17 +410,14 @@ struct rm_osd_t
                        parent->cli->st_cli.etcd_prefix+"/pg/history/"+
                        std::to_string(pool_cfg.id)+"/"+std::to_string(pg_num)
                    );
-                    auto hist = json11::Json::object {
-                        { "epoch", pg_cfg.epoch },
-                        { "all_peers", pg_cfg.all_peers },
-                        { "osd_sets", pg_cfg.target_history },
-                    };
-                    if (pg_cfg.next_scrub)
-                        hist["next_scrub"] = pg_cfg.next_scrub;
                    history_updates.push_back(json11::Json::object {
                        { "request_put", json11::Json::object {
                            { "key", history_key },
-                            { "value", base64_encode(json11::Json(hist).dump()) },
+                            { "value", base64_encode(json11::Json(json11::Json::object {
+                                { "epoch", pg_cfg.epoch },
+                                { "all_peers", pg_cfg.all_peers },
+                                { "osd_sets", pg_cfg.target_history },
+                            }).dump()) },
                        } },
                    });
                    history_checks.push_back(json11::Json::object {
--- a/src/cli_status.cpp
+++ b/src/cli_status.cpp
@@ -201,7 +201,6 @@ resume_2:
        bool readonly = json_is_true(parent->cli->config["readonly"]);
        bool no_recovery = json_is_true(parent->cli->config["no_recovery"]);
        bool no_rebalance = json_is_true(parent->cli->config["no_rebalance"]);
-        bool no_scrub = json_is_true(parent->cli->config["no_scrub"]);
        if (parent->json_output)
        {
            // JSON output
@@ -220,7 +219,6 @@ resume_2:
                { "readonly", readonly },
                { "no_recovery", no_recovery },
                { "no_rebalance", no_rebalance },
-                { "no_scrub", no_scrub },
                { "pool_count", pool_count },
                { "active_pool_count", pools_active },
                { "pg_states", pgs_by_state },
--- a/src/cluster_client.cpp
+++ b/src/cluster_client.cpp
@@ -35,7 +35,6 @@ cluster_client_t::cluster_client_t(ring_loop_t *ringloop, timerfd_manager_t *tfd
            // peer_osd just connected
            continue_ops();
            continue_lists();
-            continue_raw_ops(peer_osd);
        }
        else if (dirty_buffers.size())
        {
@@ -105,19 +104,6 @@ cluster_op_t::~cluster_op_t()
    }
 }

-void cluster_client_t::continue_raw_ops(osd_num_t peer_osd)
-{
-    auto it = raw_ops.find(peer_osd);
-    while (it != raw_ops.end() && it->first == peer_osd)
-    {
-        auto op = it->second;
-        op->op_type = OSD_OP_OUT;
-        op->peer_fd = msgr.osd_peer_fds.at(peer_osd);
-        msgr.outbox_push(op);
-        raw_ops.erase(it++);
-    }
-}
-
 void cluster_client_t::init_msgr()
 {
    if (msgr_initialized)
@@ -526,23 +512,6 @@ void cluster_client_t::execute(cluster_op_t *op)
    }
 }

-void cluster_client_t::execute_raw(osd_num_t osd_num, osd_op_t *op)
-{
-    auto fd_it = msgr.osd_peer_fds.find(osd_num);
-    if (fd_it != msgr.osd_peer_fds.end())
-    {
-        op->op_type = OSD_OP_OUT;
-        op->peer_fd = fd_it->second;
-        msgr.outbox_push(op);
-    }
-    else
-    {
-        if (msgr.wanted_peers.find(osd_num) == msgr.wanted_peers.end())
-            msgr.connect_peer(osd_num, st_cli.peer_states[osd_num]);
-        raw_ops.emplace(osd_num, op);
-    }
-}
-
 void cluster_client_t::copy_write(cluster_op_t *op, std::map<object_id, cluster_buffer_t> & dirty_buffers)
 {
    // Save operation for replay when one of PGs goes out of sync
--- a/src/cluster_client.h
+++ b/src/cluster_client.h
@@ -103,7 +103,6 @@ class cluster_client_t
    ring_consumer_t consumer;
    std::vector<std::function<void(void)>> on_ready_hooks;
    std::vector<inode_list_t*> lists;
-    std::multimap<osd_num_t, osd_op_t*> raw_ops;
    int continuing_ops = 0;
    bool msgr_initialized = false;

@@ -119,7 +118,6 @@ public:
    cluster_client_t(ring_loop_t *ringloop, timerfd_manager_t *tfd, json11::Json & config);
    ~cluster_client_t();
    void execute(cluster_op_t *op);
-    void execute_raw(osd_num_t osd_num, osd_op_t *op);
    bool is_ready();
    void on_ready(std::function<void(void)> fn);

@@ -155,5 +153,4 @@ protected:
    void continue_lists();
    void continue_listing(inode_list_t *lst);
    void send_list(inode_list_osd_t *cur_list);
-    void continue_raw_ops(osd_num_t peer_osd);
 };
--- a/src/disk_tool_utils.cpp
+++ b/src/disk_tool_utils.cpp
@@ -55,6 +55,23 @@ std::string realpath_str(std::string path, bool nofail)
    return rp;
 }

+std::string read_all_fd(int fd)
+{
+    int res_size = 0;
+    std::string res;
+    while (1)
+    {
+        res.resize(res_size+1024);
+        int r = read(fd, (char*)res.data()+res_size, res.size()-res_size);
+        if (r > 0)
+            res_size += r;
+        else if (!r || errno != EAGAIN && errno != EINTR)
+            break;
+    }
+    res.resize(res_size);
+    return res;
+}
+
 std::string read_file(std::string file, bool allow_enoent)
 {
    std::string res;
--- a/src/etcd_state_client.cpp
+++ b/src/etcd_state_client.cpp
@@ -7,8 +7,8 @@
 #ifndef __MOCK__
 #include "addr_util.h"
 #include "http_client.h"
-#endif
 #include "str_util.h"
+#endif

 etcd_state_client_t::~etcd_state_client_t()
 {
@@ -777,10 +777,6 @@ void etcd_state_client_t::parse_state(const etcd_kv_t & kv)
                fprintf(stderr, "Pool %u has invalid bitmap_granularity (must divide block_size), skipping pool\n", pool_id);
                continue;
            }
-            // Scrub Interval
-            pc.scrub_interval = parse_time(pool_item.second["scrub_interval"].string_value());
-            if (!pc.scrub_interval)
-                pc.scrub_interval = 0;
            // Immediate Commit Mode
            pc.immediate_commit = pool_item.second["immediate_commit"].is_string()
                ? (pool_item.second["immediate_commit"].string_value() == "all"
@@ -923,8 +919,6 @@ void etcd_state_client_t::parse_state(const etcd_kv_t & kv)
            }
            // Read epoch
            pg_cfg.epoch = value["epoch"].uint64_value();
-            // Next scrub timestamp (0 or empty = scrub is not needed)
-            pg_cfg.next_scrub = value["next_scrub"].uint64_value();
            if (on_change_pg_history_hook != NULL)
            {
                on_change_pg_history_hook(pool_id, pg_num);
--- a/src/etcd_state_client.h
+++ b/src/etcd_state_client.h
@@ -39,7 +39,6 @@ struct pg_config_t
    osd_num_t cur_primary;
    int cur_state;
    uint64_t epoch;
-    uint64_t next_scrub;
 };

 struct pool_config_t
@@ -56,7 +55,6 @@ struct pool_config_t
    uint64_t max_osd_combinations;
    uint64_t pg_stripe_size;
    std::map<pg_num_t, pg_config_t> pg_config;
-    uint64_t scrub_interval;
 };

 struct inode_config_t
--- a/src/messenger.cpp
+++ b/src/messenger.cpp
@@ -251,10 +251,6 @@ void osd_messenger_t::try_connect_peer_addr(osd_num_t peer_osd, const char *peer
        return;
    }
    clients[peer_fd] = new osd_client_t();
-    if (log_level > 0)
-    {
-        fprintf(stderr, "Connecting to OSD %lu at %s:%d (client %d)\n", peer_osd, peer_host, peer_port, peer_fd);
-    }
    clients[peer_fd]->peer_addr = addr;
    clients[peer_fd]->peer_port = peer_port;
    clients[peer_fd]->peer_fd = peer_fd;
@@ -317,10 +313,7 @@ void osd_messenger_t::handle_peer_epoll(int peer_fd, int epoll_events)
    if (epoll_events & EPOLLRDHUP)
    {
        // Stop client
-        if (log_level > 0)
-        {
-            fprintf(stderr, "[OSD %lu] client %d disconnected\n", this->osd_num, peer_fd);
-        }
+        fprintf(stderr, "[OSD %lu] client %d disconnected\n", this->osd_num, peer_fd);
        stop_client(peer_fd, true);
    }
    else if (epoll_events & EPOLLIN)
--- a/src/messenger.h
+++ b/src/messenger.h
@@ -50,7 +50,7 @@ struct osd_client_t

    sockaddr_storage peer_addr;
    int peer_port;
-    int peer_fd = -1;
+    int peer_fd;
    int peer_state;
    int connect_timeout_id = -1;
    int ping_time_remaining = 0;
@@ -87,7 +87,11 @@ struct osd_client_t
    std::vector<iovec> send_list, next_send_list;
    std::vector<msgr_sendp_t> outbox, next_outbox;

-    ~osd_client_t();
+    ~osd_client_t()
+    {
+        free(in_buf);
+        in_buf = NULL;
+    }
 };

 struct osd_wanted_peer_t
--- a/src/msgr_receive.cpp
+++ b/src/msgr_receive.cpp
@@ -251,6 +251,10 @@ void osd_messenger_t::handle_op_hdr(osd_client_t *cl)
        }
        cl->read_remaining = cur_op->req.sec_read_bmp.len;
    }
+    else if (cur_op->req.hdr.opcode == OSD_OP_READ)
+    {
+        cl->read_remaining = 0;
+    }
    else if (cur_op->req.hdr.opcode == OSD_OP_WRITE)
    {
        if (cur_op->req.rw.len > 0)
@@ -270,12 +274,6 @@ void osd_messenger_t::handle_op_hdr(osd_client_t *cl)
        }
        cl->read_remaining = cur_op->req.show_conf.json_len;
    }
-    /*else if (cur_op->req.hdr.opcode == OSD_OP_READ ||
-        cur_op->req.hdr.opcode == OSD_OP_SCRUB ||
-        cur_op->req.hdr.opcode == OSD_OP_DESCRIBE)
-    {
-        cl->read_remaining = 0;
-    }*/
    if (cl->read_remaining > 0)
    {
        // Read data
@@ -369,16 +367,6 @@ bool osd_messenger_t::handle_reply_hdr(osd_client_t *cl)
        op->buf = malloc_or_die(op->reply.hdr.retval);
        cl->recv_list.push_back(op->buf, op->reply.hdr.retval);
    }
-    else if (op->reply.hdr.opcode == OSD_OP_DESCRIBE && op->reply.hdr.retval > 0)
-    {
-        delete cl->read_op;
-        cl->read_op = op;
-        cl->read_state = CL_READ_REPLY_DATA;
-        cl->read_remaining = op->reply.describe.result_bytes;
-        free(op->buf);
-        op->buf = malloc_or_die(op->reply.describe.result_bytes);
-        cl->recv_list.push_back(op->buf, op->reply.describe.result_bytes);
-    }
    else
    {
 reuse:
--- a/src/msgr_send.cpp
+++ b/src/msgr_send.cpp
@@ -73,8 +73,7 @@ void osd_messenger_t::outbox_push(osd_op_t *cur_op)
        ? (cur_op->req.hdr.opcode == OSD_OP_READ ||
        cur_op->req.hdr.opcode == OSD_OP_SEC_READ ||
        cur_op->req.hdr.opcode == OSD_OP_SEC_LIST ||
-        cur_op->req.hdr.opcode == OSD_OP_SHOW_CONFIG ||
-        cur_op->req.hdr.opcode == OSD_OP_DESCRIBE)
+        cur_op->req.hdr.opcode == OSD_OP_SHOW_CONFIG)
        : (cur_op->req.hdr.opcode == OSD_OP_WRITE ||
        cur_op->req.hdr.opcode == OSD_OP_SEC_WRITE ||
        cur_op->req.hdr.opcode == OSD_OP_SEC_WRITE_STABLE ||
--- a/src/msgr_stop.cpp
+++ b/src/msgr_stop.cpp
@@ -122,6 +122,17 @@ void osd_messenger_t::stop_client(int peer_fd, bool force, bool force_delete)
        // Cancel outbound operations
        cancel_osd_ops(cl);
    }
+#ifndef __MOCK__
+    // And close the FD only when everything is done
+    // ...because peer_fd number can get reused after close()
+    close(peer_fd);
+#ifdef WITH_RDMA
+    if (cl->rdma_conn)
+    {
+        delete cl->rdma_conn;
+    }
+#endif
+#endif
    // Find the item again because it can be invalidated at this point
    it = clients.find(peer_fd);
    if (it != clients.end())
@@ -134,25 +145,3 @@ void osd_messenger_t::stop_client(int peer_fd, bool force, bool force_delete)
        delete cl;
    }
 }
-
-osd_client_t::~osd_client_t()
-{
-    free(in_buf);
-    in_buf = NULL;
-    if (peer_fd >= 0)
-    {
-        // Close the FD only when the client is actually destroyed
-        // Which only happens when all references are cleared
-        close(peer_fd);
-        peer_fd = -1;
-    }
-#ifndef __MOCK__
-#ifdef WITH_RDMA
-    if (rdma_conn)
-    {
-        delete rdma_conn;
-        rdma_conn = NULL;
-    }
-#endif
-#endif
-}
--- a/src/nbd_proxy.cpp
+++ b/src/nbd_proxy.cpp
@@ -137,19 +137,12 @@ public:
            "OPTIONS:\n"
            "  All usual Vitastor config options like --etcd_address <etcd_address> plus NBD-specific:\n"
            "  --nbd_timeout 30\n"
-            "    Timeout for I/O operations in seconds after exceeding which the kernel stops\n"
-            "    the device. You can set it to 0 to disable the timeout, but beware that you\n"
-            "    won't be able to stop the device at all if vitastor-nbd process dies.\n"
+            "    timeout in seconds after which the kernel will stop the device\n"
+            "    you can set it to 0, but beware that you won't be able to stop the device at all\n"
+            "    if vitastor-nbd process dies\n"
            "  --nbd_max_devices 64 --nbd_max_part 3\n"
-            "    Options for the \"nbd\" kernel module when modprobing it (nbds_max and max_part).\n"
-            "    note that maximum allowed (nbds_max)*(1+max_part) is 256.\n"
-            "  --logfile /path/to/log/file.txt\n"
-            "    Wite log messages to the specified file instead of dropping them (in background mode)\n"
-            "    or printing them to the standard output (in foreground mode).\n"
-            "  --dev_num N\n"
-            "    Use the specified device /dev/nbdN instead of automatic selection.\n"
-            "  --foreground 1\n"
-            "    Stay in foreground, do not daemonize.n",
+            "    options for the \"nbd\" kernel module when modprobing it (nbds_max and max_part).\n"
+            "    note that maximum allowed (nbds_max)*(1+max_part) is 256.\n",
            exe_name, exe_name, exe_name
        );
        exit(0);
--- a/src/osd.cpp
+++ b/src/osd.cpp
@@ -13,7 +13,6 @@
 #include "osd_primary.h"
 #include "osd.h"
 #include "http_client.h"
-#include "str_util.h"

 static blockstore_config_t json_to_bs(const json11::Json::object & config)
 {
@@ -169,8 +168,6 @@ void osd_t::parse_config(bool init)
    no_rebalance = json_is_true(config["no_rebalance"]);
    auto old_no_recovery = no_recovery;
    no_recovery = json_is_true(config["no_recovery"]);
-    auto old_no_scrub = no_scrub;
-    no_scrub = json_is_true(config["no_scrub"]);
    auto old_autosync_interval = autosync_interval;
    if (!config["autosync_interval"].is_null())
    {
@@ -210,38 +207,6 @@ void osd_t::parse_config(bool init)
    inode_vanish_time = config["inode_vanish_time"].uint64_value();
    if (!inode_vanish_time)
        inode_vanish_time = 60;
-    auto old_auto_scrub = auto_scrub;
-    auto_scrub = json_is_true(config["auto_scrub"]);
-    global_scrub_interval = parse_time(config["scrub_interval"].string_value());
-    if (!global_scrub_interval)
-        global_scrub_interval = 30*86400;
-    scrub_queue_depth = config["scrub_queue_depth"].uint64_value();
-    if (scrub_queue_depth < 1 || scrub_queue_depth > MAX_RECOVERY_QUEUE)
-        scrub_queue_depth = 1;
-    scrub_find_best = !json_is_false(config["scrub_find_best"]);
-    scrub_ec_max_bruteforce = config["scrub_ec_max_bruteforce"].uint64_value();
-    if (scrub_ec_max_bruteforce < 1)
-        scrub_ec_max_bruteforce = 100;
-    scrub_sleep_ms = config["scrub_sleep"].uint64_value();
-    scrub_list_limit = config["scrub_list_limit"].uint64_value();
-    if (!scrub_list_limit)
-        scrub_list_limit = 1000;
-    if (!old_auto_scrub && auto_scrub)
-    {
-        // Schedule scrubbing
-        for (auto & pgp: pgs)
-        {
-            plan_scrub(pgp.second);
-        }
-    }
-    if (old_no_scrub && !no_scrub)
-    {
-        // Wakeup scrubbing
-        for (auto & pgp: pgs)
-        {
-            schedule_scrub(pgp.second);
-        }
-    }
    if ((old_no_rebalance && !no_rebalance || old_no_recovery && !no_recovery) &&
        !(peering_state & (OSD_RECOVERING | OSD_FLUSHING_PGS)))
    {
@@ -372,8 +337,6 @@ void osd_t::exec_op(osd_op_t *cur_op)
        cur_op->req.hdr.opcode != OSD_OP_SEC_LIST &&
        cur_op->req.hdr.opcode != OSD_OP_READ &&
        cur_op->req.hdr.opcode != OSD_OP_SEC_READ_BMP &&
-        cur_op->req.hdr.opcode != OSD_OP_SCRUB &&
-        cur_op->req.hdr.opcode != OSD_OP_DESCRIBE &&
        cur_op->req.hdr.opcode != OSD_OP_SHOW_CONFIG)
    {
        // Readonly mode
@@ -404,14 +367,6 @@ void osd_t::exec_op(osd_op_t *cur_op)
    {
        continue_primary_del(cur_op);
    }
-    else if (cur_op->req.hdr.opcode == OSD_OP_SCRUB)
-    {
-        continue_primary_scrub(cur_op);
-    }
-    else if (cur_op->req.hdr.opcode == OSD_OP_DESCRIBE)
-    {
-        continue_primary_describe(cur_op);
-    }
    else
    {
        exec_secondary(cur_op);
@@ -476,10 +431,6 @@ void osd_t::print_stats()
            recovery_stat_bytes[1][i] = recovery_stat_bytes[0][i];
        }
    }
-    if (corrupted_objects > 0)
-    {
-        printf("[OSD %lu] %lu object(s) corrupted\n", osd_num, corrupted_objects);
-    }
    if (incomplete_objects > 0)
    {
        printf("[OSD %lu] %lu object(s) incomplete\n", osd_num, incomplete_objects);
@@ -547,11 +498,10 @@ void osd_t::print_slow()
                else if (op->req.hdr.opcode == OSD_OP_SEC_LIST)
                {
                    bufprintf(
-                        " oid=%lx/%lx-%lx/%lx pg=%u/%u, stripe=%lu, limit=%u",
-                        op->req.sec_list.min_inode, op->req.sec_list.min_stripe,
-                        op->req.sec_list.max_inode, op->req.sec_list.max_stripe,
+                        " inode=%lx-%lx pg=%u/%u, stripe=%lu",
+                        op->req.sec_list.min_inode, op->req.sec_list.max_inode,
                        op->req.sec_list.list_pg, op->req.sec_list.pg_count,
-                        op->req.sec_list.pg_stripe_size, op->req.sec_list.stable_limit
+                        op->req.sec_list.pg_stripe_size
                    );
                }
                else if (op->req.hdr.opcode == OSD_OP_READ || op->req.hdr.opcode == OSD_OP_WRITE ||
--- a/src/osd.h
+++ b/src/osd.h
@@ -28,7 +28,6 @@
 #define OSD_PEERING_PGS 0x04
 #define OSD_FLUSHING_PGS 0x08
 #define OSD_RECOVERING 0x10
-#define OSD_SCRUBBING 0x20

 #define MAX_AUTOSYNC_INTERVAL 3600
 #define DEFAULT_AUTOSYNC_INTERVAL 5
@@ -99,7 +98,6 @@ class osd_t
    bool run_primary = false;
    bool no_rebalance = false;
    bool no_recovery = false;
-    bool no_scrub = false;
    std::string bind_address;
    int bind_port, listen_backlog = 128;
    // FIXME: Implement client queue depth limit
@@ -115,13 +113,6 @@ class osd_t
    int recovery_sync_batch = DEFAULT_RECOVERY_BATCH;
    int inode_vanish_time = 60;
    int log_level = 0;
-    bool auto_scrub = false;
-    uint64_t global_scrub_interval = 30*86400;
-    uint64_t scrub_queue_depth = 1;
-    uint64_t scrub_sleep_ms = 0;
-    uint32_t scrub_list_limit = 1000;
-    bool scrub_find_best = true;
-    uint64_t scrub_ec_max_bruteforce = 100;

    // cluster state

@@ -144,24 +135,15 @@ class osd_t
    std::set<pool_pg_num_t> dirty_pgs;
    std::set<osd_num_t> dirty_osds;
    int copies_to_delete_after_sync_count = 0;
-    uint64_t misplaced_objects = 0, degraded_objects = 0, incomplete_objects = 0, inconsistent_objects = 0, corrupted_objects = 0;
+    uint64_t misplaced_objects = 0, degraded_objects = 0, incomplete_objects = 0;
    int peering_state = 0;
    std::map<object_id, osd_recovery_op_t> recovery_ops;
-    std::map<object_id, osd_op_t*> scrub_ops;
    bool recovery_last_degraded = true;
    pool_pg_num_t recovery_last_pg;
    object_id recovery_last_oid;
    int recovery_pg_done = 0, recovery_done = 0;
    osd_op_t *autosync_op = NULL;

-    // Scrubbing
-    uint64_t scrub_nearest_ts = 0;
-    int scrub_timer_id = -1;
-    pool_pg_num_t scrub_last_pg = {};
-    osd_op_t *scrub_list_op = NULL;
-    pg_list_result_t scrub_cur_list = {};
-    uint64_t scrub_list_pos = 0;
-
    // Unstable writes
    uint64_t unstable_write_count = 0;
    std::map<osd_object_id_t, uint64_t> unstable_writes;
@@ -239,14 +221,6 @@ class osd_t
    bool continue_recovery();
    pg_osd_set_state_t* change_osd_set(pg_osd_set_state_t *st, pg_t *pg);

-    // scrub
-    void scrub_list(pool_pg_num_t pg_id, osd_num_t role_osd, object_id min_oid);
-    int pick_next_scrub(object_id & next_oid);
-    void submit_scrub_op(object_id oid);
-    bool continue_scrub();
-    void plan_scrub(pg_t & pg, bool report_state = true);
-    void schedule_scrub(pg_t & pg);
-
    // op execution
    void exec_op(osd_op_t *cur_op);
    void finish_op(osd_op_t *cur_op, int retval);
@@ -261,19 +235,13 @@ class osd_t
    void autosync();
    bool prepare_primary_rw(osd_op_t *cur_op);
    void continue_primary_read(osd_op_t *cur_op);
-    void continue_primary_scrub(osd_op_t *cur_op);
-    void continue_primary_describe(osd_op_t *cur_op);
    void continue_primary_write(osd_op_t *cur_op);
    void cancel_primary_write(osd_op_t *cur_op);
    void continue_primary_sync(osd_op_t *cur_op);
    void continue_primary_del(osd_op_t *cur_op);
    bool check_write_queue(osd_op_t *cur_op, pg_t & pg);
-    pg_osd_set_state_t* add_object_to_set(pg_t & pg, const object_id oid, const pg_osd_set_t & osd_set,
-        uint64_t old_pg_state, int log_at_level);
-    void remove_object_from_state(object_id & oid, pg_osd_set_state_t **object_state, pg_t &pg, bool report = true);
-    pg_osd_set_state_t *mark_object_corrupted(pg_t & pg, object_id oid, pg_osd_set_state_t *prev_object_state,
-        osd_rmw_stripe_t *stripes, bool ref, bool inconsistent);
-    void deref_object_state(pg_t & pg, pg_osd_set_state_t **object_state, bool deref);
+    void remove_object_from_state(object_id & oid, pg_osd_set_state_t *object_state, pg_t &pg);
+    void free_object_state(pg_t & pg, pg_osd_set_state_t **object_state);
    bool remember_unstable_write(osd_op_t *cur_op, pg_t & pg, pg_osd_set_t & loc_set, int base_state);
    void handle_primary_subop(osd_op_t *subop, osd_op_t *cur_op);
    void handle_primary_bs_subop(osd_op_t *subop);
@@ -288,11 +256,10 @@ class osd_t
    int submit_primary_sync_subops(osd_op_t *cur_op);
    void submit_primary_stab_subops(osd_op_t *cur_op);

-    uint64_t* get_object_osd_set(pg_t &pg, object_id &oid, pg_osd_set_state_t **object_state);
+    uint64_t* get_object_osd_set(pg_t &pg, object_id &oid, uint64_t *def, pg_osd_set_state_t **object_state);

    void continue_chained_read(osd_op_t *cur_op);
    int submit_chained_read_requests(pg_t & pg, osd_op_t *cur_op);
-    void check_corrupted_chained(pg_t & pg, osd_op_t *cur_op);
    void send_chained_read_results(pg_t & pg, osd_op_t *cur_op);
    std::vector<osd_chain_read_t> collect_chained_read_requests(osd_op_t *cur_op);
    int collect_bitmap_requests(osd_op_t *cur_op, pg_t & pg, std::vector<bitmap_request_t> & bitmap_requests);
--- a/src/osd_cluster.cpp
+++ b/src/osd_cluster.cpp
@@ -337,8 +337,6 @@ void osd_t::report_statistics()
        pg_stats["misplaced_count"] = pg.misplaced_objects.size();
        pg_stats["degraded_count"] = pg.degraded_objects.size();
        pg_stats["incomplete_count"] = pg.incomplete_objects.size();
-        if (pg.corrupted_count)
-            pg_stats["corrupted_count"] = pg.corrupted_count;
        pg_stats["write_osd_set"] = pg.cur_set;
        txn.push_back(json11::Json::object {
            { "request_put", json11::Json::object {
@@ -694,11 +692,6 @@ void osd_t::apply_pg_config()
                            pg_it->second.all_peers == vec_all_peers)
                        {
                            // No change in osd_set and history
-                            if (pg_it->second.next_scrub != pg_cfg.next_scrub)
-                            {
-                                pg_it->second.next_scrub = pg_cfg.next_scrub;
-                                schedule_scrub(pg_it->second);
-                            }
                            continue;
                        }
                        else
@@ -750,7 +743,6 @@ void osd_t::apply_pg_config()
                    .reported_epoch = pg_cfg.epoch,
                    .target_history = pg_cfg.target_history,
                    .all_peers = vec_all_peers,
-                    .next_scrub = pg_cfg.next_scrub,
                    .target_set = pg_cfg.target_set,
                };
                if (pg.scheme == POOL_SCHEME_EC)
@@ -891,8 +883,6 @@ void osd_t::report_pg_states()
                    { "all_peers", pg.all_peers },
                    { "osd_sets", pg.target_history },
                };
-                if (pg.next_scrub)
-                    history_value["next_scrub"] = pg.next_scrub;
                checks.push_back(json11::Json::object {
                    { "target", "MOD" },
                    { "key", history_key },
--- a/src/osd_flush.cpp
+++ b/src/osd_flush.cpp
@@ -192,9 +192,7 @@ bool osd_t::submit_flush_op(pool_id_t pool_id, pg_num_t pg_num, pg_flush_batch_t
                op->bs_op = NULL;
                delete op;
            },
-            {
-                .len = (uint32_t)count,
-            },
+            .len = (uint32_t)count,
            .buf = op->buf,
        });
        bs->enqueue_op(op->bs_op);
@@ -305,25 +303,27 @@ void osd_t::submit_recovery_op(osd_recovery_op_t *op)
    };
    if (log_level > 2)
    {
-        printf("Submitting recovery operation for %lx:%lx (%s)\n", op->oid.inode, op->oid.stripe, op->degraded ? "degraded" : "misplaced");
+        printf("Submitting recovery operation for %lx:%lx\n", op->oid.inode, op->oid.stripe);
    }
-    op->osd_op->peer_fd = -1;
    op->osd_op->callback = [this, op](osd_op_t *osd_op)
    {
        if (osd_op->reply.hdr.retval < 0)
        {
            // Error recovering object
-            // EPIPE is totally harmless (peer is gone), others like EIO/EDOM may be not
-            printf(
-                "[PG %u/%u] Recovery operation failed with object %lx:%lx: error %ld\n",
-                INODE_POOL(op->oid.inode),
-                map_to_pg(op->oid, st_cli.pool_config.at(INODE_POOL(op->oid.inode)).pg_stripe_size),
-                op->oid.inode, op->oid.stripe, osd_op->reply.hdr.retval
-            );
-        }
-        else if (log_level > 2)
-        {
-            printf("Recovery operation done for %lx:%lx\n", op->oid.inode, op->oid.stripe);
+            if (osd_op->reply.hdr.retval == -EPIPE)
+            {
+                // PG is stopped or one of the OSDs is gone, error is harmless
+                printf(
+                    "[PG %u/%u] Recovery operation failed with object %lx:%lx\n",
+                    INODE_POOL(op->oid.inode),
+                    map_to_pg(op->oid, st_cli.pool_config.at(INODE_POOL(op->oid.inode)).pg_stripe_size),
+                    op->oid.inode, op->oid.stripe
+                );
+            }
+            else
+            {
+                throw std::runtime_error("Failed to recover an object");
+            }
        }
        // CAREFUL! op = &recovery_ops[op->oid]. Don't access op->* after recovery_ops.erase()
        op->osd_op = NULL;
--- a/src/osd_ops.cpp
+++ b/src/osd_ops.cpp
@@ -21,6 +21,4 @@ const char* osd_op_names[] = {
    "primary_delete",
    "ping",
    "sec_read_bmp",
-    "scrub",
-    "describe",
 };
--- a/src/osd_ops.h
+++ b/src/osd_ops.h
@@ -29,9 +29,7 @@
 #define OSD_OP_DELETE               14
 #define OSD_OP_PING                 15
 #define OSD_OP_SEC_READ_BMP         16
-#define OSD_OP_SCRUB                17
-#define OSD_OP_DESCRIBE             18
-#define OSD_OP_MAX                  18
+#define OSD_OP_MAX                  16
 #define OSD_RW_MAX                  64*1024*1024
 #define OSD_PROTOCOL_VERSION        1

@@ -45,11 +43,6 @@
 #define MEM_ALIGNMENT 4096
 #endif

-// Constants for osd_reply_describe_item_t.loc_bad
-#define LOC_OUTDATED 1
-#define LOC_CORRUPTED 2
-#define LOC_INCONSISTENT 4
-
 // common request and reply headers
 struct __attribute__((__packed__)) osd_op_header_t
 {
@@ -180,11 +173,6 @@ struct __attribute__((__packed__)) osd_op_sec_list_t
    uint64_t pg_stripe_size;
    // inode range (used to select pools)
    uint64_t min_inode, max_inode;
-    // min/max oid stripe, added after inodes for backwards compatibility
-    // also for backwards compatibility, max_stripe=UINT64_MAX means 0 and 0 means UINT64_MAX O_o
-    uint64_t min_stripe, max_stripe;
-    // max stable object count
-    uint32_t stable_limit;
 };

 struct __attribute__((__packed__)) osd_reply_sec_list_t
@@ -235,36 +223,6 @@ struct __attribute__((__packed__)) osd_reply_sync_t
    osd_reply_header_t header;
 };

-// describe unclean object states in detail
-struct __attribute__((__packed__)) osd_op_describe_t
-{
-    osd_op_header_t header;
-    // state mask to filter objects by state (0 or 0xfff..ff = all objects)
-    uint64_t object_state;
-    // minimum inode and offset
-    uint64_t min_inode, min_offset;
-    // maximum inode and offset
-    uint64_t max_inode, max_offset;
-    // limit
-    uint64_t limit;
-};
-
-struct __attribute__((__packed__)) osd_reply_describe_t
-{
-    osd_reply_header_t header;
-    // size of the resulting <osd_reply_describe_item_t> array in bytes
-    uint64_t result_bytes;
-};
-
-struct __attribute__((__packed__)) osd_reply_describe_item_t
-{
-    uint64_t inode;
-    uint64_t stripe;
-    uint32_t role;      // part number: 0 for replicas, 0..pg_size-1 for EC
-    uint32_t loc_bad;   // LOC_OUTDATED / LOC_CORRUPTED / LOC_INCONSISTENT
-    osd_num_t osd_num;  // OSD number
-};
-
 // FIXME it would be interesting to try to unify blockstore_op and osd_op formats
 union osd_any_op_t
 {
@@ -278,7 +236,6 @@ union osd_any_op_t
    osd_op_show_config_t show_conf;
    osd_op_rw_t rw;
    osd_op_sync_t sync;
-    osd_op_describe_t describe;
    uint8_t buf[OSD_PACKET_SIZE];
 };

@@ -294,7 +251,6 @@ union osd_any_reply_t
    osd_reply_show_config_t show_conf;
    osd_reply_rw_t rw;
    osd_reply_sync_t sync;
-    osd_reply_describe_t describe;
    uint8_t buf[OSD_PACKET_SIZE];
 };

--- a/src/osd_peering.cpp
+++ b/src/osd_peering.cpp
@@ -25,7 +25,6 @@ void osd_t::handle_peers()
                {
                    p.second.calc_object_states(log_level);
                    report_pg_state(p.second);
-                    schedule_scrub(p.second);
                    incomplete_objects += p.second.incomplete_objects.size();
                    misplaced_objects += p.second.misplaced_objects.size();
                    // FIXME: degraded objects may currently include misplaced, too! Report them separately?
@@ -84,13 +83,6 @@ void osd_t::handle_peers()
            peering_state = peering_state & ~OSD_RECOVERING;
        }
    }
-    if (peering_state & OSD_SCRUBBING)
-    {
-        if (!continue_scrub())
-        {
-            peering_state = peering_state & ~OSD_SCRUBBING;
-        }
-    }
 }

 void osd_t::repeer_pgs(osd_num_t peer_osd)
@@ -136,11 +128,9 @@ void osd_t::reset_pg(pg_t & pg)
    pg.state_dict.clear();
    copies_to_delete_after_sync_count -= pg.copies_to_delete_after_sync.size();
    pg.copies_to_delete_after_sync.clear();
-    corrupted_objects -= pg.corrupted_count;
    incomplete_objects -= pg.incomplete_objects.size();
    misplaced_objects -= pg.misplaced_objects.size();
    degraded_objects -= pg.degraded_objects.size();
-    pg.corrupted_count = 0;
    pg.incomplete_objects.clear();
    pg.misplaced_objects.clear();
    pg.degraded_objects.clear();
@@ -216,7 +206,7 @@ void osd_t::start_pg_peering(pg_t & pg)
            pg.cur_loc_set.push_back({
                .role = (uint64_t)role,
                .osd_num = pg.cur_set[role],
-                .loc_bad = 0,
+                .outdated = false,
            });
        }
    }
@@ -329,12 +319,11 @@ void osd_t::submit_list_subop(osd_num_t role_osd, pg_peering_state_t *ps)
        clock_gettime(CLOCK_REALTIME, &op->tv_begin);
        op->bs_op = new blockstore_op_t();
        op->bs_op->opcode = BS_OP_LIST;
-        op->bs_op->pg_alignment = st_cli.pool_config[ps->pool_id].pg_stripe_size;
-        op->bs_op->min_oid.inode = ((uint64_t)ps->pool_id << (64 - POOL_ID_BITS));
-        op->bs_op->max_oid.inode = ((uint64_t)(ps->pool_id+1) << (64 - POOL_ID_BITS)) - 1;
-        op->bs_op->max_oid.stripe = UINT64_MAX;
-        op->bs_op->pg_count = pg_counts[ps->pool_id];
-        op->bs_op->pg_number = ps->pg_num-1;
+        op->bs_op->oid.stripe = st_cli.pool_config[ps->pool_id].pg_stripe_size;
+        op->bs_op->oid.inode = ((uint64_t)ps->pool_id << (64 - POOL_ID_BITS));
+        op->bs_op->version = ((uint64_t)(ps->pool_id+1) << (64 - POOL_ID_BITS)) - 1;
+        op->bs_op->len = pg_counts[ps->pool_id];
+        op->bs_op->offset = ps->pg_num-1;
        op->bs_op->callback = [this, ps, op, role_osd](blockstore_op_t *bs_op)
        {
            if (op->bs_op->retval < 0)
@@ -494,7 +483,6 @@ void osd_t::report_pg_state(pg_t & pg)
        pg.all_peers = pg.target_set;
        std::sort(pg.all_peers.begin(), pg.all_peers.end());
        pg.cur_peers = pg.target_set;
-        plan_scrub(pg, false);
        // Change pg_config at the same time, otherwise our PG reconciling loop may try to apply the old metadata
        auto & pg_cfg = st_cli.pool_config[pg.pool_id].pg_config[pg.pg_num];
        pg_cfg.target_history = pg.target_history;
@@ -538,7 +526,6 @@ void osd_t::report_pg_state(pg_t & pg)
                pg.cur_peers.push_back(pg_osd);
            }
        }
-        plan_scrub(pg, false);
        auto & pg_cfg = st_cli.pool_config[pg.pool_id].pg_config[pg.pg_num];
        pg_cfg.target_history = pg.target_history;
        pg_cfg.all_peers = pg.all_peers;
--- a/src/osd_peering_pg.cpp
+++ b/src/osd_peering_pg.cpp
@@ -255,7 +255,7 @@ void pg_obj_state_check_t::finish_object()
    }
    else if (n_mismatched > 0)
    {
-        if (log_level > 2)
+        if (log_level > 2 && (replicated || n_roles >= pg->pg_cursize))
        {
            printf("Object is misplaced: %lx:%lx version=%lu/%lu\n", oid.inode, oid.stripe, target_ver, max_ver);
        }
@@ -280,7 +280,7 @@ void pg_obj_state_check_t::finish_object()
            osd_set.push_back((pg_obj_loc_t){
                .role = (list[i].oid.stripe & STRIPE_MASK),
                .osd_num = list[i].osd_num,
-                .loc_bad = 0,
+                .outdated = false,
            });
        }
    }
@@ -302,7 +302,7 @@ void pg_obj_state_check_t::finish_object()
                osd_set.push_back((pg_obj_loc_t){
                    .role = (list[i].oid.stripe & STRIPE_MASK),
                    .osd_num = list[i].osd_num,
-                    .loc_bad = LOC_OUTDATED,
+                    .outdated = true,
                });
                if (!(state & (OBJ_INCOMPLETE | OBJ_DEGRADED)))
                {
@@ -322,75 +322,65 @@ void pg_obj_state_check_t::finish_object()
    }
    else
    {
-        pg->add_object_to_state(oid, state, osd_set);
-    }
-}
-
-pg_osd_set_state_t* pg_t::add_object_to_state(const object_id oid, const uint64_t state, const pg_osd_set_t & osd_set)
-{
-    auto it = state_dict.find(osd_set);
-    if (it == state_dict.end())
-    {
-        std::vector<osd_num_t> read_target;
-        if (scheme == POOL_SCHEME_REPLICATED)
+        auto it = pg->state_dict.find(osd_set);
+        if (it == pg->state_dict.end())
        {
-            for (auto & o: osd_set)
+            std::vector<uint64_t> read_target;
+            if (replicated)
            {
-                if (!(o.loc_bad & (LOC_OUTDATED | LOC_CORRUPTED)))
+                for (auto & o: osd_set)
                {
-                    read_target.push_back(o.osd_num);
+                    if (!o.outdated)
+                    {
+                        read_target.push_back(o.osd_num);
+                    }
+                }
+                while (read_target.size() < pg->pg_size)
+                {
+                    // FIXME: This is because we then use .data() and assume it's at least <pg_size> long
+                    read_target.push_back(0);
                }
            }
-            while (read_target.size() < pg_size)
+            else
            {
-                // FIXME: This is because we then use .data() and assume it's at least <pg_size> long
-                read_target.push_back(0);
+                read_target.resize(pg->pg_size);
+                for (int i = 0; i < pg->pg_size; i++)
+                {
+                    read_target[i] = 0;
+                }
+                for (auto & o: osd_set)
+                {
+                    if (!o.outdated)
+                    {
+                        read_target[o.role] = o.osd_num;
+                    }
+                }
            }
+            pg->state_dict[osd_set] = {
+                .read_target = read_target,
+                .osd_set = osd_set,
+                .state = state,
+                .object_count = 1,
+            };
+            it = pg->state_dict.find(osd_set);
        }
        else
        {
-            read_target.resize(pg_size);
-            for (int i = 0; i < pg_size; i++)
-            {
-                read_target[i] = 0;
-            }
-            for (auto & o: osd_set)
-            {
-                if (!(o.loc_bad & (LOC_OUTDATED | LOC_CORRUPTED)))
-                {
-                    read_target[o.role] = o.osd_num;
-                }
-            }
+            it->second.object_count++;
+        }
+        if (state & OBJ_INCOMPLETE)
+        {
+            pg->incomplete_objects[oid] = &it->second;
+        }
+        else if (state & OBJ_DEGRADED)
+        {
+            pg->degraded_objects[oid] = &it->second;
+        }
+        else
+        {
+            pg->misplaced_objects[oid] = &it->second;
        }
-        state_dict[osd_set] = {
-            .read_target = read_target,
-            .osd_set = osd_set,
-            .state = state,
-            .object_count = 1,
-        };
-        it = state_dict.find(osd_set);
    }
-    else
-    {
-        it->second.object_count++;
-    }
-    if (state & OBJ_INCONSISTENT)
-    {
-        inconsistent_objects[oid] = &it->second;
-    }
-    else if (state & OBJ_INCOMPLETE)
-    {
-        incomplete_objects[oid] = &it->second;
-    }
-    else if (state & OBJ_DEGRADED)
-    {
-        degraded_objects[oid] = &it->second;
-    }
-    else
-    {
-        misplaced_objects[oid] = &it->second;
-    }
-    return &it->second;
 }

 // FIXME: Write at least some tests for this function
@@ -456,9 +446,7 @@ void pg_t::calc_object_states(int log_level)
                osd_set_desc += (osd_set_desc == "" ? "" : ", ")+
                    std::to_string(loc.osd_num)+
                    (st.replicated ? "" : "("+std::to_string(loc.role)+")")+
-                    (loc.loc_bad & LOC_OUTDATED ? "(old)" : "")+
-                    (loc.loc_bad & LOC_CORRUPTED ? "(corrupted)" : "")+
-                    (loc.loc_bad & LOC_INCONSISTENT ? "(inconsistent)" : "");
+                    (loc.outdated ? "(old)" : "");
            }
            printf("[PG %u/%u] %lu objects on OSD set %s\n", pool_id, pg_num, stp.second.object_count, osd_set_desc.c_str());
        }
@@ -468,7 +456,7 @@ void pg_t::calc_object_states(int log_level)
 void pg_t::print_state()
 {
    printf(
-        "[PG %u/%u] is %s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s (%lu objects)\n", pool_id, pg_num,
+        "[PG %u/%u] is %s%s%s%s%s%s%s%s%s%s%s%s%s%s (%lu objects)\n", pool_id, pg_num,
        (state & PG_STARTING) ? "starting" : "",
        (state & PG_OFFLINE) ? "offline" : "",
        (state & PG_PEERING) ? "peering" : "",
@@ -477,15 +465,12 @@ void pg_t::print_state()
        (state & PG_REPEERING) ? "repeering" : "",
        (state & PG_STOPPING) ? "stopping" : "",
        (state & PG_DEGRADED) ? " + degraded" : "",
-        (state & PG_HAS_INCONSISTENT) ? " + has_inconsistent" : "",
-        (state & PG_HAS_CORRUPTED) ? " + has_corrupted" : "",
        (state & PG_HAS_INCOMPLETE) ? " + has_incomplete" : "",
        (state & PG_HAS_DEGRADED) ? " + has_degraded" : "",
        (state & PG_HAS_MISPLACED) ? " + has_misplaced" : "",
        (state & PG_HAS_UNCLEAN) ? " + has_unclean" : "",
        (state & PG_HAS_INVALID) ? " + has_invalid" : "",
        (state & PG_LEFT_ON_DEAD) ? " + left_on_dead" : "",
-        (state & PG_SCRUBBING) ? " + scrubbing" : "",
        total_count
    );
 }
--- a/src/osd_peering_pg.h
+++ b/src/osd_peering_pg.h
@@ -17,7 +17,7 @@ struct pg_obj_loc_t
 {
    uint64_t role;
    osd_num_t osd_num;
-    uint32_t loc_bad; // LOC_OUTDATED / LOC_CORRUPTED / LOC_INCONSISTENT
+    bool outdated;
 };

 typedef std::vector<pg_obj_loc_t> pg_osd_set_t;
@@ -30,7 +30,6 @@ struct pg_osd_set_state_t
    pg_osd_set_t osd_set;
    uint64_t state = 0;
    uint64_t object_count = 0;
-    uint64_t ref_count = 0;
 };

 struct pg_list_result_t
@@ -92,8 +91,6 @@ struct pg_t
    // target history and all potential peers
    std::vector<std::vector<osd_num_t>> target_history;
    std::vector<osd_num_t> all_peers;
-    // next scrub time
-    uint64_t next_scrub = 0;
    bool history_changed = false;
    // peer list from the last peering event
    std::vector<osd_num_t> cur_peers;
@@ -109,8 +106,7 @@ struct pg_t
    // it may consume up to ~ (raw storage / object size) * 24 bytes in the worst case scenario
    // which is up to ~192 MB per 1 TB in the worst case scenario
    std::map<pg_osd_set_t, pg_osd_set_state_t> state_dict;
-    uint64_t corrupted_count;
-    btree::btree_map<object_id, pg_osd_set_state_t*> inconsistent_objects, incomplete_objects, misplaced_objects, degraded_objects;
+    btree::btree_map<object_id, pg_osd_set_state_t*> incomplete_objects, misplaced_objects, degraded_objects;
    std::map<obj_piece_id_t, flush_action_t> flush_actions;
    std::vector<obj_ver_osd_t> copies_to_delete_after_sync;
    btree::btree_map<object_id, uint64_t> ver_override;
@@ -120,16 +116,15 @@ struct pg_t
    int inflight = 0; // including write_queue
    std::multimap<object_id, osd_op_t*> write_queue;

-    pg_osd_set_state_t* add_object_to_state(const object_id oid, const uint64_t state, const pg_osd_set_t & osd_set);
    void calc_object_states(int log_level);
    void print_state();
 };

 inline bool operator < (const pg_obj_loc_t &a, const pg_obj_loc_t &b)
 {
-    return a.loc_bad < b.loc_bad ||
-        a.loc_bad == b.loc_bad && a.role < b.role ||
-        a.loc_bad == b.loc_bad && a.role == b.role && a.osd_num < b.osd_num;
+    return a.outdated < b.outdated ||
+        a.outdated == b.outdated && a.role < b.role ||
+        a.outdated == b.outdated && a.role == b.role && a.osd_num < b.osd_num;
 }

 inline bool operator == (const obj_piece_id_t & a, const obj_piece_id_t & b)
--- a/src/osd_primary.cpp
+++ b/src/osd_primary.cpp
@@ -52,9 +52,7 @@ bool osd_t::prepare_primary_rw(osd_op_t *cur_op)
        finish_op(cur_op, -EINVAL);
        return false;
    }
-    // Scrub is similar to r/w, so it's also handled here
-    int stripe_count = (pool_cfg.scheme == POOL_SCHEME_REPLICATED
-        && cur_op->req.hdr.opcode != OSD_OP_SCRUB ? 1 : pg_it->second.pg_size);
+    int stripe_count = (pool_cfg.scheme == POOL_SCHEME_REPLICATED ? 1 : pg_it->second.pg_size);
    int chain_size = 0;
    if (cur_op->req.hdr.opcode == OSD_OP_READ && cur_op->req.rw.meta_revision > 0)
    {
@@ -92,8 +90,6 @@ bool osd_t::prepare_primary_rw(osd_op_t *cur_op)
        chain_size * (
            // - copy of the chain
            sizeof(inode_t) +
-            // - object states for every chain item
-            sizeof(void*) +
            // - bitmap buffers for chained read
            stripe_count * clean_entry_bitmap_size +
            // - 'missing' flags for chained reads
@@ -121,8 +117,6 @@ bool osd_t::prepare_primary_rw(osd_op_t *cur_op)
    {
        op_data->read_chain = (inode_t*)data_buf;
        data_buf = (uint8_t*)data_buf + sizeof(inode_t) * chain_size;
-        op_data->chain_states = (pg_osd_set_state_t**)data_buf;
-        data_buf = (uint8_t*)data_buf + sizeof(pg_osd_set_state_t*) * chain_size;
        op_data->snapshot_bitmaps = data_buf;
        data_buf = (uint8_t*)data_buf + chain_size * stripe_count * clean_entry_bitmap_size;
        op_data->missing_flags = (uint8_t*)data_buf;
@@ -137,7 +131,6 @@ bool osd_t::prepare_primary_rw(osd_op_t *cur_op)
            inode_it->second.parent_id != cur_op->req.rw.inode)
        {
            op_data->read_chain[chain_num++] = inode_it->second.parent_id;
-            op_data->chain_states[chain_num++] = NULL;
            inode_it = st_cli.inode_config.find(inode_it->second.parent_id);
        }
    }
@@ -145,12 +138,12 @@ bool osd_t::prepare_primary_rw(osd_op_t *cur_op)
    return true;
 }

-uint64_t* osd_t::get_object_osd_set(pg_t &pg, object_id &oid, pg_osd_set_state_t **object_state)
+uint64_t* osd_t::get_object_osd_set(pg_t &pg, object_id &oid, uint64_t *def, pg_osd_set_state_t **object_state)
 {
    if (!(pg.state & (PG_HAS_INCOMPLETE | PG_HAS_DEGRADED | PG_HAS_MISPLACED)))
    {
        *object_state = NULL;
-        return pg.cur_set.data();
+        return def;
    }
    auto st_it = pg.incomplete_objects.find(oid);
    if (st_it != pg.incomplete_objects.end())
@@ -171,7 +164,7 @@ uint64_t* osd_t::get_object_osd_set(pg_t &pg, object_id &oid, pg_osd_set_state_t
        return st_it->second->read_target.data();
    }
    *object_state = NULL;
-    return pg.cur_set.data();
+    return def;
 }

 void osd_t::continue_primary_read(osd_op_t *cur_op)
@@ -190,7 +183,6 @@ void osd_t::continue_primary_read(osd_op_t *cur_op)
        goto resume_1;
    else if (op_data->st == 2)
        goto resume_2;
-resume_0:
    cur_op->reply.rw.bitmap_len = 0;
    {
        auto & pg = pgs.at({ .pool_id = INODE_POOL(op_data->oid.inode), .pg_num = op_data->pg_num });
@@ -214,17 +206,15 @@ resume_0:
        // Determine version
        auto vo_it = pg.ver_override.find(op_data->oid);
        op_data->target_ver = vo_it != pg.ver_override.end() ? vo_it->second : UINT64_MAX;
-        // PG may have degraded or misplaced objects
-        op_data->prev_set = get_object_osd_set(pg, op_data->oid, &op_data->object_state);
+        op_data->prev_set = pg.cur_set.data();
+        if (pg.state != PG_ACTIVE)
+        {
+            // PG may be degraded or have misplaced objects
+            op_data->prev_set = get_object_osd_set(pg, op_data->oid, pg.cur_set.data(), &op_data->object_state);
+        }
        if (pg.state == PG_ACTIVE || op_data->scheme == POOL_SCHEME_REPLICATED)
        {
            // Fast happy-path
-            if (op_data->scheme == POOL_SCHEME_REPLICATED &&
-                op_data->object_state && (op_data->object_state->state & OBJ_INCOMPLETE))
-            {
-                finish_op(cur_op, -EIO);
-                return;
-            }
            cur_op->buf = alloc_read_buffer(op_data->stripes, op_data->pg_data_size, 0);
            submit_primary_subops(SUBMIT_RMW_READ, op_data->target_ver, op_data->prev_set, cur_op);
            op_data->st = 1;
@@ -250,14 +240,6 @@ resume_1:
 resume_2:
    if (op_data->errors > 0)
    {
-        if (op_data->errcode == -EIO || op_data->errcode == -EDOM)
-        {
-            // I/O or checksum error
-            auto & pg = pgs.at({ .pool_id = INODE_POOL(op_data->oid.inode), .pg_num = op_data->pg_num });
-            // FIXME: ref = true ideally... because new_state != state is not necessarily true if it's freed and recreated
-            op_data->object_state = mark_object_corrupted(pg, op_data->oid, op_data->object_state, op_data->stripes, false, false);
-            goto resume_0;
-        }
        finish_op(cur_op, op_data->errcode);
        return;
    }
@@ -296,284 +278,10 @@ resume_2:
    finish_op(cur_op, cur_op->req.rw.len);
 }

-pg_osd_set_state_t *osd_t::mark_object_corrupted(pg_t & pg, object_id oid, pg_osd_set_state_t *prev_object_state,
-    osd_rmw_stripe_t *stripes, bool ref, bool inconsistent)
-{
-    pg_osd_set_state_t *object_state = NULL;
-    get_object_osd_set(pg, oid, &object_state);
-    if (prev_object_state != object_state)
-    {
-        // Object state changed in between by a parallel I/O operation, skip marking as failed
-        if (ref)
-        {
-            deref_object_state(pg, &prev_object_state, ref);
-            if (object_state)
-                object_state->ref_count++;
-        }
-        return object_state;
-    }
-    pg_osd_set_t corrupted_set;
-    if (object_state)
-    {
-        corrupted_set = object_state->osd_set;
-    }
-    else
-    {
-        for (int i = 0; i < pg.cur_set.size(); i++)
-        {
-            corrupted_set.push_back((pg_obj_loc_t){
-                .role = (pg.scheme == POOL_SCHEME_REPLICATED ? 0 : (uint64_t)i),
-                .osd_num = pg.cur_set[i],
-            });
-        }
-    }
-    // Mark object chunk(s) as corrupted
-    int changes = 0;
-    for (auto chunk_it = corrupted_set.begin(); chunk_it != corrupted_set.end(); )
-    {
-        auto & chunk = *chunk_it;
-        if (stripes[chunk.role].osd_num == chunk.osd_num)
-        {
-            if (stripes[chunk.role].not_exists)
-            {
-                changes++;
-                corrupted_set.erase(chunk_it, chunk_it+1);
-                continue;
-            }
-            if (stripes[chunk.role].read_error && chunk.loc_bad != LOC_CORRUPTED)
-            {
-                changes++;
-                chunk.loc_bad = LOC_CORRUPTED;
-            }
-            else if (stripes[chunk.role].read_end > 0 && !stripes[chunk.role].missing &&
-                (chunk.loc_bad & LOC_CORRUPTED))
-            {
-                changes++;
-                chunk.loc_bad &= ~LOC_CORRUPTED;
-            }
-        }
-        if (inconsistent && !chunk.loc_bad)
-        {
-            changes++;
-            chunk.loc_bad |= LOC_INCONSISTENT;
-        }
-        else if (!inconsistent && (chunk.loc_bad & LOC_INCONSISTENT))
-        {
-            changes++;
-            chunk.loc_bad &= ~LOC_INCONSISTENT;
-        }
-        chunk_it++;
-    }
-    if (!changes)
-    {
-        // No chunks newly marked as corrupted - object is already marked or moved
-        return object_state;
-    }
-    int old_pg_state = pg.state;
-    if (object_state)
-    {
-        remove_object_from_state(oid, &object_state, pg, false);
-        deref_object_state(pg, &object_state, ref);
-    }
-    // Insert object into the new state and retry
-    object_state = add_object_to_set(pg, oid, corrupted_set, old_pg_state, 2);
-    if (ref)
-    {
-        object_state->ref_count++;
-    }
-    return object_state;
-}
-
-pg_osd_set_state_t* osd_t::add_object_to_set(pg_t & pg, const object_id oid, const pg_osd_set_t & osd_set,
-    uint64_t old_pg_state, int log_at_level)
-{
-    // Object state will be calculated from <osd_set>
-    uint64_t has_roles = 0, n_roles = 0, n_copies = 0, n_invalid = 0, n_outdated = 0,
-        n_misplaced = 0, n_corrupted = 0, n_inconsistent = 0;
-    for (auto & chunk: osd_set)
-    {
-        if (chunk.role >= (pg.scheme == POOL_SCHEME_REPLICATED ? 1 : pg.pg_size))
-        {
-            n_invalid++;
-        }
-        else if (chunk.loc_bad & LOC_OUTDATED)
-        {
-            n_outdated++;
-        }
-        else
-        {
-            if (chunk.loc_bad & LOC_INCONSISTENT)
-            {
-                n_inconsistent++;
-            }
-            if (chunk.loc_bad & LOC_CORRUPTED)
-            {
-                n_corrupted++;
-            }
-            else if (pg.scheme == POOL_SCHEME_REPLICATED)
-            {
-                n_roles = 1;
-                int i;
-                for (i = 0; i < pg.cur_set.size() && pg.cur_set[i] != chunk.osd_num; i++) {}
-                if (i == pg.cur_set.size())
-                {
-                    n_misplaced++;
-                }
-            }
-            else
-            {
-                if (!(has_roles & (1 << chunk.role)))
-                {
-                    n_roles++;
-                    has_roles |= (1 << chunk.role);
-                }
-                if (pg.cur_set[chunk.role] != chunk.osd_num)
-                {
-                    n_misplaced++;
-                }
-            }
-            n_copies++;
-        }
-    }
-    uint64_t obj_state = 0;
-    int pg_state_bits = 0;
-    if (n_corrupted > 0)
-    {
-        this->corrupted_objects++;
-        pg.corrupted_count++;
-        obj_state |= OBJ_CORRUPTED;
-        pg_state_bits |= PG_HAS_CORRUPTED;
-    }
-    if (n_invalid > 0 || n_inconsistent > 0)
-    {
-        this->inconsistent_objects++;
-        obj_state |= OBJ_INCONSISTENT;
-        pg_state_bits |= PG_HAS_INCONSISTENT;
-    }
-    else if (n_roles < pg.pg_data_size)
-    {
-        this->incomplete_objects++;
-        obj_state |= OBJ_INCOMPLETE;
-        pg_state_bits = PG_HAS_INCOMPLETE;
-    }
-    else if (n_roles < pg.pg_cursize)
-    {
-        this->degraded_objects++;
-        obj_state |= OBJ_DEGRADED;
-        pg_state_bits = PG_HAS_DEGRADED;
-    }
-    else if (n_misplaced > 0 || n_outdated > 0)
-    {
-        this->misplaced_objects++;
-        obj_state |= OBJ_MISPLACED;
-        pg_state_bits = PG_HAS_MISPLACED;
-    }
-    if (this->log_level >= log_at_level)
-    {
-        printf("Marking object %lx:%lx ", oid.inode, oid.stripe);
-        for (int i = 0, j = 0; i < object_state_bit_count; i++)
-        {
-            if ((obj_state & object_state_bits[i]) || object_state_bits[i] == 0 && obj_state == 0)
-            {
-                printf((j++) ? "+%s" : "%s", object_state_names[i]);
-            }
-        }
-        if (pg.scheme == POOL_SCHEME_REPLICATED)
-        {
-            printf(": %lu copies available", n_copies);
-        }
-        else
-        {
-            printf(": %lu parts / %lu copies available", n_roles, n_copies);
-        }
-        if (n_invalid > 0)
-        {
-            printf(", %lu invalid", n_invalid);
-        }
-        if (n_outdated > 0)
-        {
-            printf(", %lu outdated", n_outdated);
-        }
-        if (n_misplaced > 0)
-        {
-            printf(", %lu misplaced", n_misplaced);
-        }
-        if (n_corrupted > 0)
-        {
-            printf(", %lu corrupted", n_corrupted);
-        }
-        if (n_inconsistent > 0)
-        {
-            printf(", %lu inconsistent", n_inconsistent);
-        }
-        printf("\n");
-    }
-    pg.state |= pg_state_bits;
-    if (pg.state != old_pg_state)
-    {
-        report_pg_state(pg);
-        if ((pg.state & (PG_HAS_DEGRADED | PG_HAS_MISPLACED)) !=
-            (old_pg_state & (PG_HAS_DEGRADED | PG_HAS_MISPLACED)))
-        {
-            peering_state = peering_state | OSD_RECOVERING;
-            if ((pg.state & PG_HAS_DEGRADED) != (old_pg_state & PG_HAS_DEGRADED))
-            {
-                // Restart recovery from degraded objects
-                recovery_last_degraded = true;
-                recovery_last_pg = {};
-                recovery_last_oid = {};
-            }
-            ringloop->wakeup();
-        }
-    }
-    if (!obj_state)
-    {
-        // Object is clean
-        return NULL;
-    }
-    // Insert object into the new state and retry
-    return pg.add_object_to_state(oid, obj_state, osd_set);
-}
-
 // Decrement pg_osd_set_state_t's object_count and change PG state accordingly
-void osd_t::remove_object_from_state(object_id & oid, pg_osd_set_state_t **object_state, pg_t & pg, bool report)
+void osd_t::remove_object_from_state(object_id & oid, pg_osd_set_state_t *object_state, pg_t & pg)
 {
-    if (!*object_state)
-    {
-        return;
-    }
-    pg_osd_set_state_t *recheck_state = NULL;
-    get_object_osd_set(pg, oid, &recheck_state);
-    if (recheck_state != *object_state)
-    {
-        recheck_state->ref_count++;
-        (*object_state)->ref_count--;
-        *object_state = recheck_state;
-        return;
-    }
-    bool changed = false;
-    (*object_state)->object_count--;
-    if ((*object_state)->state & OBJ_CORRUPTED)
-    {
-        this->corrupted_objects--;
-        pg.corrupted_count--;
-        if (!pg.corrupted_count)
-        {
-            pg.state = pg.state & ~PG_HAS_CORRUPTED;
-            changed = true;
-        }
-    }
-    if ((*object_state)->state & OBJ_INCONSISTENT)
-    {
-        this->inconsistent_objects--;
-        pg.inconsistent_objects.erase(oid);
-        if (!pg.inconsistent_objects.size())
-        {
-            pg.state = pg.state & ~PG_HAS_INCONSISTENT;
-            changed = true;
-        }
-    }
-    else if ((*object_state)->state & OBJ_INCOMPLETE)
+    if (object_state->state & OBJ_INCOMPLETE)
    {
        // Successful write means that object is not incomplete anymore
        this->incomplete_objects--;
@@ -581,52 +289,41 @@ void osd_t::remove_object_from_state(object_id & oid, pg_osd_set_state_t **objec
        if (!pg.incomplete_objects.size())
        {
            pg.state = pg.state & ~PG_HAS_INCOMPLETE;
-            changed = true;
+            report_pg_state(pg);
        }
    }
-    else if ((*object_state)->state & OBJ_DEGRADED)
+    else if (object_state->state & OBJ_DEGRADED)
    {
        this->degraded_objects--;
        pg.degraded_objects.erase(oid);
        if (!pg.degraded_objects.size())
        {
            pg.state = pg.state & ~PG_HAS_DEGRADED;
-            changed = true;
+            report_pg_state(pg);
        }
    }
-    else if ((*object_state)->state & OBJ_MISPLACED)
+    else if (object_state->state & OBJ_MISPLACED)
    {
        this->misplaced_objects--;
        pg.misplaced_objects.erase(oid);
        if (!pg.misplaced_objects.size())
        {
            pg.state = pg.state & ~PG_HAS_MISPLACED;
-            changed = true;
+            report_pg_state(pg);
        }
    }
    else
    {
-        throw std::runtime_error("BUG: Invalid object state: "+std::to_string((*object_state)->state));
-    }
-    if (changed && report)
-    {
-        report_pg_state(pg);
+        throw std::runtime_error("BUG: Invalid object state: "+std::to_string(object_state->state));
    }
 }

-void osd_t::deref_object_state(pg_t & pg, pg_osd_set_state_t **object_state, bool deref)
+void osd_t::free_object_state(pg_t & pg, pg_osd_set_state_t **object_state)
 {
-    if (*object_state)
+    if (*object_state && !(--(*object_state)->object_count))
    {
-        if (deref)
-        {
-            (*object_state)->ref_count--;
-        }
-        if (!(*object_state)->object_count && !(*object_state)->ref_count)
-        {
-            pg.state_dict.erase((*object_state)->osd_set);
-            *object_state = NULL;
-        }
+        pg.state_dict.erase((*object_state)->osd_set);
+        *object_state = NULL;
    }
 }

@@ -656,28 +353,21 @@ void osd_t::continue_primary_del(osd_op_t *cur_op)
    }
 resume_1:
    // Determine which OSDs contain this object and delete it
-    op_data->prev_set = get_object_osd_set(pg, op_data->oid, &op_data->object_state);
-    if (op_data->object_state)
-    {
-        op_data->object_state->ref_count++;
-    }
+    op_data->prev_set = get_object_osd_set(pg, op_data->oid, pg.cur_set.data(), &op_data->object_state);
    // Submit 1 read to determine the actual version number
    submit_primary_subops(SUBMIT_RMW_READ, UINT64_MAX, op_data->prev_set, cur_op);
-    op_data->prev_set = NULL;
 resume_2:
    op_data->st = 2;
    return;
 resume_3:
    if (op_data->errors > 0)
    {
-        deref_object_state(pg, &op_data->object_state, true);
        pg_cancel_write_queue(pg, cur_op, op_data->oid, op_data->errcode);
        return;
    }
    // Check CAS version
    if (cur_op->req.rw.version && op_data->fact_ver != (cur_op->req.rw.version-1))
    {
-        deref_object_state(pg, &op_data->object_state, true);
        cur_op->reply.hdr.retval = -EINTR;
        cur_op->reply.rw.version = op_data->fact_ver;
        goto continue_others;
@@ -693,7 +383,6 @@ resume_4:
 resume_5:
    if (op_data->errors > 0)
    {
-        deref_object_state(pg, &op_data->object_state, true);
        pg_cancel_write_queue(pg, cur_op, op_data->oid, op_data->errcode);
        return;
    }
@@ -706,8 +395,8 @@ resume_5:
    }
    else
    {
-        remove_object_from_state(op_data->oid, &op_data->object_state, pg);
-        deref_object_state(pg, &op_data->object_state, true);
+        remove_object_from_state(op_data->oid, op_data->object_state, pg);
+        free_object_state(pg, &op_data->object_state);
    }
    pg.total_count--;
    cur_op->reply.hdr.retval = 0;
--- a/src/osd_primary.h
+++ b/src/osd_primary.h
@@ -9,7 +9,6 @@
 #define SUBMIT_READ 0
 #define SUBMIT_RMW_READ 1
 #define SUBMIT_WRITE 2
-#define SUBMIT_SCRUB_READ 3

 struct unstable_osd_num_t
 {
@@ -51,7 +50,6 @@ struct osd_primary_op_data_t
            // for read_bitmaps
            void *snapshot_bitmaps;
            inode_t *read_chain;
-            pg_osd_set_state_t **chain_states;
            uint8_t *missing_flags;
            int chain_size;
            osd_chain_read_t *chain_reads;
--- a/src/osd_primary_chain.cpp
+++ b/src/osd_primary_chain.cpp
@@ -40,24 +40,10 @@ resume_3:
 resume_4:
    if (op_data->errors > 0)
    {
-        if (op_data->errcode == -EIO || op_data->errcode == -EDOM)
-        {
-            // Handle corrupted reads and retry...
-            check_corrupted_chained(pg, cur_op);
-            free(cur_op->buf);
-            cur_op->buf = NULL;
-            free(op_data->chain_reads);
-            op_data->chain_reads = NULL;
-            // FIXME: We can in theory retry only specific parts instead of the whole operation
-            goto resume_1;
-        }
-        else
-        {
-            free(op_data->chain_reads);
-            op_data->chain_reads = NULL;
-            finish_op(cur_op, op_data->errcode);
-            return;
-        }
+        free(op_data->chain_reads);
+        op_data->chain_reads = NULL;
+        finish_op(cur_op, op_data->errcode);
+        return;
    }
    send_chained_read_results(pg, cur_op);
    finish_op(cur_op, cur_op->req.rw.len);
@@ -145,7 +131,8 @@ int osd_t::collect_bitmap_requests(osd_op_t *cur_op, pg_t & pg, std::vector<bitm
        object_id cur_oid = { .inode = op_data->read_chain[chain_num], .stripe = op_data->oid.stripe };
        auto vo_it = pg.ver_override.find(cur_oid);
        uint64_t target_version = vo_it != pg.ver_override.end() ? vo_it->second : UINT64_MAX;
-        uint64_t* cur_set = get_object_osd_set(pg, cur_oid, &op_data->chain_states[chain_num]);
+        pg_osd_set_state_t *object_state;
+        uint64_t* cur_set = get_object_osd_set(pg, cur_oid, pg.cur_set.data(), &object_state);
        if (pg.scheme == POOL_SCHEME_REPLICATED)
        {
            osd_num_t read_target = 0;
@@ -260,7 +247,6 @@ int osd_t::submit_bitmap_subops(osd_op_t *cur_op, pg_t & pg)
                osd_op_t *subop = op_data->subops+subop_idx;
                subop->op_type = OSD_OP_OUT;
                // FIXME: Use the pre-allocated buffer
-                assert(!subop->buf);
                subop->buf = malloc_or_die(sizeof(obj_ver_id)*(i+1-prev));
                subop->req = (osd_any_op_t){
                    .sec_read_bmp = {
@@ -389,8 +375,6 @@ int osd_t::submit_chained_read_requests(pg_t & pg, osd_op_t *cur_op)
    op_data->chain_read_count = chain_reads.size();
    op_data->chain_reads = (osd_chain_read_t*)calloc_or_die(
        1, sizeof(osd_chain_read_t) * chain_reads.size()
-        // FIXME: Allocate only <chain_reads.size()> instead of <chain_size> stripes
-        // (but it's slightly harder to handle in send_chained_read_results())
        + sizeof(osd_rmw_stripe_t) * stripe_count * op_data->chain_size
    );
    osd_rmw_stripe_t *chain_stripes = (osd_rmw_stripe_t*)(
@@ -419,7 +403,8 @@ int osd_t::submit_chained_read_requests(pg_t & pg, osd_op_t *cur_op)
        uint64_t *cur_set = pg.cur_set.data();
        if (pg.state != PG_ACTIVE)
        {
-            cur_set = get_object_osd_set(pg, cur_oid, &op_data->chain_states[chain_reads[cri].chain_pos]);
+            pg_osd_set_state_t *object_state;
+            cur_set = get_object_osd_set(pg, cur_oid, pg.cur_set.data(), &object_state);
            if (op_data->scheme != POOL_SCHEME_REPLICATED)
            {
                if (extend_missing_stripes(stripes, cur_set, pg.pg_data_size, pg.pg_size) < 0)
@@ -431,17 +416,6 @@ int osd_t::submit_chained_read_requests(pg_t & pg, osd_op_t *cur_op)
                }
                op_data->degraded = 1;
            }
-            else
-            {
-                auto cur_state = op_data->chain_states[chain_reads[cri].chain_pos];
-                if (cur_state && (cur_state->state & OBJ_INCOMPLETE))
-                {
-                    free(op_data->chain_reads);
-                    op_data->chain_reads = NULL;
-                    finish_op(cur_op, -EIO);
-                    return -1;
-                }
-            }
        }
        if (op_data->scheme == POOL_SCHEME_REPLICATED)
        {
@@ -459,7 +433,6 @@ int osd_t::submit_chained_read_requests(pg_t & pg, osd_op_t *cur_op)
            }
        }
    }
-    assert(!cur_op->buf);
    cur_op->buf = memalign_or_die(MEM_ALIGNMENT, read_buffer_size);
    void *cur_buf = cur_op->buf;
    for (int cri = 0; cri < chain_reads.size(); cri++)
@@ -495,8 +468,12 @@ int osd_t::submit_chained_read_requests(pg_t & pg, osd_op_t *cur_op)
        object_id cur_oid = { .inode = chain_reads[cri].inode, .stripe = op_data->oid.stripe };
        auto vo_it = pg.ver_override.find(cur_oid);
        uint64_t target_ver = vo_it != pg.ver_override.end() ? vo_it->second : UINT64_MAX;
-        auto cur_state = op_data->chain_states[chain_reads[cri].chain_pos];
-        uint64_t *cur_set = (pg.state != PG_ACTIVE && cur_state ? cur_state->read_target.data() : pg.cur_set.data());
+        uint64_t *cur_set = pg.cur_set.data();
+        if (pg.state != PG_ACTIVE)
+        {
+            pg_osd_set_state_t *object_state;
+            cur_set = get_object_osd_set(pg, cur_oid, pg.cur_set.data(), &object_state);
+        }
        int zero_read = -1;
        if (op_data->scheme == POOL_SCHEME_REPLICATED)
        {
@@ -510,33 +487,6 @@ int osd_t::submit_chained_read_requests(pg_t & pg, osd_op_t *cur_op)
    return 0;
 }

-void osd_t::check_corrupted_chained(pg_t & pg, osd_op_t *cur_op)
-{
-    osd_primary_op_data_t *op_data = cur_op->op_data;
-    int stripe_count = (pg.scheme == POOL_SCHEME_REPLICATED ? 1 : pg.pg_size);
-    osd_rmw_stripe_t *chain_stripes = (osd_rmw_stripe_t*)(
-        (uint8_t*)op_data->chain_reads + sizeof(osd_chain_read_t) * op_data->chain_read_count
-    );
-    for (int cri = 0; cri < op_data->chain_read_count; cri++)
-    {
-        object_id cur_oid = { .inode = op_data->chain_reads[cri].inode, .stripe = op_data->oid.stripe };
-        osd_rmw_stripe_t *stripes = chain_stripes + op_data->chain_reads[cri].chain_pos*stripe_count;
-        bool corrupted = false;
-        for (int i = 0; i < stripe_count; i++)
-        {
-            if (stripes[i].read_error)
-            {
-                corrupted = true;
-                break;
-            }
-        }
-        if (corrupted)
-        {
-            mark_object_corrupted(pg, cur_oid, op_data->chain_states[op_data->chain_reads[cri].chain_pos], stripes, false, false);
-        }
-    }
-}
-
 void osd_t::send_chained_read_results(pg_t & pg, osd_op_t *cur_op)
 {
    osd_primary_op_data_t *op_data = cur_op->op_data;
--- a/src/osd_primary_describe.cpp
+++ b/src/osd_primary_describe.cpp
@@ -1,128 +0,0 @@
-// Copyright (c) Vitaliy Filippov, 2019+
-// License: VNPL-1.1 (see README.md for details)
-
-#include <queue>
-#include "osd_primary.h"
-
-struct unclean_list_t
-{
-    btree::btree_map<object_id, pg_osd_set_state_t*>::iterator it, end;
-    uint64_t state_mask, state;
-};
-
-struct desc_item_list_t
-{
-    int alloc, size;
-    osd_reply_describe_item_t *items;
-};
-
-static void include_list(std::vector<unclean_list_t> & lists,
-    btree::btree_map<object_id, pg_osd_set_state_t*> & from,
-    osd_op_describe_t & desc, uint64_t state_mask, uint64_t state)
-{
-    auto it = desc.min_inode || desc.min_offset ? from.lower_bound((object_id){
-        .inode = desc.min_inode,
-        .stripe = desc.min_offset,
-    }) : from.begin();
-    auto end_it = desc.max_inode || desc.max_offset ? from.upper_bound((object_id){
-        .inode = desc.max_inode,
-        .stripe = desc.max_offset,
-    }) : from.end();
-    lists.push_back((unclean_list_t){
-        .it = it,
-        .end = end_it,
-        .state_mask = state_mask,
-        .state = state,
-    });
-}
-
-struct obj_list_t
-{
-    object_id oid;
-    int list_id;
-};
-
-static inline bool operator < (const obj_list_t & a, const obj_list_t & b)
-{
-    return b.oid < a.oid;
-}
-
-static void scan_lists(std::vector<unclean_list_t> & lists, uint64_t limit, desc_item_list_t & res)
-{
-    if (limit > 1048576)
-    {
-        limit = 1048576;
-    }
-    std::priority_queue<obj_list_t> min;
-    for (int i = 0; i < lists.size(); i++)
-    {
-        if (lists[i].it != lists[i].end)
-        {
-            min.push((obj_list_t){ .oid = lists[i].it->first, .list_id = i });
-        }
-    }
-    while (min.size() && (!limit || res.size < limit))
-    {
-        auto i = min.top().list_id;
-        min.pop();
-        for (auto & chunk: lists[i].it->second->osd_set)
-        {
-            if (res.size >= res.alloc)
-            {
-                res.alloc = !res.alloc ? 128 : (res.alloc*2);
-                res.items = (osd_reply_describe_item_t*)realloc_or_die(res.items, res.alloc * sizeof(osd_reply_describe_item_t));
-            }
-            res.items[res.size++] = (osd_reply_describe_item_t){
-                .inode   = lists[i].it->first.inode,
-                .stripe  = lists[i].it->first.stripe,
-                .role    = (uint32_t)chunk.role,
-                .loc_bad = chunk.loc_bad,
-                .osd_num = chunk.osd_num,
-            };
-        }
-        lists[i].it++;
-        if (lists[i].it != lists[i].end)
-        {
-            min.push((obj_list_t){ .oid = lists[i].it->first, .list_id = i });
-        }
-    }
-}
-
-// Describe unclean objects
-void osd_t::continue_primary_describe(osd_op_t *cur_op)
-{
-    auto & desc = cur_op->req.describe;
-    if (!desc.object_state)
-        desc.object_state = ~desc.object_state;
-    std::vector<unclean_list_t> lists;
-    for (auto pg_it = pgs.begin(); pg_it != pgs.end(); pg_it++)
-    {
-        auto & pg = pg_it->second;
-        if (desc.object_state & OBJ_INCONSISTENT)
-            include_list(lists, pg.inconsistent_objects, desc, 0, 0);
-        if (desc.object_state & OBJ_CORRUPTED)
-        {
-            if (!(desc.object_state & OBJ_INCOMPLETE))
-                include_list(lists, pg.incomplete_objects, desc, OBJ_CORRUPTED, OBJ_CORRUPTED);
-            if (!(desc.object_state & OBJ_DEGRADED))
-                include_list(lists, pg.degraded_objects, desc, OBJ_CORRUPTED, OBJ_CORRUPTED);
-            if (!(desc.object_state & OBJ_MISPLACED))
-                include_list(lists, pg.misplaced_objects, desc, OBJ_CORRUPTED, OBJ_CORRUPTED);
-        }
-        uint64_t skip_corrupted = !(desc.object_state & OBJ_CORRUPTED) ? OBJ_CORRUPTED : 0;
-        if (desc.object_state & OBJ_INCOMPLETE)
-            include_list(lists, pg.incomplete_objects, desc, skip_corrupted, 0);
-        if (desc.object_state & OBJ_DEGRADED)
-            include_list(lists, pg.degraded_objects, desc, skip_corrupted, 0);
-        if (desc.object_state & OBJ_MISPLACED)
-            include_list(lists, pg.misplaced_objects, desc, skip_corrupted, 0);
-    }
-    desc_item_list_t res = {};
-    scan_lists(lists, desc.limit, res);
-    assert(!cur_op->buf);
-    cur_op->buf = res.items;
-    cur_op->reply.describe.result_bytes = res.size * sizeof(osd_reply_describe_item_t);
-    if (res.items)
-        cur_op->iov.push_back(res.items, res.size * sizeof(osd_reply_describe_item_t));
-    finish_op(cur_op, res.size);
-}
--- a/src/osd_primary_subops.cpp
+++ b/src/osd_primary_subops.cpp
@@ -9,7 +9,6 @@ void osd_t::autosync()
    {
        autosync_op = new osd_op_t();
        autosync_op->op_type = OSD_OP_IN;
-        autosync_op->peer_fd = -1;
        autosync_op->req = (osd_any_op_t){
            .sync = {
                .header = {
@@ -81,11 +80,7 @@ void osd_t::finish_op(osd_op_t *cur_op, int retval)
        free(cur_op->op_data);
        cur_op->op_data = NULL;
    }
-    cur_op->reply.hdr.magic = SECONDARY_OSD_REPLY_MAGIC;
-    cur_op->reply.hdr.id = cur_op->req.hdr.id;
-    cur_op->reply.hdr.opcode = cur_op->req.hdr.opcode;
-    cur_op->reply.hdr.retval = retval;
-    if (cur_op->peer_fd == -1)
+    if (!cur_op->peer_fd)
    {
        // Copy lambda to be unaffected by `delete op`
        std::function<void(osd_op_t*)>(cur_op->callback)(cur_op);
@@ -96,6 +91,10 @@ void osd_t::finish_op(osd_op_t *cur_op, int retval)
        auto cl_it = msgr.clients.find(cur_op->peer_fd);
        if (cl_it != msgr.clients.end())
        {
+            cur_op->reply.hdr.magic = SECONDARY_OSD_REPLY_MAGIC;
+            cur_op->reply.hdr.id = cur_op->req.hdr.id;
+            cur_op->reply.hdr.opcode = cur_op->req.hdr.opcode;
+            cur_op->reply.hdr.retval = retval;
            msgr.outbox_push(cur_op);
        }
        else
@@ -143,50 +142,43 @@ int osd_t::submit_primary_subop_batch(int submit_type, inode_t inode, uint64_t o
    for (int role = 0; role < op_data->pg_size; role++)
    {
        // We always submit zero-length writes to all replicas, even if the stripe is not modified
-        if (!(wr || !rep && stripes[role].read_end != 0 || zero_read == role || submit_type == SUBMIT_SCRUB_READ))
+        if (!(wr || !rep && stripes[role].read_end != 0 || zero_read == role))
        {
            continue;
        }
        osd_num_t role_osd_num = osd_set[role];
-        int stripe_num = rep ? 0 : role;
-        osd_rmw_stripe_t *si = stripes + (submit_type == SUBMIT_SCRUB_READ ? role : stripe_num);
        if (role_osd_num != 0)
        {
+            int stripe_num = rep ? 0 : role;
            osd_op_t *subop = op_data->subops + i;
            uint32_t subop_len = wr
-                ? si->write_end - si->write_start
-                : si->read_end - si->read_start;
-            if (!wr && si->read_end == UINT32_MAX)
+                ? stripes[stripe_num].write_end - stripes[stripe_num].write_start
+                : stripes[stripe_num].read_end - stripes[stripe_num].read_start;
+            if (!wr && stripes[stripe_num].read_end == UINT32_MAX)
            {
                subop_len = 0;
            }
-            si->osd_num = role_osd_num;
-            si->read_error = false;
-            subop->bitmap = si->bmp_buf;
-            subop->bitmap_len = clean_entry_bitmap_size;
-            // Using rmw_buf to pass pointer to stripes. Dirty but should work
-            subop->rmw_buf = si;
            if (role_osd_num == this->osd_num)
            {
                clock_gettime(CLOCK_REALTIME, &subop->tv_begin);
                subop->op_type = (uint64_t)cur_op;
-                subop->bs_op = new blockstore_op_t((blockstore_op_t){
+                subop->bitmap = stripes[stripe_num].bmp_buf;
+                subop->bitmap_len = clean_entry_bitmap_size;
+                subop->bs_op = new blockstore_op_t({
                    .opcode = (uint64_t)(wr ? (rep ? BS_OP_WRITE_STABLE : BS_OP_WRITE) : BS_OP_READ),
                    .callback = [subop, this](blockstore_op_t *bs_subop)
                    {
                        handle_primary_bs_subop(subop);
                    },
-                    { {
-                        .oid = (object_id){
-                            .inode = inode,
-                            .stripe = op_data->oid.stripe | stripe_num,
-                        },
-                        .version = op_version,
-                        .offset = wr ? si->write_start : si->read_start,
-                        .len = subop_len,
-                    } },
-                    .buf = wr ? si->write_buf : si->read_buf,
-                    .bitmap = si->bmp_buf,
+                    .oid = {
+                        .inode = inode,
+                        .stripe = op_data->oid.stripe | stripe_num,
+                    },
+                    .version = op_version,
+                    .offset = wr ? stripes[stripe_num].write_start : stripes[stripe_num].read_start,
+                    .len = subop_len,
+                    .buf = wr ? stripes[stripe_num].write_buf : stripes[stripe_num].read_buf,
+                    .bitmap = stripes[stripe_num].bmp_buf,
                });
 #ifdef OSD_DEBUG
                printf(
@@ -200,6 +192,8 @@ int osd_t::submit_primary_subop_batch(int submit_type, inode_t inode, uint64_t o
            else
            {
                subop->op_type = OSD_OP_OUT;
+                subop->bitmap = stripes[stripe_num].bmp_buf;
+                subop->bitmap_len = clean_entry_bitmap_size;
                subop->req.sec_rw = {
                    .header = {
                        .magic = SECONDARY_OSD_OP_MAGIC,
@@ -211,7 +205,7 @@ int osd_t::submit_primary_subop_batch(int submit_type, inode_t inode, uint64_t o
                        .stripe = op_data->oid.stripe | stripe_num,
                    },
                    .version = op_version,
-                    .offset = wr ? si->write_start : si->read_start,
+                    .offset = wr ? stripes[stripe_num].write_start : stripes[stripe_num].read_start,
                    .len = subop_len,
                    .attr_len = wr ? clean_entry_bitmap_size : 0,
                };
@@ -224,16 +218,16 @@ int osd_t::submit_primary_subop_batch(int submit_type, inode_t inode, uint64_t o
 #endif
                if (wr)
                {
-                    if (si->write_end > si->write_start)
+                    if (stripes[stripe_num].write_end > stripes[stripe_num].write_start)
                    {
-                        subop->iov.push_back(si->write_buf, si->write_end - si->write_start);
+                        subop->iov.push_back(stripes[stripe_num].write_buf, stripes[stripe_num].write_end - stripes[stripe_num].write_start);
                    }
                }
                else
                {
                    if (subop_len > 0)
                    {
-                        subop->iov.push_back(si->read_buf, subop_len);
+                        subop->iov.push_back(stripes[stripe_num].read_buf, subop_len);
                    }
                }
                subop->callback = [cur_op, this](osd_op_t *subop)
@@ -256,10 +250,6 @@ int osd_t::submit_primary_subop_batch(int submit_type, inode_t inode, uint64_t o
            }
            i++;
        }
-        else
-        {
-            si->osd_num = 0;
-        }
    }
    return i-subop_idx;
 }
@@ -344,45 +334,14 @@ void osd_t::handle_primary_subop(osd_op_t *subop, osd_op_t *cur_op)
    else
        expected = 0;
    osd_primary_op_data_t *op_data = cur_op->op_data;
-    if (retval == -ENOENT && opcode == OSD_OP_SEC_READ)
-    {
-        // ENOENT is not an error for almost all reads, except scrub
-        retval = expected;
-        memset(((osd_rmw_stripe_t*)subop->rmw_buf)->read_buf, 0, expected);
-        ((osd_rmw_stripe_t*)subop->rmw_buf)->not_exists = true;
-    }
-    if (retval == expected && (opcode == OSD_OP_SEC_READ || opcode == OSD_OP_SEC_WRITE || opcode == OSD_OP_SEC_WRITE_STABLE))
-    {
-        uint64_t version = subop->reply.sec_rw.version;
-#ifdef OSD_DEBUG
-        uint64_t peer_osd = msgr.clients.find(subop->peer_fd) != msgr.clients.end()
-            ? msgr.clients[subop->peer_fd]->osd_num : osd_num;
-        printf("subop %s %lx:%lx from osd %lu: version = %lu\n", osd_op_names[opcode], subop->req.sec_rw.oid.inode, subop->req.sec_rw.oid.stripe, peer_osd, version);
-#endif
-        if (op_data->fact_ver != UINT64_MAX)
-        {
-            if (op_data->fact_ver != 0 && op_data->fact_ver != version)
-            {
-                fprintf(
-                    stderr, "different fact_versions returned from %s subops: %lu vs %lu\n",
-                    osd_op_names[opcode], version, op_data->fact_ver
-                );
-                retval = -ERANGE;
-            }
-            else
-                op_data->fact_ver = version;
-        }
-    }
    if (retval != expected)
    {
        if (opcode == OSD_OP_SEC_READ || opcode == OSD_OP_SEC_WRITE || opcode == OSD_OP_SEC_WRITE_STABLE)
        {
            printf(
-                subop->peer_fd >= 0
-                    ? "%1$s subop to %2$lx:%3$lx v%4$lu failed on peer %7$d: retval = %5$d (expected %6$d)\n"
-                    : "%1$s subop to %2$lx:%3$lx v%4$lu failed locally: retval = %5$d (expected %6$d)\n",
+                "%s subop to %lx:%lx v%lu failed on peer %d: retval = %d (expected %d)\n",
                osd_op_names[opcode], subop->req.sec_rw.oid.inode, subop->req.sec_rw.oid.stripe, subop->req.sec_rw.version,
-                retval, expected, subop->peer_fd
+                subop->peer_fd, retval, expected
            );
        }
        else
@@ -392,33 +351,43 @@ void osd_t::handle_primary_subop(osd_op_t *subop, osd_op_t *cur_op)
                osd_op_names[opcode], subop->peer_fd, retval, expected
            );
        }
-        if (opcode == OSD_OP_SEC_READ && (retval == -EIO || retval == -EDOM))
-        {
-            // We'll retry reads from other replica(s) on EIO/EDOM and mark object as corrupted
-            ((osd_rmw_stripe_t*)subop->rmw_buf)->read_error = true;
-        }
-        subop->rmw_buf = NULL;
-        // Error priority: ENOSPC and others > EIO > EDOM > EPIPE
-        if (op_data->errcode == 0 ||
-            retval == -EIO && (op_data->errcode == -EDOM || op_data->errcode == -EPIPE) ||
-            retval == -EDOM && (op_data->errcode == -EPIPE) ||
-            retval != -EIO && retval != -EDOM && retval != -EPIPE)
+        // Error priority: EIO > ENOSPC > EPIPE
+        if (op_data->errcode == 0 || retval == -EIO ||
+            retval == -ENOSPC && op_data->errcode == -EPIPE)
        {
            op_data->errcode = retval;
        }
        op_data->errors++;
-        if (subop->peer_fd >= 0 && retval != -EDOM && retval != -ERANGE &&
-            (retval != -ENOSPC || opcode != OSD_OP_SEC_WRITE && opcode != OSD_OP_SEC_WRITE_STABLE) &&
-            (retval != -EIO || opcode != OSD_OP_SEC_READ))
+        if (subop->peer_fd >= 0 && (opcode != OSD_OP_SEC_WRITE && opcode != OSD_OP_SEC_WRITE_STABLE ||
+            retval != -ENOSPC))
        {
-            // Drop connection on unexpected errors
+            // Drop connection on any error expect ENOSPC
            msgr.stop_client(subop->peer_fd);
        }
    }
    else
    {
-        subop->rmw_buf = NULL;
        op_data->done++;
+        if (opcode == OSD_OP_SEC_READ || opcode == OSD_OP_SEC_WRITE || opcode == OSD_OP_SEC_WRITE_STABLE)
+        {
+            uint64_t version = subop->reply.sec_rw.version;
+#ifdef OSD_DEBUG
+            uint64_t peer_osd = msgr.clients.find(subop->peer_fd) != msgr.clients.end()
+                ? msgr.clients[subop->peer_fd]->osd_num : osd_num;
+            printf("subop %lu from osd %lu: version = %lu\n", opcode, peer_osd, version);
+#endif
+            if (op_data->fact_ver != UINT64_MAX)
+            {
+                if (op_data->fact_ver != 0 && op_data->fact_ver != version)
+                {
+                    throw std::runtime_error(
+                        "different fact_versions returned from "+std::string(osd_op_names[opcode])+
+                        " subops: "+std::to_string(version)+" vs "+std::to_string(op_data->fact_ver)
+                    );
+                }
+                op_data->fact_ver = version;
+            }
+        }
    }
    if ((op_data->errors + op_data->done) >= op_data->n_subops)
    {
@@ -441,10 +410,6 @@ void osd_t::handle_primary_subop(osd_op_t *subop, osd_op_t *cur_op)
        {
            continue_primary_del(cur_op);
        }
-        else if (cur_op->req.hdr.opcode == OSD_OP_SCRUB)
-        {
-            continue_primary_scrub(cur_op);
-        }
        else
        {
            throw std::runtime_error("BUG: unknown opcode");
@@ -533,10 +498,8 @@ void osd_t::submit_primary_del_batch(osd_op_t *cur_op, obj_ver_osd_t *chunks_to_
                {
                    handle_primary_bs_subop(subop);
                },
-                { {
-                    .oid = chunk.oid,
-                    .version = chunk.version,
-                } },
+                .oid = chunk.oid,
+                .version = chunk.version,
            });
            bs->enqueue_op(subops[i].bs_op);
        }
@@ -650,9 +613,7 @@ void osd_t::submit_primary_stab_subops(osd_op_t *cur_op)
                {
                    handle_primary_bs_subop(subop);
                },
-                {
-                    .len = (uint32_t)stab_osd.len,
-                },
+                .len = (uint32_t)stab_osd.len,
                .buf = (void*)(op_data->unstable_writes + stab_osd.start),
            });
            bs->enqueue_op(subops[i].bs_op);
--- a/src/osd_primary_write.cpp
+++ b/src/osd_primary_write.cpp
@@ -58,13 +58,7 @@ resume_1:
    // Determine blocks to read and write
    // Missing chunks are allowed to be overwritten even in incomplete objects
    // FIXME: Allow to do small writes to the old (degraded/misplaced) OSD set for lower performance impact
-    op_data->prev_set = get_object_osd_set(pg, op_data->oid, &op_data->object_state);
-    if (op_data->object_state)
-    {
-        // Protect object_state from being freed by a parallel read operation changing it
-        op_data->object_state->ref_count++;
-    }
-retry_1:
+    op_data->prev_set = get_object_osd_set(pg, op_data->oid, pg.cur_set.data(), &op_data->object_state);
    if (op_data->scheme == POOL_SCHEME_REPLICATED)
    {
        // Simplified algorithm
@@ -74,12 +68,6 @@ retry_1:
        if (pg.cur_set.data() != op_data->prev_set && (op_data->stripes[0].write_start != 0 ||
            op_data->stripes[0].write_end != bs_block_size))
        {
-            if (op_data->object_state->state & OBJ_INCOMPLETE)
-            {
-                // Refuse partial overwrite of an incomplete (corrupted) object
-                cur_op->reply.hdr.retval = -EIO;
-                goto continue_others;
-            }
            // Object is degraded/misplaced and will be moved to <write_osd_set>
            op_data->stripes[0].read_start = 0;
            op_data->stripes[0].read_end = bs_block_size;
@@ -98,61 +86,19 @@ retry_1:
        }
    }
    // Read required blocks
-    {
-        if (op_data->object_state && (op_data->object_state->state & OBJ_INCOMPLETE))
-        {
-            // Allow to read version number (just version number!) from corrupted chunks
-            // to allow full overwrite of a corrupted object
-            bool found = false;
-            for (int role = 0; role < op_data->pg_size; role++)
-            {
-                if (op_data->prev_set[role] != 0 || op_data->stripes[role].read_end > op_data->stripes[role].read_start)
-                {
-                    found = true;
-                    break;
-                }
-            }
-            if (!found)
-            {
-                osd_num_t corrupted_target[op_data->pg_size];
-                for (int role = 0; role < op_data->pg_size; role++)
-                {
-                    corrupted_target[role] = 0;
-                }
-                for (auto & loc: op_data->object_state->osd_set)
-                {
-                    if (!(loc.loc_bad & LOC_OUTDATED) && !corrupted_target[loc.role])
-                    {
-                        corrupted_target[loc.role] = loc.osd_num;
-                    }
-                }
-                submit_primary_subops(SUBMIT_RMW_READ, UINT64_MAX, corrupted_target, cur_op);
-                goto resume_2;
-            }
-        }
-        submit_primary_subops(SUBMIT_RMW_READ, UINT64_MAX, op_data->prev_set, cur_op);
-    }
+    submit_primary_subops(SUBMIT_RMW_READ, UINT64_MAX, op_data->prev_set, cur_op);
 resume_2:
    op_data->st = 2;
    return;
 resume_3:
    if (op_data->errors > 0)
    {
-        if (op_data->errcode == -EIO || op_data->errcode == -EDOM)
-        {
-            // Mark object corrupted and retry
-            op_data->object_state = mark_object_corrupted(pg, op_data->oid, op_data->object_state, op_data->stripes, true, false);
-            op_data->prev_set = op_data->object_state ? op_data->object_state->read_target.data() : pg.cur_set.data();
-            goto retry_1;
-        }
-        deref_object_state(pg, &op_data->object_state, true);
        pg_cancel_write_queue(pg, cur_op, op_data->oid, op_data->errcode);
        return;
    }
    // Check CAS version
    if (cur_op->req.rw.version && op_data->fact_ver != (cur_op->req.rw.version-1))
    {
-        deref_object_state(pg, &op_data->object_state, true);
        cur_op->reply.hdr.retval = -EINTR;
        cur_op->reply.rw.version = op_data->fact_ver;
        goto continue_others;
@@ -236,7 +182,6 @@ resume_10:
    // Recheck PG state after reporting history - maybe it's already stopping/restarting
    if (pg.state & (PG_STOPPING|PG_REPEERING))
    {
-        deref_object_state(pg, &op_data->object_state, true);
        pg_cancel_write_queue(pg, cur_op, op_data->oid, -EPIPE);
        return;
    }
@@ -257,7 +202,6 @@ resume_5:
        // to overwrite the same version number which will result in EEXIST.
        // To fix it, we should mark the object as degraded for replicas,
        // and rollback successful part updates in case of EC.
-        deref_object_state(pg, &op_data->object_state, true);
        pg_cancel_write_queue(pg, cur_op, op_data->oid, op_data->errcode);
        return;
    }
@@ -266,7 +210,7 @@ resume_5:
        // We must forget the unclean state of the object before deleting it
        // so the next reads don't accidentally read a deleted version
        // And it should be done at the same time as the removal of the version override
-        remove_object_from_state(op_data->oid, &op_data->object_state, pg);
+        remove_object_from_state(op_data->oid, op_data->object_state, pg);
        pg.clean_count++;
    }
 resume_6:
@@ -321,12 +265,12 @@ resume_7:
                    copies_to_delete_after_sync_count++;
                }
            }
-            deref_object_state(pg, &op_data->object_state, true);
+            free_object_state(pg, &op_data->object_state);
        }
        else
        {
            submit_primary_del_subops(cur_op, pg.cur_set.data(), pg.pg_size, op_data->object_state->osd_set);
-            deref_object_state(pg, &op_data->object_state, true);
+            free_object_state(pg, &op_data->object_state);
            if (op_data->n_subops > 0)
            {
 resume_8:
--- a/src/osd_rmw.cpp
+++ b/src/osd_rmw.cpp
@@ -1084,180 +1084,3 @@ void calc_rmw_parity_ec(osd_rmw_stripe_t *stripes, int pg_size, int pg_minsize,
    }
    calc_rmw_parity_copy_parity(stripes, pg_size, pg_minsize, read_osd_set, write_osd_set, chunk_size, start, end);
 }
-
-// Generate subsets of k items each in {0..n-1}
-static bool first_combination(int *subset, int k, int n)
-{
-    if (k > n)
-        return false;
-    for (int i = 0; i < k; i++)
-        subset[i] = i;
-    return true;
-}
-
-static bool next_combination(int *subset, int k, int n)
-{
-    int pos = k-1;
-    while (true)
-    {
-        subset[pos]++;
-        if (subset[pos] >= n-(k-1-pos))
-        {
-            if (pos == 0)
-                return false;
-            pos--;
-        }
-        else
-            break;
-    }
-    for (pos++; pos < k; pos++)
-    {
-        subset[pos] = subset[pos-1]+1;
-    }
-    return true;
-}
-
-static int c_n_k(int n, int k)
-{
-    int c = 1;
-    for (int i = n; i > k; i--)
-        c *= i;
-    for (int i = 2; i <= (n-k); i++)
-        c /= i;
-    return c;
-}
-
-std::vector<int> ec_find_good(osd_rmw_stripe_t *stripes, int pg_size, int pg_minsize, bool is_xor,
-    uint32_t chunk_size, uint32_t bitmap_size, int max_bruteforce)
-{
-    std::vector<int> found_valid;
-    int cur_live[pg_size], live_count = 0, exists_count = 0;
-    osd_num_t fake_osd_set[pg_size];
-    for (int role = 0; role < pg_size; role++)
-    {
-        if (!stripes[role].missing)
-        {
-            if (!stripes[role].not_exists)
-                exists_count++;
-            cur_live[live_count++] = role;
-            fake_osd_set[role] = role+1;
-        }
-    }
-    if (live_count <= pg_minsize)
-    {
-        return std::vector<int>();
-    }
-    if (exists_count <= pg_minsize)
-    {
-        // Special case: user manually deleted some chunks
-        for (int role = 0; role < pg_size; role++)
-            if (!stripes[role].missing && !stripes[role].not_exists)
-                found_valid.push_back(role);
-        return found_valid;
-    }
-    // Try to locate errors using brute force if there isn't too many combinations
-    osd_rmw_stripe_t brute_stripes[pg_size];
-    int out_count = live_count-pg_minsize;
-    bool brute_force = out_count > 1 && c_n_k(live_count-1, out_count-1) <= max_bruteforce;
-    int subset[pg_minsize], outset[out_count];
-    // Select all combinations with items except the last one (== anything to compare)
-    first_combination(subset, pg_minsize, live_count-1);
-    uint8_t *tmp_buf = (uint8_t*)malloc_or_die(pg_size*chunk_size);
-    do
-    {
-        memcpy(brute_stripes, stripes, sizeof(osd_rmw_stripe_t)*pg_size);
-        int i = 0, j = 0, k = 0;
-        for (; i < pg_minsize; i++, j++)
-            while (j < subset[i])
-                outset[k++] = j++;
-        while (j < pg_size)
-            outset[k++] = j++;
-        for (int i = 0; i < out_count; i++)
-        {
-            brute_stripes[cur_live[outset[i]]].missing = true;
-            brute_stripes[cur_live[outset[i]]].read_buf = tmp_buf+cur_live[outset[i]]*chunk_size;
-        }
-        for (int i = 0; i < pg_minsize; i++)
-        {
-            brute_stripes[i].write_buf = brute_stripes[i].read_buf;
-            brute_stripes[i].req_start = 0;
-            brute_stripes[i].req_end = chunk_size;
-        }
-        for (int i = pg_minsize; i < pg_size; i++)
-        {
-            brute_stripes[i].write_buf = tmp_buf+i*chunk_size;
-        }
-        if (is_xor)
-        {
-            assert(pg_size == pg_minsize+1);
-            reconstruct_stripes_xor(brute_stripes, pg_size, bitmap_size);
-        }
-        else
-        {
-            reconstruct_stripes_ec(brute_stripes, pg_size, pg_minsize, bitmap_size);
-            calc_rmw_parity_ec(brute_stripes, pg_size, pg_minsize, fake_osd_set, fake_osd_set, chunk_size, bitmap_size);
-        }
-        for (int i = pg_minsize; i < pg_size; i++)
-        {
-            brute_stripes[i].read_buf = brute_stripes[i].write_buf;
-        }
-        int valid_count = 0;
-        for (int i = 0; i < out_count; i++)
-        {
-            if (memcmp(brute_stripes[cur_live[outset[i]]].read_buf,
-                    stripes[cur_live[outset[i]]].read_buf, chunk_size) == 0)
-            {
-                brute_stripes[cur_live[outset[i]]].missing = false;
-                valid_count++;
-            }
-        }
-        if (valid_count > 0)
-        {
-            if (found_valid.size())
-            {
-                // Check if we found the same set from the different point of view,
-                // like 1 2 3 -> valid 4 5 and 1 3 4 -> valid 2 5
-                for (int i = 0, j = 0; i < pg_size; i++)
-                {
-                    if (!brute_stripes[i].missing)
-                    {
-                        if (j >= found_valid.size() || found_valid[j] != i)
-                        {
-                            // Ambiguity: we found multiple valid sets and don't know which one is correct
-                            found_valid.clear();
-                            break;
-                        }
-                        j++;
-                    }
-                }
-                if (!found_valid.size())
-                {
-                    break;
-                }
-            }
-            else
-            {
-                for (int i = 0; i < pg_size; i++)
-                {
-                    if (!brute_stripes[i].missing)
-                    {
-                        found_valid.push_back(i);
-                    }
-                }
-            }
-            if (valid_count == out_count)
-            {
-                // All chunks are good
-                break;
-            }
-        }
-        if (!brute_force)
-        {
-            // Do not attempt brute force if there are too many combinations because even
-            // if we find it we won't be able to check that it's the only good one
-            break;
-        }
-    } while (out_count > 1 && next_combination(subset, pg_minsize, live_count-1));
-    free(tmp_buf);
-    return found_valid;
-}
--- a/src/osd_rmw.h
+++ b/src/osd_rmw.h
@@ -4,7 +4,6 @@
 #pragma once

 #include <stdint.h>
-#include <vector>
 #include "object_id.h"
 #include "osd_id.h"

@@ -27,10 +26,7 @@ struct osd_rmw_stripe_t
    // read_end=UINT32_MAX means to only read bitmap, but not data
    uint32_t read_start, read_end;
    uint32_t write_start, write_end;
-    osd_num_t osd_num;
-    bool missing: 1;
-    bool read_error: 1;
-    bool not_exists: 1;
+    bool missing;
 };

 // Here pg_minsize is the number of data chunks, not the minimum number of alive OSDs for the PG to operate
@@ -56,6 +52,3 @@ void reconstruct_stripes_ec(osd_rmw_stripe_t *stripes, int pg_size, int pg_minsi

 void calc_rmw_parity_ec(osd_rmw_stripe_t *stripes, int pg_size, int pg_minsize,
    uint64_t *read_osd_set, uint64_t *write_osd_set, uint32_t chunk_size, uint32_t bitmap_size);
-
-std::vector<int> ec_find_good(osd_rmw_stripe_t *stripes, int pg_size, int pg_minsize, bool is_xor,
-    uint32_t chunk_size, uint32_t bitmap_size, int max_bruteforce);
--- a/src/osd_rmw_test.cpp
+++ b/src/osd_rmw_test.cpp
@@ -28,7 +28,6 @@ void test14();
 void test15(bool second);
 void test16();
 void test_recover_22_d2();
-void test_ec43_error_bruteforce();

 int main(int narg, char *args[])
 {
@@ -65,8 +64,6 @@ int main(int narg, char *args[])
    test16();
    // Test 17
    test_recover_22_d2();
-    // Error bruteforce
-    test_ec43_error_bruteforce();
    // End
    printf("all ok\n");
    return 0;
@@ -1109,72 +1106,3 @@ void test_recover_22_d2()
    // Done
    use_ec(4, 2, false);
 }
-
-/***
-
-EC 4+2 error location bruteforce
-
-***/
-
-static void assert_eq_vec(const std::vector<int> & b, const std::vector<int> & a)
-{
-    printf("Expect [");
-    for (int i = 0; i < a.size(); i++)
-        printf(" %d", a[i]);
-    printf(" ] have [");
-    for (int i = 0; i < b.size(); i++)
-        printf(" %d", b[i]);
-    printf(" ]\n");
-    assert(a == b);
-}
-
-void test_ec43_error_bruteforce()
-{
-    use_ec(7, 4, true);
-    osd_num_t osd_set[7] = { 1, 2, 3, 4, 5, 6, 7 };
-    osd_rmw_stripe_t stripes[7] = {};
-    split_stripes(4, 4096, 0, 4096 * 4, stripes);
-    uint8_t *write_buf = (uint8_t*)malloc_or_die(4096 * 7);
-    set_pattern(write_buf+0*4096, 4096, PATTERN0);
-    set_pattern(write_buf+1*4096, 4096, PATTERN1);
-    set_pattern(write_buf+2*4096, 4096, PATTERN2);
-    set_pattern(write_buf+3*4096, 4096, PATTERN3);
-    uint8_t *rmw_buf = (uint8_t*)calc_rmw(write_buf, stripes, osd_set, 7, 4, 7, osd_set, 4096, 0);
-    calc_rmw_parity_ec(stripes, 7, 4, osd_set, osd_set, 4096, 0);
-    check_pattern(stripes[4].write_buf, 4096, PATTERN0^PATTERN1^PATTERN2^PATTERN3);
-    check_pattern(stripes[5].write_buf, 4096, 0xfcee568ba36371ac); // 2nd EC chunk
-    check_pattern(stripes[6].write_buf, 4096, 0x139274739ae6f387); // 3rd EC chunk
-    memcpy(write_buf+4*4096, stripes[4].write_buf, 4096);
-    memcpy(write_buf+5*4096, stripes[5].write_buf, 4096);
-    memcpy(write_buf+6*4096, stripes[6].write_buf, 4096);
-    // Try to locate errors
-    for (int i = 0; i < 7; i++)
-    {
-        stripes[i].read_start = 0;
-        stripes[i].read_end = 4096;
-        stripes[i].read_buf = write_buf+i*4096;
-        stripes[i].write_buf = NULL;
-    }
-    // All good chunks
-    auto res = ec_find_good(stripes, 7, 4, false, 4096, 0, 100);
-    assert_eq_vec(res, std::vector<int>({0, 1, 2, 3, 4, 5, 6}));
-    // 1 missing chunk
-    set_pattern(write_buf+1*4096, 4096, 0);
-    res = ec_find_good(stripes, 7, 4, false, 4096, 0, 100);
-    assert_eq_vec(res, std::vector<int>({0, 2, 3, 4, 5, 6}));
-    // 2 missing chunks
-    set_pattern(write_buf+1*4096, 4096, 0);
-    set_pattern(write_buf+5*4096, 4096, 0);
-    res = ec_find_good(stripes, 7, 4, false, 4096, 0, 100);
-    assert_eq_vec(res, std::vector<int>({0, 2, 3, 4, 6}));
-    // 3 missing chunks
-    set_pattern(write_buf+1*4096, 4096, 0);
-    set_pattern(write_buf+5*4096, 4096, 0);
-    set_pattern(write_buf+6*4096, 4096, 0);
-    res = ec_find_good(stripes, 7, 4, false, 4096, 0, 100);
-    assert_eq_vec(res, std::vector<int>());
-    // Done
-    free(rmw_buf);
-    free(write_buf);
-    use_ec(7, 4, false);
-}
--- a/src/osd_scrub.cpp
+++ b/src/osd_scrub.cpp
@@ -1,623 +0,0 @@
-// Copyright (c) Vitaliy Filippov, 2019+
-// License: VNPL-1.1 (see README.md for details)
-
-#include "osd_primary.h"
-
-#define SELF_FD -1
-
-void osd_t::scrub_list(pool_pg_num_t pg_id, osd_num_t role_osd, object_id min_oid)
-{
-    pool_id_t pool_id = pg_id.pool_id;
-    pg_num_t pg_num = pg_id.pg_num;
-    assert(!scrub_list_op);
-    if (role_osd == this->osd_num)
-    {
-        // Self
-        osd_op_t *op = new osd_op_t();
-        op->op_type = 0;
-        op->peer_fd = SELF_FD;
-        clock_gettime(CLOCK_REALTIME, &op->tv_begin);
-        op->bs_op = new blockstore_op_t();
-        op->bs_op->opcode = BS_OP_LIST;
-        op->bs_op->pg_alignment = st_cli.pool_config[pool_id].pg_stripe_size;
-        if (min_oid.inode != 0 || min_oid.stripe != 0)
-            op->bs_op->min_oid = min_oid;
-        else
-        {
-            op->bs_op->min_oid.inode = ((uint64_t)pool_id << (64 - POOL_ID_BITS));
-            op->bs_op->min_oid.stripe = 0;
-        }
-        op->bs_op->max_oid.inode = ((uint64_t)(pool_id+1) << (64 - POOL_ID_BITS)) - 1;
-        op->bs_op->max_oid.stripe = UINT64_MAX;
-        op->bs_op->list_stable_limit = scrub_list_limit;
-        op->bs_op->pg_count = pg_counts[pool_id];
-        op->bs_op->pg_number = pg_num-1;
-        op->bs_op->callback = [this, op](blockstore_op_t *bs_op)
-        {
-            scrub_list_op = NULL;
-            if (op->bs_op->retval < 0)
-            {
-                printf("Local OP_LIST failed: retval=%d\n", op->bs_op->retval);
-                force_stop(1);
-                return;
-            }
-            add_bs_subop_stats(op);
-            scrub_cur_list = {
-                .buf = (obj_ver_id*)op->bs_op->buf,
-                .total_count = (uint64_t)op->bs_op->retval,
-                .stable_count = op->bs_op->version,
-            };
-            delete op->bs_op;
-            op->bs_op = NULL;
-            delete op;
-            continue_scrub();
-        };
-        scrub_list_op = op;
-        bs->enqueue_op(op->bs_op);
-    }
-    else
-    {
-        // Peer
-        osd_op_t *op = new osd_op_t();
-        op->op_type = OSD_OP_OUT;
-        op->peer_fd = msgr.osd_peer_fds.at(role_osd);
-        op->req = (osd_any_op_t){
-            .sec_list = {
-                .header = {
-                    .magic = SECONDARY_OSD_OP_MAGIC,
-                    .id = msgr.next_subop_id++,
-                    .opcode = OSD_OP_SEC_LIST,
-                },
-                .list_pg = pg_num,
-                .pg_count = pg_counts[pool_id],
-                .pg_stripe_size = st_cli.pool_config[pool_id].pg_stripe_size,
-                .min_inode = min_oid.inode ? min_oid.inode : ((uint64_t)(pool_id) << (64 - POOL_ID_BITS)),
-                .max_inode = ((uint64_t)(pool_id+1) << (64 - POOL_ID_BITS)) - 1,
-                .min_stripe = min_oid.stripe,
-                .stable_limit = scrub_list_limit,
-            },
-        };
-        op->callback = [this, role_osd](osd_op_t *op)
-        {
-            scrub_list_op = NULL;
-            if (op->reply.hdr.retval < 0)
-            {
-                printf("Failed to get object list from OSD %lu (retval=%ld), disconnecting peer\n", role_osd, op->reply.hdr.retval);
-                int fail_fd = op->peer_fd;
-                delete op;
-                msgr.stop_client(fail_fd);
-                return;
-            }
-            scrub_cur_list = {
-                .buf = (obj_ver_id*)op->buf,
-                .total_count = (uint64_t)op->reply.hdr.retval,
-                .stable_count = op->reply.sec_list.stable_count,
-            };
-            // set op->buf to NULL so it doesn't get freed
-            op->buf = NULL;
-            delete op;
-            continue_scrub();
-        };
-        scrub_list_op = op;
-        msgr.outbox_push(op);
-    }
-}
-
-int osd_t::pick_next_scrub(object_id & next_oid)
-{
-    if (!pgs.size())
-    {
-        if (scrub_cur_list.buf)
-        {
-            free(scrub_cur_list.buf);
-            scrub_cur_list = {};
-            scrub_last_pg = {};
-        }
-        return 0;
-    }
-    timespec tv_now;
-    clock_gettime(CLOCK_REALTIME, &tv_now);
-    bool rescan = scrub_last_pg.pool_id != 0 || scrub_last_pg.pg_num != 0;
-    // Restart scanning from the same PG as the last time
-    auto pg_it = pgs.lower_bound(scrub_last_pg);
-    if (pg_it == pgs.end() && rescan)
-    {
-        pg_it = pgs.begin();
-        rescan = false;
-    }
-    while (pg_it != pgs.end())
-    {
-        if ((pg_it->second.state & PG_ACTIVE) && pg_it->second.next_scrub && pg_it->second.next_scrub < tv_now.tv_sec)
-        {
-            // Continue scrubbing from the next object
-            if (scrub_last_pg == pg_it->first)
-            {
-                while (scrub_list_pos < scrub_cur_list.total_count)
-                {
-                    auto oid = scrub_cur_list.buf[scrub_list_pos].oid;
-                    oid.stripe &= ~STRIPE_MASK;
-                    scrub_list_pos++;
-                    if (recovery_ops.find(oid) == recovery_ops.end() &&
-                        scrub_ops.find(oid) == scrub_ops.end() &&
-                        pg_it->second.write_queue.find(oid) == pg_it->second.write_queue.end())
-                    {
-                        next_oid = oid;
-                        if (!(pg_it->second.state & PG_SCRUBBING))
-                        {
-                            // Currently scrubbing this PG
-                            pg_it->second.state = pg_it->second.state | PG_SCRUBBING;
-                            report_pg_state(pg_it->second);
-                        }
-                        return 2;
-                    }
-                }
-            }
-            if (scrub_last_pg == pg_it->first &&
-                scrub_list_pos >= scrub_cur_list.total_count &&
-                scrub_cur_list.stable_count < scrub_list_limit)
-            {
-                // End of the list, mark this PG as scrubbed and go to the next PG
-            }
-            else
-            {
-                // Continue listing
-                object_id scrub_last_oid = {};
-                if (scrub_last_pg == pg_it->first && scrub_cur_list.stable_count > 0)
-                {
-                    scrub_last_oid = scrub_cur_list.buf[scrub_cur_list.stable_count-1].oid;
-                    scrub_last_oid.stripe++;
-                }
-                osd_num_t scrub_osd = 0;
-                for (osd_num_t pg_osd: pg_it->second.cur_set)
-                {
-                    if (pg_osd == this->osd_num || scrub_osd == 0)
-                        scrub_osd = pg_osd;
-                }
-                if (!(pg_it->second.state & PG_SCRUBBING))
-                {
-                    // Currently scrubbing this PG
-                    pg_it->second.state = pg_it->second.state | PG_SCRUBBING;
-                    report_pg_state(pg_it->second);
-                }
-                if (scrub_cur_list.buf)
-                {
-                    free(scrub_cur_list.buf);
-                    scrub_cur_list = {};
-                    scrub_list_pos = 0;
-                }
-                scrub_last_pg = pg_it->first;
-                scrub_list(pg_it->first, scrub_osd, scrub_last_oid);
-                return 1;
-            }
-            if (pg_it->second.state & PG_SCRUBBING)
-            {
-                scrub_last_pg = {};
-                pg_it->second.state = pg_it->second.state & ~PG_SCRUBBING;
-                pg_it->second.next_scrub = 0;
-                pg_it->second.history_changed = true;
-                report_pg_state(pg_it->second);
-            }
-            // The list is definitely not needed anymore
-            if (scrub_cur_list.buf)
-            {
-                free(scrub_cur_list.buf);
-                scrub_cur_list = {};
-            }
-        }
-        pg_it++;
-        if (pg_it == pgs.end() && rescan)
-        {
-            // Scan one more time to guarantee that there are no PGs to scrub
-            pg_it = pgs.begin();
-            rescan = false;
-        }
-    }
-    // Scanned all PGs - no more scrubs to do
-    return 0;
-}
-
-void osd_t::submit_scrub_op(object_id oid)
-{
-    auto osd_op = new osd_op_t();
-    osd_op->op_type = OSD_OP_OUT;
-    osd_op->peer_fd = -1;
-    osd_op->req = (osd_any_op_t){
-        .rw = {
-            .header = {
-                .magic = SECONDARY_OSD_OP_MAGIC,
-                .id = 1,
-                .opcode = OSD_OP_SCRUB,
-            },
-            .inode = oid.inode,
-            .offset = oid.stripe,
-            .len = 0,
-        },
-    };
-    if (log_level > 2)
-    {
-        printf("Submitting scrub for %lx:%lx\n", oid.inode, oid.stripe);
-    }
-    osd_op->callback = [this](osd_op_t *osd_op)
-    {
-        object_id oid = { .inode = osd_op->req.rw.inode, .stripe = osd_op->req.rw.offset };
-        if (osd_op->reply.hdr.retval < 0 && osd_op->reply.hdr.retval != -ENOENT)
-        {
-            // Scrub error
-            printf(
-                "Scrub failed with object %lx:%lx (PG %u/%u): error %ld\n",
-                oid.inode, oid.stripe, INODE_POOL(oid.inode),
-                map_to_pg(oid, st_cli.pool_config.at(INODE_POOL(oid.inode)).pg_stripe_size),
-                osd_op->reply.hdr.retval
-            );
-        }
-        else if (log_level > 2)
-        {
-            printf("Scrubbed %lx:%lx\n", oid.inode, oid.stripe);
-        }
-        delete osd_op;
-        if (scrub_sleep_ms)
-        {
-            this->tfd->set_timer(scrub_sleep_ms, false, [this, oid](int timer_id)
-            {
-                scrub_ops.erase(oid);
-                continue_scrub();
-            });
-        }
-        else
-        {
-            scrub_ops.erase(oid);
-            continue_scrub();
-        }
-    };
-    scrub_ops[oid] = osd_op;
-    exec_op(osd_op);
-}
-
-// Triggers scrub requests
-// Scrub reads data from all replicas and compares it
-// To scrub first we need to read objects listings
-bool osd_t::continue_scrub()
-{
-    if (scrub_list_op)
-    {
-        return true;
-    }
-    if (no_scrub)
-    {
-        // Return false = no more scrub work to do
-        scrub_cur_list = {};
-        scrub_last_pg = {};
-        scrub_nearest_ts = 0;
-        if (scrub_timer_id >= 0)
-        {
-            tfd->clear_timer(scrub_timer_id);
-            scrub_timer_id = -1;
-        }
-        for (auto pg_it = pgs.begin(); pg_it != pgs.end(); pg_it++)
-        {
-            if (pg_it->second.state & PG_SCRUBBING)
-            {
-                pg_it->second.state = pg_it->second.state & ~PG_SCRUBBING;
-                report_pg_state(pg_it->second);
-            }
-        }
-        return false;
-    }
-    while (scrub_ops.size() < scrub_queue_depth)
-    {
-        object_id oid;
-        int r = pick_next_scrub(oid);
-        if (r == 2)
-            submit_scrub_op(oid);
-        else
-            return r;
-    }
-    return true;
-}
-
-void osd_t::plan_scrub(pg_t & pg, bool report_state)
-{
-    if ((pg.state & PG_ACTIVE) && !pg.next_scrub && auto_scrub)
-    {
-        timespec tv_now;
-        clock_gettime(CLOCK_REALTIME, &tv_now);
-        auto & pool_cfg = st_cli.pool_config.at(pg.pool_id);
-        auto interval = pool_cfg.scrub_interval ? pool_cfg.scrub_interval : global_scrub_interval;
-        if (pg.next_scrub != tv_now.tv_sec + interval)
-        {
-            pool_cfg.pg_config[pg.pg_num].next_scrub = pg.next_scrub = tv_now.tv_sec + interval;
-            pg.history_changed = true;
-            if (report_state)
-                report_pg_state(pg);
-        }
-        schedule_scrub(pg);
-    }
-}
-
-void osd_t::schedule_scrub(pg_t & pg)
-{
-    if (!no_scrub && pg.next_scrub && (!scrub_nearest_ts || scrub_nearest_ts > pg.next_scrub))
-    {
-        scrub_nearest_ts = pg.next_scrub;
-        timespec tv_now;
-        clock_gettime(CLOCK_REALTIME, &tv_now);
-        if (scrub_timer_id >= 0)
-        {
-            tfd->clear_timer(scrub_timer_id);
-            scrub_timer_id = -1;
-        }
-        if (tv_now.tv_sec > scrub_nearest_ts)
-        {
-            scrub_nearest_ts = 0;
-            peering_state = peering_state | OSD_SCRUBBING;
-            ringloop->wakeup();
-        }
-        else
-        {
-            scrub_timer_id = tfd->set_timer((scrub_nearest_ts-tv_now.tv_sec)*1000, false, [this](int timer_id)
-            {
-                scrub_timer_id = -1;
-                scrub_nearest_ts = 0;
-                peering_state = peering_state | OSD_SCRUBBING;
-                ringloop->wakeup();
-            });
-        }
-    }
-}
-
-void osd_t::continue_primary_scrub(osd_op_t *cur_op)
-{
-    if (!cur_op->op_data && !prepare_primary_rw(cur_op))
-        return;
-    osd_primary_op_data_t *op_data = cur_op->op_data;
-    if (op_data->st == 1)
-        goto resume_1;
-    else if (op_data->st == 2)
-        goto resume_2;
-    {
-        auto & pg = pgs.at({ .pool_id = INODE_POOL(op_data->oid.inode), .pg_num = op_data->pg_num });
-        // Determine version
-        auto vo_it = pg.ver_override.find(op_data->oid);
-        op_data->target_ver = vo_it != pg.ver_override.end() ? vo_it->second : UINT64_MAX;
-        // PG may have degraded or misplaced objects
-        op_data->prev_set = get_object_osd_set(pg, op_data->oid, &op_data->object_state);
-        // Read all available chunks
-        int n_copies = 0;
-        op_data->degraded = false;
-        for (int role = 0; role < op_data->pg_size; role++)
-        {
-            op_data->stripes[role].write_buf = NULL;
-            op_data->stripes[role].read_start = 0;
-            op_data->stripes[role].read_end = bs_block_size;
-            if (op_data->prev_set[role] != 0)
-            {
-                n_copies++;
-            }
-            else
-            {
-                op_data->stripes[role].missing = true;
-                if (op_data->scheme != POOL_SCHEME_REPLICATED && role < op_data->pg_data_size)
-                {
-                    op_data->degraded = true;
-                }
-            }
-        }
-        if (n_copies <= op_data->pg_data_size)
-        {
-            // Nothing to compare, even if we'd like to
-            finish_op(cur_op, 0);
-            return;
-        }
-        cur_op->buf = alloc_read_buffer(op_data->stripes, op_data->pg_size, 0);
-        // Submit reads
-        osd_op_t *subops = new osd_op_t[n_copies];
-        op_data->fact_ver = 0;
-        op_data->done = op_data->errors = op_data->errcode = 0;
-        op_data->n_subops = n_copies;
-        op_data->subops = subops;
-        int sent = submit_primary_subop_batch(SUBMIT_SCRUB_READ, op_data->oid.inode, op_data->target_ver,
-            op_data->stripes, op_data->prev_set, cur_op, 0, -1);
-        assert(sent == n_copies);
-        op_data->st = 1;
-    }
-resume_1:
-    return;
-resume_2:
-    if (op_data->errors > 0)
-    {
-        if (op_data->errcode == -EIO || op_data->errcode == -EDOM)
-        {
-            // I/O or checksum error
-            int n_copies = 0;
-            for (int role = 0; role < op_data->pg_size; role++)
-            {
-                if (op_data->stripes[role].read_error)
-                {
-                    op_data->stripes[role].missing = true;
-                    if (op_data->scheme != POOL_SCHEME_REPLICATED && role < op_data->pg_data_size)
-                    {
-                        op_data->degraded = true;
-                    }
-                }
-                else if (!op_data->stripes[role].missing)
-                {
-                    n_copies++;
-                }
-            }
-            if (n_copies <= op_data->pg_data_size)
-            {
-                // Nothing to compare, just mark the object as corrupted
-                auto & pg = pgs.at({ .pool_id = INODE_POOL(op_data->oid.inode), .pg_num = op_data->pg_num });
-                // FIXME: ref = true ideally... because new_state != state is not necessarily true if it's freed and recreated
-                op_data->object_state = mark_object_corrupted(pg, op_data->oid, op_data->object_state, op_data->stripes, false, false);
-                // Operation is treated as unsuccessful only if the object becomes unreadable
-                finish_op(cur_op, n_copies < op_data->pg_data_size ? op_data->errcode : 0);
-                return;
-            }
-            // Proceed, we can still compare chunks that were successfully read
-        }
-        else
-        {
-            finish_op(cur_op, op_data->errcode);
-            return;
-        }
-    }
-    bool inconsistent = false;
-    if (op_data->scheme == POOL_SCHEME_REPLICATED)
-    {
-        // Check that all chunks have returned the same data
-        int total = 0;
-        int eq_to[op_data->pg_size];
-        for (int role = 0; role < op_data->pg_size; role++)
-        {
-            eq_to[role] = -1;
-            if (op_data->stripes[role].read_end != 0 && !op_data->stripes[role].missing &&
-                !op_data->stripes[role].not_exists)
-            {
-                total++;
-                eq_to[role] = role;
-                for (int other = 0; other < role; other++)
-                {
-                    // Only compare with unique chunks (eq_to[other] == other)
-                    if (eq_to[other] == other && memcmp(op_data->stripes[role].read_buf, op_data->stripes[other].read_buf, bs_block_size) == 0)
-                    {
-                        eq_to[role] = eq_to[other];
-                        break;
-                    }
-                }
-            }
-        }
-        int votes[op_data->pg_size];
-        for (int role = 0; role < op_data->pg_size; role++)
-            votes[role] = 0;
-        for (int role = 0; role < op_data->pg_size; role++)
-        {
-            if (eq_to[role] != -1)
-                votes[eq_to[role]]++;
-        }
-        int best = -1;
-        for (int role = 0; role < op_data->pg_size; role++)
-        {
-            if (votes[role] > (best >= 0 ? votes[best] : 0))
-                best = role;
-        }
-        if (best >= 0 && votes[best] < total)
-        {
-            bool unknown = false;
-            for (int role = 0; role < op_data->pg_size; role++)
-            {
-                if (role != best && votes[role] == votes[best])
-                {
-                    unknown = true;
-                }
-                if (votes[role] > 0 && votes[role] < votes[best])
-                {
-                    printf(
-                        "[PG %u/%u] Object %lx:%lx v%lu copy on OSD %lu doesn't match %d other copies%s\n",
-                        INODE_POOL(op_data->oid.inode), op_data->pg_num,
-                        op_data->oid.inode, op_data->oid.stripe, op_data->fact_ver,
-                        op_data->stripes[role].osd_num, votes[best],
-                        scrub_find_best ? ", marking it as corrupted" : ""
-                    );
-                    if (scrub_find_best)
-                    {
-                        op_data->stripes[role].read_error = true;
-                    }
-                }
-            }
-            if (!scrub_find_best)
-            {
-                unknown = true;
-            }
-            if (unknown)
-            {
-                // It's unknown which replica is good. There are multiple versions with no majority
-                // Mark all good replicas as ambiguous
-                best = -1;
-                inconsistent = true;
-                printf(
-                    "[PG %u/%u] Object %lx:%lx v%lu is inconsistent: copies don't match. Use vitastor-cli fix to fix it\n",
-                    INODE_POOL(op_data->oid.inode), op_data->pg_num,
-                    op_data->oid.inode, op_data->oid.stripe, op_data->fact_ver
-                );
-            }
-        }
-    }
-    else
-    {
-        assert(op_data->scheme == POOL_SCHEME_EC || op_data->scheme == POOL_SCHEME_XOR);
-        auto good_subset = ec_find_good(
-            op_data->stripes, op_data->pg_size, op_data->pg_data_size, op_data->scheme == POOL_SCHEME_XOR,
-            bs_block_size, clean_entry_bitmap_size, scrub_ec_max_bruteforce
-        );
-        if (!good_subset.size())
-        {
-            inconsistent = true;
-            printf(
-                "[PG %u/%u] Object %lx:%lx v%lu is inconsistent: parity chunks don't match data. Use vitastor-cli fix to fix it\n",
-                INODE_POOL(op_data->oid.inode), op_data->pg_num,
-                op_data->oid.inode, op_data->oid.stripe, op_data->fact_ver
-            );
-        }
-        else
-        {
-            int total = 0;
-            for (int role = 0; role < op_data->pg_size; role++)
-            {
-                if (!op_data->stripes[role].missing)
-                {
-                    total++;
-                    op_data->stripes[role].read_error = true;
-                }
-            }
-            for (int role: good_subset)
-            {
-                op_data->stripes[role].read_error = false;
-            }
-            for (int role = 0; role < op_data->pg_size; role++)
-            {
-                if (!op_data->stripes[role].missing && op_data->stripes[role].read_error)
-                {
-                    printf(
-                        "[PG %u/%u] Object %lx:%lx v%lu chunk %d on OSD %lu doesn't match other chunks%s\n",
-                        INODE_POOL(op_data->oid.inode), op_data->pg_num,
-                        op_data->oid.inode, op_data->oid.stripe, op_data->fact_ver,
-                        role, op_data->stripes[role].osd_num,
-                        scrub_find_best ? ", marking it as corrupted" : ""
-                    );
-                }
-            }
-            if (!scrub_find_best && good_subset.size() < total)
-            {
-                inconsistent = true;
-                printf(
-                    "[PG %u/%u] Object %lx:%lx v%lu is marked as inconsistent because scrub_find_best is turned off. Use vitastor-cli fix to fix it\n",
-                    INODE_POOL(op_data->oid.inode), op_data->pg_num,
-                    op_data->oid.inode, op_data->oid.stripe, op_data->fact_ver
-                );
-                for (int role = 0; role < op_data->pg_size; role++)
-                {
-                    if (!op_data->stripes[role].missing && op_data->stripes[role].read_error)
-                    {
-                        // Undo error locator marking chunk as bad
-                        op_data->stripes[role].read_error = false;
-                    }
-                }
-            }
-        }
-    }
-    for (int role = 0; role < op_data->pg_size; role++)
-    {
-        if (op_data->stripes[role].osd_num != 0 &&
-            (op_data->stripes[role].read_error || op_data->stripes[role].not_exists) ||
-            inconsistent)
-        {
-            // Got at least 1 read error or mismatch, mark the object as corrupted
-            auto & pg = pgs.at({ .pool_id = INODE_POOL(op_data->oid.inode), .pg_num = op_data->pg_num });
-            // FIXME: ref = true ideally... because new_state != state is not necessarily true if it's freed and recreated
-            op_data->object_state = mark_object_corrupted(pg, op_data->oid, op_data->object_state, op_data->stripes, false, inconsistent);
-            break;
-        }
-    }
-    finish_op(cur_op, 0);
-}
--- a/src/osd_secondary.cpp
+++ b/src/osd_secondary.cpp
@@ -125,18 +125,11 @@ void osd_t::exec_secondary(osd_op_t *cur_op)
            secondary_op_callback(cur_op);
            return;
        }
-        cur_op->bs_op->pg_alignment = cur_op->req.sec_list.pg_stripe_size;
-        cur_op->bs_op->pg_count = cur_op->req.sec_list.pg_count;
-        cur_op->bs_op->pg_number = cur_op->req.sec_list.list_pg - 1;
-        cur_op->bs_op->min_oid.inode = cur_op->req.sec_list.min_inode;
-        cur_op->bs_op->min_oid.stripe = cur_op->req.sec_list.min_stripe;
-        cur_op->bs_op->max_oid.inode = cur_op->req.sec_list.max_inode;
-        if (cur_op->req.sec_list.max_inode && cur_op->req.sec_list.max_stripe != UINT64_MAX)
-        {
-            cur_op->bs_op->max_oid.stripe = cur_op->req.sec_list.max_stripe
-                ? cur_op->req.sec_list.max_stripe : UINT64_MAX;
-        }
-        cur_op->bs_op->list_stable_limit = cur_op->req.sec_list.stable_limit;
+        cur_op->bs_op->oid.stripe = cur_op->req.sec_list.pg_stripe_size;
+        cur_op->bs_op->len = cur_op->req.sec_list.pg_count;
+        cur_op->bs_op->offset = cur_op->req.sec_list.list_pg - 1;
+        cur_op->bs_op->oid.inode = cur_op->req.sec_list.min_inode;
+        cur_op->bs_op->version = cur_op->req.sec_list.max_inode;
 #ifdef OSD_STUB
        cur_op->bs_op->retval = 0;
        cur_op->bs_op->buf = NULL;
--- a/src/pg_states.cpp
+++ b/src/pg_states.cpp
@@ -3,9 +3,9 @@

 #include "pg_states.h"

-const int pg_state_bit_count = 17;
+const int pg_state_bit_count = 14;

-const int pg_state_bits[17] = {
+const int pg_state_bits[14] = {
    PG_STARTING,
    PG_PEERING,
    PG_INCOMPLETE,
@@ -14,18 +14,15 @@ const int pg_state_bits[17] = {
    PG_STOPPING,
    PG_OFFLINE,
    PG_DEGRADED,
-    PG_HAS_INCONSISTENT,
-    PG_HAS_CORRUPTED,
    PG_HAS_INCOMPLETE,
    PG_HAS_DEGRADED,
    PG_HAS_MISPLACED,
    PG_HAS_UNCLEAN,
    PG_HAS_INVALID,
    PG_LEFT_ON_DEAD,
-    PG_SCRUBBING,
 };

-const char *pg_state_names[17] = {
+const char *pg_state_names[14] = {
    "starting",
    "peering",
    "incomplete",
@@ -34,37 +31,10 @@ const char *pg_state_names[17] = {
    "stopping",
    "offline",
    "degraded",
-    "has_inconsistent",
-    "has_corrupted",
    "has_incomplete",
    "has_degraded",
    "has_misplaced",
    "has_unclean",
    "has_invalid",
    "left_on_dead",
-    "scrubbing",
-};
-
-const int object_state_bit_count = 8;
-
-const int object_state_bits[8] = {
-    OBJ_DEGRADED,
-    OBJ_INCOMPLETE,
-    OBJ_MISPLACED,
-    OBJ_CORRUPTED,
-    OBJ_INCONSISTENT,
-    OBJ_NEEDS_STABLE,
-    OBJ_NEEDS_ROLLBACK,
-    0,
-};
-
-const char *object_state_names[8] = {
-    "degraded",
-    "incomplete",
-    "misplaced",
-    "corrupted",
-    "inconsistent",
-    "needs_stable",
-    "needs_rollback",
-    "clean",
 };
--- a/src/pg_states.h
+++ b/src/pg_states.h
@@ -22,10 +22,7 @@
 #define PG_HAS_MISPLACED (1<<10)
 #define PG_HAS_UNCLEAN (1<<11)
 #define PG_HAS_INVALID (1<<12)
-#define PG_HAS_CORRUPTED (1<<13)
-#define PG_HAS_INCONSISTENT (1<<14)
-#define PG_LEFT_ON_DEAD (1<<15)
-#define PG_SCRUBBING (1<<16)
+#define PG_LEFT_ON_DEAD (1<<13)

 // Lower bits that represent object role (EC 0/1/2... or always 0 with replication)
 // 12 bits is a safe default that doesn't depend on pg_stripe_size or pg_block_size
@@ -35,18 +32,9 @@
 #define OBJ_DEGRADED 0x02
 #define OBJ_INCOMPLETE 0x04
 #define OBJ_MISPLACED 0x08
-// OBJ_CORRUPTED is always set with one of OBJ_INCOMPLETE/OBJ_DEGRADED/OBJ_MISPLACED
-#define OBJ_CORRUPTED 0x10
-// OBJ_INCONSISTENT is when its replicas don't match, but it's unclear which one is correct
-// OBJ_INCONSISTENT may be set with CORRUPTED, but never with other states
-#define OBJ_INCONSISTENT 0x20
 #define OBJ_NEEDS_STABLE 0x10000
 #define OBJ_NEEDS_ROLLBACK 0x20000

 extern const int pg_state_bits[];
 extern const char *pg_state_names[];
 extern const int pg_state_bit_count;
-
-extern const int object_state_bits[];
-extern const char *object_state_names[];
-extern const int object_state_bit_count;
--- a/src/str_util.cpp
+++ b/src/str_util.cpp
@@ -3,7 +3,6 @@

 #include <assert.h>
 #include <string.h>
-#include <unistd.h>
 #include "str_util.h"

 std::string base64_encode(const std::string &in)
@@ -250,53 +249,3 @@ void print_help(const char *help_text, std::string exe_name, std::string cmd, bo
    fwrite(filtered_text.data(), filtered_text.size(), 1, stdout);
    exit(0);
 }
-
-uint64_t parse_time(std::string time_str, bool *ok)
-{
-    if (!time_str.length())
-    {
-        if (ok)
-            *ok = false;
-        return 0;
-    }
-    uint64_t mul = 1;
-    char type_char = tolower(time_str[time_str.length()-1]);
-    if (type_char == 's' || type_char == 'm' || type_char == 'h' || type_char == 'd' || type_char == 'y')
-    {
-        if (type_char == 's')
-            mul = 1;
-        else if (time_str[time_str.length()-1] == 'M')
-            mul = 30*86400;
-        else if (type_char == 'm')
-            mul = 60;
-        else if (type_char == 'h')
-            mul = 3600;
-        else if (type_char == 'd')
-            mul = 86400;
-        else /*if (type_char == 'y')*/
-            mul = 86400*365;
-        time_str = time_str.substr(0, time_str.length()-1);
-    }
-    uint64_t ts = stoull_full(time_str, 0) * mul;
-    if (ok)
-        *ok = !(ts == 0 && time_str != "0" && (time_str != "" || mul != 1));
-    return ts;
-}
-
-std::string read_all_fd(int fd)
-{
-    int res_size = 0, res_alloc = 0;
-    std::string res;
-    while (1)
-    {
-        if (res_size >= res_alloc)
-            res.resize((res_alloc = (res_alloc ? res_alloc*2 : 1024)));
-        int r = read(fd, (char*)res.data()+res_size, res_alloc-res_size);
-        if (r > 0)
-            res_size += r;
-        else if (!r || errno != EAGAIN && errno != EINTR)
-            break;
-    }
-    res.resize(res_size);
-    return res;
-}
--- a/src/str_util.h
+++ b/src/str_util.h
@@ -15,5 +15,3 @@ std::string str_replace(const std::string & in, const std::string & needle, cons
 uint64_t stoull_full(const std::string & str, int base = 0);
 std::string format_size(uint64_t size, bool nobytes = false);
 void print_help(const char *help_text, std::string exe_name, std::string cmd, bool all);
-uint64_t parse_time(std::string time_str, bool *ok = NULL);
-std::string read_all_fd(int fd);
--- a/src/vitastor.pc.in
+++ b/src/vitastor.pc.in
@@ -6,7 +6,7 @@ includedir=${prefix}/@CMAKE_INSTALL_INCLUDEDIR@

 Name: Vitastor
 Description: Vitastor client library
-Version: 0.9.1
+Version: 0.8.9
 Libs: -L${libdir} -lvitastor_client
 Cflags: -I${includedir}

--- a/tests/common.sh
+++ b/tests/common.sh
@@ -7,7 +7,7 @@ fi

 format_error()
 {
-    echo $(echo -n -e "\033[1;31m")"$0 $1"$(echo -n -e "\033[m")
+    echo $(echo -n -e "\033[1;31m")"$1"$(echo -n -e "\033[m")
    $ETCDCTL get --prefix /vitastor > ./testdata/etcd-dump.txt
    exit 1
 }
--- a/tests/run_3osds.sh
+++ b/tests/run_3osds.sh
@@ -95,29 +95,19 @@ try_reweight()
    sleep 3
 }

-wait_condition()
-{
-    sec=$1
-    check=$2
-    proc=$3
-    i=0
-    while [[ $i -lt $sec ]]; do
-        eval "$check" && break
-        if [ $i -eq $sec ]; then
-            format_error "$proc couldn't finish in $sec seconds"
-        fi
-        sleep 1
-        i=$((i+1))
-    done
-}
-
 wait_finish_rebalance()
 {
    sec=$1
-    check=$2
-    check=${check:-'.state == ["active"] or .state == ["active", "left_on_dead"]'}
-    check="$ETCDCTL get --prefix /vitastor/pg/state/ --print-value-only | jq -s -e '([ .[] | select($check) ] | length) == $PG_COUNT'"
-    wait_condition "$sec" "$check" Rebalance
+    i=0
+    while [[ $i -lt $sec ]]; do
+        ($ETCDCTL get --prefix /vitastor/pg/state/ --print-value-only | jq -s -e '([ .[] | select(.state == ["active"] or .state == ["active", "left_on_dead"]) ] | length) == '$PG_COUNT) && \
+            break
+        sleep 1
+        i=$((i+1))
+        if [ $i -eq $sec ]; then
+            format_error "Rebalance couldn't finish in $sec seconds"
+        fi
+    done
 }

 check_qemu()
--- a/tests/run_tests.sh
+++ b/tests/run_tests.sh
@@ -46,10 +46,3 @@ SCHEME=xor ./test_write.sh

 PG_SIZE=2 ./test_heal.sh
 SCHEME=ec ./test_heal.sh
-
-./test_scrub.sh
-ZERO_OSD=2 ./test_scrub.sh
-SCHEME=xor ./test_scrub.sh
-PG_SIZE=3 ./test_scrub.sh
-PG_SIZE=6 PG_MINSIZE=4 OSD_COUNT=6 SCHEME=ec ./test_scrub.sh
-SCHEME=ec ./test_scrub.sh
--- a/tests/test_heal.sh
+++ b/tests/test_heal.sh
@@ -46,8 +46,8 @@ kill_osds()
 kill_osds &

 LD_PRELOAD="build/src/libfio_vitastor.so" \
-    fio -thread -name=test -ioengine=build/src/libfio_vitastor.so -bsrange=4k-128k -direct=1 -iodepth=32 -fsync=256 -rw=randrw \
-        -randrepeat=0 -refill_buffers=1 -mirror_file=./testdata/mirror.bin -etcd=$ETCD_URL -image=testimg -loops=10 -runtime=120
+    fio -thread -name=test -ioengine=build/src/libfio_vitastor.so -bs=4k -direct=1 -iodepth=16 -fsync=256 -rw=randwrite \
+        -mirror_file=./testdata/mirror.bin -etcd=$ETCD_URL -image=testimg -loops=10 -runtime=120

 qemu-img convert -S 4096 -p \
    -f raw "vitastor:etcd_host=127.0.0.1\:$ETCD_PORT/v3:image=testimg" \
--- a/tests/test_scrub.sh
+++ b/tests/test_scrub.sh
@@ -1,65 +0,0 @@
-#!/bin/bash -ex
-# Test for scrub without checksums
-
-ZERO_OSD=${ZERO_OSD:-1}
-
-if [[ ("$SCHEME" = "" || "$SCHEME" = "replicated") && ("$PG_SIZE" = "" || "$PG_SIZE" = 2) ]]; then
-    OSD_COUNT=2
-fi
-
-. `dirname $0`/run_3osds.sh
-
-check_qemu
-
-IMG_SIZE=128
-
-$ETCDCTL put /vitastor/config/inode/1/1 '{"name":"testimg","size":'$((IMG_SIZE*1024*1024))'}'
-
-# Write
-LD_PRELOAD="build/src/libfio_vitastor.so" \
-    fio -thread -name=test -ioengine=build/src/libfio_vitastor.so -bs=1M -direct=1 -iodepth=4 \
-        -mirror_file=./testdata/mirror.bin -end_fsync=1 -rw=write -etcd=$ETCD_URL -image=testimg
-
-# Intentionally corrupt OSD data and restart it
-zero_osd_pid=OSD${ZERO_OSD}_PID
-kill ${!zero_osd_pid}
-sleep 1
-kill -9 ${!zero_osd_pid} || true
-data_offset=$(build/src/vitastor-disk simple-offsets ./testdata/test_osd$ZERO_OSD.bin $OFFSET_ARGS | grep data_offset | awk '{print $2}')
-truncate -s $data_offset ./testdata/test_osd$ZERO_OSD.bin
-dd if=/dev/zero of=./testdata/test_osd$ZERO_OSD.bin bs=1024 count=1 seek=$((OSD_SIZE*1024-1))
-$ETCDCTL del /vitastor/osd/state/$ZERO_OSD
-start_osd $ZERO_OSD
-
-# Wait until start
-wait_up 10
-
-# Trigger scrub
-$ETCDCTL put /vitastor/pg/history/1/1 `$ETCDCTL get --print-value-only /vitastor/pg/history/1/1 | jq -s -c '(.[0] // {}) + {"next_scrub":1}'`
-
-# Wait for scrub to finish
-wait_condition 60 "$ETCDCTL get --prefix /vitastor/pg/history/ --print-value-only | jq -s -e '([ .[] | select(.next_scrub == 0 or .next_scrub == null) ] | length) == $PG_COUNT'" Scrubbing
-
-if [[ ($SCHEME = replicated && $PG_SIZE < 3) || ($SCHEME != replicated && $((PG_SIZE-PG_DATA_SIZE)) < 2) ]]; then
-    # Check that objects are marked as inconsistent if 2 replicas or EC/XOR 2+1
-    build/src/vitastor-cli describe --etcd_address $ETCD_URL --json | jq -e '[ .[] | select(.inconsistent) ] | length == '$((IMG_SIZE * 8 * PG_SIZE / (SCHEME = replicated ? 1 : PG_DATA_SIZE)))
-
-    # Fix objects using vitastor-cli fix
-    build/src/vitastor-cli describe --etcd_address $ETCD_URL --json | \
-        jq -s '[ .[0][] | select(.inconsistent and .osd_num == '$ZERO_OSD') ]' | \
-        build/src/vitastor-cli fix --etcd_address $ETCD_URL --bad_osds $ZERO_OSD
-elif [[ ($SCHEME = replicated && $PG_SIZE > 2) || ($SCHEME != replicated && $((PG_SIZE-PG_DATA_SIZE)) > 1) ]]; then
-    # Check that everything heals
-    wait_finish_rebalance 60
-
-    build/src/vitastor-cli describe --etcd_address $ETCD_URL --json | jq -e '. | length == 0'
-fi
-
-# Read everything back
-qemu-img convert -S 4096 -p \
-    -f raw "vitastor:etcd_host=127.0.0.1\:$ETCD_PORT/v3:image=testimg" \
-    -O raw ./testdata/read.bin
-
-diff ./testdata/read.bin ./testdata/mirror.bin
-
-format_green OK
Author	SHA1	Message	Date
Vitaliy Filippov	72f0cff79d	WIP Use random_hier_combinations	2023-05-18 17:44:00 +03:00
Vitaliy Filippov	c1d470522c	Replace flatten_tree with extract_tree_levels	2023-05-18 17:44:00 +03:00
Vitaliy Filippov	57feb7f390	Implement multi-level tree extractor for hierarchical failure domains	2023-05-18 17:44:00 +03:00
Vitaliy Filippov	431f780347	Implement a PG generator for hierarchical failure domains	2023-05-18 17:44:00 +03:00