WIP Implement RDMA v2 based on IBV_WR_RDMA_WRITE with remote buffer management

One BIG FIXME remaining - handling large operations :))
Fix json11: allow trailing comma
2023-02-26 00:26:39 +03:00 · 2023-02-23 01:16:01 +03:00 · 2023-02-23 00:55:47 +03:00 · 2023-02-21 01:30:42 +03:00 · 2023-02-21 01:30:42 +03:00 · 2023-02-19 18:59:07 +03:00
119 changed files with 3332 additions and 803 deletions
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -2,6 +2,6 @@ cmake_minimum_required(VERSION 2.8)

 project(vitastor)

-set(VERSION "0.8.0")
+set(VERSION "0.8.5")

 add_subdirectory(src)
--- a/VNPL-1.1-RU.txt
+++ b/VNPL-1.1-RU.txt
@@ -48,9 +48,9 @@ Vitastor, составлены для того, чтобы убедиться,
 интерфейс (прокси), опять же, без открытия в свободный публичный доступ как
 самой программы, так и прокси.

-  Сетевая Публичная Лицензия Vitastor разработана специально чтобы
+  Сетевая Публичная Лицензия Vitastor разработана специально, чтобы
 гарантировать, что в таких случаях и модифицированная версия программы, и
-прокси оставались доступными сообществу. Для этого лицензия требует от
+прокси останутся доступными сообществу. Для этого лицензия требует от
 операторов сетевых серверов предоставлять исходный код оригинальной программы,
 а также всех других программ, взаимодействующих с ней на их серверах,
 пользователям этих серверов, на условиях свободных лицензий. Таким образом,
--- a/csi/Dockerfile
+++ b/csi/Dockerfile
@@ -18,15 +18,19 @@ ENV CSI_ENDPOINT=""

 RUN apt-get update && \
    apt-get install -y wget && \
-    wget -q -O /etc/apt/trusted.gpg.d/vitastor.gpg https://vitastor.io/debian/pubkey.gpg && \
-    (echo deb http://vitastor.io/debian buster main > /etc/apt/sources.list.d/vitastor.list) && \
    (echo deb http://deb.debian.org/debian buster-backports main > /etc/apt/sources.list.d/backports.list) && \
    (echo "APT::Install-Recommends false;" > /etc/apt/apt.conf) && \
    apt-get update && \
-    apt-get install -y e2fsprogs xfsprogs vitastor kmod && \
+    apt-get install -y e2fsprogs xfsprogs kmod && \
    apt-get clean && \
    (echo options nbd nbds_max=128 > /etc/modprobe.d/nbd.conf)

 COPY --from=build /app/vitastor-csi /bin/

+RUN (echo deb http://vitastor.io/debian buster main > /etc/apt/sources.list.d/vitastor.list) && \
+    wget -q -O /etc/apt/trusted.gpg.d/vitastor.gpg https://vitastor.io/debian/pubkey.gpg && \
+    apt-get update && \
+    apt-get install -y vitastor-client && \
+    apt-get clean
+
 ENTRYPOINT ["/bin/vitastor-csi"]
--- a/csi/Makefile
+++ b/csi/Makefile
@@ -1,4 +1,4 @@
-VERSION ?= v0.8.0
+VERSION ?= v0.8.5

 all: build push

--- a/csi/deploy/004-csi-nodeplugin.yaml
+++ b/csi/deploy/004-csi-nodeplugin.yaml
@@ -49,7 +49,7 @@ spec:
            capabilities:
              add: ["SYS_ADMIN"]
            allowPrivilegeEscalation: true
-          image: vitalif/vitastor-csi:v0.8.0
+          image: vitalif/vitastor-csi:v0.8.5
          args:
            - "--node=$(NODE_ID)"
            - "--endpoint=$(CSI_ENDPOINT)"
@@ -102,7 +102,7 @@ spec:
            - "--health-port=9898"
          env:
            - name: CSI_ENDPOINT
-              value: unix://csi/csi.sock
+              value: unix:///csi/csi.sock
          volumeMounts:
          - mountPath: /csi
            name: socket-dir
--- a/csi/deploy/007-csi-provisioner.yaml
+++ b/csi/deploy/007-csi-provisioner.yaml
@@ -116,7 +116,7 @@ spec:
            privileged: true
            capabilities:
              add: ["SYS_ADMIN"]
-          image: vitalif/vitastor-csi:v0.8.0
+          image: vitalif/vitastor-csi:v0.8.5
          args:
            - "--node=$(NODE_ID)"
            - "--endpoint=$(CSI_ENDPOINT)"
--- a/csi/src/config.go
+++ b/csi/src/config.go
@@ -5,7 +5,7 @@ package vitastor

 const (
    vitastorCSIDriverName    = "csi.vitastor.io"
-    vitastorCSIDriverVersion = "0.8.0"
+    vitastorCSIDriverVersion = "0.8.5"
 )

 // Config struct fills the parameters of request or user input
--- a/debian/changelog
+++ b/debian/changelog
@@ -1,10 +1,10 @@
-vitastor (0.8.0-1) unstable; urgency=medium
+vitastor (0.8.5-1) unstable; urgency=medium

  * Bugfixes

 -- Vitaliy Filippov <vitalif@yourcmc.ru>  Fri, 03 Jun 2022 02:09:44 +0300

-vitastor (0.8.0-1) unstable; urgency=medium
+vitastor (0.8.5-1) unstable; urgency=medium

  * Implement NFS proxy
  * Add documentation
--- a/debian/pve-storage-vitastor.install
+++ b/debian/pve-storage-vitastor.install
@@ -1 +1 @@
-patches/PVE_VitastorPlugin.pm usr/share/perl5/PVE/Storage/Custom/VitastorPlugin.pm
+patches/VitastorPlugin.pm usr/share/perl5/PVE/Storage/Custom/
--- a/debian/vitastor.Dockerfile
+++ b/debian/vitastor.Dockerfile
@@ -34,8 +34,8 @@ RUN set -e -x; \
    mkdir -p /root/packages/vitastor-$REL; \
    rm -rf /root/packages/vitastor-$REL/*; \
    cd /root/packages/vitastor-$REL; \
-    cp -r /root/vitastor vitastor-0.8.0; \
-    cd vitastor-0.8.0; \
+    cp -r /root/vitastor vitastor-0.8.5; \
+    cd vitastor-0.8.5; \
    ln -s /root/fio-build/fio-*/ ./fio; \
    FIO=$(head -n1 fio/debian/changelog | perl -pe 's/^.*\((.*?)\).*$/$1/'); \
    ls /usr/include/linux/raw.h || cp ./debian/raw.h /usr/include/linux/raw.h; \
@@ -48,8 +48,8 @@ RUN set -e -x; \
    rm -rf a b; \
    echo "dep:fio=$FIO" > debian/fio_version; \
    cd /root/packages/vitastor-$REL; \
-    tar --sort=name --mtime='2020-01-01' --owner=0 --group=0 --exclude=debian -cJf vitastor_0.8.0.orig.tar.xz vitastor-0.8.0; \
-    cd vitastor-0.8.0; \
+    tar --sort=name --mtime='2020-01-01' --owner=0 --group=0 --exclude=debian -cJf vitastor_0.8.5.orig.tar.xz vitastor-0.8.5; \
+    cd vitastor-0.8.5; \
    V=$(head -n1 debian/changelog | perl -pe 's/^.*\((.*?)\).*$/$1/'); \
    DEBFULLNAME="Vitaliy Filippov <vitalif@yourcmc.ru>" dch -D $REL -v "$V""$REL" "Rebuild for $REL"; \
    DEB_BUILD_OPTIONS=nocheck dpkg-buildpackage --jobs=auto -sa; \
--- a/docs/config/osd.en.md
+++ b/docs/config/osd.en.md
@@ -17,6 +17,7 @@ initialization and can be changed with an OSD restart.
 - [autosync_interval](#autosync_interval)
 - [autosync_writes](#autosync_writes)
 - [recovery_queue_depth](#recovery_queue_depth)
+- [recovery_pg_switch](#recovery_pg_switch)
 - [recovery_sync_batch](#recovery_sync_batch)
 - [readonly](#readonly)
 - [no_recovery](#no_recovery)
@@ -115,6 +116,16 @@ Maximum recovery operations per one primary OSD at any given moment of time.
 Currently it's the only parameter available to tune the speed or recovery
 and rebalancing, but it's planned to implement more.

+## recovery_pg_switch
+
+- Type: integer
+- Default: 128
+
+Number of recovery operations before switching to recovery of the next PG.
+The idea is to mix all PGs during recovery for more even space and load
+distribution but still benefit from recovery queue depth greater than 1.
+Degraded PGs are anyway scanned first.
+
 ## recovery_sync_batch

 - Type: integer
--- a/docs/config/osd.ru.md
+++ b/docs/config/osd.ru.md
@@ -18,6 +18,7 @@
 - [autosync_interval](#autosync_interval)
 - [autosync_writes](#autosync_writes)
 - [recovery_queue_depth](#recovery_queue_depth)
+- [recovery_pg_switch](#recovery_pg_switch)
 - [recovery_sync_batch](#recovery_sync_batch)
 - [readonly](#readonly)
 - [no_recovery](#no_recovery)
@@ -119,6 +120,17 @@ OSD, чтобы успевать очищать журнал - без них OSD
 для ускорения или замедления восстановления и перебалансировки данных, но
 в планах реализация других параметров.

+## recovery_pg_switch
+
+- Тип: целое число
+- Значение по умолчанию: 128
+
+Число операций восстановления перед переключением на восстановление другой PG.
+Идея заключается в том, чтобы восстанавливать все PG одновременно для более
+равномерного распределения места и нагрузки, но при этом всё равно выигрывать
+от глубины очереди восстановления, большей, чем 1. Деградированные PG в любом
+случае сканируются первыми.
+
 ## recovery_sync_batch

 - Тип: целое число
--- a/docs/config/pool.en.md
+++ b/docs/config/pool.en.md
@@ -82,7 +82,7 @@ Parent node reference is required for intermediate tree nodes.
 Separate OSD settings are set in etc keys `/vitastor/config/osd/<number>`
 in JSON format `{"<key>":<value>}`.

-As of now, there is only one setting:
+As of now, two settings are supported:

 ## reweight

@@ -96,6 +96,15 @@ This means an OSD configured with reweight lower than 1 receives less PGs than
 it normally would. An OSD with reweight = 0 won't store any data. You can set
 reweight to 0 to trigger rebalance and remove all data from an OSD.

+## tags
+
+- Type: string or array of strings
+
+Sets tag or multiple tags for this OSD. Tags can be used to group OSDs into
+subsets and then use a specific subset for pool instead of all OSDs.
+For example you can mark SSD OSDs with tag "ssd" and HDD OSDs with "hdd" and
+such tags will work as device classes.
+
 # Pool parameters

 ## name
--- a/docs/config/pool.ru.md
+++ b/docs/config/pool.ru.md
@@ -81,7 +81,10 @@
 Настройки отдельных OSD задаются в ключах etcd `/vitastor/config/osd/<number>`
 в JSON-формате `{"<key>":<value>}`.

-На данный момент поддерживается одна настройка:
+На данный момент поддерживаются две настройки:
+
+- [reweight](#reweight)
+- [tags](#tags)

 ## reweight

@@ -96,6 +99,15 @@
 хранении данных вообще. Вы можете установить reweight в 0, чтобы убрать
 все данные с OSD.

+## tags
+
+- Тип: строка или массив строк
+
+Задаёт тег или набор тегов для данного OSD. Теги можно использовать, чтобы
+делить OSD на множества и потом размещать пул только на части OSD, а не на
+всех. Можно, например, пометить SSD OSD тегом "ssd", а HDD тегом "hdd", в
+этом смысле теги работают аналогично классам устройств.
+
 # Параметры

 ## name
--- a/docs/config/src/osd.yml
+++ b/docs/config/src/osd.yml
@@ -102,6 +102,20 @@
    момент времени. На данный момент единственный параметр, который можно менять
    для ускорения или замедления восстановления и перебалансировки данных, но
    в планах реализация других параметров.
+- name: recovery_pg_switch
+  type: int
+  default: 128
+  info: |
+    Number of recovery operations before switching to recovery of the next PG.
+    The idea is to mix all PGs during recovery for more even space and load
+    distribution but still benefit from recovery queue depth greater than 1.
+    Degraded PGs are anyway scanned first.
+  info_ru: |
+    Число операций восстановления перед переключением на восстановление другой PG.
+    Идея заключается в том, чтобы восстанавливать все PG одновременно для более
+    равномерного распределения места и нагрузки, но при этом всё равно выигрывать
+    от глубины очереди восстановления, большей, чем 1. Деградированные PG в любом
+    случае сканируются первыми.
 - name: recovery_sync_batch
  type: int
  default: 16
--- a/docs/installation/packages.en.md
+++ b/docs/installation/packages.en.md
@@ -9,7 +9,7 @@
 ## Debian

 - Trust Vitastor package signing key:
-  `wget -q -O - https://vitastor.io/debian/pubkey | sudo apt-key add -`
+  `wget https://vitastor.io/debian/pubkey.gpg -O /etc/apt/trusted.gpg.d/vitastor.gpg`
 - Add Vitastor package repository to your /etc/apt/sources.list:
  - Debian 11 (Bullseye/Sid): `deb https://vitastor.io/debian bullseye main`
  - Debian 10 (Buster): `deb https://vitastor.io/debian buster main`
@@ -20,8 +20,8 @@
 ## CentOS

 - Add Vitastor package repository:
-  - CentOS 7: `yum install https://vitastor.io/rpms/centos/7/vitastor-release-1.0-1.el7.noarch.rpm`
-  - CentOS 8: `dnf install https://vitastor.io/rpms/centos/8/vitastor-release-1.0-1.el8.noarch.rpm`
+  - CentOS 7: `yum install https://vitastor.io/rpms/centos/7/vitastor-release.rpm`
+  - CentOS 8: `dnf install https://vitastor.io/rpms/centos/8/vitastor-release.rpm`
 - Enable EPEL: `yum/dnf install epel-release`
 - Enable additional CentOS repositories:
  - CentOS 7: `yum install centos-release-scl`
--- a/docs/installation/packages.ru.md
+++ b/docs/installation/packages.ru.md
@@ -9,7 +9,7 @@
 ## Debian

 - Добавьте ключ репозитория Vitastor:
-  `wget -q -O - https://vitastor.io/debian/pubkey | sudo apt-key add -`
+  `wget https://vitastor.io/debian/pubkey.gpg -O /etc/apt/trusted.gpg.d/vitastor.gpg`
 - Добавьте репозиторий Vitastor в /etc/apt/sources.list:
  - Debian 11 (Bullseye/Sid): `deb https://vitastor.io/debian bullseye main`
  - Debian 10 (Buster): `deb https://vitastor.io/debian buster main`
@@ -20,8 +20,8 @@
 ## CentOS

 - Добавьте в систему репозиторий Vitastor:
-  - CentOS 7: `yum install https://vitastor.io/rpms/centos/7/vitastor-release-1.0-1.el7.noarch.rpm`
-  - CentOS 8: `dnf install https://vitastor.io/rpms/centos/8/vitastor-release-1.0-1.el8.noarch.rpm`
+  - CentOS 7: `yum install https://vitastor.io/rpms/centos/7/vitastor-release.rpm`
+  - CentOS 8: `dnf install https://vitastor.io/rpms/centos/8/vitastor-release.rpm`
 - Включите EPEL: `yum/dnf install epel-release`
 - Включите дополнительные репозитории CentOS:
  - CentOS 7: `yum install centos-release-scl`
--- a/docs/installation/proxmox.en.md
+++ b/docs/installation/proxmox.en.md
@@ -6,10 +6,10 @@

 # Proxmox VE

-To enable Vitastor support in Proxmox Virtual Environment (6.4 and 7.1 are supported):
+To enable Vitastor support in Proxmox Virtual Environment (6.4-7.3 are supported):

- Add the corresponding Vitastor Debian repository into sources.list on Proxmox hosts
-  (buster for 6.4, bullseye for 7.1)
+- Add the corresponding Vitastor Debian repository into sources.list on Proxmox hosts:
+  buster for 6.4, bullseye for 7.3, pve7.1 for 7.1, pve7.2 for 7.2
 - Install vitastor-client, pve-qemu-kvm, pve-storage-vitastor (* or see note) packages from Vitastor repository
 - Define storage in `/etc/pve/storage.cfg` (see below)
 - Block network access from VMs to Vitastor network (to OSDs and etcd),
@@ -35,5 +35,5 @@ vitastor: vitastor
    vitastor_nbd 0
 ```

-\* Note: you can also manually copy [patches/PVE_VitastorPlugin.pm](patches/PVE_VitastorPlugin.pm) to Proxmox hosts
+\* Note: you can also manually copy [patches/VitastorPlugin.pm](patches/VitastorPlugin.pm) to Proxmox hosts
 as `/usr/share/perl5/PVE/Storage/Custom/VitastorPlugin.pm` instead of installing pve-storage-vitastor.
--- a/docs/installation/proxmox.ru.md
+++ b/docs/installation/proxmox.ru.md
@@ -6,10 +6,10 @@

 # Proxmox

-Чтобы подключить Vitastor к Proxmox Virtual Environment (поддерживаются версии 6.4 и 7.1):
+Чтобы подключить Vitastor к Proxmox Virtual Environment (поддерживаются версии 6.4-7.3):

- Добавьте соответствующий Debian-репозиторий Vitastor в sources.list на хостах Proxmox
-  (buster для 6.4, bullseye для 7.1)
+- Добавьте соответствующий Debian-репозиторий Vitastor в sources.list на хостах Proxmox:
+  buster для 6.4, bullseye для 7.3, pve7.1 для 7.1, pve7.2 для 7.2
 - Установите пакеты vitastor-client, pve-qemu-kvm, pve-storage-vitastor (* или см. сноску) из репозитория Vitastor
 - Определите тип хранилища в `/etc/pve/storage.cfg` (см. ниже)
 - Обязательно заблокируйте доступ от виртуальных машин к сети Vitastor (OSD и etcd), т.к. Vitastor (пока) не поддерживает аутентификацию
@@ -35,5 +35,5 @@ vitastor: vitastor
 ```

 \* Примечание: вместо установки пакета pve-storage-vitastor вы можете вручную скопировать файл
-[patches/PVE_VitastorPlugin.pm](patches/PVE_VitastorPlugin.pm) на хосты Proxmox как
+[patches/VitastorPlugin.pm](patches/VitastorPlugin.pm) на хосты Proxmox как
 `/usr/share/perl5/PVE/Storage/Custom/VitastorPlugin.pm`.
--- a/docs/intro/quickstart.en.md
+++ b/docs/intro/quickstart.en.md
@@ -70,7 +70,7 @@ For EC pools the configuration should look like the following:

 ```
 etcdctl --endpoints=... put /vitastor/config/pools '{"2":{"name":"ecpool",
-  "scheme":"ec","pg_size":4,"parity_chunks":2,"pg_minsize":2,"pg_count":256,"failure_domain":"host"}`
+  "scheme":"ec","pg_size":4,"parity_chunks":2,"pg_minsize":2,"pg_count":256,"failure_domain":"host"}'
 ```

 After you do this, one of the monitors will configure PGs and OSDs will start them.
--- a/docs/intro/quickstart.ru.md
+++ b/docs/intro/quickstart.ru.md
@@ -71,7 +71,7 @@ etcdctl --endpoints=... put /vitastor/config/pools '{"1":{"name":"testpool",

 ```
 etcdctl --endpoints=... put /vitastor/config/pools '{"2":{"name":"ecpool",
-  "scheme":"ec","pg_size":4,"parity_chunks":2,"pg_minsize":2,"pg_count":256,"failure_domain":"host"}`
+  "scheme":"ec","pg_size":4,"parity_chunks":2,"pg_minsize":2,"pg_count":256,"failure_domain":"host"}'
 ```

 После этого один из мониторов должен сконфигурировать PG, а OSD должны запустить их.
--- a/docs/usage/cli.en.md
+++ b/docs/usage/cli.en.md
@@ -14,12 +14,14 @@ It supports the following commands:
 - [df](#df)
 - [ls](#ls)
 - [create](#create)
+- [snap-create](#create)
 - [modify](#modify)
 - [rm](#rm)
 - [flatten](#flatten)
 - [rm-data](#rm-data)
 - [merge-data](#merge-data)
 - [alloc-osd](#alloc-osd)
+- [rm-osd](#rm-osd)

 Global options:

@@ -122,6 +124,8 @@ vitastor-cli snap-create [-p|--pool <id|name>] <image>@<snapshot>

 Create a snapshot of image `<name>` (either form can be used). May be used live if only a single writer is active.

+See also about [how to export snapshots](qemu.en.md#exporting-snapshots).
+
 ## modify

 `vitastor-cli modify <name> [--rename <new-name>] [--resize <size>] [--readonly | --readwrite] [-f|--force]`
@@ -175,3 +179,14 @@ Merge layer data without changing metadata. Merge `<from>`..`<to>` to `<target>`
 `vitastor-cli alloc-osd`

 Allocate a new OSD number and reserve it by creating empty `/osd/stats/<n>` key.
+
+## rm-osd
+
+`vitastor-cli rm-osd [--force] [--allow-data-loss] [--dry-run] <osd_id> [osd_id...]`
+
+Remove metadata and configuration for specified OSD(s) from etcd.
+
+Refuses to remove OSDs with data without `--force` and `--allow-data-loss`.
+
+With `--dry-run` only checks if deletion is possible without data loss and
+redundancy degradation.
--- a/docs/usage/cli.ru.md
+++ b/docs/usage/cli.ru.md
@@ -15,12 +15,14 @@ vitastor-cli - интерфейс командной строки для адм
 - [df](#df)
 - [ls](#ls)
 - [create](#create)
+- [snap-create](#create)
 - [modify](#modify)
 - [rm](#rm)
 - [flatten](#flatten)
 - [rm-data](#rm-data)
 - [merge-data](#merge-data)
 - [alloc-osd](#alloc-osd)
+- [rm-osd](#rm-osd)

 Глобальные опции:

@@ -125,6 +127,8 @@ vitastor-cli snap-create [-p|--pool <id|name>] <image>@<snapshot>
 Создать снимок образа `<name>` (можно использовать любую форму команды). Снимок можно создавать без остановки
 клиентов, если пишущий клиент максимум 1.

+Смотрите также информацию о том, [как экспортировать снимки](qemu.ru.md#экспорт-снимков).
+
 ## modify

 `vitastor-cli modify <name> [--rename <new-name>] [--resize <size>] [--readonly | --readwrite] [-f|--force]`
@@ -186,3 +190,14 @@ vitastor-cli snap-create [-p|--pool <id|name>] <image>@<snapshot>

 Атомарно выделить новый номер OSD и зарезервировать его, создав в etcd пустой
 ключ `/osd/stats/<n>`.
+
+## rm-osd
+
+`vitastor-cli rm-osd [--force] [--allow-data-loss] [--dry-run] <osd_id> [osd_id...]`
+
+Удалить метаданные и конфигурацию для заданных OSD из etcd.
+
+Отказывается удалять OSD с данными без опций `--force` и `--allow-data-loss`.
+
+С опцией `--dry-run` только проверяет, возможно ли удаление без потери данных и деградации
+избыточности.
--- a/docs/usage/disk.en.md
+++ b/docs/usage/disk.en.md
@@ -14,6 +14,7 @@ It supports the following commands:
 - [upgrade-simple](#upgrade-simple)
 - [resize](#resize)
 - [start/stop/restart/enable/disable](#start/stop/restart/enable/disable)
+- [purge](#purge)
 - [read-sb](#read-sb)
 - [write-sb](#write-sb)
 - [udev](#udev)
@@ -155,11 +156,22 @@ Commands are passed to `systemctl` with `vitastor-osd@<num>` units as arguments.

 When `--now` is added to enable/disable, OSDs are also immediately started/stopped.

+## purge
+
+`vitastor-disk purge [--force] [--allow-data-loss] <device> [device2 device3 ...]`
+
+Purge Vitastor OSD(s) on specified device(s). Uses `vitastor-cli rm-osd` to check
+if deletion is possible without data loss and to actually remove metadata from etcd.
+`--force` and `--allow-data-loss` options may be used to ignore safety check results.
+
+Requires `vitastor-cli`, `sfdisk` and `partprobe` (from parted) utilities.
+
 ## read-sb

-`vitastor-disk read-sb <device>`
+`vitastor-disk read-sb [--force] <device>`

 Try to read Vitastor OSD superblock from `<device>` and print it in JSON format.
+`--force` allows to ignore validation errors.

 ## write-sb

--- a/docs/usage/disk.ru.md
+++ b/docs/usage/disk.ru.md
@@ -14,6 +14,7 @@ vitastor-disk - инструмент командной строки для уп
 - [upgrade-simple](#upgrade-simple)
 - [resize](#resize)
 - [start/stop/restart/enable/disable](#start/stop/restart/enable/disable)
+- [purge](#purge)
 - [read-sb](#read-sb)
 - [write-sb](#write-sb)
 - [udev](#udev)
@@ -158,12 +159,25 @@ throttle_target_mbs, throttle_target_parallelism, throttle_threshold_us.
 Когда к командам включения/выключения добавляется параметр `--now`, OSD также сразу
 запускаются/останавливаются.

+## purge
+
+`vitastor-disk purge [--force] [--allow-data-loss] <device> [device2 device3 ...]`
+
+Удалить OSD на заданном диске/дисках. Использует `vitastor-cli rm-osd` для проверки
+возможности удаления без потери данных и для удаления OSD из etcd. Опции `--force`
+и `--allow-data-loss` служат для обхода данной защиты в случае необходимости.
+
+Команде требуются утилиты `vitastor-cli`, `sfdisk` и `partprobe` (из состава parted).
+
 ## read-sb

-`vitastor-disk read-sb <device>`
+`vitastor-disk read-sb [--force] <device>`

 Прочитать суперблок OSD с диска `<device>` и вывести его в формате JSON.

+Опция `--force` позволяет читать суперблок, даже если он считается некорректным
+из-за ошибок валидации.
+
 ## write-sb

 `vitastor-disk write-sb <device>`
--- a/docs/usage/qemu.en.md
+++ b/docs/usage/qemu.en.md
@@ -46,3 +46,40 @@ qemu-img convert -f qcow2 debian10.qcow2 -p -O raw 'vitastor:etcd_host=192.168.7

 You can also specify `:pool=<POOL>:inode=<INODE>:size=<SIZE>` instead of `:image=<IMAGE>`
 if you don't want to use inode metadata.
+
+### Exporting snapshots
+
+Starting with 0.8.4, you can also export individual layers (snapshot diffs) using `qemu-img`.
+
+Suppose you have an image `testimg` and a snapshot `testimg@0` created with `vitastor-cli snap-create testimg@0`.
+
+Then you can export the `testimg@0` snapshot and the data written to `testimg` after creating
+the snapshot separately using the following commands (key points are using `skip-parents=1` and
+`-B backing_file` option):
+
+```
+qemu-img convert -f raw 'vitastor:etcd_host=192.168.7.2\:2379/v3:image=testimg@0' \
+    -O qcow2 testimg_0.qcow2
+
+qemu-img convert -f raw 'vitastor:etcd_host=192.168.7.2\:2379/v3:image=testimg:skip-parents=1' \
+    -O qcow2 -o 'cluster_size=4k' -B testimg_0.qcow2 testimg.qcow2
+```
+
+In fact, with `cluster_size=4k` any QCOW2 file can be used instead `-B testimg_0.qcow2`, even an empty one.
+
+QCOW2 `cluster_size=4k` option is required if you want `testimg.qcow2` to contain only the data
+overwritten  **exactly** in the child layer. With the default 64 KB QCOW2 cluster size you'll
+get a bit of extra data from parent layers, i.e. a 4 KB overwrite will result in `testimg.qcow2`
+containing 64 KB of data. And this extra data will be taken by `qemu-img` from the file passed
+in `-B` option, so you really need 4 KB cluster if you use an empty image in `-B`.
+
+After this procedure you'll get two chained QCOW2 images. To detach `testimg.qcow2` from
+its parent, run:
+
+```
+qemu-img rebase -u -b '' testimg.qcow2
+```
+
+This can be used for backups. Just note that exporting an image that is currently being written to
+is of course unsafe and doesn't produce a consistent result, so only export snapshots if you do this
+on a live VM.
--- a/docs/usage/qemu.ru.md
+++ b/docs/usage/qemu.ru.md
@@ -50,3 +50,40 @@ qemu-img convert -f qcow2 debian10.qcow2 -p -O raw 'vitastor:etcd_host=10.115.0.

 Если вы не хотите обращаться к образу по имени, вместо `:image=<IMAGE>` можно указать номер пула, номер инода и размер:
 `:pool=<POOL>:inode=<INODE>:size=<SIZE>`.
+
+### Экспорт снимков
+
+Начиная с 0.8.4 вы можете экспортировать отдельные слои (изменения в снимках) с помощью `qemu-img`.
+
+Допустим, что у вас есть образ `testimg` и его снимок `testimg@0`, созданный с помощью `vitastor-cli snap-create testimg@0`.
+
+Тогда вы можете выгрузить снимок `testimg@0` и данные, изменённые в `testimg` после создания снимка, отдельно,
+с помощью следующих команд (ключевые моменты - использование `skip-parents=1` и опции `-B backing_file.qcow2`):
+
+```
+qemu-img convert -f raw 'vitastor:etcd_host=192.168.7.2\:2379/v3:image=testimg@0' \
+    -O qcow2 testimg_0.qcow2
+
+qemu-img convert -f raw 'vitastor:etcd_host=192.168.7.2\:2379/v3:image=testimg:skip-parents=1' \
+    -O qcow2 -o 'cluster_size=4k' -B testimg_0.qcow2 testimg.qcow2
+```
+
+На самом деле, с `cluster_size=4k` вместо `-B testimg_0.qcow2` можно использовать любой qcow2-файл,
+даже пустой.
+
+Опция QCOW2 `cluster_size=4k` нужна, если вы хотите, чтобы `testimg.qcow2` содержал **в точности**
+данные, перезаписанные в дочернем слое. С размером кластера QCOW2 по умолчанию, составляющим 64 КБ,
+вы получите немного "лишних" данных из родительских слоёв - перезапись 4 КБ будет приводить к тому,
+что в `testimg.qcow2` будет появляться 64 КБ данных. Причём "лишние" данные qemu-img будет брать
+как раз из файла, указанного в опции `-B`, так что если там указан пустой образ, кластер обязан быть 4 КБ.
+
+После данной процедуры вы получите два QCOW2-образа, связанных в цепочку. Чтобы "отцепить" образ
+`testimg.qcow2` от базового, выполните:
+
+```
+qemu-img rebase -u -b '' testimg.qcow2
+```
+
+Это можно использовать для резервного копирования. Только помните, что экспортировать образ, в который
+в то же время идёт запись, небезопасно - результат чтения не будет целостным. Так что если вы работаете
+с активными виртуальными машинами, экспортируйте только их снимки, но не сам образ.
--- a/2
+++ b/2
--- a/mon/PGUtil.js
+++ b/mon/PGUtil.js
@@ -21,7 +21,7 @@ function add_pg_history(new_pg_history, new_pg, prev_pgs, prev_pg_history, old_p
    {
        for (const pg of oh.osd_sets)
        {
-            nh.osd_sets[pg.join(' ')] = pg;
+            nh.osd_sets[pg.join(' ')] = pg.map(osd_num => Number(osd_num));
        }
    }
    if (oh && oh.all_peers && oh.all_peers.length)
--- a/mon/lp-optimizer.js
+++ b/mon/lp-optimizer.js
@@ -550,8 +550,8 @@ function random_combinations(osd_tree, pg_size, count, ordered)
        seed ^= seed << 5;
        return seed + 2147483648;
    };
-    const hosts = Object.keys(osd_tree).sort();
    const osds = Object.keys(osd_tree).reduce((a, c) => { a[c] = Object.keys(osd_tree[c]).sort(); return a; }, {});
+    const hosts = Object.keys(osd_tree).sort().filter(h => osds[h].length > 0);
    const r = {};
    // Generate random combinations including each OSD at least once
    for (let h = 0; h < hosts.length; h++)
--- a/mon/make-etcd
+++ b/mon/make-etcd
@@ -79,7 +79,7 @@ StartLimitInterval=0
 RestartSec=10

 [Install]
-WantedBy=local.target
+WantedBy=multi-user.target
 `);
    await system(`useradd etcd`);
    await system(`systemctl daemon-reload`);
--- a/mon/mon.js
+++ b/mon/mon.js
@@ -261,7 +261,7 @@ const etcd_tree = {
            /* <pool_id>: {
                <pg_id>: {
                    primary: osd_num_t,
-                    state: ("starting"|"peering"|"peered"|"incomplete"|"active"|"repeering"|"stopping"|"offline"|
+                    state: ("starting"|"peering"|"incomplete"|"active"|"repeering"|"stopping"|"offline"|
                        "degraded"|"has_incomplete"|"has_degraded"|"has_misplaced"|"has_unclean"|
                        "has_invalid"|"left_on_dead")[],
                }
@@ -663,12 +663,15 @@ class Mon
    async save_last_clean()
    {
        // last_clean_pgs is used to avoid extra data move when observing a series of changes in the cluster
+        const new_clean_pgs = { items: {} };
+    next_pool:
        for (const pool_id in this.state.config.pools)
        {
+            new_clean_pgs.items[pool_id] = (this.state.history.last_clean_pgs.items||{})[pool_id];
            const pool_cfg = this.state.config.pools[pool_id];
            if (!this.validate_pool_cfg(pool_id, pool_cfg, false))
            {
-                continue;
+                continue next_pool;
            }
            for (let pg_num = 1; pg_num <= pool_cfg.pg_count; pg_num++)
            {
@@ -677,17 +680,18 @@ class Mon
                    !(this.state.pg.state[pool_id][pg_num].state instanceof Array))
                {
                    // Unclean
-                    return;
+                    continue next_pool;
                }
                let st = this.state.pg.state[pool_id][pg_num].state.join(',');
                if (st != 'active' && st != 'active,left_on_dead' && st != 'left_on_dead,active')
                {
                    // Unclean
-                    return;
+                    continue next_pool;
                }
            }
+            new_clean_pgs.items[pool_id] = this.state.config.pgs.items[pool_id];
        }
-        this.state.history.last_clean_pgs = JSON.parse(JSON.stringify(this.state.config.pgs));
+        this.state.history.last_clean_pgs = new_clean_pgs;
        await this.etcd_call('/kv/txn', {
            success: [ { requestPut: {
                key: b64(this.etcd_prefix+'/history/last_clean_pgs'),
@@ -1374,16 +1378,14 @@ class Mon
    // This is required for multiple change events to trigger at most 1 recheck in 1s
    schedule_recheck()
    {
-        if (this.recheck_timer)
+        if (!this.recheck_timer)
        {
-            clearTimeout(this.recheck_timer);
-            this.recheck_timer = null;
+            this.recheck_timer = setTimeout(() =>
+            {
+                this.recheck_timer = null;
+                this.recheck_pgs().catch(this.die);
+            }, this.config.mon_change_timeout || 1000);
        }
-        this.recheck_timer = setTimeout(() =>
-        {
-            this.recheck_timer = null;
-            this.recheck_pgs().catch(this.die);
-        }, this.config.mon_change_timeout || 1000);
    }

    sum_op_stats(timestamp, prev_stats)
@@ -1693,6 +1695,7 @@ class Mon
            // Do not clear these to null
            kv.value = kv.value || {};
        }
+        const old = cur[key_parts[key_parts.length-1]];
        cur[key_parts[key_parts.length-1]] = kv.value;
        if (key === 'config/global')
        {
@@ -1717,7 +1720,12 @@ class Mon
        }
        else if (key_parts[0] === 'osd' && key_parts[1] === 'stats')
        {
-            // Recheck PGs <osd_out_time> later
+            // Recheck OSD tree on OSD addition/deletion
+            if ((!old) != (!kv.value) || old && kv.value && old.size != kv.value.size)
+            {
+                this.schedule_recheck();
+            }
+            // Recheck PGs <osd_out_time> after last OSD statistics report
            this.schedule_next_recheck_at(
                !this.state.osd.stats[key[2]] ? 0 : this.state.osd.stats[key[2]].time+this.config.osd_out_time
            );
--- a/patches/PVE_VitastorPlugin.pm
+++ b/patches/PVE_VitastorPlugin.pm
@@ -16,6 +16,11 @@ use PVE::Tools qw(run_command);

 use base qw(PVE::Storage::Plugin);

+if (@PVE::Storage::Plugin::SHARED_STORAGE)
+{
+    push @PVE::Storage::Plugin::SHARED_STORAGE, 'vitastor';
+}
+
 sub api
 {
    # Trick it :)
@@ -133,9 +138,11 @@ sub properties
 sub options
 {
    return {
+        shared => { optional => 1 },
+        content => { optional => 1 },
        nodes => { optional => 1 },
        disable => { optional => 1 },
-        vitastor_etcd_address => { optional => 1},
+        vitastor_etcd_address => { optional => 1 },
        vitastor_etcd_prefix => { optional => 1 },
        vitastor_config_path => { optional => 1 },
        vitastor_prefix => { optional => 1 },
--- a/patches/cinder-vitastor.py
+++ b/patches/cinder-vitastor.py
@@ -50,7 +50,7 @@ from cinder.volume import configuration
 from cinder.volume import driver
 from cinder.volume import volume_utils

-VERSION = '0.8.0'
+VERSION = '0.8.5'

 LOG = logging.getLogger(__name__)

--- a/patches/pve-qemu-6.2-vitastor.patch
+++ b/patches/pve-qemu-6.2-vitastor.patch
@@ -0,0 +1,169 @@
+Index: qemu/block/meson.build
+===================================================================
+--- qemu.orig/block/meson.build
+++ qemu/block/meson.build
+@@ -91,6 +91,7 @@ foreach m : [
+   [libnfs, 'nfs', files('nfs.c')],
+   [libssh, 'ssh', files('ssh.c')],
+   [rbd, 'rbd', files('rbd.c')],
+  [vitastor, 'vitastor', files('vitastor.c')],
+ ]
+   if m[0].found()
+     module_ss = ss.source_set()
+Index: qemu/meson.build
+===================================================================
+--- qemu.orig/meson.build
+++ qemu/meson.build
+@@ -838,6 +838,26 @@ if not get_option('rbd').auto() or have_
+   endif
+ endif
+ 
+vitastor = not_found
+if not get_option('vitastor').auto() or have_block
+  libvitastor_client = cc.find_library('vitastor_client', has_headers: ['vitastor_c.h'],
+    required: get_option('vitastor'), kwargs: static_kwargs)
+  if libvitastor_client.found()
+    if cc.links('''
+      #include <vitastor_c.h>
+      int main(void) {
+        vitastor_c_create_qemu(0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0);
+        return 0;
+      }''', dependencies: libvitastor_client)
+      vitastor = declare_dependency(dependencies: libvitastor_client)
+    elif get_option('vitastor').enabled()
+      error('could not link libvitastor_client')
+    else
+      warning('could not link libvitastor_client, disabling')
+    endif
+  endif
+endif
+
+ glusterfs = not_found
+ glusterfs_ftruncate_has_stat = false
+ glusterfs_iocb_has_stat = false
+@@ -1459,6 +1479,7 @@ config_host_data.set('CONFIG_LINUX_AIO',
+ config_host_data.set('CONFIG_LINUX_IO_URING', linux_io_uring.found())
+ config_host_data.set('CONFIG_LIBPMEM', libpmem.found())
+ config_host_data.set('CONFIG_RBD', rbd.found())
+config_host_data.set('CONFIG_VITASTOR', vitastor.found())
+ config_host_data.set('CONFIG_SDL', sdl.found())
+ config_host_data.set('CONFIG_SDL_IMAGE', sdl_image.found())
+ config_host_data.set('CONFIG_SECCOMP', seccomp.found())
+@@ -3424,6 +3445,7 @@ if spice_protocol.found()
+   summary_info += {'  spice server support': spice}
+ endif
+ summary_info += {'rbd support':       rbd}
+summary_info += {'vitastor support':  vitastor}
+ summary_info += {'xfsctl support':    config_host.has_key('CONFIG_XFS')}
+ summary_info += {'smartcard support': cacard}
+ summary_info += {'U2F support':       u2f}
+Index: qemu/meson_options.txt
+===================================================================
+--- qemu.orig/meson_options.txt
+++ qemu/meson_options.txt
+@@ -121,6 +121,8 @@ option('lzo', type : 'feature', value :
+        description: 'lzo compression support')
+ option('rbd', type : 'feature', value : 'auto',
+        description: 'Ceph block device driver')
+option('vitastor', type : 'feature', value : 'auto',
+       description: 'Vitastor block device driver')
+ option('gtk', type : 'feature', value : 'auto',
+        description: 'GTK+ user interface')
+ option('sdl', type : 'feature', value : 'auto',
+Index: qemu/qapi/block-core.json
+===================================================================
+--- qemu.orig/qapi/block-core.json
+++ qemu/qapi/block-core.json
+@@ -3179,7 +3179,7 @@
+             'preallocate', 'qcow', 'qcow2', 'qed', 'quorum', 'raw', 'rbd',
+             { 'name': 'replication', 'if': 'CONFIG_REPLICATION' },
+             'pbs',
+-            'ssh', 'throttle', 'vdi', 'vhdx', 'vmdk', 'vpc', 'vvfat' ] }
+            'ssh', 'throttle', 'vdi', 'vhdx', 'vitastor', 'vmdk', 'vpc', 'vvfat' ] }
+ 
+ ##
+ # @BlockdevOptionsFile:
+@@ -4125,6 +4125,28 @@
+             '*server': ['InetSocketAddressBase'] } }
+ 
+ ##
+# @BlockdevOptionsVitastor:
+#
+# Driver specific block device options for vitastor
+#
+# @image:       Image name
+# @inode:       Inode number
+# @pool:        Pool ID
+# @size:        Desired image size in bytes
+# @config-path: Path to Vitastor configuration
+# @etcd-host:   etcd connection address(es)
+# @etcd-prefix: etcd key/value prefix
+##
+{ 'struct': 'BlockdevOptionsVitastor',
+  'data': { '*inode': 'uint64',
+            '*pool': 'uint64',
+            '*size': 'uint64',
+            '*image': 'str',
+            '*config-path': 'str',
+            '*etcd-host': 'str',
+            '*etcd-prefix': 'str' } }
+
+##
+ # @ReplicationMode:
+ #
+ # An enumeration of replication modes.
+@@ -4520,6 +4542,7 @@
+       'throttle':   'BlockdevOptionsThrottle',
+       'vdi':        'BlockdevOptionsGenericFormat',
+       'vhdx':       'BlockdevOptionsGenericFormat',
+      'vitastor':   'BlockdevOptionsVitastor',
+       'vmdk':       'BlockdevOptionsGenericCOWFormat',
+       'vpc':        'BlockdevOptionsGenericFormat',
+       'vvfat':      'BlockdevOptionsVVFAT'
+@@ -4910,6 +4933,17 @@
+             '*encrypt' :        'RbdEncryptionCreateOptions' } }
+ 
+ ##
+# @BlockdevCreateOptionsVitastor:
+#
+# Driver specific image creation options for Vitastor.
+#
+# @size: Size of the virtual disk in bytes
+##
+{ 'struct': 'BlockdevCreateOptionsVitastor',
+  'data': { 'location':         'BlockdevOptionsVitastor',
+            'size':             'size' } }
+
+##
+ # @BlockdevVmdkSubformat:
+ #
+ # Subformat options for VMDK images
+@@ -5108,6 +5142,7 @@
+       'ssh':            'BlockdevCreateOptionsSsh',
+       'vdi':            'BlockdevCreateOptionsVdi',
+       'vhdx':           'BlockdevCreateOptionsVhdx',
+      'vitastor':       'BlockdevCreateOptionsVitastor',
+       'vmdk':           'BlockdevCreateOptionsVmdk',
+       'vpc':            'BlockdevCreateOptionsVpc'
+   } }
+Index: qemu/scripts/ci/org.centos/stream/8/x86_64/configure
+===================================================================
+--- qemu.orig/scripts/ci/org.centos/stream/8/x86_64/configure
+++ qemu/scripts/ci/org.centos/stream/8/x86_64/configure
+@@ -31,7 +31,7 @@
+ --with-git=meson \
+ --with-git-submodules=update \
+ --target-list="x86_64-softmmu" \
+---block-drv-rw-whitelist="qcow2,raw,file,host_device,nbd,iscsi,rbd,blkdebug,luks,null-co,nvme,copy-on-read,throttle,gluster" \
+--block-drv-rw-whitelist="qcow2,raw,file,host_device,nbd,iscsi,rbd,vitastor,blkdebug,luks,null-co,nvme,copy-on-read,throttle,gluster" \
+ --audio-drv-list="" \
+ --block-drv-ro-whitelist="vmdk,vhdx,vpc,https,ssh" \
+ --with-coroutine=ucontext \
+@@ -183,6 +183,7 @@
+ --enable-opengl \
+ --enable-pie \
+ --enable-rbd \
+--enable-vitastor \
+ --enable-rdma \
+ --enable-seccomp \
+ --enable-snappy \
--- a/patches/pve-qemu-7.1-vitastor.patch
+++ b/patches/pve-qemu-7.1-vitastor.patch
@@ -0,0 +1,169 @@
+Index: qemu/block/meson.build
+===================================================================
+--- qemu.orig/block/meson.build
+++ qemu/block/meson.build
+@@ -111,6 +111,7 @@ foreach m : [
+   [libnfs, 'nfs', files('nfs.c')],
+   [libssh, 'ssh', files('ssh.c')],
+   [rbd, 'rbd', files('rbd.c')],
+  [vitastor, 'vitastor', files('vitastor.c')],
+ ]
+   if m[0].found()
+     module_ss = ss.source_set()
+Index: qemu/meson.build
+===================================================================
+--- qemu.orig/meson.build
+++ qemu/meson.build
+@@ -967,6 +967,26 @@ if not get_option('rbd').auto() or have_
+   endif
+ endif
+ 
+vitastor = not_found
+if not get_option('vitastor').auto() or have_block
+  libvitastor_client = cc.find_library('vitastor_client', has_headers: ['vitastor_c.h'],
+    required: get_option('vitastor'), kwargs: static_kwargs)
+  if libvitastor_client.found()
+    if cc.links('''
+      #include <vitastor_c.h>
+      int main(void) {
+        vitastor_c_create_qemu(0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0);
+        return 0;
+      }''', dependencies: libvitastor_client)
+      vitastor = declare_dependency(dependencies: libvitastor_client)
+    elif get_option('vitastor').enabled()
+      error('could not link libvitastor_client')
+    else
+      warning('could not link libvitastor_client, disabling')
+    endif
+  endif
+endif
+
+ glusterfs = not_found
+ glusterfs_ftruncate_has_stat = false
+ glusterfs_iocb_has_stat = false
+@@ -1802,6 +1822,7 @@ config_host_data.set('CONFIG_NUMA', numa
+ config_host_data.set('CONFIG_OPENGL', opengl.found())
+ config_host_data.set('CONFIG_PROFILER', get_option('profiler'))
+ config_host_data.set('CONFIG_RBD', rbd.found())
+config_host_data.set('CONFIG_VITASTOR', vitastor.found())
+ config_host_data.set('CONFIG_RDMA', rdma.found())
+ config_host_data.set('CONFIG_SDL', sdl.found())
+ config_host_data.set('CONFIG_SDL_IMAGE', sdl_image.found())
+@@ -3965,6 +3986,7 @@ if spice_protocol.found()
+   summary_info += {'  spice server support': spice}
+ endif
+ summary_info += {'rbd support':       rbd}
+summary_info += {'vitastor support':  vitastor}
+ summary_info += {'smartcard support': cacard}
+ summary_info += {'U2F support':       u2f}
+ summary_info += {'libusb':            libusb}
+Index: qemu/meson_options.txt
+===================================================================
+--- qemu.orig/meson_options.txt
+++ qemu/meson_options.txt
+@@ -167,6 +167,8 @@ option('lzo', type : 'feature', value :
+        description: 'lzo compression support')
+ option('rbd', type : 'feature', value : 'auto',
+        description: 'Ceph block device driver')
+option('vitastor', type : 'feature', value : 'auto',
+       description: 'Vitastor block device driver')
+ option('opengl', type : 'feature', value : 'auto',
+        description: 'OpenGL support')
+ option('rdma', type : 'feature', value : 'auto',
+Index: qemu/qapi/block-core.json
+===================================================================
+--- qemu.orig/qapi/block-core.json
+++ qemu/qapi/block-core.json
+@@ -3209,7 +3209,7 @@
+             'preallocate', 'qcow', 'qcow2', 'qed', 'quorum', 'raw', 'rbd',
+             { 'name': 'replication', 'if': 'CONFIG_REPLICATION' },
+             'pbs',
+-            'ssh', 'throttle', 'vdi', 'vhdx', 'vmdk', 'vpc', 'vvfat' ] }
+            'ssh', 'throttle', 'vdi', 'vhdx', 'vitastor', 'vmdk', 'vpc', 'vvfat' ] }
+ 
+ ##
+ # @BlockdevOptionsFile:
+@@ -4149,6 +4149,28 @@
+             '*server': ['InetSocketAddressBase'] } }
+ 
+ ##
+# @BlockdevOptionsVitastor:
+#
+# Driver specific block device options for vitastor
+#
+# @image:       Image name
+# @inode:       Inode number
+# @pool:        Pool ID
+# @size:        Desired image size in bytes
+# @config-path: Path to Vitastor configuration
+# @etcd-host:   etcd connection address(es)
+# @etcd-prefix: etcd key/value prefix
+##
+{ 'struct': 'BlockdevOptionsVitastor',
+  'data': { '*inode': 'uint64',
+            '*pool': 'uint64',
+            '*size': 'uint64',
+            '*image': 'str',
+            '*config-path': 'str',
+            '*etcd-host': 'str',
+            '*etcd-prefix': 'str' } }
+
+##
+ # @ReplicationMode:
+ #
+ # An enumeration of replication modes.
+@@ -4593,6 +4615,7 @@
+       'throttle':   'BlockdevOptionsThrottle',
+       'vdi':        'BlockdevOptionsGenericFormat',
+       'vhdx':       'BlockdevOptionsGenericFormat',
+      'vitastor':   'BlockdevOptionsVitastor',
+       'vmdk':       'BlockdevOptionsGenericCOWFormat',
+       'vpc':        'BlockdevOptionsGenericFormat',
+       'vvfat':      'BlockdevOptionsVVFAT'
+@@ -4985,6 +5008,17 @@
+             '*encrypt' :        'RbdEncryptionCreateOptions' } }
+ 
+ ##
+# @BlockdevCreateOptionsVitastor:
+#
+# Driver specific image creation options for Vitastor.
+#
+# @size: Size of the virtual disk in bytes
+##
+{ 'struct': 'BlockdevCreateOptionsVitastor',
+  'data': { 'location':         'BlockdevOptionsVitastor',
+            'size':             'size' } }
+
+##
+ # @BlockdevVmdkSubformat:
+ #
+ # Subformat options for VMDK images
+@@ -5182,6 +5216,7 @@
+       'ssh':            'BlockdevCreateOptionsSsh',
+       'vdi':            'BlockdevCreateOptionsVdi',
+       'vhdx':           'BlockdevCreateOptionsVhdx',
+      'vitastor':       'BlockdevCreateOptionsVitastor',
+       'vmdk':           'BlockdevCreateOptionsVmdk',
+       'vpc':            'BlockdevCreateOptionsVpc'
+   } }
+Index: qemu/scripts/ci/org.centos/stream/8/x86_64/configure
+===================================================================
+--- qemu.orig/scripts/ci/org.centos/stream/8/x86_64/configure
+++ qemu/scripts/ci/org.centos/stream/8/x86_64/configure
+@@ -31,7 +31,7 @@
+ --with-git=meson \
+ --with-git-submodules=update \
+ --target-list="x86_64-softmmu" \
+---block-drv-rw-whitelist="qcow2,raw,file,host_device,nbd,iscsi,rbd,blkdebug,luks,null-co,nvme,copy-on-read,throttle,gluster" \
+--block-drv-rw-whitelist="qcow2,raw,file,host_device,nbd,iscsi,rbd,vitastor,blkdebug,luks,null-co,nvme,copy-on-read,throttle,gluster" \
+ --audio-drv-list="" \
+ --block-drv-ro-whitelist="vmdk,vhdx,vpc,https,ssh" \
+ --with-coroutine=ucontext \
+@@ -179,6 +179,7 @@
+ --enable-opengl \
+ --enable-pie \
+ --enable-rbd \
+--enable-vitastor \
+ --enable-rdma \
+ --enable-seccomp \
+ --enable-snappy \
--- a/patches/qemu-make-patches.sh
+++ b/patches/qemu-make-patches.sh
@@ -9,7 +9,7 @@ for i in "$DIR"/qemu-*-vitastor.patch "$DIR"/pve-qemu-*-vitastor.patch; do
        echo '===================================================================' >> $i
        echo '--- /dev/null' >> $i
        echo '+++ a/block/vitastor.c' >> $i
-        echo '@@ -0,0 +1,'$(wc -l "$DIR"/../src/qemu_driver.c)' @@' >> $i
+        echo '@@ -0,0 +1,'$(wc -l "$DIR"/../src/qemu_driver.c | cut -d ' ' -f 1)' @@' >> $i
        cat "$DIR"/../src/qemu_driver.c | sed 's/^/+/' >> $i
    fi
 done
--- a/rpm/build-tarball.sh
+++ b/rpm/build-tarball.sh
@@ -25,4 +25,4 @@ rm fio
 mv fio-copy fio
 FIO=`rpm -qi fio | perl -e 'while(<>) { /^Epoch[\s:]+(\S+)/ && print "$1:"; /^Version[\s:]+(\S+)/ && print $1; /^Release[\s:]+(\S+)/ && print "-$1"; }'`
 perl -i -pe 's/(Requires:\s*fio)([^\n]+)?/$1 = '$FIO'/' $VITASTOR/rpm/vitastor-el$EL.spec
-tar --transform 's#^#vitastor-0.8.0/#' --exclude 'rpm/*.rpm' -czf $VITASTOR/../vitastor-0.8.0$(rpm --eval '%dist').tar.gz *
+tar --transform 's#^#vitastor-0.8.5/#' --exclude 'rpm/*.rpm' -czf $VITASTOR/../vitastor-0.8.5$(rpm --eval '%dist').tar.gz *
--- a/rpm/qemu-kvm-4.2-el7.spec.patch
+++ b/rpm/qemu-kvm-4.2-el7.spec.patch
@@ -58,7 +58,7 @@
 +BuildRequires: gperftools-devel
 +BuildRequires: libusbx-devel >= 1.0.21
 %if %{have_usbredir}
- BuildRequires: usbredir-devel >= 0.8.0
+ BuildRequires: usbredir-devel >= 0.8.2
 %endif
@@ -856,12 +861,13 @@ BuildRequires: virglrenderer-devel
 # For smartcard NSS support
--- a/rpm/vitastor-el7.Dockerfile
+++ b/rpm/vitastor-el7.Dockerfile
@@ -35,7 +35,7 @@ ADD . /root/vitastor
 RUN set -e; \
    cd /root/vitastor/rpm; \
    sh build-tarball.sh; \
-    cp /root/vitastor-0.8.0.el7.tar.gz ~/rpmbuild/SOURCES; \
+    cp /root/vitastor-0.8.5.el7.tar.gz ~/rpmbuild/SOURCES; \
    cp vitastor-el7.spec ~/rpmbuild/SPECS/vitastor.spec; \
    cd ~/rpmbuild/SPECS/; \
    rpmbuild -ba vitastor.spec; \
--- a/rpm/vitastor-el7.spec
+++ b/rpm/vitastor-el7.spec
@@ -1,11 +1,11 @@
 Name:           vitastor
-Version:        0.8.0
+Version:        0.8.5
 Release:        1%{?dist}
 Summary:        Vitastor, a fast software-defined clustered block storage

 License:        Vitastor Network Public License 1.1
 URL:            https://vitastor.io/
-Source0:        vitastor-0.8.0.el7.tar.gz
+Source0:        vitastor-0.8.5.el7.tar.gz

 BuildRequires:  liburing-devel >= 0.6
 BuildRequires:  gperftools-devel
@@ -35,6 +35,7 @@ Summary:        Vitastor - OSD
 Requires:       libJerasure2
 Requires:       libisa-l
 Requires:       liburing >= 0.6
+Requires:       liburing < 2
 Requires:       vitastor-client = %{version}-%{release}
 Requires:       util-linux
 Requires:       parted
@@ -59,6 +60,7 @@ scheduling cluster-level operations.
 %package -n vitastor-client
 Summary:        Vitastor - client
 Requires:       liburing >= 0.6
+Requires:       liburing < 2


 %description -n vitastor-client
--- a/rpm/vitastor-el8.Dockerfile
+++ b/rpm/vitastor-el8.Dockerfile
@@ -35,7 +35,7 @@ ADD . /root/vitastor
 RUN set -e; \
    cd /root/vitastor/rpm; \
    sh build-tarball.sh; \
-    cp /root/vitastor-0.8.0.el8.tar.gz ~/rpmbuild/SOURCES; \
+    cp /root/vitastor-0.8.5.el8.tar.gz ~/rpmbuild/SOURCES; \
    cp vitastor-el8.spec ~/rpmbuild/SPECS/vitastor.spec; \
    cd ~/rpmbuild/SPECS/; \
    rpmbuild -ba vitastor.spec; \
--- a/rpm/vitastor-el8.spec
+++ b/rpm/vitastor-el8.spec
@@ -1,11 +1,11 @@
 Name:           vitastor
-Version:        0.8.0
+Version:        0.8.5
 Release:        1%{?dist}
 Summary:        Vitastor, a fast software-defined clustered block storage

 License:        Vitastor Network Public License 1.1
 URL:            https://vitastor.io/
-Source0:        vitastor-0.8.0.el8.tar.gz
+Source0:        vitastor-0.8.5.el8.tar.gz

 BuildRequires:  liburing-devel >= 0.6
 BuildRequires:  gperftools-devel
@@ -34,6 +34,7 @@ Summary:        Vitastor - OSD
 Requires:       libJerasure2
 Requires:       libisa-l
 Requires:       liburing >= 0.6
+Requires:       liburing < 2
 Requires:       vitastor-client = %{version}-%{release}
 Requires:       util-linux
 Requires:       parted
@@ -57,6 +58,7 @@ scheduling cluster-level operations.
 %package -n vitastor-client
 Summary:        Vitastor - client
 Requires:       liburing >= 0.6
+Requires:       liburing < 2


 %description -n vitastor-client
--- a/src/CMakeLists.txt
+++ b/src/CMakeLists.txt
@@ -3,6 +3,7 @@ cmake_minimum_required(VERSION 2.8)
 project(vitastor)

 include(GNUInstallDirs)
+include(CTest)

 set(WITH_QEMU false CACHE BOOL "Build QEMU driver inside Vitastor source tree")
 set(WITH_FIO true CACHE BOOL "Build FIO driver")
@@ -15,7 +16,7 @@ if("${CMAKE_INSTALL_PREFIX}" MATCHES "^/usr/local/?$")
 	set(CMAKE_INSTALL_RPATH "${CMAKE_INSTALL_PREFIX}/${CMAKE_INSTALL_LIBDIR}")
 endif()

-add_definitions(-DVERSION="0.8.0")
+add_definitions(-DVERSION="0.8.5")
 add_definitions(-Wall -Wno-sign-compare -Wno-comment -Wno-parentheses -Wno-pointer-arith -fdiagnostics-color=always -I ${CMAKE_SOURCE_DIR}/src)
 if (${WITH_ASAN})
 	add_definitions(-fsanitize=address -fno-omit-frame-pointer)
@@ -55,6 +56,14 @@ if (ISAL_LIBRARIES)
 	add_definitions(-DWITH_ISAL)
 endif (ISAL_LIBRARIES)

+add_custom_target(build_tests)
+add_custom_target(test
+	COMMAND
+	echo leak:tcmalloc > ${CMAKE_CURRENT_BINARY_DIR}/lsan-suppress.txt &&
+	env LSAN_OPTIONS=suppressions=${CMAKE_CURRENT_BINARY_DIR}/lsan-suppress.txt ${CMAKE_CTEST_COMMAND}
+)
+add_dependencies(test build_tests)
+
 include_directories(
 	../
 	/usr/include/jerasure
@@ -89,7 +98,7 @@ endif (${WITH_FIO})
 # libvitastor_common.a
 set(MSGR_RDMA "")
 if (IBVERBS_LIBRARIES)
-	set(MSGR_RDMA "msgr_rdma.cpp")
+	set(MSGR_RDMA msgr_rdma.cpp freelist.cpp allocator.cpp)
 endif (IBVERBS_LIBRARIES)
 add_library(vitastor_common STATIC
 	epoll_manager.cpp etcd_state_client.cpp messenger.cpp addr_util.cpp
@@ -140,11 +149,11 @@ add_library(vitastor_client SHARED
 	cli_merge.cpp
 	cli_rm_data.cpp
 	cli_rm.cpp
+	cli_rm_osd.cpp
 )
 set_target_properties(vitastor_client PROPERTIES PUBLIC_HEADER "vitastor_c.h")
 target_link_libraries(vitastor_client
 	vitastor_common
-	tcmalloc_minimal
 	${LIBURING_LIBRARIES}
 	${IBVERBS_LIBRARIES}
 )
@@ -234,8 +243,18 @@ add_executable(osd_test osd_test.cpp rw_blocking.cpp addr_util.cpp)
 target_link_libraries(osd_test tcmalloc_minimal)

 # osd_rmw_test
-add_executable(osd_rmw_test osd_rmw_test.cpp allocator.cpp)
+add_executable(osd_rmw_test EXCLUDE_FROM_ALL osd_rmw_test.cpp allocator.cpp)
 target_link_libraries(osd_rmw_test Jerasure ${ISAL_LIBRARIES} tcmalloc_minimal)
+add_dependencies(build_tests osd_rmw_test)
+add_test(NAME osd_rmw_test COMMAND osd_rmw_test)
+
+if (ISAL_LIBRARIES)
+	add_executable(osd_rmw_test_je EXCLUDE_FROM_ALL osd_rmw_test.cpp allocator.cpp)
+	target_compile_definitions(osd_rmw_test_je PUBLIC -DNO_ISAL)
+	target_link_libraries(osd_rmw_test_je Jerasure tcmalloc_minimal)
+	add_dependencies(build_tests osd_rmw_test_je)
+	add_test(NAME osd_rmw_test_jerasure COMMAND osd_rmw_test_je)
+endif (ISAL_LIBRARIES)

 # stub_uring_osd
 add_executable(stub_uring_osd
@@ -249,11 +268,20 @@ target_link_libraries(stub_uring_osd
 )

 # osd_peering_pg_test
-add_executable(osd_peering_pg_test osd_peering_pg_test.cpp osd_peering_pg.cpp)
+add_executable(osd_peering_pg_test EXCLUDE_FROM_ALL osd_peering_pg_test.cpp osd_peering_pg.cpp)
 target_link_libraries(osd_peering_pg_test tcmalloc_minimal)
+add_dependencies(build_tests osd_peering_pg_test)
+add_test(NAME osd_peering_pg_test COMMAND osd_peering_pg_test)

 # test_allocator
-add_executable(test_allocator test_allocator.cpp allocator.cpp)
+add_executable(test_allocator EXCLUDE_FROM_ALL test_allocator.cpp allocator.cpp)
+add_dependencies(build_tests test_allocator)
+add_test(NAME test_allocator COMMAND test_allocator)
+
+# test_freelist
+add_executable(test_freelist EXCLUDE_FROM_ALL test_freelist.cpp)
+add_dependencies(build_tests test_freelist)
+add_test(NAME test_freelist COMMAND test_freelist)

 # test_cas
 add_executable(test_cas
@@ -263,14 +291,25 @@ target_link_libraries(test_cas
 	vitastor_client
 )

+# test_crc32
+add_executable(test_crc32
+	test_crc32.cpp
+)
+target_link_libraries(test_crc32
+	vitastor_blk
+)
+
 # test_cluster_client
 add_executable(test_cluster_client
+	EXCLUDE_FROM_ALL
 	test_cluster_client.cpp
 	pg_states.cpp osd_ops.cpp cluster_client.cpp cluster_client_list.cpp msgr_op.cpp mock/messenger.cpp msgr_stop.cpp
 	etcd_state_client.cpp timerfd_manager.cpp ../json11/json11.cpp
 )
 target_compile_definitions(test_cluster_client PUBLIC -D__MOCK__)
 target_include_directories(test_cluster_client PUBLIC ${CMAKE_SOURCE_DIR}/src/mock)
+add_dependencies(build_tests test_cluster_client)
+add_test(NAME test_cluster_client COMMAND test_cluster_client)

 ## test_blockstore, test_shit
 #add_executable(test_blockstore test_blockstore.cpp)
--- a/src/blockstore_flush.cpp
+++ b/src/blockstore_flush.cpp
@@ -35,24 +35,14 @@ journal_flusher_co::journal_flusher_co()
    {
        bs->live = true;
        if (data->res != data->iov.iov_len)
-        {
-            throw std::runtime_error(
-                "data read operation failed during flush ("+std::to_string(data->res)+" != "+std::to_string(data->iov.iov_len)+
-                "). can't continue, sorry :-("
-            );
-        }
+            bs->disk_error_abort("read operation during flush", data->res, data->iov.iov_len);
        wait_count--;
    };
    simple_callback_w = [this](ring_data_t* data)
    {
        bs->live = true;
        if (data->res != data->iov.iov_len)
-        {
-            throw std::runtime_error(
-                "write operation failed ("+std::to_string(data->res)+" != "+std::to_string(data->iov.iov_len)+
-                "). state "+std::to_string(wait_state)+". in-memory state is corrupted. AAAAAAAaaaaaaaaa!!!111"
-            );
-        }
+            bs->disk_error_abort("write operation during flush", data->res, data->iov.iov_len);
        wait_count--;
    };
 }
@@ -87,7 +77,7 @@ void journal_flusher_t::loop()
            cur_flusher_count--;
        }
    }
-    for (int i = 0; (active_flushers > 0 || dequeuing) && i < cur_flusher_count; i++)
+    for (int i = 0; (active_flushers > 0 || dequeuing || trim_wanted > 0) && i < cur_flusher_count; i++)
        co[i].loop();
 }

@@ -172,7 +162,8 @@ void journal_flusher_t::mark_trim_possible()
    if (trim_wanted > 0)
    {
        dequeuing = true;
-        journal_trim_counter++;
+        if (!journal_trim_counter)
+            journal_trim_counter = journal_trim_interval;
        bs->ringloop->wakeup();
    }
 }
@@ -306,6 +297,8 @@ bool journal_flusher_co::loop()
        goto resume_20;
    else if (wait_state == 21)
        goto resume_21;
+    else if (wait_state == 22)
+        goto resume_22;
 resume_0:
    if (flusher->flush_queue.size() < flusher->min_flusher_count && !flusher->trim_wanted ||
        !flusher->flush_queue.size() || !flusher->dequeuing)
@@ -511,6 +504,13 @@ resume_1:
            );
            wait_count++;
        }
+        // Wait for data writes before fsyncing it
+    resume_22:
+        if (wait_count > 0)
+        {
+            wait_state = 22;
+            return false;
+        }
        // Sync data before writing metadata
    resume_16:
    resume_17:
@@ -521,7 +521,7 @@ resume_1:
            return false;
        }
    resume_5:
-        // And metadata writes, but only after data writes complete
+        // Submit metadata writes, but only when data is written and fsynced
        if (!bs->inmemory_meta && meta_new.it->second.state == 0 || wait_count > 0)
        {
            // metadata sector is still being read or data is still being written, wait for it
@@ -615,7 +615,12 @@ resume_1:
        }
        for (it = v.begin(); it != v.end(); it++)
        {
-            free(it->buf);
+            // Free it if it's not taken from the journal
+            if (it->buf && (!bs->journal.inmemory || it->buf < bs->journal.buffer ||
+                it->buf >= (uint8_t*)bs->journal.buffer + bs->journal.len))
+            {
+                free(it->buf);
+            }
        }
        v.clear();
        // And sync metadata (in batches - not per each operation!)
@@ -760,16 +765,17 @@ bool journal_flusher_co::scan_dirty(int wait_base)
                    {
                        submit_offset = dirty_it->second.location + offset - dirty_it->second.offset;
                        submit_len = it == v.end() || it->offset >= end_offset ? end_offset-offset : it->offset-offset;
-                        it = v.insert(it, (copy_buffer_t){ .offset = offset, .len = submit_len, .buf = memalign_or_die(MEM_ALIGNMENT, submit_len) });
+                        it = v.insert(it, (copy_buffer_t){ .offset = offset, .len = submit_len });
                        copy_count++;
                        if (bs->journal.inmemory)
                        {
-                            // Take it from memory
-                            memcpy(it->buf, (uint8_t*)bs->journal.buffer + submit_offset, submit_len);
+                            // Take it from memory, don't copy it
+                            it->buf = (uint8_t*)bs->journal.buffer + submit_offset;
                        }
                        else
                        {
                            // Read it from disk
+                            it->buf = memalign_or_die(MEM_ALIGNMENT, submit_len);
                            await_sqe(0);
                            data->iov = (struct iovec){ it->buf, (size_t)submit_len };
                            data->callback = simple_callback_r;
--- a/src/blockstore_impl.cpp
+++ b/src/blockstore_impl.cpp
@@ -107,7 +107,7 @@ void blockstore_impl_t::loop()
        // has_writes == 0 - no writes before the current queue item
        // has_writes == 1 - some writes in progress
        // has_writes == 2 - tried to submit some writes, but failed
-        int has_writes = 0, op_idx = 0, new_idx = 0, done_lists = 0;
+        int has_writes = 0, op_idx = 0, new_idx = 0;
        for (; op_idx < submit_queue.size(); op_idx++, new_idx++)
        {
            auto op = submit_queue[op_idx];
@@ -188,16 +188,12 @@ void blockstore_impl_t::loop()
            else if (op->opcode == BS_OP_LIST)
            {
                // LIST doesn't have to be blocked by previous modifications
-                // But don't do a lot of LISTs at once, because they're blocking and potentially slow
-                if (single_tick_list_limit <= 0 || done_lists < single_tick_list_limit)
-                {
-                    process_list(op);
-                    done_lists++;
-                    wr_st = 2;
-                }
+                process_list(op);
+                wr_st = 2;
            }
            if (wr_st == 2)
            {
+                submit_queue[op_idx] = NULL;
                new_idx--;
            }
            if (wr_st == 0)
@@ -306,17 +302,6 @@ void blockstore_impl_t::check_wait(blockstore_op_t *op)
            // do not submit
 #ifdef BLOCKSTORE_DEBUG
            printf("Still waiting for a journal buffer\n");
-#endif
-            return;
-        }
-        PRIV(op)->wait_for = 0;
-    }
-    else if (PRIV(op)->wait_for == WAIT_FREE)
-    {
-        if (!data_alloc->get_free_count() && flusher->is_active())
-        {
-#ifdef BLOCKSTORE_DEBUG
-            printf("Still waiting for free space on the data device\n");
 #endif
            return;
        }
@@ -340,7 +325,7 @@ void blockstore_impl_t::enqueue_op(blockstore_op_t *op)
    {
        // Basic verification not passed
        op->retval = -EINVAL;
-        std::function<void (blockstore_op_t*)>(op->callback)(op);
+        ringloop->set_immediate([op]() { std::function<void (blockstore_op_t*)>(op->callback)(op); });
        return;
    }
    if (op->opcode == BS_OP_SYNC_STAB_ALL)
@@ -383,7 +368,7 @@ void blockstore_impl_t::enqueue_op(blockstore_op_t *op)
    }
    if ((op->opcode == BS_OP_WRITE || op->opcode == BS_OP_WRITE_STABLE || op->opcode == BS_OP_DELETE) && !enqueue_write(op))
    {
-        std::function<void (blockstore_op_t*)>(op->callback)(op);
+        ringloop->set_immediate([op]() { std::function<void (blockstore_op_t*)>(op->callback)(op); });
        return;
    }
    // Call constructor without allocating memory. We'll call destructor before returning op back
@@ -598,7 +583,7 @@ void blockstore_impl_t::process_list(blockstore_op_t *op)
                        replace_stable(dirty_it->first.oid, 0, clean_stable_count, stable_count, stable);
                    }
                }
-                else if (IS_STABLE(dirty_it->second.state))
+                else if (IS_STABLE(dirty_it->second.state) || (dirty_it->second.state & BS_ST_INSTANT))
                {
                    // First try to replace a clean stable version in the first part of the list
                    if (!replace_stable(dirty_it->first.oid, dirty_it->first.version, 0, clean_stable_count, stable))
@@ -687,3 +672,16 @@ void blockstore_impl_t::dump_diagnostics()
    journal.dump_diagnostics();
    flusher->dump_diagnostics();
 }
+
+void blockstore_impl_t::disk_error_abort(const char *op, int retval, int expected)
+{
+    if (retval == -EAGAIN)
+    {
+        fprintf(stderr, "EAGAIN error received from a disk %s during flush."
+            " It must never happen with io_uring and indicates a kernel bug."
+            " Please upgrade your kernel. Aborting.\n", op);
+        exit(1);
+    }
+    fprintf(stderr, "Disk %s failed: result is %d, expected %d. Can't continue, sorry :-(\n", op, retval, expected);
+    exit(1);
+}
--- a/src/blockstore_impl.h
+++ b/src/blockstore_impl.h
@@ -160,12 +160,11 @@ struct __attribute__((__packed__)) dirty_entry
 #define WAIT_JOURNAL 3
 // Suspend operation until the next journal sector buffer is free
 #define WAIT_JOURNAL_BUFFER 4
-// Suspend operation until there is some free space on the data device
-#define WAIT_FREE 5

 struct fulfill_read_t
 {
    uint64_t offset, len;
+    uint64_t journal_sector; // sector+1 if used and !journal.inmemory, otherwise 0
 };

 #define PRIV(op) ((blockstore_op_private_t*)(op)->private_data)
@@ -241,8 +240,6 @@ class blockstore_impl_t
    int throttle_target_parallelism = 1;
    // Minimum difference in microseconds between target and real execution times to throttle the response
    int throttle_threshold_us = 50;
-    // Maximum number of LIST operations to be processed between
-    int single_tick_list_limit = 1;
    /******* END OF OPTIONS *******/

    struct ring_consumer_t ring_consumer;
@@ -293,6 +290,7 @@ class blockstore_impl_t
    // Journaling
    void prepare_journal_sector_write(int sector, blockstore_op_t *op);
    void handle_journal_write(ring_data_t *data, uint64_t flush_id);
+    void disk_error_abort(const char *op, int retval, int expected);

    // Asynchronous init
    int initialized;
@@ -305,7 +303,7 @@ class blockstore_impl_t
    // Read
    int dequeue_read(blockstore_op_t *read_op);
    int fulfill_read(blockstore_op_t *read_op, uint64_t &fulfilled, uint32_t item_start, uint32_t item_end,
-        uint32_t item_state, uint64_t item_version, uint64_t item_location);
+        uint32_t item_state, uint64_t item_version, uint64_t item_location, uint64_t journal_sector);
    int fulfill_read_push(blockstore_op_t *op, void *buf, uint64_t offset, uint64_t len,
        uint32_t item_state, uint64_t item_version);
    void handle_read_event(ring_data_t *data, blockstore_op_t *op);
--- a/src/blockstore_init.cpp
+++ b/src/blockstore_init.cpp
@@ -48,14 +48,12 @@ void blockstore_init_meta::handle_event(ring_data_t *data, int buf_num)

 int blockstore_init_meta::loop()
 {
-    if (wait_state == 1)
-        goto resume_1;
-    else if (wait_state == 2)
-        goto resume_2;
-    else if (wait_state == 3)
-        goto resume_3;
-    else if (wait_state == 4)
-        goto resume_4;
+    if (wait_state == 1)      goto resume_1;
+    else if (wait_state == 2) goto resume_2;
+    else if (wait_state == 3) goto resume_3;
+    else if (wait_state == 4) goto resume_4;
+    else if (wait_state == 5) goto resume_5;
+    else if (wait_state == 6) goto resume_6;
    printf("Reading blockstore metadata\n");
    if (bs->inmemory_meta)
        metadata_buffer = bs->metadata_buffer;
@@ -140,6 +138,7 @@ resume_1:
    // Skip superblock
    md_offset = bs->dsk.meta_block_size;
    next_offset = md_offset;
+    entries_per_block = bs->dsk.meta_block_size / bs->dsk.clean_entry_size;
    // Read the rest of the metadata
 resume_2:
    if (next_offset < bs->dsk.meta_len && submitted == 0)
@@ -179,17 +178,15 @@ resume_2:
        if (bufs[i].state == INIT_META_READ_DONE)
        {
            // Handle result
-            unsigned entries_per_block = bs->dsk.meta_block_size / bs->dsk.clean_entry_size;
            bool changed = false;
            for (uint64_t sector = 0; sector < bufs[i].size; sector += bs->dsk.meta_block_size)
            {
                // handle <count> entries
-                changed = changed || handle_entries(
-                    bufs[i].buf + sector, entries_per_block,
-                    ((bufs[i].offset + sector - md_offset) / bs->dsk.meta_block_size) * entries_per_block
-                );
+                if (handle_meta_block(bufs[i].buf + sector, entries_per_block,
+                    ((bufs[i].offset + sector - md_offset) / bs->dsk.meta_block_size) * entries_per_block))
+                    changed = true;
            }
-            if (changed && !bs->inmemory_meta)
+            if (changed && !bs->inmemory_meta && !bs->readonly)
            {
                // write the modified buffer back
                GET_SQE();
@@ -211,6 +208,43 @@ resume_2:
        wait_state = 2;
        return 1;
    }
+    if (entries_to_zero.size() && !bs->inmemory_meta && !bs->readonly)
+    {
+        // we have to zero out additional entries
+        for (i = 0; i < entries_to_zero.size(); )
+        {
+            next_offset = entries_to_zero[i]/entries_per_block;
+            for (j = i; j < entries_to_zero.size() && entries_to_zero[j]/entries_per_block == next_offset; j++) {}
+            GET_SQE();
+            data->iov = { metadata_buffer, bs->dsk.meta_block_size };
+            data->callback = [this](ring_data_t *data) { handle_event(data, -1); };
+            my_uring_prep_readv(sqe, bs->dsk.meta_fd, &data->iov, 1, bs->dsk.meta_offset + (1+next_offset)*bs->dsk.meta_block_size);
+            submitted++;
+resume_5:
+            if (submitted > 0)
+            {
+                wait_state = 5;
+                return 1;
+            }
+            for (; i < j; i++)
+            {
+                uint64_t pos = (entries_to_zero[i] % entries_per_block);
+                memset((uint8_t*)metadata_buffer + pos*bs->dsk.clean_entry_size, 0, bs->dsk.clean_entry_size);
+            }
+            GET_SQE();
+            data->iov = { metadata_buffer, bs->dsk.meta_block_size };
+            data->callback = [this](ring_data_t *data) { handle_event(data, -1); };
+            my_uring_prep_writev(sqe, bs->dsk.meta_fd, &data->iov, 1, bs->dsk.meta_offset + (1+next_offset)*bs->dsk.meta_block_size);
+            submitted++;
+resume_6:
+            if (submitted > 0)
+            {
+                wait_state = 6;
+                return 1;
+            }
+        }
+        entries_to_zero.clear();
+    }
    // metadata read finished
    printf("Metadata entries loaded: %lu, free blocks: %lu / %lu\n", entries_loaded, bs->data_alloc->get_free_count(), bs->dsk.block_count);
    if (!bs->inmemory_meta)
@@ -236,10 +270,13 @@ resume_2:
    return 0;
 }

-bool blockstore_init_meta::handle_entries(uint8_t *buf, uint64_t count, uint64_t done_cnt)
+bool blockstore_init_meta::handle_meta_block(uint8_t *buf, uint64_t entries_per_block, uint64_t done_cnt)
 {
    bool updated = false;
-    for (uint64_t i = 0; i < count; i++)
+    uint64_t max_i = entries_per_block;
+    if (max_i > bs->dsk.block_count-done_cnt)
+        max_i = bs->dsk.block_count-done_cnt;
+    for (uint64_t i = 0; i < max_i; i++)
    {
        clean_disk_entry *entry = (clean_disk_entry*)(buf + i*bs->dsk.clean_entry_size);
        if (!bs->inmemory_meta && bs->dsk.clean_entry_bitmap_size)
@@ -255,17 +292,35 @@ bool blockstore_init_meta::handle_entries(uint8_t *buf, uint64_t count, uint64_t
                if (clean_it != clean_db.end())
                {
                    // free the previous block
-                    // here we have to zero out the entry because otherwise we'll hit
+                    // here we have to zero out the previous entry because otherwise we'll hit
                    // "tried to overwrite non-zero metadata entry" later
-                    updated = true;
-                    memset(entry, 0, bs->dsk.clean_entry_size);
+                    uint64_t old_clean_loc = clean_it->second.location >> bs->dsk.block_order;
+                    if (bs->inmemory_meta)
+                    {
+                        uint64_t sector = (old_clean_loc / entries_per_block) * bs->dsk.meta_block_size;
+                        uint64_t pos = (old_clean_loc % entries_per_block);
+                        clean_disk_entry *old_entry = (clean_disk_entry*)((uint8_t*)bs->metadata_buffer + sector + pos*bs->dsk.clean_entry_size);
+                        memset(old_entry, 0, bs->dsk.clean_entry_size);
+                    }
+                    else if (old_clean_loc >= done_cnt)
+                    {
+                        updated = true;
+                        uint64_t sector = ((old_clean_loc - done_cnt) / entries_per_block) * bs->dsk.meta_block_size;
+                        uint64_t pos = (old_clean_loc % entries_per_block);
+                        clean_disk_entry *old_entry = (clean_disk_entry*)(buf + sector + pos*bs->dsk.clean_entry_size);
+                        memset(old_entry, 0, bs->dsk.clean_entry_size);
+                    }
+                    else
+                    {
+                        entries_to_zero.push_back(clean_it->second.location >> bs->dsk.block_order);
+                    }
 #ifdef BLOCKSTORE_DEBUG
                    printf("Free block %lu from %lx:%lx v%lu (new location is %lu)\n",
-                        clean_it->second.location >> bs->dsk.block_order,
+                        old_clean_loc,
                        clean_it->first.inode, clean_it->first.stripe, clean_it->second.version,
                        done_cnt+i);
 #endif
-                    bs->data_alloc->set(clean_it->second.location >> bs->dsk.block_order, false);
+                    bs->data_alloc->set(old_clean_loc, false);
                }
                else
                {
--- a/src/blockstore_init.h
+++ b/src/blockstore_init.h
@@ -24,7 +24,10 @@ class blockstore_init_meta
    uint64_t md_offset = 0;
    uint64_t next_offset = 0;
    uint64_t entries_loaded = 0;
-    bool handle_entries(uint8_t *buf, uint64_t count, uint64_t done_cnt);
+    unsigned entries_per_block = 0;
+    int i = 0, j = 0;
+    std::vector<uint64_t> entries_to_zero;
+    bool handle_meta_block(uint8_t *buf, uint64_t count, uint64_t done_cnt);
    void handle_event(ring_data_t *data, int buf_num);
 public:
    blockstore_init_meta(blockstore_impl_t *bs);
--- a/src/blockstore_journal.cpp
+++ b/src/blockstore_journal.cpp
@@ -198,10 +198,7 @@ void blockstore_impl_t::handle_journal_write(ring_data_t *data, uint64_t flush_i
    if (data->res != data->iov.iov_len)
    {
        // FIXME: our state becomes corrupted after a write error. maybe do something better than just die
-        throw std::runtime_error(
-            "journal write failed ("+std::to_string(data->res)+" != "+std::to_string(data->iov.iov_len)+
-            "). in-memory state is corrupted. AAAAAAAaaaaaaaaa!!!111"
-        );
+        disk_error_abort("journal write", data->res, data->iov.iov_len);
    }
    auto fl_it = journal.flushing_ops.upper_bound((pending_journaling_t){ .flush_id = flush_id });
    if (fl_it != journal.flushing_ops.end() && fl_it->flush_id == flush_id)
--- a/src/blockstore_journal.h
+++ b/src/blockstore_journal.h
@@ -16,6 +16,7 @@
 // FIXME: This value should be dynamic i.e. Blockstore ideally shouldn't allow
 // writing more than can be stabilized afterwards
 #define JOURNAL_STABILIZE_RESERVATION 65536
+#define JOURNAL_INSTANT_RESERVATION 131072

 // Journal entries
 // Journal entries are linked to each other by their crc32 value
--- a/src/blockstore_read.cpp
+++ b/src/blockstore_read.cpp
@@ -42,7 +42,7 @@ int blockstore_impl_t::fulfill_read_push(blockstore_op_t *op, void *buf, uint64_

 // FIXME I've seen a bug here so I want some tests
 int blockstore_impl_t::fulfill_read(blockstore_op_t *read_op, uint64_t &fulfilled, uint32_t item_start, uint32_t item_end,
-    uint32_t item_state, uint64_t item_version, uint64_t item_location)
+    uint32_t item_state, uint64_t item_version, uint64_t item_location, uint64_t journal_sector)
 {
    uint32_t cur_start = item_start;
    if (cur_start < read_op->offset + read_op->len && item_end > read_op->offset)
@@ -72,6 +72,7 @@ int blockstore_impl_t::fulfill_read(blockstore_op_t *read_op, uint64_t &fulfille
                fulfill_read_t el = {
                    .offset = cur_start,
                    .len = it == PRIV(read_op)->read_vec.end() || it->offset >= item_end ? item_end-cur_start : it->offset-cur_start,
+                    .journal_sector = journal_sector,
                };
                it = PRIV(read_op)->read_vec.insert(it, el);
                if (!fulfill_read_push(read_op,
@@ -138,7 +139,7 @@ int blockstore_impl_t::dequeue_read(blockstore_op_t *read_op)
        while (dirty_it->first.oid == read_op->oid)
        {
            dirty_entry& dirty = dirty_it->second;
-            bool version_ok = read_op->version >= dirty_it->first.version;
+            bool version_ok = !IS_IN_FLIGHT(dirty.state) && read_op->version >= dirty_it->first.version;
            if (IS_SYNCED(dirty.state))
            {
                if (!version_ok && read_op->version != 0)
@@ -156,8 +157,10 @@ int blockstore_impl_t::dequeue_read(blockstore_op_t *read_op)
                        memcpy(read_op->bitmap, bmp_ptr, dsk.clean_entry_bitmap_size);
                    }
                }
+                // If inmemory_journal is false, journal trim will have to wait until the read is completed
                if (!fulfill_read(read_op, fulfilled, dirty.offset, dirty.offset + dirty.len,
-                    dirty.state, dirty_it->first.version, dirty.location + (IS_JOURNAL(dirty.state) ? 0 : dirty.offset)))
+                    dirty.state, dirty_it->first.version, dirty.location + (IS_JOURNAL(dirty.state) ? 0 : dirty.offset),
+                    (IS_JOURNAL(dirty.state) ? dirty.journal_sector+1 : 0)))
                {
                    // need to wait. undo added requests, don't dequeue op
                    PRIV(read_op)->read_vec.clear();
@@ -171,7 +174,7 @@ int blockstore_impl_t::dequeue_read(blockstore_op_t *read_op)
            dirty_it--;
        }
    }
-    if (clean_it != clean_db.end())
+    if (clean_found)
    {
        if (!result_version)
        {
@@ -186,7 +189,8 @@ int blockstore_impl_t::dequeue_read(blockstore_op_t *read_op)
        {
            if (!dsk.clean_entry_bitmap_size)
            {
-                if (!fulfill_read(read_op, fulfilled, 0, dsk.data_block_size, (BS_ST_BIG_WRITE | BS_ST_STABLE), 0, clean_it->second.location))
+                if (!fulfill_read(read_op, fulfilled, 0, dsk.data_block_size,
+                    (BS_ST_BIG_WRITE | BS_ST_STABLE), 0, clean_it->second.location, 0))
                {
                    // need to wait. undo added requests, don't dequeue op
                    PRIV(read_op)->read_vec.clear();
@@ -207,7 +211,7 @@ int blockstore_impl_t::dequeue_read(blockstore_op_t *read_op)
                    {
                        // fill with zeroes
                        assert(fulfill_read(read_op, fulfilled, bmp_start * dsk.bitmap_granularity,
-                            bmp_end * dsk.bitmap_granularity, (BS_ST_DELETE | BS_ST_STABLE), 0, 0));
+                            bmp_end * dsk.bitmap_granularity, (BS_ST_DELETE | BS_ST_STABLE), 0, 0, 0));
                    }
                    bmp_start = bmp_end;
                    while (clean_entry_bitmap[bmp_end >> 3] & (1 << (bmp_end & 0x7)) && bmp_end < bmp_size)
@@ -218,7 +222,7 @@ int blockstore_impl_t::dequeue_read(blockstore_op_t *read_op)
                    {
                        if (!fulfill_read(read_op, fulfilled, bmp_start * dsk.bitmap_granularity,
                            bmp_end * dsk.bitmap_granularity, (BS_ST_BIG_WRITE | BS_ST_STABLE), 0,
-                            clean_it->second.location + bmp_start * dsk.bitmap_granularity))
+                            clean_it->second.location + bmp_start * dsk.bitmap_granularity, 0))
                        {
                            // need to wait. undo added requests, don't dequeue op
                            PRIV(read_op)->read_vec.clear();
@@ -233,7 +237,7 @@ int blockstore_impl_t::dequeue_read(blockstore_op_t *read_op)
    else if (fulfilled < read_op->len)
    {
        // fill remaining parts with zeroes
-        assert(fulfill_read(read_op, fulfilled, 0, dsk.data_block_size, (BS_ST_DELETE | BS_ST_STABLE), 0, 0));
+        assert(fulfill_read(read_op, fulfilled, 0, dsk.data_block_size, (BS_ST_DELETE | BS_ST_STABLE), 0, 0, 0));
    }
    assert(fulfilled == read_op->len);
    read_op->version = result_version;
@@ -249,6 +253,15 @@ int blockstore_impl_t::dequeue_read(blockstore_op_t *read_op)
        FINISH_OP(read_op);
        return 2;
    }
+    if (!journal.inmemory)
+    {
+        // Journal trim has to wait until the read is completed - record journal sector usage
+        for (auto & rv: PRIV(read_op)->read_vec)
+        {
+            if (rv.journal_sector)
+                journal.used_sectors[rv.journal_sector-1]++;
+        }
+    }
    read_op->retval = 0;
    return 2;
 }
@@ -264,6 +277,22 @@ void blockstore_impl_t::handle_read_event(ring_data_t *data, blockstore_op_t *op
    }
    if (PRIV(op)->pending_ops == 0)
    {
+        if (!journal.inmemory)
+        {
+            // Release journal sector usage
+            for (auto & rv: PRIV(op)->read_vec)
+            {
+                if (rv.journal_sector)
+                {
+                    auto used = --journal.used_sectors[rv.journal_sector-1];
+                    if (used == 0)
+                    {
+                        journal.used_sectors.erase(rv.journal_sector-1);
+                        flusher->mark_trim_possible();
+                    }
+                }
+            }
+        }
        if (op->retval == 0)
            op->retval = op->len;
        FINISH_OP(op);
--- a/src/blockstore_rollback.cpp
+++ b/src/blockstore_rollback.cpp
@@ -127,7 +127,6 @@ resume_4:
    {
        mark_rolled_back(*v);
    }
-    flusher->mark_trim_possible();
    // Acknowledge op
    op->retval = 0;
    FINISH_OP(op);
@@ -222,7 +221,7 @@ void blockstore_impl_t::erase_dirty(blockstore_dirty_db_t::iterator dirty_start,
 #endif
            data_alloc->set(dirty_it->second.location >> dsk.block_order, false);
        }
-        int used = --journal.used_sectors[dirty_it->second.journal_sector];
+        auto used = --journal.used_sectors[dirty_it->second.journal_sector];
 #ifdef BLOCKSTORE_DEBUG
        printf(
            "remove usage of journal offset %08lx by %lx:%lx v%lu (%d refs)\n", dirty_it->second.journal_sector,
@@ -232,6 +231,7 @@ void blockstore_impl_t::erase_dirty(blockstore_dirty_db_t::iterator dirty_start,
        if (used == 0)
        {
            journal.used_sectors.erase(dirty_it->second.journal_sector);
+            flusher->mark_trim_possible();
        }
        if (dsk.clean_entry_bitmap_size > sizeof(void*))
        {
--- a/src/blockstore_write.cpp
+++ b/src/blockstore_write.cpp
@@ -89,6 +89,9 @@ bool blockstore_impl_t::enqueue_write(blockstore_op_t *op)
        else
        {
            // Invalid version requested
+#ifdef BLOCKSTORE_DEBUG
+            printf("Write %lx:%lx v%lu requested, but we already have v%lu\n", op->oid.inode, op->oid.stripe, op->version, version);
+#endif
            op->retval = -EEXIST;
            if (!is_del && dsk.clean_entry_bitmap_size > sizeof(void*))
            {
@@ -115,8 +118,8 @@ bool blockstore_impl_t::enqueue_write(blockstore_op_t *op)
    else if (!wait_del)
        printf("Write %lx:%lx v%lu offset=%u len=%u\n", op->oid.inode, op->oid.stripe, op->version, op->offset, op->len);
 #endif
-    // FIXME No strict need to add it into dirty_db here, it's just left
-    // from the previous implementation where reads waited for writes
+    // No strict need to add it into dirty_db here except maybe for listings to return
+    // correct data when there are inflight operations in the queue
    uint32_t state;
    if (is_del)
        state = BS_ST_DELETE | BS_ST_IN_FLIGHT;
@@ -139,7 +142,7 @@ bool blockstore_impl_t::enqueue_write(blockstore_op_t *op)
            uint8_t *bmp_ptr = (uint8_t*)(dsk.clean_entry_bitmap_size > sizeof(void*) ? bmp : &bmp);
            uint32_t bit = op->offset/dsk.bitmap_granularity;
            uint32_t bits_left = op->len/dsk.bitmap_granularity;
-            while (!(bit % 8) && bits_left > 8)
+            while (!(bit % 8) && bits_left >= 8)
            {
                // Copy bytes
                bmp_ptr[bit/8] = ((uint8_t*)op->bitmap)[bit/8];
@@ -182,8 +185,15 @@ void blockstore_impl_t::cancel_all_writes(blockstore_op_t *op, blockstore_dirty_
    bool found = false;
    for (auto other_op: submit_queue)
    {
-        if (!found && other_op == op)
+        if (!other_op)
+        {
+            // freed operations during submitting are zeroed
+        }
+        else if (other_op == op)
+        {
+            // <op> may be present in queue multiple times due to moving operations in submit_queue
            found = true;
+        }
        else if (found && other_op->oid == op->oid &&
            (other_op->opcode == BS_OP_WRITE || other_op->opcode == BS_OP_WRITE_STABLE))
        {
@@ -251,7 +261,8 @@ int blockstore_impl_t::dequeue_write(blockstore_op_t *op)
    {
        blockstore_journal_check_t space_check(this);
        if (!space_check.check_available(op, unsynced_big_write_count + 1,
-            sizeof(journal_entry_big_write) + dsk.clean_entry_bitmap_size, JOURNAL_STABILIZE_RESERVATION))
+            sizeof(journal_entry_big_write) + dsk.clean_entry_bitmap_size,
+            (dirty_it->second.state & BS_ST_INSTANT) ? JOURNAL_INSTANT_RESERVATION : JOURNAL_STABILIZE_RESERVATION))
        {
            return 0;
        }
@@ -260,12 +271,6 @@ int blockstore_impl_t::dequeue_write(blockstore_op_t *op)
        if (loc == UINT64_MAX)
        {
            // no space
-            if (flusher->is_active())
-            {
-                // hope that some space will be available after flush
-                PRIV(op)->wait_for = WAIT_FREE;
-                return 0;
-            }
            cancel_all_writes(op, dirty_it, -ENOSPC);
            return 2;
        }
@@ -337,7 +342,8 @@ int blockstore_impl_t::dequeue_write(blockstore_op_t *op)
            !space_check.check_available(op, unsynced_big_write_count,
                sizeof(journal_entry_big_write) + dsk.clean_entry_bitmap_size, 0)
            || !space_check.check_available(op, 1,
-                sizeof(journal_entry_small_write) + dsk.clean_entry_bitmap_size, op->len + JOURNAL_STABILIZE_RESERVATION))
+                sizeof(journal_entry_small_write) + dsk.clean_entry_bitmap_size,
+                op->len + ((dirty_it->second.state & BS_ST_INSTANT) ? JOURNAL_INSTANT_RESERVATION : JOURNAL_STABILIZE_RESERVATION)))
        {
            return 0;
        }
@@ -448,12 +454,19 @@ int blockstore_impl_t::continue_write(blockstore_op_t *op)
 resume_2:
    // Only for the immediate_commit mode: prepare and submit big_write journal entry
    {
-        BS_SUBMIT_CHECK_SQES(1);
        auto dirty_it = dirty_db.find((obj_ver_id){
            .oid = op->oid,
            .version = op->version,
        });
        assert(dirty_it != dirty_db.end());
+        blockstore_journal_check_t space_check(this);
+        if (!space_check.check_available(op, 1,
+            sizeof(journal_entry_big_write) + dsk.clean_entry_bitmap_size,
+            ((dirty_it->second.state & BS_ST_INSTANT) ? JOURNAL_INSTANT_RESERVATION : JOURNAL_STABILIZE_RESERVATION)))
+        {
+            return 0;
+        }
+        BS_SUBMIT_CHECK_SQES(1);
        journal_entry_big_write *je = (journal_entry_big_write*)prefill_single_journal_entry(
            journal, op->opcode == BS_OP_WRITE_STABLE ? JE_BIG_WRITE_INSTANT : JE_BIG_WRITE,
            sizeof(journal_entry_big_write) + dsk.clean_entry_bitmap_size
@@ -585,10 +598,7 @@ void blockstore_impl_t::handle_write_event(ring_data_t *data, blockstore_op_t *o
    if (data->res != data->iov.iov_len)
    {
        // FIXME: our state becomes corrupted after a write error. maybe do something better than just die
-        throw std::runtime_error(
-            "write operation failed ("+std::to_string(data->res)+" != "+std::to_string(data->iov.iov_len)+
-            "). in-memory state is corrupted. AAAAAAAaaaaaaaaa!!!111"
-        );
+        disk_error_abort("data write", data->res, data->iov.iov_len);
    }
    PRIV(op)->pending_ops--;
    assert(PRIV(op)->pending_ops >= 0);
@@ -643,7 +653,7 @@ int blockstore_impl_t::dequeue_del(blockstore_op_t *op)
    });
    assert(dirty_it != dirty_db.end());
    blockstore_journal_check_t space_check(this);
-    if (!space_check.check_available(op, 1, sizeof(journal_entry_del), JOURNAL_STABILIZE_RESERVATION))
+    if (!space_check.check_available(op, 1, sizeof(journal_entry_del), JOURNAL_INSTANT_RESERVATION))
    {
        return 0;
    }
--- a/src/cli.cpp
+++ b/src/cli.cpp
@@ -76,6 +76,12 @@ static const char* help_text =
    "vitastor-cli alloc-osd\n"
    "  Allocate a new OSD number and reserve it by creating empty /osd/stats/<n> key.\n"
    "\n"
+    "vitastor-cli rm-osd [--force] [--allow-data-loss] [--dry-run] <osd_id> [osd_id...]\n"
+    "  Remove metadata and configuration for specified OSD(s) from etcd.\n"
+    "  Refuses to remove OSDs with data without --force and --allow-data-loss.\n"
+    "  With --dry-run only checks if deletion is possible without data loss and\n"
+    "  redundancy degradation.\n"
+    "\n"
    "Use vitastor-cli --help <command> for command details or vitastor-cli --help --all for all details.\n"
    "\n"
    "GLOBAL OPTIONS:\n"
@@ -95,43 +101,47 @@ static json11::Json::object parse_args(int narg, const char *args[])
    cfg["progress"] = "1";
    for (int i = 1; i < narg; i++)
    {
-        if (args[i][0] == '-' && args[i][1] == 'h')
+        if (args[i][0] == '-' && args[i][1] == 'h' && args[i][2] == 0)
        {
            cfg["help"] = "1";
        }
-        else if (args[i][0] == '-' && args[i][1] == 'l')
+        else if (args[i][0] == '-' && args[i][1] == 'l' && args[i][2] == 0)
        {
            cfg["long"] = "1";
        }
-        else if (args[i][0] == '-' && args[i][1] == 'n')
+        else if (args[i][0] == '-' && args[i][1] == 'n' && args[i][2] == 0)
        {
            cfg["count"] = args[++i];
        }
-        else if (args[i][0] == '-' && args[i][1] == 'p')
+        else if (args[i][0] == '-' && args[i][1] == 'p' && args[i][2] == 0)
        {
            cfg["pool"] = args[++i];
        }
-        else if (args[i][0] == '-' && args[i][1] == 's')
+        else if (args[i][0] == '-' && args[i][1] == 's' && args[i][2] == 0)
        {
            cfg["size"] = args[++i];
        }
-        else if (args[i][0] == '-' && args[i][1] == 'r')
+        else if (args[i][0] == '-' && args[i][1] == 'r' && args[i][2] == 0)
        {
            cfg["reverse"] = "1";
        }
-        else if (args[i][0] == '-' && args[i][1] == 'f')
+        else if (args[i][0] == '-' && args[i][1] == 'f' && args[i][2] == 0)
        {
            cfg["force"] = "1";
        }
        else if (args[i][0] == '-' && args[i][1] == '-')
        {
            const char *opt = args[i]+2;
-            cfg[opt] = i == narg-1 || !strcmp(opt, "json") || !strcmp(opt, "wait-list") ||
-                !strcmp(opt, "long") || !strcmp(opt, "del") || !strcmp(opt, "no-color") ||
+            cfg[opt] = i == narg-1 || !strcmp(opt, "json") ||
+                !strcmp(opt, "wait-list") || !strcmp(opt, "wait_list") ||
+                !strcmp(opt, "long") || !strcmp(opt, "del") ||
+                !strcmp(opt, "no-color") || !strcmp(opt, "no_color") ||
                !strcmp(opt, "readonly") || !strcmp(opt, "readwrite") ||
                !strcmp(opt, "force") || !strcmp(opt, "reverse") ||
+                !strcmp(opt, "allow-data-loss") || !strcmp(opt, "allow_data_loss") ||
+                !strcmp(opt, "dry-run") || !strcmp(opt, "dry_run") ||
                !strcmp(opt, "help") || !strcmp(opt, "all") ||
-                !strcmp(opt, "writers-stopped") && strcmp("1", args[i+1]) != 0
+                (!strcmp(opt, "writers-stopped") || !strcmp(opt, "writers_stopped")) && strcmp("1", args[i+1]) != 0
                ? "1" : args[++i];
        }
        else
@@ -139,10 +149,6 @@ static json11::Json::object parse_args(int narg, const char *args[])
            cmd.push_back(std::string(args[i]));
        }
    }
-    if (cfg["help"].bool_value())
-    {
-        print_help(help_text, "vitastor-cli", cmd.size() ? cmd[0].string_value() : "", cfg["all"].bool_value());
-    }
    if (!cmd.size())
    {
        std::string exe(exe_name);
@@ -151,6 +157,10 @@ static json11::Json::object parse_args(int narg, const char *args[])
            cmd.push_back("rm-data");
        }
    }
+    if (!cmd.size() || cfg["help"].bool_value())
+    {
+        print_help(help_text, "vitastor-cli", cmd.size() ? cmd[0].string_value() : "", cfg["all"].bool_value());
+    }
    cfg["command"] = cmd;
    return cfg;
 }
@@ -225,6 +235,16 @@ static int run(cli_tool_t *p, json11::Json::object cfg)
        // Delete inode data
        action_cb = p->start_rm_data(cfg);
    }
+    else if (cmd[0] == "rm-osd")
+    {
+        // Delete OSD metadata from etcd
+        if (cmd.size() > 1)
+        {
+            cmd.erase(cmd.begin(), cmd.begin()+1);
+            cfg["osd_id"] = cmd;
+        }
+        action_cb = p->start_rm_osd(cfg);
+    }
    else if (cmd[0] == "merge-data")
    {
        // Merge layer data without affecting metadata
--- a/src/cli.h
+++ b/src/cli.h
@@ -45,7 +45,7 @@ public:
    cli_result_t etcd_err;
    json11::Json etcd_result;

-    void parse_config(json11::Json cfg);
+    void parse_config(json11::Json::object & cfg);

    void change_parent(inode_t cur, inode_t new_parent, cli_result_t *result);
    inode_config_t* get_inode_cfg(const std::string & name);
@@ -64,6 +64,7 @@ public:
    std::function<bool(cli_result_t &)> start_merge(json11::Json);
    std::function<bool(cli_result_t &)> start_flatten(json11::Json);
    std::function<bool(cli_result_t &)> start_rm(json11::Json);
+    std::function<bool(cli_result_t &)> start_rm_osd(json11::Json cfg);
    std::function<bool(cli_result_t &)> start_alloc_osd(json11::Json cfg);

    // Should be called like loop_and_wait(start_status(), <completion callback>)
--- a/src/cli_common.cpp
+++ b/src/cli_common.cpp
@@ -100,9 +100,20 @@ inode_config_t* cli_tool_t::get_inode_cfg(const std::string & name)
    return NULL;
 }

-void cli_tool_t::parse_config(json11::Json cfg)
+void cli_tool_t::parse_config(json11::Json::object & cfg)
 {
-    color = !cfg["no-color"].bool_value();
+    for (auto kv_it = cfg.begin(); kv_it != cfg.end();)
+    {
+        // Translate all options with - to _
+        if (kv_it->first.find("-") != std::string::npos)
+        {
+            cfg[str_replace(kv_it->first, "-", "_")] = kv_it->second;
+            cfg.erase(kv_it++);
+        }
+        else
+            kv_it++;
+    }
+    color = !cfg["no_color"].bool_value();
    json_output = cfg["json"].bool_value();
    iodepth = cfg["iodepth"].uint64_value();
    if (!iodepth)
@@ -112,7 +123,7 @@ void cli_tool_t::parse_config(json11::Json cfg)
        parallel_osds = 4;
    log_level = cfg["log_level"].int64_value();
    progress = cfg["progress"].uint64_value() ? true : false;
-    list_first = cfg["wait-list"].uint64_value() ? true : false;
+    list_first = cfg["wait_list"].uint64_value() ? true : false;
 }

 struct cli_result_looper_t
--- a/src/cli_create.cpp
+++ b/src/cli_create.cpp
@@ -517,7 +517,7 @@ std::function<bool(cli_result_t &)> cli_tool_t::start_create(json11::Json cfg)
    image_creator->force_size = cfg["force_size"].bool_value();
    if (cfg["image_meta"].is_object())
    {
-        image_creator->new_meta = cfg["image-meta"];
+        image_creator->new_meta = cfg["image_meta"];
    }
    if (cfg["snapshot"].string_value() != "")
    {
--- a/src/cli_df.cpp
+++ b/src/cli_df.cpp
@@ -121,8 +121,7 @@ resume_1:
            }
            if (pool_cfg.scheme != POOL_SCHEME_REPLICATED)
            {
-                uint64_t pg_real_size = pool_stats[pool_cfg.id]["pg_real_size"].uint64_value();
-                pool_avail = pg_real_size > 0 ? pool_avail * (pool_cfg.pg_size - pool_cfg.parity_chunks) / pg_real_size : 0;
+                pool_avail *= (pool_cfg.pg_size - pool_cfg.parity_chunks);
            }
            pool_stats[pool_cfg.id] = json11::Json::object {
                { "name", pool_cfg.name },
--- a/src/cli_flatten.cpp
+++ b/src/cli_flatten.cpp
@@ -133,7 +133,7 @@ std::function<bool(cli_result_t &)> cli_tool_t::start_flatten(json11::Json cfg)
    auto flattener = new snap_flattener_t();
    flattener->parent = this;
    flattener->target_name = cfg["image"].string_value();
-    flattener->fsync_interval = cfg["fsync-interval"].uint64_value();
+    flattener->fsync_interval = cfg["fsync_interval"].uint64_value();
    if (!flattener->fsync_interval)
        flattener->fsync_interval = 128;
    if (!cfg["cas"].is_null())
--- a/src/cli_merge.cpp
+++ b/src/cli_merge.cpp
@@ -403,7 +403,7 @@ struct snap_merger_t
        op->opcode = OSD_OP_READ_BITMAP;
        op->inode = target;
        op->offset = offset;
-        op->len = 0;
+        op->len = target_block_size;
        op->callback = [this](cluster_op_t *op)
        {
            if (op->retval < 0)
@@ -631,8 +631,8 @@ std::function<bool(cli_result_t &)> cli_tool_t::start_merge(json11::Json cfg)
    merger->from_name = cfg["from"].string_value();
    merger->to_name = cfg["to"].string_value();
    merger->target_name = cfg["target"].string_value();
-    merger->delete_source = cfg["delete-source"].string_value() != "";
-    merger->fsync_interval = cfg["fsync-interval"].uint64_value();
+    merger->delete_source = cfg["delete_source"].string_value() != "";
+    merger->fsync_interval = cfg["fsync_interval"].uint64_value();
    if (!merger->fsync_interval)
        merger->fsync_interval = 128;
    if (!cfg["cas"].is_null())
--- a/src/cli_modify.cpp
+++ b/src/cli_modify.cpp
@@ -236,7 +236,7 @@ std::function<bool(cli_result_t &)> cli_tool_t::start_modify(json11::Json cfg)
    changer->force = cfg["force"].bool_value();
    changer->set_readonly = cfg["readonly"].bool_value();
    changer->set_readwrite = cfg["readwrite"].bool_value();
-    changer->fsync_interval = cfg["fsync-interval"].uint64_value();
+    changer->fsync_interval = cfg["fsync_interval"].uint64_value();
    if (!changer->fsync_interval)
        changer->fsync_interval = 128;
    // FIXME Check that the image doesn't have children when shrinking
--- a/src/cli_rm.cpp
+++ b/src/cli_rm.cpp
@@ -639,7 +639,7 @@ std::function<bool(cli_result_t &)> cli_tool_t::start_rm(json11::Json cfg)
    snap_remover->parent = this;
    snap_remover->from_name = cfg["from"].string_value();
    snap_remover->to_name = cfg["to"].string_value();
-    snap_remover->fsync_interval = cfg["fsync-interval"].uint64_value();
+    snap_remover->fsync_interval = cfg["fsync_interval"].uint64_value();
    if (!snap_remover->fsync_interval)
        snap_remover->fsync_interval = 128;
    if (!cfg["cas"].is_null())
--- a/src/cli_rm_data.cpp
+++ b/src/cli_rm_data.cpp
@@ -92,6 +92,7 @@ struct rm_inode_t

    void send_ops(rm_pg_t *cur_list)
    {
+        parent->cli->init_msgr();
        if (parent->cli->msgr.osd_peer_fds.find(cur_list->rm_osd_num) ==
            parent->cli->msgr.osd_peer_fds.end())
        {
@@ -218,7 +219,7 @@ std::function<bool(cli_result_t &)> cli_tool_t::start_rm_data(json11::Json cfg)
        remover->inode = (remover->inode & (((uint64_t)1 << (64-POOL_ID_BITS)) - 1)) | (((uint64_t)remover->pool_id) << (64-POOL_ID_BITS));
    }
    remover->pool_id = INODE_POOL(remover->inode);
-    remover->min_offset = cfg["min-offset"].uint64_value();
+    remover->min_offset = cfg["min_offset"].uint64_value();
    return [remover](cli_result_t & result)
    {
        remover->loop();
--- a/src/cli_rm_osd.cpp
+++ b/src/cli_rm_osd.cpp
@@ -0,0 +1,491 @@
+// Copyright (c) Vitaliy Filippov, 2019+
+// License: VNPL-1.1 (see README.md for details)
+
+#include <ctype.h>
+#include "cli.h"
+#include "cluster_client.h"
+#include "str_util.h"
+#include "epoll_manager.h"
+
+#include <algorithm>
+
+// Delete OSD metadata from etcd
+struct rm_osd_t
+{
+    cli_tool_t *parent;
+
+    bool dry_run, force_warning, force_dataloss;
+    uint64_t etcd_tx_retry_ms = 500;
+    uint64_t etcd_tx_retries = 10000;
+    std::vector<uint64_t> osd_ids;
+
+    int state = 0;
+    cli_result_t result;
+
+    std::set<uint64_t> to_remove;
+    std::set<uint64_t> to_restart;
+    json11::Json::array pool_effects;
+    json11::Json::array history_updates, history_checks;
+    json11::Json new_pgs, new_clean_pgs;
+    uint64_t new_pgs_mod_rev, new_clean_pgs_mod_rev;
+    uint64_t cur_retry = 0;
+    uint64_t retry_wait = 0;
+    bool is_warning, is_dataloss;
+
+    bool is_done()
+    {
+        return state == 100;
+    }
+
+    void loop()
+    {
+        if (state == 1)
+            goto resume_1;
+        else if (state == 2)
+            goto resume_2;
+        else if (state == 3)
+            goto resume_3;
+        else if (state == 4)
+            goto resume_4;
+        if (!osd_ids.size())
+        {
+            result = (cli_result_t){ .err = EINVAL, .text = "OSD numbers are not specified" };
+            state = 100;
+            return;
+        }
+        for (auto osd_id: osd_ids)
+        {
+            if (!osd_id)
+            {
+                result = (cli_result_t){ .err = EINVAL, .text = "OSD number can't be zero" };
+                state = 100;
+                return;
+            }
+            to_remove.insert(osd_id);
+        }
+        // Check if OSDs are still used in data distribution
+        is_warning = is_dataloss = false;
+        for (auto & pp: parent->cli->st_cli.pool_config)
+        {
+            // Will OSD deletion make pool incomplete / down / degraded?
+            bool pool_incomplete = false, pool_down = false, pool_degraded = false;
+            bool hist_incomplete = false, hist_degraded = false;
+            auto & pool_cfg = pp.second;
+            uint64_t pg_data_size = (pool_cfg.scheme == POOL_SCHEME_REPLICATED ? 1 : pool_cfg.pg_size-pool_cfg.parity_chunks);
+            for (auto & pgp: pool_cfg.pg_config)
+            {
+                auto & pg_cfg = pgp.second;
+                int pg_cursize = 0, pg_rm = 0;
+                for (auto pg_osd: pg_cfg.target_set)
+                {
+                    if (pg_osd != 0)
+                    {
+                        pg_cursize++;
+                        if (to_remove.find(pg_osd) != to_remove.end())
+                            pg_rm++;
+                    }
+                }
+                for (auto & hist_item: pg_cfg.target_history)
+                {
+                    int hist_size = 0, hist_rm = 0;
+                    for (auto & old_osd: hist_item)
+                    {
+                        if (old_osd != 0)
+                        {
+                            hist_size++;
+                            if (to_remove.find(old_osd) != to_remove.end())
+                                hist_rm++;
+                        }
+                    }
+                    if (hist_rm > 0)
+                    {
+                        hist_degraded = true;
+                        if (hist_size-hist_rm == 0)
+                            pool_incomplete = true;
+                        else if (hist_size-hist_rm < pg_data_size)
+                            hist_incomplete = true;
+                    }
+                }
+                if (pg_rm > 0)
+                {
+                    pool_degraded = true;
+                    if (pg_cursize-pg_rm < pg_data_size)
+                        pool_incomplete = true;
+                    else if (pg_cursize-pg_rm < pool_cfg.pg_minsize)
+                        pool_down = true;
+                }
+            }
+            if (pool_incomplete || pool_down || pool_degraded || hist_incomplete || hist_degraded)
+            {
+                pool_effects.push_back(json11::Json::object {
+                    { "pool_id", (uint64_t)pool_cfg.id },
+                    { "pool_name", pool_cfg.name },
+                    { "effect", (pool_incomplete
+                        ? "incomplete"
+                        : (hist_incomplete
+                            ? "has_incomplete"
+                            : (pool_down
+                                ? "offline"
+                                : (pool_degraded
+                                    ? "degraded"
+                                    : (hist_degraded ? "has_degraded" : "?")
+                                )
+                            )
+                        )
+                    ) },
+                });
+                is_warning = true;
+                if (pool_incomplete || hist_incomplete)
+                    is_dataloss = true;
+            }
+        }
+        result.data = json11::Json::object {
+            { "osd_ids", osd_ids },
+            { "pool_errors", pool_effects },
+        };
+        if (is_dataloss || is_warning || dry_run)
+        {
+            std::string error;
+            for (auto & e: pool_effects)
+            {
+                error += "Pool "+e["pool_name"].string_value()+" (ID "+e["pool_id"].as_string()+") will have "+(
+                    e["effect"] == "has_incomplete"
+                        ? std::string("INCOMPLETE objects (DATA LOSS)")
+                        : (e["effect"] == "incomplete"
+                            ? std::string("INCOMPLETE PGs (DATA LOSS)")
+                            : (e["effect"] == "has_degraded"
+                                ? std::string("DEGRADED objects")
+                                : strtoupper(e["effect"].string_value())+" PGs"))
+                )+" after deleting OSD(s).\n";
+            }
+            if (is_dataloss && !force_dataloss && !dry_run)
+                error += "OSDs not deleted. Please move data to other OSDs or bypass this check with --allow-data-loss if you know what you are doing.\n";
+            else if (is_warning && !force_warning && !dry_run)
+                error += "OSDs not deleted. Please move data to other OSDs or bypass this check with --force if you know what you are doing.\n";
+            else if (!is_dataloss && !is_warning && dry_run)
+                error += "OSDs can be deleted without data loss.\n";
+            result.text = error;
+            if (dry_run || is_dataloss && !force_dataloss || is_warning && !force_warning)
+            {
+                result.err = is_dataloss && !force_dataloss || is_warning && !force_warning ? EBUSY : 0;
+                state = 100;
+                return;
+            }
+        }
+        parent->etcd_txn(json11::Json::object { { "success", json11::Json::array {
+            json11::Json::object {
+                { "request_range", json11::Json::object {
+                    { "key", base64_encode(
+                        parent->cli->st_cli.etcd_prefix+"/config/pgs"
+                    ) },
+                } },
+            },
+            json11::Json::object {
+                { "request_range", json11::Json::object {
+                    { "key", base64_encode(
+                        parent->cli->st_cli.etcd_prefix+"/history/last_clean_pgs"
+                    ) },
+                } },
+            },
+        } } });
+    resume_4:
+        state = 4;
+        if (parent->waiting > 0)
+            return;
+        if (parent->etcd_err.err)
+        {
+            result = parent->etcd_err;
+            state = 100;
+            return;
+        }
+        {
+            auto kv = parent->cli->st_cli.parse_etcd_kv(parent->etcd_result["responses"][0]["response_range"]["kvs"][0]);
+            new_pgs = remove_osds_from_pgs(kv);
+            new_pgs_mod_rev = kv.mod_revision;
+            kv = parent->cli->st_cli.parse_etcd_kv(parent->etcd_result["responses"][1]["response_range"]["kvs"][0]);
+            new_clean_pgs = remove_osds_from_pgs(kv);
+            new_clean_pgs_mod_rev = kv.mod_revision;
+        }
+        // Remove keys from etcd
+        {
+            json11::Json::array rm_items, rm_checks;
+            for (auto osd_id: osd_ids)
+            {
+                rm_items.push_back("/config/osd/"+std::to_string(osd_id));
+                rm_items.push_back("/osd/stats/"+std::to_string(osd_id));
+                rm_items.push_back("/osd/state/"+std::to_string(osd_id));
+                rm_items.push_back("/osd/inodestats/"+std::to_string(osd_id));
+                rm_items.push_back("/osd/space/"+std::to_string(osd_id));
+            }
+            for (int i = 0; i < rm_items.size(); i++)
+            {
+                rm_items[i] = json11::Json::object {
+                    { "request_delete_range", json11::Json::object {
+                        { "key", base64_encode(
+                            parent->cli->st_cli.etcd_prefix+rm_items[i].string_value()
+                        ) },
+                    } },
+                };
+            }
+            if (!new_pgs.is_null())
+            {
+                auto pgs_key = base64_encode(parent->cli->st_cli.etcd_prefix+"/config/pgs");
+                rm_items.push_back(json11::Json::object {
+                    { "request_put", json11::Json::object {
+                        { "key", pgs_key },
+                        { "value", base64_encode(new_pgs.dump()) },
+                    } },
+                });
+                rm_checks.push_back(json11::Json::object {
+                    { "target", "MOD" },
+                    { "key", pgs_key },
+                    { "result", "LESS" },
+                    { "mod_revision", new_pgs_mod_rev+1 },
+                });
+            }
+            if (!new_clean_pgs.is_null())
+            {
+                auto pgs_key = base64_encode(parent->cli->st_cli.etcd_prefix+"/history/last_clean_pgs");
+                rm_items.push_back(json11::Json::object {
+                    { "request_put", json11::Json::object {
+                        { "key", pgs_key },
+                        { "value", base64_encode(new_clean_pgs.dump()) },
+                    } },
+                });
+                rm_checks.push_back(json11::Json::object {
+                    { "target", "MOD" },
+                    { "key", pgs_key },
+                    { "result", "LESS" },
+                    { "mod_revision", new_clean_pgs_mod_rev+1 },
+                });
+            }
+            parent->etcd_txn(json11::Json::object { { "success", rm_items }, { "checks", rm_checks } });
+        }
+    resume_1:
+        state = 1;
+        if (parent->waiting > 0)
+            return;
+        if (parent->etcd_err.err)
+        {
+            result = parent->etcd_err;
+            state = 100;
+            return;
+        }
+        // Remove old OSD from PG all_peers to prevent left_on_dead and from
+        // target_history to prevent INCOMPLETE if --allow-data-loss is specified
+        for (auto & rsp: parent->etcd_result["responses"].array_items())
+        {
+            if (rsp["response_delete_range"]["deleted"].uint64_value() > 0)
+            {
+                // Wait for mon_change_timeout before updating PG history, or the monitor's change will likely interfere with ours
+                retry_wait = parent->cli->merged_config["mon_change_timeout"].uint64_value();
+                if (!retry_wait)
+                    retry_wait = 1000;
+                retry_wait += etcd_tx_retry_ms;
+            }
+        }
+        while (1)
+        {
+    resume_2:
+            if (!remove_osds_from_history(2))
+                return;
+    resume_3:
+            state = 3;
+            if (parent->waiting > 0)
+                return;
+            if (parent->etcd_err.err)
+            {
+                result = parent->etcd_err;
+                state = 100;
+                return;
+            }
+            if (parent->etcd_result["succeeded"].bool_value())
+                break;
+            if ((++cur_retry) >= etcd_tx_retries)
+            {
+                result.err = EAGAIN;
+                result.text += "Failed to remove OSDs from PG history due to update conflicts."
+                    " Some PGs may remain left_on_dead or incomplete. Please retry later\n";
+                state = 100;
+                return;
+            }
+            retry_wait = etcd_tx_retry_ms;
+        }
+        std::string ids = "";
+        for (auto osd_id: osd_ids)
+        {
+            ids += (ids.size() ? ", " : "")+std::to_string(osd_id);
+        }
+        ids = (osd_ids.size() > 1 ? "OSDs " : "OSD ")+ids+(osd_ids.size() > 1 ? " are" : " is")+" removed from etcd";
+        state = 100;
+        result.text = (result.text != "" ? ids+"\n"+result.text : ids);
+        result.err = 0;
+    }
+
+    json11::Json remove_osds_from_pgs(const etcd_kv_t & kv)
+    {
+        if (kv.value.is_null())
+        {
+            return kv.value;
+        }
+        json11::Json::object new_pgs;
+        for (auto & pp: kv.value["items"].object_items())
+        {
+            if (pp.second.is_object())
+            {
+                json11::Json::object new_pool;
+                for (auto & pgp: pp.second.object_items())
+                {
+                    json11::Json::array osd_set;
+                    for (auto & osd_json: pgp.second["osd_set"].array_items())
+                    {
+                        uint64_t osd_num = osd_json.uint64_value();
+                        osd_set.push_back(osd_num == 0 || to_remove.find(osd_num) != to_remove.end() ? 0 : osd_num);
+                    }
+                    json11::Json::object new_pg = pgp.second.object_items();
+                    new_pg["osd_set"] = osd_set;
+                    new_pool[pgp.first] = new_pg;
+                }
+                new_pgs[pp.first] = new_pool;
+            }
+            else
+                new_pgs[pp.first] = pp.second;
+        }
+        auto res = kv.value.object_items();
+        res["items"] = new_pgs;
+        return res;
+    }
+
+    bool remove_osds_from_history(int base_state)
+    {
+        if (state == base_state+0)
+            goto resume_0;
+        history_updates.clear();
+        history_checks.clear();
+        for (auto & pp: parent->cli->st_cli.pool_config)
+        {
+            bool update_pg_history = false;
+            auto & pool_cfg = pp.second;
+            for (auto & pgp: pool_cfg.pg_config)
+            {
+                auto pg_num = pgp.first;
+                auto & pg_cfg = pgp.second;
+                for (int i = 0; i < pg_cfg.all_peers.size(); i++)
+                {
+                    if (to_remove.find(pg_cfg.all_peers[i]) != to_remove.end())
+                    {
+                        update_pg_history = true;
+                        pg_cfg.all_peers.erase(pg_cfg.all_peers.begin()+i, pg_cfg.all_peers.begin()+i+1);
+                        i--;
+                    }
+                }
+                for (int i = 0; i < pg_cfg.target_history.size(); i++)
+                {
+                    int hist_size = 0, hist_rm = 0;
+                    for (auto & old_osd: pg_cfg.target_history[i])
+                    {
+                        if (old_osd != 0)
+                        {
+                            hist_size++;
+                            if (to_remove.find(old_osd) != to_remove.end())
+                            {
+                                hist_rm++;
+                                old_osd = 0;
+                            }
+                        }
+                    }
+                    if (hist_rm > 0)
+                    {
+                        if (hist_size-hist_rm == 0)
+                        {
+                            pg_cfg.target_history.erase(pg_cfg.target_history.begin()+i, pg_cfg.target_history.begin()+i+1);
+                            i--;
+                        }
+                        update_pg_history = true;
+                    }
+                }
+                if (update_pg_history)
+                {
+                    std::string history_key = base64_encode(
+                        parent->cli->st_cli.etcd_prefix+"/pg/history/"+
+                        std::to_string(pool_cfg.id)+"/"+std::to_string(pg_num)
+                    );
+                    history_updates.push_back(json11::Json::object {
+                        { "request_put", json11::Json::object {
+                            { "key", history_key },
+                            { "value", base64_encode(json11::Json(json11::Json::object {
+                                { "epoch", pg_cfg.epoch },
+                                { "all_peers", pg_cfg.all_peers },
+                                { "osd_sets", pg_cfg.target_history },
+                            }).dump()) },
+                        } },
+                    });
+                    history_checks.push_back(json11::Json::object {
+                        { "target", "MOD" },
+                        { "key", history_key },
+                        { "result", "LESS" },
+                        { "mod_revision", parent->cli->st_cli.etcd_watch_revision+1 },
+                    });
+                }
+            }
+        }
+        if (history_updates.size())
+        {
+            if (retry_wait)
+            {
+                parent->waiting++;
+                parent->epmgr->tfd->set_timer(retry_wait, false, [this](int timer_id)
+                {
+                    parent->waiting--;
+                    parent->ringloop->wakeup();
+                });
+    resume_0:
+                state = base_state+0;
+                if (parent->waiting > 0)
+                    return false;
+            }
+            parent->etcd_txn(json11::Json::object {
+                { "success", history_updates },
+                { "compare", history_checks },
+            });
+        }
+        else
+            parent->etcd_result = json11::Json::object{ { "succeeded", true } };
+        return true;
+    }
+};
+
+std::function<bool(cli_result_t &)> cli_tool_t::start_rm_osd(json11::Json cfg)
+{
+    auto rm_osd = new rm_osd_t();
+    rm_osd->parent = this;
+    rm_osd->dry_run = cfg["dry_run"].bool_value();
+    rm_osd->force_dataloss = cfg["allow_data_loss"].bool_value();
+    rm_osd->force_warning = rm_osd->force_dataloss || cfg["force"].bool_value();
+    if (!cfg["etcd_tx_retries"].is_null())
+        rm_osd->etcd_tx_retries = cfg["etcd_tx_retries"].uint64_value();
+    if (!cfg["etcd_tx_retry_ms"].is_null())
+    {
+        rm_osd->etcd_tx_retry_ms = cfg["etcd_tx_retry_ms"].uint64_value();
+        if (rm_osd->etcd_tx_retry_ms < 100)
+            rm_osd->etcd_tx_retry_ms = 100;
+    }
+    if (cfg["osd_id"].is_number() || cfg["osd_id"].is_string())
+        rm_osd->osd_ids.push_back(cfg["osd_id"].uint64_value());
+    else
+    {
+        for (auto & id: cfg["osd_id"].array_items())
+            rm_osd->osd_ids.push_back(id.uint64_value());
+    }
+    return [rm_osd](cli_result_t & result)
+    {
+        rm_osd->loop();
+        if (rm_osd->is_done())
+        {
+            result = rm_osd->result;
+            delete rm_osd;
+            return true;
+        }
+        return false;
+    };
+}
--- a/src/cluster_client.cpp
+++ b/src/cluster_client.cpp
@@ -59,7 +59,6 @@ cluster_client_t::cluster_client_t(ring_loop_t *ringloop, timerfd_manager_t *tfd
        delete op;
    };
    msgr.parse_config(this->config);
-    msgr.init();

    st_cli.tfd = tfd;
    st_cli.on_load_config_hook = [this](json11::Json::object & cfg) { on_load_config_hook(cfg); };
@@ -73,17 +72,6 @@ cluster_client_t::cluster_client_t(ring_loop_t *ringloop, timerfd_manager_t *tfd

    scrap_buffer_size = SCRAP_BUFFER_SIZE;
    scrap_buffer = malloc_or_die(scrap_buffer_size);
-
-    if (ringloop)
-    {
-        consumer.loop = [this]()
-        {
-            msgr.read_requests();
-            msgr.send_replies();
-            this->ringloop->submit();
-        };
-        ringloop->register_consumer(&consumer);
-    }
 }

 cluster_client_t::~cluster_client_t()
@@ -115,6 +103,24 @@ cluster_op_t::~cluster_op_t()
    }
 }

+void cluster_client_t::init_msgr()
+{
+    if (msgr_initialized)
+        return;
+    msgr.init();
+    msgr_initialized = true;
+    if (ringloop)
+    {
+        consumer.loop = [this]()
+        {
+            msgr.read_requests();
+            msgr.send_replies();
+            this->ringloop->submit();
+        };
+        ringloop->register_consumer(&consumer);
+    }
+}
+
 void cluster_client_t::calc_wait(cluster_op_t *op)
 {
    op->prev_wait = 0;
@@ -143,7 +149,7 @@ void cluster_client_t::calc_wait(cluster_op_t *op)
        if (!op->prev_wait)
            continue_sync(op);
    }
-    else /* if (op->opcode == OSD_OP_READ || op->opcode == OSD_OP_READ_BITMAP) */
+    else /* if (op->opcode == OSD_OP_READ || op->opcode == OSD_OP_READ_BITMAP || op->opcode == OSD_OP_READ_CHAIN_BITMAP) */
    {
        for (auto prev = op_queue_head; prev && prev != op; prev = prev->next)
        {
@@ -151,7 +157,8 @@ void cluster_client_t::calc_wait(cluster_op_t *op)
            {
                op->prev_wait++;
            }
-            else if (prev->opcode == OSD_OP_WRITE || prev->opcode == OSD_OP_READ || prev->opcode == OSD_OP_READ_BITMAP)
+            else if (prev->opcode == OSD_OP_WRITE || prev->opcode == OSD_OP_READ ||
+                prev->opcode == OSD_OP_READ_BITMAP || prev->opcode == OSD_OP_READ_CHAIN_BITMAP)
            {
                // Flushes are always in the beginning (we're scanning from the beginning of the queue)
                break;
@@ -171,7 +178,8 @@ void cluster_client_t::inc_wait(uint64_t opcode, uint64_t flags, cluster_op_t *n
            auto n2 = next->next;
            if (next->opcode == OSD_OP_SYNC && !(flags & OP_IMMEDIATE_COMMIT) ||
                next->opcode == OSD_OP_WRITE && (flags & OP_FLUSH_BUFFER) && !(next->flags & OP_FLUSH_BUFFER) ||
-                (next->opcode == OSD_OP_READ || next->opcode == OSD_OP_READ_BITMAP) && (flags & OP_FLUSH_BUFFER))
+                (next->opcode == OSD_OP_READ || next->opcode == OSD_OP_READ_BITMAP ||
+                    next->opcode == OSD_OP_READ_CHAIN_BITMAP) && (flags & OP_FLUSH_BUFFER))
            {
                next->prev_wait += inc;
                assert(next->prev_wait >= 0);
@@ -221,9 +229,14 @@ void cluster_client_t::erase_op(cluster_op_t *op)
    if (op_queue_tail == op)
        op_queue_tail = op->prev;
    op->next = op->prev = NULL;
-    std::function<void(cluster_op_t*)>(op->callback)(op);
+    if (flags & OP_FLUSH_BUFFER)
+        std::function<void(cluster_op_t*)>(op->callback)(op);
    if (!(flags & OP_IMMEDIATE_COMMIT))
        inc_wait(opcode, flags, next, -1);
+    // Call callback at the end to avoid inconsistencies in prev_wait
+    // if the callback adds more operations itself
+    if (!(flags & OP_FLUSH_BUFFER))
+        std::function<void(cluster_op_t*)>(op->callback)(op);
 }

 void cluster_client_t::continue_ops(bool up_retry)
@@ -335,7 +348,8 @@ void cluster_client_t::on_change_hook(std::map<std::string, etcd_kv_t> & changes
            // And now they have to be resliced!
            for (auto op = op_queue_head; op; op = op->next)
            {
-                if ((op->opcode == OSD_OP_WRITE || op->opcode == OSD_OP_READ || op->opcode == OSD_OP_READ_BITMAP) &&
+                if ((op->opcode == OSD_OP_WRITE || op->opcode == OSD_OP_READ ||
+                    op->opcode == OSD_OP_READ_BITMAP || op->opcode == OSD_OP_READ_CHAIN_BITMAP) &&
                    INODE_POOL(op->cur_inode) == pool_item.first)
                {
                    op->needs_reslice = true;
@@ -407,7 +421,7 @@ void cluster_client_t::on_ready(std::function<void(void)> fn)
 void cluster_client_t::execute(cluster_op_t *op)
 {
    if (op->opcode != OSD_OP_SYNC && op->opcode != OSD_OP_READ &&
-        op->opcode != OSD_OP_READ_BITMAP && op->opcode != OSD_OP_WRITE)
+        op->opcode != OSD_OP_READ_BITMAP && op->opcode != OSD_OP_READ_CHAIN_BITMAP && op->opcode != OSD_OP_WRITE)
    {
        op->retval = -EINVAL;
        std::function<void(cluster_op_t*)>(op->callback)(op);
@@ -439,7 +453,7 @@ void cluster_client_t::execute(cluster_op_t *op)
            return;
        }
        // Check alignment
-        if ((op->opcode == OSD_OP_READ || op->opcode == OSD_OP_WRITE) && !op->len ||
+        if (!op->len && (op->opcode == OSD_OP_READ || op->opcode == OSD_OP_READ_BITMAP || op->opcode == OSD_OP_READ_CHAIN_BITMAP || op->opcode == OSD_OP_WRITE) ||
            op->offset % pool_it->second.bitmap_granularity || op->len % pool_it->second.bitmap_granularity)
        {
            op->retval = -EINVAL;
@@ -700,8 +714,7 @@ resume_3:
        // Finished successfully
        // Even if the PG count has changed in meanwhile we treat it as success
        // because if some operations were invalid for the new PG count we'd get errors
-        bool is_read = op->opcode == OSD_OP_READ;
-        if (is_read)
+        if (op->opcode == OSD_OP_READ || op->opcode == OSD_OP_READ_CHAIN_BITMAP)
        {
            // Check parent inode
            auto ino_it = st_cli.inode_config.find(op->cur_inode);
@@ -725,6 +738,11 @@ resume_3:
            }
        }
        op->retval = op->len;
+        if (op->opcode == OSD_OP_READ_BITMAP || op->opcode == OSD_OP_READ_CHAIN_BITMAP)
+        {
+            auto & pool_cfg = st_cli.pool_config.at(INODE_POOL(op->inode));
+            op->retval = op->len / pool_cfg.bitmap_granularity;
+        }
        erase_op(op);
        return 1;
    }
@@ -748,7 +766,10 @@ resume_3:
        {
            for (int i = 0; i < op->parts.size(); i++)
            {
-                op->parts[i].flags = PART_RETRY;
+                if (!(op->parts[i].flags & PART_DONE))
+                {
+                    op->parts[i].flags = PART_RETRY;
+                }
            }
            goto resume_2;
        }
@@ -807,23 +828,19 @@ void cluster_client_t::slice_rw(cluster_op_t *op)
    uint64_t last_stripe = op->len > 0 ? ((op->offset + op->len - 1) / pg_block_size) * pg_block_size : first_stripe;
    op->retval = 0;
    op->parts.resize((last_stripe - first_stripe) / pg_block_size + 1);
-    if (op->opcode == OSD_OP_READ || op->opcode == OSD_OP_READ_BITMAP)
+    if (op->opcode == OSD_OP_READ || op->opcode == OSD_OP_READ_BITMAP || op->opcode == OSD_OP_READ_CHAIN_BITMAP)
    {
        // Allocate memory for the bitmap
-        unsigned object_bitmap_size = (((op->opcode == OSD_OP_READ_BITMAP ? pg_block_size : op->len) / pool_cfg.bitmap_granularity + 7) / 8);
+        unsigned object_bitmap_size = ((op->len / pool_cfg.bitmap_granularity + 7) / 8);
        object_bitmap_size = (object_bitmap_size < 8 ? 8 : object_bitmap_size);
        unsigned bitmap_mem = object_bitmap_size + (pool_cfg.data_block_size / pool_cfg.bitmap_granularity / 8 * pg_data_size) * op->parts.size();
-        if (op->bitmap_buf_size < bitmap_mem)
+        if (!op->bitmap_buf || op->bitmap_buf_size < bitmap_mem)
        {
            op->bitmap_buf = realloc_or_die(op->bitmap_buf, bitmap_mem);
-            if (!op->bitmap_buf_size)
-            {
-                // First allocation
-                memset(op->bitmap_buf, 0, object_bitmap_size);
-            }
            op->part_bitmaps = (uint8_t*)op->bitmap_buf + object_bitmap_size;
            op->bitmap_buf_size = bitmap_mem;
        }
+        memset(op->bitmap_buf, 0, bitmap_mem);
    }
    int iov_idx = 0;
    size_t iov_pos = 0;
@@ -874,13 +891,14 @@ void cluster_client_t::slice_rw(cluster_op_t *op)
            if (end == begin)
                op->done_count++;
        }
-        else if (op->opcode != OSD_OP_READ_BITMAP && op->opcode != OSD_OP_DELETE)
+        else if (op->opcode != OSD_OP_READ_BITMAP && op->opcode != OSD_OP_READ_CHAIN_BITMAP && op->opcode != OSD_OP_DELETE)
        {
            add_iov(end-begin, false, op, iov_idx, iov_pos, op->parts[i].iov, NULL, 0);
        }
        op->parts[i].parent = op;
        op->parts[i].offset = begin;
-        op->parts[i].len = op->opcode == OSD_OP_READ_BITMAP || op->opcode == OSD_OP_DELETE ? 0 : (uint32_t)(end - begin);
+        op->parts[i].len = op->opcode == OSD_OP_READ_BITMAP || op->opcode == OSD_OP_READ_CHAIN_BITMAP ||
+            op->opcode == OSD_OP_DELETE ? 0 : (uint32_t)(end - begin);
        op->parts[i].pg_num = pg_num;
        op->parts[i].osd_num = 0;
        op->parts[i].flags = 0;
@@ -909,6 +927,10 @@ bool cluster_client_t::affects_osd(uint64_t inode, uint64_t offset, uint64_t len

 bool cluster_client_t::try_send(cluster_op_t *op, int i)
 {
+    if (!msgr_initialized)
+    {
+        init_msgr();
+    }
    auto part = &op->parts[i];
    auto & pool_cfg = st_cli.pool_config.at(INODE_POOL(op->cur_inode));
    auto pg_it = pool_cfg.pg_config.find(part->pg_num);
@@ -927,7 +949,7 @@ bool cluster_client_t::try_send(cluster_op_t *op, int i)
                pool_cfg.scheme == POOL_SCHEME_REPLICATED ? 1 : pool_cfg.pg_size-pool_cfg.parity_chunks
            );
            uint64_t meta_rev = 0;
-            if (op->opcode != OSD_OP_READ_BITMAP && op->opcode != OSD_OP_DELETE)
+            if (op->opcode != OSD_OP_READ_BITMAP && op->opcode != OSD_OP_READ_CHAIN_BITMAP && op->opcode != OSD_OP_DELETE)
            {
                auto ino_it = st_cli.inode_config.find(op->inode);
                if (ino_it != st_cli.inode_config.end())
@@ -939,8 +961,8 @@ bool cluster_client_t::try_send(cluster_op_t *op, int i)
                .req = { .rw = {
                    .header = {
                        .magic = SECONDARY_OSD_OP_MAGIC,
-                        .id = op_id++,
-                        .opcode = op->opcode == OSD_OP_READ_BITMAP ? OSD_OP_READ : op->opcode,
+                        .id = next_op_id(),
+                        .opcode = op->opcode == OSD_OP_READ_BITMAP || op->opcode == OSD_OP_READ_CHAIN_BITMAP ? OSD_OP_READ : op->opcode,
                    },
                    .inode = op->cur_inode,
                    .offset = part->offset,
@@ -948,8 +970,10 @@ bool cluster_client_t::try_send(cluster_op_t *op, int i)
                    .meta_revision = meta_rev,
                    .version = op->opcode == OSD_OP_WRITE || op->opcode == OSD_OP_DELETE ? op->version : 0,
                } },
-                .bitmap = (op->opcode == OSD_OP_READ || op->opcode == OSD_OP_READ_BITMAP ? (uint8_t*)op->part_bitmaps + pg_bitmap_size*i : NULL),
-                .bitmap_len = (unsigned)(op->opcode == OSD_OP_READ || op->opcode == OSD_OP_READ_BITMAP ? pg_bitmap_size : 0),
+                .bitmap = (op->opcode == OSD_OP_READ || op->opcode == OSD_OP_READ_BITMAP || op->opcode == OSD_OP_READ_CHAIN_BITMAP
+                    ? (uint8_t*)op->part_bitmaps + pg_bitmap_size*i : NULL),
+                .bitmap_len = (unsigned)(op->opcode == OSD_OP_READ || op->opcode == OSD_OP_READ_BITMAP || op->opcode == OSD_OP_READ_CHAIN_BITMAP
+                    ? pg_bitmap_size : 0),
                .callback = [this, part](osd_op_t *op_part)
                {
                    handle_op_part(part);
@@ -1067,7 +1091,7 @@ void cluster_client_t::send_sync(cluster_op_t *op, cluster_op_part_t *part)
        .req = {
            .hdr = {
                .magic = SECONDARY_OSD_OP_MAGIC,
-                .id = op_id++,
+                .id = next_op_id(),
                .opcode = OSD_OP_SYNC,
            },
        },
@@ -1128,11 +1152,11 @@ void cluster_client_t::handle_op_part(cluster_op_part_t *part)
    else
    {
        // OK
-        if (!(op->flags & OP_IMMEDIATE_COMMIT))
+        if ((op->opcode == OSD_OP_WRITE || op->opcode == OSD_OP_DELETE) && !(op->flags & OP_IMMEDIATE_COMMIT))
            dirty_osds.insert(part->osd_num);
        part->flags |= PART_DONE;
        op->done_count++;
-        if (op->opcode == OSD_OP_READ || op->opcode == OSD_OP_READ_BITMAP)
+        if (op->opcode == OSD_OP_READ || op->opcode == OSD_OP_READ_BITMAP || op->opcode == OSD_OP_READ_CHAIN_BITMAP)
        {
            copy_part_bitmap(op, part);
            op->version = op->parts.size() == 1 ? part->op.reply.rw.version : 0;
@@ -1156,7 +1180,12 @@ void cluster_client_t::copy_part_bitmap(cluster_op_t *op, cluster_op_part_t *par
    );
    uint32_t object_offset = (part->op.req.rw.offset - op->offset) / pool_cfg.bitmap_granularity;
    uint32_t part_offset = (part->op.req.rw.offset % pg_block_size) / pool_cfg.bitmap_granularity;
-    uint32_t part_len = (op->opcode == OSD_OP_READ_BITMAP ? pg_block_size : part->op.req.rw.len) / pool_cfg.bitmap_granularity;
+    uint32_t op_len = op->len / pool_cfg.bitmap_granularity;
+    uint32_t part_len = pg_block_size/pool_cfg.bitmap_granularity - part_offset;
+    if (part_len > op_len-object_offset)
+    {
+        part_len = op_len-object_offset;
+    }
    if (!(object_offset & 0x7) && !(part_offset & 0x7) && (part_len >= 8))
    {
        // Copy bytes
@@ -1179,5 +1208,5 @@ void cluster_client_t::copy_part_bitmap(cluster_op_t *op, cluster_op_part_t *par

 uint64_t cluster_client_t::next_op_id()
 {
-    return op_id++;
+    return msgr.next_subop_id++;
 }
--- a/src/cluster_client.h
+++ b/src/cluster_client.h
@@ -11,6 +11,7 @@
 #define INODE_LIST_DONE 1
 #define INODE_LIST_HAS_UNSTABLE 2
 #define OSD_OP_READ_BITMAP OSD_OP_SEC_READ_BMP
+#define OSD_OP_READ_CHAIN_BITMAP 0x102

 #define OSD_OP_IGNORE_READONLY 0x08

@@ -30,7 +31,7 @@ struct cluster_op_part_t

 struct cluster_op_t
 {
-    uint64_t opcode; // OSD_OP_READ, OSD_OP_WRITE, OSD_OP_SYNC, OSD_OP_DELETE, OSD_OP_READ_BITMAP
+    uint64_t opcode; // OSD_OP_READ, OSD_OP_WRITE, OSD_OP_SYNC, OSD_OP_DELETE, OSD_OP_READ_BITMAP, OSD_OP_READ_CHAIN_BITMAP
    uint64_t inode;
    uint64_t offset;
    uint64_t len;
@@ -39,9 +40,13 @@ struct cluster_op_t
    uint64_t version = 0;
    // now only OSD_OP_IGNORE_READONLY is supported
    uint64_t flags = 0;
+    // negative retval is an error number
+    // write and read return len on success
+    // sync and delete return 0 on success
+    // read_bitmap and read_chain_bitmap return the length of bitmap in bits(!)
    int retval;
    osd_op_buf_list_t iov;
-    // READ and READ_BITMAP return the bitmap here
+    // READ, READ_BITMAP, READ_CHAIN_BITMAP return the bitmap here
    void *bitmap_buf = NULL;
    std::function<void(cluster_op_t*)> callback;
    ~cluster_op_t();
@@ -85,7 +90,6 @@ class cluster_client_t
    int up_wait_retry_interval = 500; // ms

    int retry_timeout_id = 0;
-    uint64_t op_id = 1;
    std::vector<cluster_op_t*> offline_ops;
    cluster_op_t *op_queue_head = NULL, *op_queue_tail = NULL;
    std::map<object_id, cluster_buffer_t> dirty_buffers;
@@ -100,10 +104,14 @@ class cluster_client_t
    std::vector<std::function<void(void)>> on_ready_hooks;
    std::vector<inode_list_t*> lists;
    int continuing_ops = 0;
+    bool msgr_initialized = false;

 public:
    etcd_state_client_t st_cli;
+
    osd_messenger_t msgr;
+    void init_msgr();
+
    json11::Json config;
    json11::Json::object merged_config;

--- a/src/cluster_client_list.cpp
+++ b/src/cluster_client_list.cpp
@@ -196,7 +196,7 @@ void cluster_client_t::send_list(inode_list_osd_t *cur_list)
        .sec_list = {
            .header = {
                .magic = SECONDARY_OSD_OP_MAGIC,
-                .id = op_id++,
+                .id = next_op_id(),
                .opcode = OSD_OP_SEC_LIST,
            },
            .list_pg = cur_list->pg->pg_num,
--- a/src/disk_tool.cpp
+++ b/src/disk_tool.cpp
@@ -52,11 +52,12 @@ static const char *help_text =
    "    --disable_data_fsync 0     Disable data device cache and fsync (default off)\n"
    "    --disable_meta_fsync 0     Disable metadata device cache and fsync (default off)\n"
    "    --disable_journal_fsync 0  Disable journal device cache and fsync (default off)\n"
+    "    --hdd                      Enable HDD defaults (1M block, 1G journal, throttling)\n"
    "    --force                    Bypass partition safety checks (for emptiness and so on)\n"
    "  \n"
    "  Options (both modes):\n"
-    "    --journal_size 1G/32M      Set journal size (area or partition size)\n"
-    "    --block_size 1M/128k       Set blockstore object size\n"
+    "    --journal_size 32M/1G      Set journal size (area or partition size)\n"
+    "    --block_size 128k/1M       Set blockstore object size\n"
    "    --bitmap_granularity 4k    Set bitmap granularity\n"
    "    --data_device_block 4k     Override data device block size\n"
    "    --meta_device_block 4k     Override metadata device block size\n"
@@ -109,8 +110,16 @@ static const char *help_text =
    "  Commands are passed to systemctl with vitastor-osd@<num> units as arguments.\n"
    "  When --now is added to enable/disable, OSDs are also immediately started/stopped.\n"
    "\n"
-    "vitastor-disk read-sb <device>\n"
+    "vitastor-disk purge [--force] [--allow-data-loss] <device> [device2 device3 ...]\n"
+    "  Purge Vitastor OSD(s) on specified device(s). Uses vitastor-cli rm-osd to check\n"
+    "  if deletion is possible without data loss and to actually remove metadata from etcd.\n"
+    "  --force and --allow-data-loss options may be used to ignore safety check results.\n"
+    "  \n"
+    "  Requires `vitastor-cli`, `sfdisk` and `partprobe` (from parted) utilities.\n"
+    "\n"
+    "vitastor-disk read-sb [--force] <device>\n"
    "  Try to read Vitastor OSD superblock from <device> and print it in JSON format.\n"
+    "  --force allows to ignore validation errors.\n"
    "\n"
    "vitastor-disk write-sb <device>\n"
    "  Read JSON from STDIN and write it into Vitastor OSD superblock on <device>.\n"
@@ -195,6 +204,10 @@ int main(int argc, char *argv[])
        {
            self.options["hybrid"] = "1";
        }
+        else if (!strcmp(argv[i], "--hdd"))
+        {
+            self.options["hdd"] = "1";
+        }
        else if (!strcmp(argv[i], "--help") || !strcmp(argv[i], "-h"))
        {
            cmd.insert(cmd.begin(), (char*)"help");
@@ -207,6 +220,10 @@ int main(int argc, char *argv[])
        {
            self.options["force"] = "1";
        }
+        else if (!strcmp(argv[i], "--allow-data-loss"))
+        {
+            self.options["allow_data_loss"] = "1";
+        }
        else if (argv[i][0] == '-' && argv[i][1] == '-')
        {
            char *key = argv[i]+2;
@@ -339,6 +356,10 @@ int main(int argc, char *argv[])
        }
        return self.systemd_start_stop_osds(systemd_cmd, std::vector<std::string>(cmd.begin()+1, cmd.end()));
    }
+    else if (!strcmp(cmd[0], "purge"))
+    {
+        return self.purge_devices(std::vector<std::string>(cmd.begin()+1, cmd.end()));
+    }
    else if (!strcmp(cmd[0], "exec-osd"))
    {
        if (cmd.size() != 2)
--- a/src/disk_tool.h
+++ b/src/disk_tool.h
@@ -56,7 +56,7 @@ struct disk_tool_t
    uint64_t meta_pos;
    uint64_t journal_pos, journal_calc_data_pos;

-    bool first, first2;
+    bool first_block, first_entry;

    allocator *data_alloc;
    std::map<uint64_t, uint64_t> data_remap;
@@ -108,10 +108,11 @@ struct disk_tool_t
    int read_sb(std::string device);
    int write_sb(std::string device);
    int exec_osd(std::string device);
-    int systemd_start_stop_osds(std::vector<std::string> cmd, std::vector<std::string> devices);
+    int systemd_start_stop_osds(const std::vector<std::string> & cmd, const std::vector<std::string> & devices);
    int pre_exec_osd(std::string device);
+    int purge_devices(const std::vector<std::string> & devices);

-    json11::Json read_osd_superblock(std::string device, bool expect_exist = true);
+    json11::Json read_osd_superblock(std::string device, bool expect_exist = true, bool ignore_nonref = false);
    uint32_t write_osd_superblock(std::string device, json11::Json params);

    int prepare_one(std::map<std::string, std::string> options, int is_hdd = -1);
@@ -139,3 +140,4 @@ int write_zero(int fd, uint64_t offset, uint64_t size);
 json11::Json read_parttable(std::string dev);
 uint64_t dev_size_from_parttable(json11::Json pt);
 uint64_t free_from_parttable(json11::Json pt);
+int fix_partition_type(std::string dev_by_uuid);
--- a/src/disk_tool_journal.cpp
+++ b/src/disk_tool_journal.cpp
@@ -13,7 +13,7 @@ int disk_tool_t::dump_journal()
        fprintf(stderr, "Invalid journal block size\n");
        return 1;
    }
-    first = true;
+    first_block = true;
    if (json)
        printf("[\n");
    if (all)
@@ -38,8 +38,8 @@ int disk_tool_t::dump_journal()
            }
            if (json)
            {
-                printf("%s{\"offset\":\"0x%lx\"", first ? "" : ",\n", journal_pos);
-                first = false;
+                printf("%s{\"offset\":\"0x%lx\"", first_block ? "" : ",\n", journal_pos);
+                first_block = false;
            }
            if (s == dsk.journal_block_size)
            {
@@ -55,10 +55,10 @@ int disk_tool_t::dump_journal()
                    printf("offset %08lx:\n", journal_pos);
                else
                    printf(",\"entries\":[\n");
-                first2 = true;
+                first_entry = true;
                process_journal_block(journal_buf, [this](int num, journal_entry *je) { dump_journal_entry(num, je, json); });
                if (json)
-                    printf(first2 ? "]}" : "\n]}");
+                    printf(first_entry ? "]}" : "\n]}");
            }
            else
            {
@@ -75,34 +75,30 @@ int disk_tool_t::dump_journal()
    }
    else
    {
+        first_entry = true;
        process_journal([this](void *data)
        {
-            first2 = true;
+            if (json && dump_with_blocks)
+                first_entry = true;
            if (!json)
                printf("offset %08lx:\n", journal_pos);
            auto pos = journal_pos;
            int r = process_journal_block(data, [this, pos](int num, journal_entry *je)
            {
-                if (json && first2)
-                {
-                    if (dump_with_blocks)
-                        printf("%s{\"offset\":\"0x%lx\",\"entries\":[\n", first ? "" : ",\n", pos);
-                    first = false;
-                }
+                if (json && dump_with_blocks && first_entry)
+                    printf("%s{\"offset\":\"0x%lx\",\"entries\":[\n", first_block ? "" : ",\n", pos);
                dump_journal_entry(num, je, json);
+                first_block = false;
            });
-            if (json)
-            {
-                if (dump_with_blocks && !first2)
-                    printf("\n]}");
-            }
-            else if (r <= 0)
+            if (json && dump_with_blocks && !first_entry)
+                printf("\n]}");
+            else if (!json && r <= 0)
                printf("end of the journal\n");
            return r;
        });
    }
    if (json)
-        printf(first ? "]\n" : "\n]\n");
+        printf(first_block ? "]\n" : "\n]\n");
    return 0;
 }

@@ -209,9 +205,9 @@ void disk_tool_t::dump_journal_entry(int num, journal_entry *je, bool json)
 {
    if (json)
    {
-        if (!first2)
+        if (!first_entry)
            printf(",\n");
-        first2 = false;
+        first_entry = false;
        printf(
            "{\"crc32\":\"%08x\",\"valid\":%s,\"crc32_prev\":\"%08x\"",
            je->crc32, (je_crc32(je) == je->crc32 ? "true" : "false"), je->crc32_prev
@@ -275,10 +271,12 @@ void disk_tool_t::dump_journal_entry(int num, journal_entry *je, bool json)
    else if (je->type == JE_BIG_WRITE || je->type == JE_BIG_WRITE_INSTANT)
    {
        printf(
-            json ? ",\"type\":\"big_write%s\",\"inode\":\"0x%lx\",\"stripe\":\"0x%lx\",\"ver\":\"%lu\",\"loc\":\"0x%lx\""
-                : "je_big_write%s oid=%lx:%lx ver=%lu loc=%08lx",
+            json ? ",\"type\":\"big_write%s\",\"inode\":\"0x%lx\",\"stripe\":\"0x%lx\",\"ver\":\"%lu\",\"offset\":%u,\"len\":%u,\"loc\":\"0x%lx\""
+                : "je_big_write%s oid=%lx:%lx ver=%lu offset=%u len=%u loc=%08lx",
            je->type == JE_BIG_WRITE_INSTANT ? "_instant" : "",
-            je->big_write.oid.inode, je->big_write.oid.stripe, je->big_write.version, je->big_write.location
+            je->big_write.oid.inode, je->big_write.oid.stripe,
+            je->big_write.version, je->big_write.offset, je->big_write.len,
+            je->big_write.location
        );
        if (je->big_write.size > sizeof(journal_entry_big_write))
        {
@@ -424,6 +422,8 @@ int disk_tool_t::write_json_journal(json11::Json entries)
                    .stripe = sscanf_json(NULL, rec["stripe"]),
                },
                .version = rec["ver"].uint64_value(),
+                .offset = (uint32_t)rec["offset"].uint64_value(),
+                .len = (uint32_t)rec["len"].uint64_value(),
                .location = sscanf_json(NULL, rec["loc"]),
            };
            fromhexstr(rec["bitmap"].string_value(), dsk.clean_entry_bitmap_size, ((uint8_t*)ne) + sizeof(journal_entry_big_write));
--- a/src/disk_tool_meta.cpp
+++ b/src/disk_tool_meta.cpp
@@ -124,14 +124,14 @@ void disk_tool_t::dump_meta_header(blockstore_meta_header_v1_t *hdr)
    {
        printf("{\"version\":\"0.5\",\"meta_block_size\":%lu,\"entries\":[\n", dsk.meta_block_size);
    }
-    first = true;
+    first_entry = true;
 }

 void disk_tool_t::dump_meta_entry(uint64_t block_num, clean_disk_entry *entry, uint8_t *bitmap)
 {
    printf(
-#define ENTRY_FMT "{\"block\":%lu,\"pool\":%u,\"inode\":%lu,\"stripe\":%lu,\"version\":%lu"
-        (first ? ENTRY_FMT : (",\n" ENTRY_FMT)),
+#define ENTRY_FMT "{\"block\":%lu,\"pool\":%u,\"inode\":\"0x%lx\",\"stripe\":\"0x%lx\",\"version\":%lu"
+        (first_entry ? ENTRY_FMT : (",\n" ENTRY_FMT)),
 #undef ENTRY_FMT
        block_num, INODE_POOL(entry->oid.inode), INODE_NO_POOL(entry->oid.inode),
        entry->oid.stripe, entry->version
@@ -154,7 +154,7 @@ void disk_tool_t::dump_meta_entry(uint64_t block_num, clean_disk_entry *entry, u
    {
        printf("}");
    }
-    first = false;
+    first_entry = false;
 }

 int disk_tool_t::write_json_meta(json11::Json meta)
--- a/src/disk_tool_prepare.cpp
+++ b/src/disk_tool_prepare.cpp
@@ -61,6 +61,11 @@ int disk_tool_t::prepare_one(std::map<std::string, std::string> options, int is_
                fprintf(stderr, "%s already contains Vitastor OSD superblock, not creating OSD without --force\n", dev.c_str());
                return 1;
            }
+            if (fix_partition_type(dev) != 0)
+            {
+                fprintf(stderr, "%s has incorrect type and we failed to change it to Vitastor type\n", dev.c_str());
+                return 1;
+            }
        }
    }
    for (auto dev: std::vector<std::string>{"data", "meta", "journal"})
@@ -317,7 +322,8 @@ json11::Json disk_tool_t::add_partitions(vitastor_dev_info_t & devinfo, std::vec
    {
        script += "+ "+size+" "+std::string(VITASTOR_PART_TYPE)+"\n";
    }
-    if (shell_exec({ "sfdisk", "--force", devinfo.path }, script, NULL, NULL) != 0)
+    std::string out;
+    if (shell_exec({ "sfdisk", "--no-reread", "--force", devinfo.path }, script, &out, NULL) != 0)
    {
        fprintf(stderr, "Failed to add %lu partition(s) with sfdisk\n", sizes.size());
        return {};
@@ -351,7 +357,8 @@ json11::Json disk_tool_t::add_partitions(vitastor_dev_info_t & devinfo, std::vec
                {
                    iter++;
                    // Run partprobe
-                    if (iter > 1 || (r = shell_exec({ "partprobe", devinfo.path }, "", NULL, NULL)) != 0)
+                    std::string out;
+                    if (iter > 1 || (r = shell_exec({ "partprobe", devinfo.path }, "", &out, NULL)) != 0)
                    {
                        fprintf(
                            stderr, iter == 1 && r == 255
@@ -539,7 +546,7 @@ int disk_tool_t::prepare(std::vector<std::string> devices)
            fprintf(stderr, "Device list (positional arguments) and --hybrid are incompatible with --data_device\n");
            return 1;
        }
-        return prepare_one(options);
+        return prepare_one(options, options.find("hdd") != options.end() ? 1 : 0);
    }
    if (!devices.size())
    {
@@ -549,12 +556,12 @@ int disk_tool_t::prepare(std::vector<std::string> devices)
    options.erase("data_device");
    options.erase("meta_device");
    options.erase("journal_device");
+    bool hybrid = options.find("hybrid") != options.end();
    auto devinfo = collect_devices(devices);
    if (!devinfo.size())
    {
        return 1;
    }
-    bool hybrid = options.find("hybrid") != options.end();
    uint64_t osd_per_disk = stoull_full(options["osd_per_disk"]);
    if (!osd_per_disk)
        osd_per_disk = 1;
@@ -612,7 +619,8 @@ int disk_tool_t::prepare(std::vector<std::string> devices)
                        return 1;
                    }
                }
-                prepare_one(options, dev.is_hdd ? 1 : 0);
+                // Treat all disks as SSDs if not in the hybrid mode
+                prepare_one(options, hybrid && dev.is_hdd ? 1 : 0);
            }
        }
    }
--- a/src/disk_tool_udev.cpp
+++ b/src/disk_tool_udev.cpp
@@ -5,6 +5,7 @@

 #include "disk_tool.h"
 #include "rw_blocking.h"
+#include "str_util.h"

 struct __attribute__((__packed__)) vitastor_disk_superblock_t
 {
@@ -54,7 +55,7 @@ int disk_tool_t::udev_import(std::string device)

 int disk_tool_t::read_sb(std::string device)
 {
-    json11::Json sb = read_osd_superblock(device);
+    json11::Json sb = read_osd_superblock(device, true, options.find("force") != options.end());
    if (sb.is_null())
    {
        return 1;
@@ -123,7 +124,7 @@ uint32_t disk_tool_t::write_osd_superblock(std::string device, json11::Json para
    return sb_size;
 }

-json11::Json disk_tool_t::read_osd_superblock(std::string device, bool expect_exist)
+json11::Json disk_tool_t::read_osd_superblock(std::string device, bool expect_exist, bool ignore_errors)
 {
    vitastor_disk_superblock_t *sb = NULL;
    uint8_t *buf = NULL;
@@ -144,7 +145,7 @@ json11::Json disk_tool_t::read_osd_superblock(std::string device, bool expect_ex
        goto ex;
    }
    sb = (vitastor_disk_superblock_t*)buf;
-    if (sb->magic != VITASTOR_DISK_MAGIC)
+    if (sb->magic != VITASTOR_DISK_MAGIC && !ignore_errors)
    {
        if (expect_exist)
            fprintf(stderr, "Invalid OSD superblock on %s: magic number mismatch\n", device.c_str());
@@ -172,7 +173,7 @@ json11::Json disk_tool_t::read_osd_superblock(std::string device, bool expect_ex
        }
        sb = (vitastor_disk_superblock_t*)buf;
    }
-    if (sb->crc32c != crc32c(0, &sb->size, sb->size - ((uint8_t*)&sb->size - buf)))
+    if (sb->crc32c != crc32c(0, &sb->size, sb->size - ((uint8_t*)&sb->size - buf)) && !ignore_errors)
    {
        if (expect_exist)
            fprintf(stderr, "Invalid OSD superblock on %s: crc32 mismatch\n", device.c_str());
@@ -186,14 +187,14 @@ json11::Json disk_tool_t::read_osd_superblock(std::string device, bool expect_ex
        goto ex;
    }
    // Validate superblock
-    if (!osd_params["osd_num"].uint64_value())
+    if (!osd_params["osd_num"].uint64_value() && !ignore_errors)
    {
        if (expect_exist)
            fprintf(stderr, "OSD superblock on %s lacks osd_num\n", device.c_str());
        osd_params = json11::Json();
        goto ex;
    }
-    if (osd_params["data_device"].string_value() == "")
+    if (osd_params["data_device"].string_value() == "" && !ignore_errors)
    {
        if (expect_exist)
            fprintf(stderr, "OSD superblock on %s lacks data_device\n", device.c_str());
@@ -226,7 +227,7 @@ json11::Json disk_tool_t::read_osd_superblock(std::string device, bool expect_ex
    {
        device_type = "journal";
    }
-    else
+    else if (!ignore_errors)
    {
        if (expect_exist)
            fprintf(stderr, "Invalid OSD superblock on %s: does not refer to the device itself\n", device.c_str());
@@ -246,7 +247,7 @@ ex:
    return osd_params;
 }

-int disk_tool_t::systemd_start_stop_osds(std::vector<std::string> cmd, std::vector<std::string> devices)
+int disk_tool_t::systemd_start_stop_osds(const std::vector<std::string> & cmd, const std::vector<std::string> & devices)
 {
    if (!devices.size())
    {
@@ -306,8 +307,7 @@ int disk_tool_t::exec_osd(std::string device)
        argv[i] = (char*)argstr[i].c_str();
    }
    argv[argstr.size()] = NULL;
-    execvpe(osd_binary.c_str(), argv, environ);
-    return 0;
+    return execvpe(osd_binary.c_str(), argv, environ);
 }

 static int check_disabled_cache(std::string dev)
@@ -362,3 +362,140 @@ int disk_tool_t::pre_exec_osd(std::string device)
    }
    return 0;
 }
+
+int disk_tool_t::purge_devices(const std::vector<std::string> & devices)
+{
+    std::vector<uint64_t> osd_numbers;
+    json11::Json::array superblocks;
+    for (auto & device: devices)
+    {
+        json11::Json sb = read_osd_superblock(device);
+        if (!sb.is_null())
+        {
+            uint64_t osd_num = sb["params"]["osd_num"].uint64_value();
+            osd_numbers.push_back(osd_num);
+            superblocks.push_back(sb);
+        }
+    }
+    if (!osd_numbers.size())
+    {
+        return 0;
+    }
+    std::vector<std::string> rm_osd_cli = { "vitastor-cli", "rm-osd" };
+    for (auto osd_num: osd_numbers)
+    {
+        rm_osd_cli.push_back(std::to_string(osd_num));
+    }
+    // Check for data loss
+    if (options["force"] != "")
+    {
+        rm_osd_cli.push_back("--force");
+    }
+    else if (options["allow_data_loss"] != "")
+    {
+        rm_osd_cli.push_back("--allow-data-loss");
+    }
+    rm_osd_cli.push_back("--dry-run");
+    std::string dry_run_ignore_stdout;
+    if (shell_exec(rm_osd_cli, "", &dry_run_ignore_stdout, NULL) != 0)
+    {
+        return 1;
+    }
+    // Disable & stop OSDs
+    std::vector<std::string> systemctl_cli = { "systemctl", "disable", "--now" };
+    for (auto osd_num: osd_numbers)
+    {
+        systemctl_cli.push_back("vitastor-osd@"+std::to_string(osd_num));
+    }
+    if (shell_exec(systemctl_cli, "", NULL, NULL) != 0)
+    {
+        return 1;
+    }
+    // Remove OSD metadata
+    rm_osd_cli.pop_back();
+    if (shell_exec(rm_osd_cli, "", NULL, NULL) != 0)
+    {
+        return 1;
+    }
+    // Destroy OSD superblocks
+    uint8_t *buf = (uint8_t*)memalign_or_die(MEM_ALIGNMENT, 4096);
+    for (auto & sb: superblocks)
+    {
+        for (auto dev_type: std::vector<std::string>{ "data", "meta", "journal" })
+        {
+            auto dev = sb["real_"+dev_type+"_device"].string_value();
+            if (dev != "")
+            {
+                int fd = -1, r = open(dev.c_str(), O_DIRECT|O_RDWR);
+                if (r >= 0)
+                {
+                    fd = r;
+                    r = read_blocking(fd, buf, 4096);
+                    if (r == 4096)
+                    {
+                        // Clear magic and CRC
+                        memset(buf, 0, 12);
+                        r = lseek64(fd, 0, 0);
+                        if (r == 0)
+                        {
+                            r = write_blocking(fd, buf, 4096);
+                            if (r == 4096)
+                                r = 0;
+                        }
+                    }
+                }
+                if (fd >= 0)
+                    close(fd);
+                if (r != 0)
+                {
+                    fprintf(stderr, "Failed to clear OSD %lu %s device %s superblock: %s\n",
+                        sb["params"]["osd_num"].uint64_value(), dev_type.c_str(), dev.c_str(), strerror(errno));
+                }
+                else
+                {
+                    fprintf(stderr, "OSD %lu %s device %s superblock cleared\n",
+                        sb["params"]["osd_num"].uint64_value(), dev_type.c_str(), dev.c_str());
+                }
+                if (sb["params"][dev_type+"_device"].string_value().substr(0, 22) == "/dev/disk/by-partuuid/")
+                {
+                    // Delete the partition itself
+                    auto uuid_to_del = strtolower(sb["params"][dev_type+"_device"].string_value().substr(22));
+                    auto parent_dev = get_parent_device(dev);
+                    if (parent_dev == "" || parent_dev == dev)
+                    {
+                        fprintf(stderr, "Failed to delete partition %s: failed to find parent device\n", dev.c_str());
+                        continue;
+                    }
+                    auto pt = read_parttable("/dev/"+parent_dev);
+                    if (!pt.is_object())
+                        continue;
+                    json11::Json::array newpt = pt["partitions"].array_items();
+                    for (int i = 0; i < newpt.size(); i++)
+                    {
+                        if (strtolower(newpt[i]["uuid"].string_value()) == uuid_to_del)
+                        {
+                            auto old_part = newpt[i];
+                            newpt.erase(newpt.begin()+i, newpt.begin()+i+1);
+                            vitastor_dev_info_t devinfo = {
+                                .path = "/dev/"+parent_dev,
+                                .pt = json11::Json::object{ { "partitions", newpt } },
+                            };
+                            add_partitions(devinfo, {});
+                            struct stat st;
+                            if (stat(old_part["node"].string_value().c_str(), &st) == 0 ||
+                                errno != ENOENT)
+                            {
+                                std::string out;
+                                shell_exec({ "partprobe", "/dev/"+parent_dev }, "", &out, NULL);
+                            }
+                            break;
+                        }
+                    }
+                }
+            }
+        }
+    }
+    free(buf);
+    buf = NULL;
+    return 0;
+}
--- a/src/disk_tool_upgrade.cpp
+++ b/src/disk_tool_upgrade.cpp
@@ -38,42 +38,6 @@ static std::map<std::string, std::string> read_vitastor_unit(std::string unit)
    return r;
 }

-static int fix_partition_type(std::string dev_by_uuid)
-{
-    auto uuid = strtolower(dev_by_uuid.substr(dev_by_uuid.rfind('/')+1));
-    std::string parent_dev = get_parent_device(realpath_str(dev_by_uuid, false));
-    if (parent_dev == "")
-        return 1;
-    auto pt = read_parttable("/dev/"+parent_dev);
-    if (pt.is_null())
-        return 1;
-    std::string script = "label: gpt\n\n";
-    for (const auto & part: pt["partitions"].array_items())
-    {
-        bool this_part = (strtolower(part["uuid"].string_value()) == uuid);
-        if (this_part && strtolower(part["type"].string_value()) == "e7009fac-a5a1-4d72-af72-53de13059903")
-        {
-            // Already correct type
-            return 0;
-        }
-        script += part["node"].string_value()+": ";
-        bool first = true;
-        for (const auto & kv: part.object_items())
-        {
-            if (kv.first != "node")
-            {
-                script += (first ? "" : ", ")+kv.first+"="+
-                    (kv.first == "type" && this_part
-                        ? "e7009fac-a5a1-4d72-af72-53de13059903"
-                        : (kv.second.is_string() ? kv.second.string_value() : kv.second.dump()));
-                first = false;
-            }
-        }
-        script += "\n";
-    }
-    return shell_exec({ "sfdisk", "--no-reread", "--force", "/dev/"+parent_dev }, script, NULL, NULL);
-}
-
 int disk_tool_t::upgrade_simple_unit(std::string unit)
 {
    if (stoull_full(unit) != 0)
--- a/src/disk_tool_utils.cpp
+++ b/src/disk_tool_utils.cpp
@@ -145,10 +145,10 @@ int disable_cache(std::string dev)
        closedir(dir);
        // Check cache_type
        scsi_disk += "/cache_type";
-        std::string cache_type = read_file(scsi_disk);
+        std::string cache_type = trim(read_file(scsi_disk));
        if (cache_type == "")
            return 1;
-        if (cache_type == "write back")
+        if (cache_type != "write through")
        {
            int fd = open(scsi_disk.c_str(), O_WRONLY);
            if (fd < 0 || write_blocking(fd, (void*)"write through", strlen("write through")) != strlen("write through"))
@@ -239,7 +239,8 @@ int shell_exec(const std::vector<std::string> & cmd, const std::string & in, std
    {
        // Child
        dup2(child_stdin[0], 0);
-        dup2(child_stdout[1], 1);
+        if (out)
+            dup2(child_stdout[1], 1);
        if (err)
            dup2(child_stderr[1], 2);
        close(child_stdin[0]);
@@ -250,9 +251,7 @@ int shell_exec(const std::vector<std::string> & cmd, const std::string & in, std
        close(child_stderr[1]);
        char *argv[cmd.size()+1];
        for (int i = 0; i < cmd.size(); i++)
-        {
            argv[i] = (char*)cmd[i].c_str();
-        }
        argv[cmd.size()] = NULL;
        execvp(argv[0], argv);
        std::string full_cmd;
@@ -306,10 +305,10 @@ int write_zero(int fd, uint64_t offset, uint64_t size)
 json11::Json read_parttable(std::string dev)
 {
    std::string part_dump;
-    int r = shell_exec({ "sfdisk", "--dump", dev, "--json" }, "", &part_dump, NULL);
+    int r = shell_exec({ "sfdisk", "--json", dev }, "", &part_dump, NULL);
    if (r == 255)
    {
-        fprintf(stderr, "Error running sfdisk --dump %s --json\n", dev.c_str());
+        fprintf(stderr, "Error running sfdisk --json %s\n", dev.c_str());
        return json11::Json(false);
    }
    // Decode partition table
@@ -320,7 +319,7 @@ json11::Json read_parttable(std::string dev)
        pt = json11::Json::parse(part_dump, err);
        if (err != "")
        {
-            fprintf(stderr, "sfdisk --dump %s --json returned bad JSON: %s\n", dev.c_str(), part_dump.c_str());
+            fprintf(stderr, "sfdisk --json %s returned bad JSON: %s\n", dev.c_str(), part_dump.c_str());
            return json11::Json(false);
        }
        pt = pt["partitiontable"];
@@ -354,3 +353,40 @@ uint64_t free_from_parttable(json11::Json pt)
        free *= pt["sectorsize"].uint64_value();
    return free;
 }
+
+int fix_partition_type(std::string dev_by_uuid)
+{
+    auto uuid = strtolower(dev_by_uuid.substr(dev_by_uuid.rfind('/')+1));
+    std::string parent_dev = get_parent_device(realpath_str(dev_by_uuid, false));
+    if (parent_dev == "")
+        return 1;
+    auto pt = read_parttable("/dev/"+parent_dev);
+    if (pt.is_null() || pt.is_bool())
+        return 1;
+    std::string script = "label: gpt\n\n";
+    for (const auto & part: pt["partitions"].array_items())
+    {
+        bool this_part = (strtolower(part["uuid"].string_value()) == uuid);
+        if (this_part && strtolower(part["type"].string_value()) == "e7009fac-a5a1-4d72-af72-53de13059903")
+        {
+            // Already correct type
+            return 0;
+        }
+        script += part["node"].string_value()+": ";
+        bool first = true;
+        for (const auto & kv: part.object_items())
+        {
+            if (kv.first != "node")
+            {
+                script += (first ? "" : ", ")+kv.first+"="+
+                    (kv.first == "type" && this_part
+                        ? "e7009fac-a5a1-4d72-af72-53de13059903"
+                        : (kv.second.is_string() ? kv.second.string_value() : kv.second.dump()));
+                first = false;
+            }
+        }
+        script += "\n";
+    }
+    std::string out;
+    return shell_exec({ "sfdisk", "--no-reread", "--force", "/dev/"+parent_dev }, script, &out, NULL);
+}
--- a/src/etcd_state_client.cpp
+++ b/src/etcd_state_client.cpp
@@ -871,19 +871,33 @@ void etcd_state_client_t::parse_state(const etcd_kv_t & kv)
            pg_cfg.target_history.clear();
            pg_cfg.all_peers.clear();
            // Refuse to start PG if any set of the <osd_sets> has no live OSDs
-            for (auto hist_item: value["osd_sets"].array_items())
+            for (auto & hist_item: value["osd_sets"].array_items())
            {
                std::vector<osd_num_t> history_set;
-                for (auto pg_osd: hist_item.array_items())
+                for (auto & pg_osd: hist_item.array_items())
                {
-                    history_set.push_back(pg_osd.uint64_value());
+                    osd_num_t pg_osd_num = pg_osd.uint64_value();
+                    if (pg_osd_num != 0)
+                    {
+                        auto it = std::lower_bound(history_set.begin(), history_set.end(), pg_osd_num);
+                        if (it == history_set.end() || *it != pg_osd_num)
+                            history_set.insert(it, pg_osd_num);
+                    }
                }
-                pg_cfg.target_history.push_back(history_set);
+                auto it = std::lower_bound(pg_cfg.target_history.begin(), pg_cfg.target_history.end(), history_set);
+                if (it == pg_cfg.target_history.end() || *it != history_set)
+                    pg_cfg.target_history.insert(it, history_set);
            }
            // Include these additional OSDs when peering the PG
            for (auto pg_osd: value["all_peers"].array_items())
            {
-                pg_cfg.all_peers.push_back(pg_osd.uint64_value());
+                osd_num_t pg_osd_num = pg_osd.uint64_value();
+                if (pg_osd_num != 0)
+                {
+                    auto it = std::lower_bound(pg_cfg.all_peers.begin(), pg_cfg.all_peers.end(), pg_osd_num);
+                    if (it == pg_cfg.all_peers.end() || *it != pg_osd_num)
+                        pg_cfg.all_peers.insert(it, pg_osd_num);
+                }
            }
            // Read epoch
            pg_cfg.epoch = value["epoch"].uint64_value();
--- a/src/freelist.cpp
+++ b/src/freelist.cpp
@@ -0,0 +1,63 @@
+// Copyright (c) Vitaliy Filippov, 2023+
+// License: VNPL-1.1 or GNU GPL-2.0+ (see README.md for details)
+
+#include <assert.h>
+#include "freelist.h"
+
+uint64_t freelist_allocator_t::alloc(uint64_t data_size)
+{
+    for (int i = 0; i < freelist.size(); i++)
+    {
+        if (freelist[i].size >= data_size)
+        {
+            uint64_t r = freelist[i].start;
+            freelist[i].start += data_size;
+            freelist[i].size -= data_size;
+            return r;
+        }
+    }
+    return UINT64_MAX;
+}
+
+void freelist_allocator_t::free(uint64_t start, uint64_t size)
+{
+    int min = 0, max = freelist.size();
+    if (max && freelist[freelist.size()-1].start < start)
+    {
+        min = max;
+    }
+    if (max && freelist[0].start >= start)
+    {
+        max = 0;
+    }
+    while (max-min > 1)
+    {
+        int mid = (min+max)/2;
+        if (freelist[mid].start >= start)
+            max = mid;
+        else
+            min = mid;
+    }
+    // max = the first item where freelist[max].start >= start
+    if (max > 0 && freelist[max-1].start+freelist[max-1].size >= start)
+    {
+        assert(freelist[max-1].start+freelist[max-1].size == start);
+        freelist[max-1].size += size;
+    }
+    else if (max < freelist.size() && freelist[max].start <= size+start)
+    {
+        assert(freelist[max].start == size+start);
+        freelist[max].start -= size;
+        freelist[max].size += size;
+    }
+    else
+    {
+        freelist.insert(freelist.begin()+min, (freelist_item_t){ .start = start, .size = size });
+        max = min; // to skip the if below
+    }
+    if (min != max && max < freelist.size() && freelist[max].start == freelist[min].start+freelist[min].size)
+    {
+        freelist[min].size += freelist[max].size;
+        freelist.erase(freelist.begin()+max, freelist.begin()+max+1);
+    }
+}
--- a/src/freelist.h
+++ b/src/freelist.h
@@ -0,0 +1,23 @@
+// Copyright (c) Vitaliy Filippov, 2023+
+// License: VNPL-1.1 or GNU GPL-2.0+ (see README.md for details)
+
+#pragma once
+
+#include <stdint.h>
+#include <vector>
+
+struct freelist_item_t
+{
+    uint64_t start, size;
+};
+
+// Really trivial freelist allocator
+// Should be fine for remote RDMA memory management because
+// most of the time fragmentation shouldn't be an issue as all
+// memory regions are short-lived
+struct freelist_allocator_t
+{
+    std::vector<freelist_item_t> freelist;
+    uint64_t alloc(uint64_t data_size);
+    void free(uint64_t start, uint64_t size);
+};
--- a/src/messenger.cpp
+++ b/src/messenger.cpp
@@ -80,12 +80,20 @@ void osd_messenger_t::init()
                    };
                    op->callback = [this, cl](osd_op_t *op)
                    {
+                        auto cl_it = clients.find(op->peer_fd);
+                        if (cl_it == clients.end() || cl_it->second != cl)
+                        {
+                            // client is already dropped
+                            delete op;
+                            return;
+                        }
                        int fail_fd = (op->reply.hdr.retval != 0 ? op->peer_fd : -1);
+                        auto fail_osd_num = cl->osd_num;
                        cl->ping_time_remaining = 0;
                        delete op;
                        if (fail_fd >= 0)
                        {
-                            fprintf(stderr, "Ping failed for OSD %lu (client %d), disconnecting peer\n", cl->osd_num, cl->peer_fd);
+                            fprintf(stderr, "Ping failed for OSD %lu (client %d), disconnecting peer\n", fail_osd_num, fail_fd);
                            stop_client(fail_fd, true);
                        }
                    };
@@ -149,13 +157,16 @@ void osd_messenger_t::parse_config(const json11::Json & config)
        this->rdma_max_sge = 128;
    this->rdma_max_send = config["rdma_max_send"].uint64_value();
    if (!this->rdma_max_send)
-        this->rdma_max_send = 1;
+        this->rdma_max_send = 128;
    this->rdma_max_recv = config["rdma_max_recv"].uint64_value();
    if (!this->rdma_max_recv)
        this->rdma_max_recv = 128;
-    this->rdma_max_msg = config["rdma_max_msg"].uint64_value();
-    if (!this->rdma_max_msg || this->rdma_max_msg > 128*1024*1024)
-        this->rdma_max_msg = 129*1024;
+    this->rdma_op_slots = config["rdma_op_slots"].uint64_value();
+    if (!this->rdma_op_slots || this->rdma_op_slots >= 1024*1024)
+        this->rdma_op_slots = 4096;
+    this->rdma_op_memory = config["rdma_op_memory"].uint64_value();
+    if (!this->rdma_op_memory || this->rdma_op_memory >= 1024*1024*1024)
+        this->rdma_op_memory = 16*1024*1024;
 #endif
    this->receive_buffer_size = (uint32_t)config["tcp_header_buffer_size"].uint64_value();
    if (!this->receive_buffer_size || this->receive_buffer_size > 1024*1024*1024)
@@ -380,12 +391,16 @@ void osd_messenger_t::check_peer_config(osd_client_t *cl)
 #ifdef WITH_RDMA
    if (rdma_context)
    {
-        cl->rdma_conn = msgr_rdma_connection_t::create(rdma_context, rdma_max_send, rdma_max_recv, rdma_max_sge, rdma_max_msg);
+        cl->rdma_conn = msgr_rdma_connection_t::create(rdma_context, rdma_max_send, rdma_max_recv, rdma_max_sge, rdma_op_slots, rdma_op_memory);
        if (cl->rdma_conn)
        {
+            clients_by_qp[cl->rdma_conn->qp->qp_num] = cl->peer_fd;
            json11::Json payload = json11::Json::object {
                { "connect_rdma", cl->rdma_conn->addr.to_string() },
-                { "rdma_max_msg", cl->rdma_conn->max_msg },
+                { "rdma_data_rkey", (uint64_t)cl->rdma_conn->in_data_mr->rkey },
+                { "rdma_op_rkey", (uint64_t)cl->rdma_conn->in_op_mr->rkey },
+                { "rdma_op_slots", cl->rdma_conn->op_slots },
+                { "rdma_op_memory", cl->rdma_conn->op_memory },
            };
            std::string payload_str = payload.dump();
            op->req.show_conf.json_len = payload_str.size();
@@ -445,12 +460,14 @@ void osd_messenger_t::check_peer_config(osd_client_t *cl)
        {
            msgr_rdma_address_t addr;
            if (!msgr_rdma_address_t::from_string(config["rdma_address"].string_value().c_str(), &addr) ||
+                config["rdma_op_memory"].uint64_value() == 0 ||
                cl->rdma_conn->connect(&addr) != 0)
            {
                fprintf(
                    stderr, "Failed to connect to OSD %lu (address %s) using RDMA\n",
                    cl->osd_num, config["rdma_address"].string_value().c_str()
                );
+                clients_by_qp.erase(cl->rdma_conn->qp->qp_num);
                delete cl->rdma_conn;
                cl->rdma_conn = NULL;
                // FIXME: Keep TCP connection in this case
@@ -462,11 +479,12 @@ void osd_messenger_t::check_peer_config(osd_client_t *cl)
            }
            else
            {
-                uint64_t server_max_msg = config["rdma_max_msg"].uint64_value();
-                if (cl->rdma_conn->max_msg > server_max_msg)
-                {
-                    cl->rdma_conn->max_msg = server_max_msg;
-                }
+                cl->rdma_conn->set_out_capacity(
+                    config["rdma_data_rkey"].uint64_value(),
+                    config["rdma_op_rkey"].uint64_value(),
+                    config["rdma_op_slots"].uint64_value(),
+                    config["rdma_op_memory"].uint64_value()
+                );
                if (log_level > 0)
                {
                    fprintf(stderr, "Connected to OSD %lu using RDMA\n", cl->osd_num);
--- a/src/messenger.h
+++ b/src/messenger.h
@@ -37,6 +37,7 @@

 #define MSGR_SENDP_HDR 1
 #define MSGR_SENDP_FREE 2
+#define MSGR_SENDP_LAST 4

 struct msgr_sendp_t
 {
@@ -131,13 +132,15 @@ protected:
    bool use_rdma = true;
    std::string rdma_device;
    uint64_t rdma_port_num = 1, rdma_gid_index = 0, rdma_mtu = 0;
-    msgr_rdma_context_t *rdma_context = NULL;
    uint64_t rdma_max_sge = 0, rdma_max_send = 0, rdma_max_recv = 0;
-    uint64_t rdma_max_msg = 0;
+    uint64_t rdma_op_slots = 0, rdma_op_memory = 0;
+    msgr_rdma_context_t *rdma_context = NULL;
+    std::map<uint32_t, int> clients_by_qp;
 #endif

    std::vector<int> read_ready_clients;
    std::vector<int> write_ready_clients;
+    // We don't use ringloop->set_immediate here because we may have no ringloop in client :)
    std::vector<std::function<void()>> set_immediate;

 public:
@@ -169,7 +172,8 @@ public:

 #ifdef WITH_RDMA
    bool is_rdma_enabled();
-    bool connect_rdma(int peer_fd, std::string rdma_address, uint64_t client_max_msg);
+    bool connect_rdma(int peer_fd, std::string rdma_address,
+        uint32_t out_data_rkey, uint32_t out_op_rkey, uint64_t out_op_slots, uint64_t out_op_memory);
 #endif

 protected:
@@ -190,12 +194,13 @@ protected:
    bool handle_read_buffer(osd_client_t *cl, void *curbuf, int remain);
    bool handle_finished_read(osd_client_t *cl);
    void handle_op_hdr(osd_client_t *cl);
-    bool handle_reply_hdr(osd_client_t *cl);
+    bool handle_reply_hdr(void *reply_hdr, osd_client_t *cl);
    void handle_reply_ready(osd_op_t *op);

 #ifdef WITH_RDMA
    bool try_send_rdma(osd_client_t *cl);
    bool try_recv_rdma(osd_client_t *cl);
    void handle_rdma_events();
+    bool rdma_handle_op(osd_client_t *cl, uint32_t op_slot);
 #endif
 };
--- a/src/msgr_rdma.cpp
+++ b/src/msgr_rdma.cpp
@@ -46,9 +46,20 @@ msgr_rdma_connection_t::~msgr_rdma_connection_t()
    ctx->used_max_cqe -= max_send+max_recv;
    if (qp)
        ibv_destroy_qp(qp);
-    if (recv_buffers.size())
-        for (auto b: recv_buffers)
-            free(b);
+    if (in_data_mr)
+        ibv_dereg_mr(in_data_mr);
+    if (in_op_mr)
+        ibv_dereg_mr(in_op_mr);
+    if (in_data_buf)
+        free(in_data_buf);
+    if (in_ops)
+        free(in_ops);
+    if (out_op_alloc)
+        delete out_op_alloc;
+    if (out_slot_data)
+        free(out_slot_data);
+    if (out_slot_ops)
+        free(out_slot_ops);
 }

 msgr_rdma_context_t *msgr_rdma_context_t::create(const char *ib_devname, uint8_t ib_port, uint8_t gid_index, uint32_t mtu, int log_level)
@@ -149,7 +160,7 @@ msgr_rdma_context_t *msgr_rdma_context_t::create(const char *ib_devname, uint8_t
    ctx->mr = ibv_reg_mr(ctx->pd, NULL, SIZE_MAX, IBV_ACCESS_LOCAL_WRITE | IBV_ACCESS_ON_DEMAND);
    if (!ctx->mr)
    {
-        fprintf(stderr, "Couldn't register RDMA memory region\n");
+        fprintf(stderr, "Couldn't register global RDMA memory region: %s\n", strerror(errno));
        goto cleanup;
    }

@@ -180,7 +191,7 @@ cleanup:
 }

 msgr_rdma_connection_t *msgr_rdma_connection_t::create(msgr_rdma_context_t *ctx, uint32_t max_send,
-    uint32_t max_recv, uint32_t max_sge, uint32_t max_msg)
+    uint32_t max_recv, uint32_t max_sge, uint64_t op_slots, uint64_t op_memory)
 {
    msgr_rdma_connection_t *conn = new msgr_rdma_connection_t;

@@ -190,7 +201,6 @@ msgr_rdma_connection_t *msgr_rdma_connection_t::create(msgr_rdma_context_t *ctx,
    conn->max_send = max_send;
    conn->max_recv = max_recv;
    conn->max_sge = max_sge;
-    conn->max_msg = max_msg;

    ctx->used_max_cqe += max_send+max_recv;
    if (ctx->used_max_cqe > ctx->max_cqe)
@@ -211,6 +221,30 @@ msgr_rdma_connection_t *msgr_rdma_connection_t::create(msgr_rdma_context_t *ctx,
        ctx->max_cqe = new_max_cqe;
    }

+    conn->op_memory = op_memory;
+    conn->in_data_buf = memalign_or_die(MEM_ALIGNMENT, op_memory);
+    conn->in_data_mr = ibv_reg_mr(ctx->pd, conn->in_data_buf, op_memory,
+        IBV_ACCESS_ZERO_BASED | IBV_ACCESS_LOCAL_WRITE | IBV_ACCESS_REMOTE_WRITE | IBV_ACCESS_ON_DEMAND);
+    if (!conn->in_data_mr)
+    {
+        fprintf(stderr, "Couldn't register %lu MB RDMA memory region for incoming data: %s\n",
+            (op_memory+1024*1024-1)/1024/1024, strerror(errno));
+        delete conn;
+        return NULL;
+    }
+
+    conn->op_slots = op_slots;
+    conn->in_ops = (msgr_rdma_cmd_t *)malloc_or_die(sizeof(msgr_rdma_cmd_t) * op_slots);
+    conn->in_op_mr = ibv_reg_mr(ctx->pd, conn->in_ops, sizeof(msgr_rdma_cmd_t) * op_slots,
+        IBV_ACCESS_ZERO_BASED | IBV_ACCESS_LOCAL_WRITE | IBV_ACCESS_REMOTE_WRITE | IBV_ACCESS_ON_DEMAND);
+    if (!conn->in_op_mr)
+    {
+        fprintf(stderr, "Couldn't register %lu KB RDMA memory region for incoming operation headers: %s\n",
+            (sizeof(msgr_rdma_cmd_t) * op_slots + 1023)/1024, strerror(errno));
+        delete conn;
+        return NULL;
+    }
+
    ibv_qp_init_attr init_attr = {
        .send_cq = ctx->cq,
        .recv_cq = ctx->cq,
@@ -237,7 +271,7 @@ msgr_rdma_connection_t *msgr_rdma_connection_t::create(msgr_rdma_context_t *ctx,

    ibv_qp_attr attr = {
        .qp_state        = IBV_QPS_INIT,
-        .qp_access_flags = 0,
+        .qp_access_flags = IBV_ACCESS_REMOTE_WRITE,
        .pkey_index      = 0,
        .port_num        = ctx->ib_port,
    };
@@ -265,6 +299,19 @@ static ibv_mtu mtu_to_ibv_mtu(uint32_t mtu)
    return IBV_MTU_4096;
 }

+void msgr_rdma_connection_t::set_out_capacity(uint32_t out_data_rkey, uint32_t out_op_rkey, uint64_t out_op_slots, uint64_t out_op_memory)
+{
+    assert(!out_op_alloc);
+    this->out_data_rkey = out_data_rkey;
+    this->out_op_rkey = out_op_rkey;
+    this->out_op_slots = out_op_slots;
+    this->out_op_memory = out_op_memory;
+    out_op_alloc = new allocator(out_op_slots);
+    out_data_alloc.free(0, out_op_memory);
+    out_slot_data = (msgr_rdma_out_pos_t *)malloc_or_die(sizeof(msgr_rdma_out_pos_t) * out_op_slots);
+    out_slot_ops = (osd_op_t **)malloc_or_die(sizeof(osd_op_t *) * out_op_slots);
+}
+
 int msgr_rdma_connection_t::connect(msgr_rdma_address_t *dest)
 {
    auto conn = this;
@@ -311,17 +358,14 @@ int msgr_rdma_connection_t::connect(msgr_rdma_address_t *dest)
    return 0;
 }

-bool osd_messenger_t::connect_rdma(int peer_fd, std::string rdma_address, uint64_t client_max_msg)
+bool osd_messenger_t::connect_rdma(int peer_fd, std::string rdma_address,
+    uint32_t out_data_rkey, uint32_t out_op_rkey, uint64_t out_op_slots, uint64_t out_op_memory)
 {
    // Try to connect to the peer using RDMA
    msgr_rdma_address_t addr;
    if (msgr_rdma_address_t::from_string(rdma_address.c_str(), &addr))
    {
-        if (client_max_msg > rdma_max_msg)
-        {
-            client_max_msg = rdma_max_msg;
-        }
-        auto rdma_conn = msgr_rdma_connection_t::create(rdma_context, rdma_max_send, rdma_max_recv, rdma_max_sge, client_max_msg);
+        auto rdma_conn = msgr_rdma_connection_t::create(rdma_context, rdma_max_send, rdma_max_recv, rdma_max_sge, rdma_op_slots, rdma_op_memory);
        if (rdma_conn)
        {
            int r = rdma_conn->connect(&addr);
@@ -336,6 +380,8 @@ bool osd_messenger_t::connect_rdma(int peer_fd, std::string rdma_address, uint64
            else
            {
                // Remember connection, but switch to RDMA only after sending the configuration response
+                clients_by_qp[rdma_conn->qp->qp_num] = peer_fd;
+                rdma_conn->set_out_capacity(out_data_rkey, out_op_rkey, out_op_slots, out_op_memory);
                auto cl = clients.at(peer_fd);
                cl->rdma_conn = rdma_conn;
                cl->peer_state = PEER_RDMA_CONNECTING;
@@ -346,66 +392,161 @@ bool osd_messenger_t::connect_rdma(int peer_fd, std::string rdma_address, uint64
    return false;
 }

-static void try_send_rdma_wr(osd_client_t *cl, ibv_sge *sge, int op_sge)
-{
-    ibv_send_wr *bad_wr = NULL;
-    ibv_send_wr wr = {
-        .wr_id = (uint64_t)(cl->peer_fd*2+1),
-        .sg_list = sge,
-        .num_sge = op_sge,
-        .opcode = IBV_WR_SEND,
-        .send_flags = IBV_SEND_SIGNALED,
-    };
-    int err = ibv_post_send(cl->rdma_conn->qp, &wr, &bad_wr);
-    if (err || bad_wr)
-    {
-        fprintf(stderr, "RDMA send failed: %s\n", strerror(err));
-        exit(1);
-    }
-    cl->rdma_conn->cur_send++;
-}
-
 bool osd_messenger_t::try_send_rdma(osd_client_t *cl)
 {
    auto rc = cl->rdma_conn;
-    if (!cl->send_list.size() || rc->cur_send > 0)
+    if (!cl->send_list.size() && !rc->in_slots_freed.size() || rc->cur_send >= rc->max_send)
    {
-        // Only send one batch at a time
        return true;
    }
-    uint64_t op_size = 0, op_sge = 0;
-    ibv_sge sge[rc->max_sge];
-    while (rc->send_pos < cl->send_list.size())
+    int i = 0;
+    while (i < rc->in_slots_freed.size())
    {
-        iovec & iov = cl->send_list[rc->send_pos];
-        if (op_size >= rc->max_msg || op_sge >= rc->max_sge)
-        {
-            try_send_rdma_wr(cl, sge, op_sge);
-            op_sge = 0;
-            op_size = 0;
-            if (rc->cur_send >= rc->max_send)
-            {
-                break;
-            }
-        }
-        uint32_t len = (uint32_t)(op_size+iov.iov_len-rc->send_buf_pos < rc->max_msg
-            ? iov.iov_len-rc->send_buf_pos : rc->max_msg-op_size);
-        sge[op_sge++] = {
-            .addr = (uintptr_t)((uint8_t*)iov.iov_base+rc->send_buf_pos),
-            .length = len,
-            .lkey = rc->ctx->mr->lkey,
+        auto op_slot = rc->in_slots_freed[i++];
+        assert(op_slot < 0x80000000);
+        ibv_send_wr *bad_wr = NULL;
+        ibv_send_wr wr = {
+            .wr_id = 0,
+            .opcode = IBV_WR_RDMA_WRITE_WITH_IMM,
+            .imm_data = 0x80000000 | op_slot,
        };
-        op_size += len;
-        rc->send_buf_pos += len;
-        if (rc->send_buf_pos >= iov.iov_len)
+        int err = ibv_post_send(cl->rdma_conn->qp, &wr, &bad_wr);
+        if (err || bad_wr)
        {
-            rc->send_pos++;
-            rc->send_buf_pos = 0;
+            fprintf(stderr, "RDMA send failed: %s\n", strerror(err));
+            exit(1);
+        }
+        rc->cur_send++;
+        if (rc->cur_send >= rc->max_send)
+        {
+            break;
        }
    }
-    if (op_sge > 0)
+    rc->in_slots_freed.erase(rc->in_slots_freed.begin(), rc->in_slots_freed.begin()+i);
+    if (!cl->send_list.size() || rc->cur_send >= rc->max_send)
    {
-        try_send_rdma_wr(cl, sge, op_sge);
+        return true;
+    }
+    ibv_sge sge[rc->max_sge];
+    int op_start = 0;
+    while (op_start < cl->send_list.size())
+    {
+        uint64_t op_data_size = 0;
+        int op_end = op_start;
+        while (!(cl->outbox[op_end].flags & MSGR_SENDP_LAST))
+        {
+            op_data_size += cl->send_list[op_end].iov_len;
+            op_end++;
+        }
+        op_data_size += cl->send_list[op_end].iov_len;
+        op_end++;
+        op_data_size -= cl->send_list[op_start].iov_len;
+        // Operation boundaries in send_list: op_start..op_end, first iovec is the header
+        uint64_t op_slot = rc->out_op_alloc->find_free();
+        if (op_slot == UINT64_MAX)
+        {
+            // op queue is full
+            return true;
+        }
+        uint64_t data_pos = UINT64_MAX;
+        if (op_data_size >= 0)
+        {
+            if (rc->cur_send > rc->max_send-1-(op_end-op_start-1+rc->max_sge)/rc->max_sge)
+            {
+                // RDMA queue is full
+                return true;
+            }
+            // FIXME: Oops, and what if op data is larger than the whole buffer... :)
+            data_pos = rc->out_data_alloc.alloc(op_data_size);
+            if (data_pos == UINT64_MAX)
+            {
+                // data buffers are full
+                return true;
+            }
+            int cur_sge = 0;
+            for (int data_sent = 1; data_sent < op_end; data_sent++)
+            {
+                sge[cur_sge++] = {
+                    .addr = (uintptr_t)cl->send_list[data_sent].iov_base,
+                    .length = (uint32_t)cl->send_list[data_sent].iov_len,
+                    .lkey = rc->ctx->mr->lkey,
+                };
+                if (data_sent == op_end-1 || cur_sge >= rc->max_sge)
+                {
+                    ibv_send_wr *bad_wr = NULL;
+                    ibv_send_wr wr = {
+                        .wr_id = op_slot,
+                        .next = NULL,
+                        .sg_list = sge,
+                        .num_sge = cur_sge,
+                        .opcode = IBV_WR_RDMA_WRITE,
+                        .send_flags = 0,
+                        .wr = {
+                            .rdma = {
+                                .remote_addr = data_pos,
+                                .rkey = rc->out_data_rkey,
+                            },
+                        },
+                    };
+                    int err = ibv_post_send(cl->rdma_conn->qp, &wr, &bad_wr);
+                    if (err || bad_wr)
+                    {
+                        fprintf(stderr, "RDMA send failed: %s\n", strerror(err));
+                        exit(1);
+                    }
+                    rc->cur_send++;
+                    cur_sge = 0;
+                }
+            }
+        }
+        if (rc->cur_send > rc->max_send-1)
+        {
+            // RDMA queue is full
+            return true;
+        }
+        rc->out_op_alloc->set(op_slot, true);
+        assert(cl->send_list[op_start].iov_len == OSD_PACKET_SIZE);
+        sge[0] = {
+            .addr = (uintptr_t)cl->send_list[op_start].iov_base,
+            .length = (uint32_t)cl->send_list[op_start].iov_len,
+            .lkey = rc->ctx->mr->lkey,
+        };
+        rc->out_slot_data[op_slot] = { .data_pos = data_pos, .data_size = op_data_size };
+        rc->out_slot_ops[op_slot] = (cl->outbox[op_end-1].flags & MSGR_SENDP_FREE)
+            ? cl->outbox[op_end-1].op : NULL;
+        sge[1] = {
+            .addr = (uintptr_t)(rc->out_slot_data+op_slot),
+            .length = sizeof(rc->out_slot_data[op_slot]),
+            .lkey = rc->ctx->mr->lkey,
+        };
+        ibv_send_wr *bad_wr = NULL;
+        ibv_send_wr wr = {
+            .wr_id = op_slot,
+            .next = NULL,
+            .sg_list = sge,
+            .num_sge = 2,
+            .opcode = IBV_WR_RDMA_WRITE_WITH_IMM,
+            .send_flags = IBV_SEND_SIGNALED,
+            .imm_data = (uint32_t)op_slot,
+            .wr = {
+                .rdma = {
+                    .remote_addr = op_slot*sizeof(msgr_rdma_cmd_t),
+                    .rkey = rc->out_op_rkey,
+                },
+            },
+        };
+        int err = ibv_post_send(cl->rdma_conn->qp, &wr, &bad_wr);
+        if (err || bad_wr)
+        {
+            fprintf(stderr, "RDMA send failed: %s\n", strerror(err));
+            exit(1);
+        }
+        rc->cur_send++;
+        op_start = op_end;
+    }
+    if (op_start > 0)
+    {
+        cl->send_list.erase(cl->send_list.begin(), cl->send_list.begin()+op_start);
    }
    return true;
 }
@@ -427,23 +568,87 @@ static void try_recv_rdma_wr(osd_client_t *cl, ibv_sge *sge, int op_sge)
    cl->rdma_conn->cur_recv++;
 }

+static void copy_data_to_recv_list(uint8_t *data_buf, uint64_t data_size, osd_client_t *cl)
+{
+    uint64_t pos = 0;
+    while (cl->recv_list.done < cl->recv_list.count)
+    {
+        uint64_t cur = cl->recv_list.buf[cl->recv_list.done].iov_len;
+        assert(cur <= data_size-pos);
+        memcpy(cl->recv_list.buf[cl->recv_list.done].iov_base, data_buf+pos, cur);
+        pos += cur;
+    }
+    cl->recv_list.reset();
+}
+
 bool osd_messenger_t::try_recv_rdma(osd_client_t *cl)
 {
    auto rc = cl->rdma_conn;
    while (rc->cur_recv < rc->max_recv)
    {
-        void *buf = malloc_or_die(rc->max_msg);
-        rc->recv_buffers.push_back(buf);
-        ibv_sge sge = {
-            .addr = (uintptr_t)buf,
-            .length = (uint32_t)rc->max_msg,
-            .lkey = rc->ctx->mr->lkey,
-        };
-        try_recv_rdma_wr(cl, &sge, 1);
+        try_recv_rdma_wr(cl, NULL, 0);
    }
    return true;
 }

+bool osd_messenger_t::rdma_handle_op(osd_client_t *cl, uint32_t op_slot)
+{
+    auto rc = cl->rdma_conn;
+    if (op_slot >= rc->in_op_cap)
+    {
+        // Invalid incoming index
+        fprintf(stderr, "Client %d invalid incoming RDMA op slot: %u, dropping connection\n", cl->peer_fd, op_slot);
+        stop_client(cl->peer_fd);
+        return false;
+    }
+    osd_op_header_t *hdr = (osd_op_header_t *)rc->in_ops[op_slot].header;
+    uint8_t *data_buf = (uint8_t*)rc->in_data_buf + rc->in_ops[op_slot].pos.data_pos;
+    uint64_t data_size = rc->in_ops[op_slot].pos.data_size;
+    if (hdr->magic == SECONDARY_OSD_REPLY_MAGIC)
+    {
+        // Reply
+        if (cl->read_op)
+        {
+            delete cl->read_op;
+            cl->read_op = NULL;
+        }
+        if (!handle_reply_hdr(rc->in_ops[op_slot].header, cl))
+            return false;
+        if (cl->read_state == CL_READ_REPLY_DATA)
+        {
+            // copy reply data to cl->recv_list
+            copy_data_to_recv_list(data_buf, data_size, cl);
+            // and handle reply with data
+            handle_reply_ready(cl->read_op);
+            cl->read_op = NULL;
+            cl->read_state = 0;
+            cl->read_remaining = 0;
+        }
+    }
+    else
+    {
+        // Operation
+        cl->read_op = new osd_op_t;
+        cl->read_op->peer_fd = cl->peer_fd;
+        cl->read_op->op_type = OSD_OP_IN;
+        memcpy(&cl->read_op->req, hdr, OSD_PACKET_SIZE);
+        handle_op_hdr(cl);
+        if (cl->read_state == CL_READ_DATA)
+        {
+            copy_data_to_recv_list(data_buf, data_size, cl);
+            // And handle the incoming op with data
+            cl->received_ops.push_back(cl->read_op);
+            set_immediate.push_back([this, op = cl->read_op]() { exec_op(op); });
+            cl->read_op = NULL;
+            cl->read_state = 0;
+        }
+    }
+    // We don't need the incoming data buffer anymore, notify peer about it
+    // FIXME: Allow to pass memory to the internal layer without copying and notify after handling it
+    rc->in_slots_freed.push_back(op_slot);
+    return true;
+}
+
 #define RDMA_EVENTS_AT_ONCE 32

 void osd_messenger_t::handle_rdma_events()
@@ -468,9 +673,9 @@ void osd_messenger_t::handle_rdma_events()
        event_count = ibv_poll_cq(rdma_context->cq, RDMA_EVENTS_AT_ONCE, wc);
        for (int i = 0; i < event_count; i++)
        {
-            int client_id = wc[i].wr_id >> 1;
-            bool is_send = wc[i].wr_id & 1;
-            auto cl_it = clients.find(client_id);
+            auto cqp_it = clients_by_qp.find(wc[i].qp_num);
+            int peer_fd = cqp_it != clients_by_qp.end() ? cqp_it->second : -1;
+            auto cl_it = clients.find(peer_fd);
            if (cl_it == clients.end())
            {
                continue;
@@ -478,55 +683,51 @@ void osd_messenger_t::handle_rdma_events()
            osd_client_t *cl = cl_it->second;
            if (wc[i].status != IBV_WC_SUCCESS)
            {
-                fprintf(stderr, "RDMA work request failed for client %d", client_id);
+                fprintf(stderr, "RDMA work request failed for client %d", peer_fd);
                if (cl->osd_num)
-                {
                    fprintf(stderr, " (OSD %lu)", cl->osd_num);
-                }
                fprintf(stderr, " with status: %s, stopping client\n", ibv_wc_status_str(wc[i].status));
-                stop_client(client_id);
+                if (peer_fd >= 0)
+                    stop_client(peer_fd);
                continue;
            }
-            if (!is_send)
+            auto rc = cl->rdma_conn;
+            if (wc[i].opcode == IBV_WC_RDMA_WRITE)
            {
-                cl->rdma_conn->cur_recv--;
-                if (!handle_read_buffer(cl, cl->rdma_conn->recv_buffers[0], wc[i].byte_len))
+                // Operation or reply is sent, we can free it
+                auto & op = rc->out_slot_ops[wc[i].wr_id];
+                if (op)
                {
-                    // handle_read_buffer may stop the client
-                    continue;
+                    delete op;
+                    op = NULL;
                }
-                free(cl->rdma_conn->recv_buffers[0]);
-                cl->rdma_conn->recv_buffers.erase(cl->rdma_conn->recv_buffers.begin(), cl->rdma_conn->recv_buffers.begin()+1);
-                try_recv_rdma(cl);
+                rc->cur_send--;
+                try_send_rdma(cl);
            }
-            else
+            else if (wc[i].opcode == IBV_WC_RECV)
            {
-                cl->rdma_conn->cur_send--;
-                if (!cl->rdma_conn->cur_send)
+                if (!(wc[i].imm_data & 0x80000000))
                {
-                    // Wait for the whole batch
-                    for (int i = 0; i < cl->rdma_conn->send_pos; i++)
+                    // Operation or reply received. Handle it
+                    if (!rdma_handle_op(cl, wc[i].imm_data))
                    {
-                        if (cl->outbox[i].flags & MSGR_SENDP_FREE)
-                        {
-                            // Reply fully sent
-                            delete cl->outbox[i].op;
-                        }
+                        // false means that the client is stopped due to invalid operation
+                        continue;
                    }
-                    if (cl->rdma_conn->send_pos > 0)
-                    {
-                        cl->send_list.erase(cl->send_list.begin(), cl->send_list.begin()+cl->rdma_conn->send_pos);
-                        cl->outbox.erase(cl->outbox.begin(), cl->outbox.begin()+cl->rdma_conn->send_pos);
-                        cl->rdma_conn->send_pos = 0;
-                    }
-                    if (cl->rdma_conn->send_buf_pos > 0)
-                    {
-                        cl->send_list[0].iov_base = (uint8_t*)cl->send_list[0].iov_base + cl->rdma_conn->send_buf_pos;
-                        cl->send_list[0].iov_len -= cl->rdma_conn->send_buf_pos;
-                        cl->rdma_conn->send_buf_pos = 0;
-                    }
-                    try_send_rdma(cl);
+                    rc->cur_recv--;
+                    try_recv_rdma(cl);
                }
+                else
+                {
+                    // Outbox slot is marked as free (the remote side doesn't need it anymore)
+                    uint32_t op_slot = wc[i].imm_data & 0x7FFFFFFF;
+                    auto & pos = rc->in_ops[op_slot].pos;
+                    if (pos.data_size > 0)
+                        rc->out_data_alloc.free(pos.data_pos, pos.data_size);
+                    rc->out_op_alloc->set(op_slot, false);
+                }
+                // Try to continue sending
+                try_send_rdma(cl);
            }
        }
    } while (event_count > 0);
--- a/src/msgr_rdma.h
+++ b/src/msgr_rdma.h
@@ -5,6 +5,11 @@
 #include <infiniband/verbs.h>
 #include <string>
 #include <vector>
+#include "allocator.h"
+#include "freelist.h"
+#include "osd_ops.h"
+
+struct osd_op_t;

 struct msgr_rdma_address_t
 {
@@ -39,6 +44,17 @@ struct msgr_rdma_context_t
    ~msgr_rdma_context_t();
 };

+struct msgr_rdma_out_pos_t
+{
+    uint64_t data_pos, data_size;
+};
+
+struct msgr_rdma_cmd_t
+{
+    uint8_t header[OSD_PACKET_SIZE];
+    msgr_rdma_out_pos_t pos;
+};
+
 struct msgr_rdma_connection_t
 {
    msgr_rdma_context_t *ctx = NULL;
@@ -46,13 +62,24 @@ struct msgr_rdma_connection_t
    msgr_rdma_address_t addr;
    int max_send = 0, max_recv = 0, max_sge = 0;
    int cur_send = 0, cur_recv = 0;
-    uint64_t max_msg = 0;
+    uint64_t op_slots = 0, op_memory = 0;

-    int send_pos = 0, send_buf_pos = 0;
-    int recv_pos = 0, recv_buf_pos = 0;
-    std::vector<void*> recv_buffers;
+    ibv_mr *in_data_mr = NULL, *in_op_mr = NULL;
+    msgr_rdma_cmd_t *in_ops = NULL;
+    int in_op_cap = 0;
+    void *in_data_buf = NULL;
+    std::vector<uint32_t> in_slots_freed;
+
+    uint32_t out_data_rkey = 0, out_op_rkey = 0;
+    uint64_t out_op_slots = 0, out_op_memory = 0;
+    allocator *out_op_alloc = NULL;
+    freelist_allocator_t out_data_alloc;
+    msgr_rdma_out_pos_t *out_slot_data = NULL;
+    osd_op_t **out_slot_ops = NULL;

    ~msgr_rdma_connection_t();
-    static msgr_rdma_connection_t *create(msgr_rdma_context_t *ctx, uint32_t max_send, uint32_t max_recv, uint32_t max_sge, uint32_t max_msg);
+    static msgr_rdma_connection_t *create(msgr_rdma_context_t *ctx, uint32_t max_send,
+        uint32_t max_recv, uint32_t max_sge, uint64_t op_slots, uint64_t op_memory);
    int connect(msgr_rdma_address_t *dest);
+    void set_out_capacity(uint32_t out_data_rkey, uint32_t out_op_rkey, uint64_t out_op_slots, uint64_t out_op_memory);
 };
--- a/src/msgr_receive.cpp
+++ b/src/msgr_receive.cpp
@@ -172,7 +172,7 @@ bool osd_messenger_t::handle_finished_read(osd_client_t *cl)
    if (cl->read_state == CL_READ_HDR)
    {
        if (cl->read_op->req.hdr.magic == SECONDARY_OSD_REPLY_MAGIC)
-            return handle_reply_hdr(cl);
+            return handle_reply_hdr(cl->read_op->req.buf, cl);
        else if (cl->read_op->req.hdr.magic == SECONDARY_OSD_OP_MAGIC)
            handle_op_hdr(cl);
        else
@@ -286,7 +286,7 @@ void osd_messenger_t::handle_op_hdr(osd_client_t *cl)
    }
 }

-bool osd_messenger_t::handle_reply_hdr(osd_client_t *cl)
+bool osd_messenger_t::handle_reply_hdr(void *reply_hdr, osd_client_t *cl)
 {
    auto req_it = cl->sent_ops.find(cl->read_op->req.hdr.id);
    if (req_it == cl->sent_ops.end())
@@ -297,7 +297,7 @@ bool osd_messenger_t::handle_reply_hdr(osd_client_t *cl)
        return false;
    }
    osd_op_t *op = req_it->second;
-    memcpy(op->reply.buf, cl->read_op->req.buf, OSD_PACKET_SIZE);
+    memcpy(op->reply.buf, reply_hdr, OSD_PACKET_SIZE);
    cl->sent_ops.erase(req_it);
    if (op->reply.hdr.opcode == OSD_OP_SEC_READ || op->reply.hdr.opcode == OSD_OP_READ)
    {
@@ -328,14 +328,16 @@ bool osd_messenger_t::handle_reply_hdr(osd_client_t *cl)
        {
            goto reuse;
        }
-        delete cl->read_op;
+        if (cl->read_op)
+            delete cl->read_op;
        cl->read_op = op;
        cl->read_state = CL_READ_REPLY_DATA;
    }
    else if (op->reply.hdr.opcode == OSD_OP_SEC_LIST && op->reply.hdr.retval > 0)
    {
        assert(!op->iov.count);
-        delete cl->read_op;
+        if (cl->read_op)
+            delete cl->read_op;
        cl->read_op = op;
        cl->read_state = CL_READ_REPLY_DATA;
        cl->read_remaining = sizeof(obj_ver_id) * op->reply.hdr.retval;
@@ -345,7 +347,8 @@ bool osd_messenger_t::handle_reply_hdr(osd_client_t *cl)
    else if (op->reply.hdr.opcode == OSD_OP_SEC_READ_BMP && op->reply.hdr.retval > 0)
    {
        assert(!op->iov.count);
-        delete cl->read_op;
+        if (cl->read_op)
+            delete cl->read_op;
        cl->read_op = op;
        cl->read_state = CL_READ_REPLY_DATA;
        cl->read_remaining = op->reply.hdr.retval;
@@ -355,7 +358,8 @@ bool osd_messenger_t::handle_reply_hdr(osd_client_t *cl)
    }
    else if (op->reply.hdr.opcode == OSD_OP_SHOW_CONFIG && op->reply.hdr.retval > 0)
    {
-        delete cl->read_op;
+        if (cl->read_op)
+            delete cl->read_op;
        cl->read_op = op;
        cl->read_state = CL_READ_REPLY_DATA;
        cl->read_remaining = op->reply.hdr.retval;
@@ -368,7 +372,8 @@ bool osd_messenger_t::handle_reply_hdr(osd_client_t *cl)
 reuse:
        // It's fine to reuse cl->read_op for the next reply
        handle_reply_ready(op);
-        cl->recv_list.push_back(cl->read_op->req.buf, OSD_PACKET_SIZE);
+        if (cl->read_op)
+            cl->recv_list.push_back(cl->read_op->req.buf, OSD_PACKET_SIZE);
        cl->read_remaining = OSD_PACKET_SIZE;
        cl->read_state = CL_READ_HDR;
    }
--- a/src/msgr_send.cpp
+++ b/src/msgr_send.cpp
@@ -96,6 +96,7 @@ void osd_messenger_t::outbox_push(osd_op_t *cur_op)
            to_send_list.push_back((iovec){ .iov_base = cur_op->buf, .iov_len = (size_t)cur_op->req.sec_read_bmp.len });
        to_outbox.push_back((msgr_sendp_t){ .op = cur_op, .flags = 0 });
    }
+    to_outbox[to_outbox.size()-1].flags |= MSGR_SENDP_LAST;
    if (cur_op->op_type == OSD_OP_IN)
    {
        to_outbox[to_outbox.size()-1].flags |= MSGR_SENDP_FREE;
--- a/src/msgr_stop.cpp
+++ b/src/msgr_stop.cpp
@@ -129,6 +129,7 @@ void osd_messenger_t::stop_client(int peer_fd, bool force, bool force_delete)
 #ifdef WITH_RDMA
    if (cl->rdma_conn)
    {
+        clients_by_qp.erase(cl->rdma_conn->qp->qp_num);
        delete cl->rdma_conn;
    }
 #endif
--- a/src/osd.cpp
+++ b/src/osd.cpp
@@ -163,6 +163,9 @@ void osd_t::parse_config(const json11::Json & config, bool allow_disk_params)
    recovery_queue_depth = config["recovery_queue_depth"].uint64_value();
    if (recovery_queue_depth < 1 || recovery_queue_depth > MAX_RECOVERY_QUEUE)
        recovery_queue_depth = DEFAULT_RECOVERY_QUEUE;
+    recovery_pg_switch = config["recovery_pg_switch"].uint64_value();
+    if (recovery_pg_switch < 1)
+        recovery_pg_switch = DEFAULT_RECOVERY_PG_SWITCH;
    recovery_sync_batch = config["recovery_sync_batch"].uint64_value();
    if (recovery_sync_batch < 1 || recovery_sync_batch > MAX_RECOVERY_QUEUE)
        recovery_sync_batch = DEFAULT_RECOVERY_BATCH;
--- a/src/osd.h
+++ b/src/osd.h
@@ -34,6 +34,7 @@
 #define DEFAULT_AUTOSYNC_WRITES 128
 #define MAX_RECOVERY_QUEUE 2048
 #define DEFAULT_RECOVERY_QUEUE 4
+#define DEFAULT_RECOVERY_PG_SWITCH 128
 #define DEFAULT_RECOVERY_BATCH 16

 //#define OSD_STUB
@@ -108,6 +109,7 @@ class osd_t
    int autosync_interval = DEFAULT_AUTOSYNC_INTERVAL; // "emergency" sync every 5 seconds
    int autosync_writes = DEFAULT_AUTOSYNC_WRITES;
    int recovery_queue_depth = DEFAULT_RECOVERY_QUEUE;
+    int recovery_pg_switch = DEFAULT_RECOVERY_PG_SWITCH;
    int recovery_sync_batch = DEFAULT_RECOVERY_BATCH;
    int inode_vanish_time = 60;
    int log_level = 0;
@@ -135,7 +137,10 @@ class osd_t
    uint64_t misplaced_objects = 0, degraded_objects = 0, incomplete_objects = 0;
    int peering_state = 0;
    std::map<object_id, osd_recovery_op_t> recovery_ops;
-    int recovery_done = 0;
+    bool recovery_last_degraded = true;
+    pool_pg_num_t recovery_last_pg;
+    object_id recovery_last_oid;
+    int recovery_pg_done = 0, recovery_done = 0;
    osd_op_t *autosync_op = NULL;

    // Unstable writes
@@ -200,7 +205,6 @@ class osd_t
    bool check_peer_config(osd_client_t *cl, json11::Json conf);
    void repeer_pgs(osd_num_t osd_num);
    void start_pg_peering(pg_t & pg);
-    void submit_sync_and_list_subop(osd_num_t role_osd, pg_peering_state_t *ps);
    void submit_list_subop(osd_num_t role_osd, pg_peering_state_t *ps);
    void discard_list_subop(osd_op_t *list_op);
    bool stop_pg(pg_t & pg);
--- a/src/osd_cluster.cpp
+++ b/src/osd_cluster.cpp
@@ -132,7 +132,7 @@ bool osd_t::check_peer_config(osd_client_t *cl, json11::Json conf)
                this->osd_num, immediate_commit == IMMEDIATE_ALL ? "all" : "small",
                cl->osd_num, conf["immediate_commit"].string_value().c_str()
            );
-            return true;
+            return false;
        }
        else if (conf["block_size"].uint64_value() != (uint64_t)this->bs_block_size)
        {
@@ -140,7 +140,7 @@ bool osd_t::check_peer_config(osd_client_t *cl, json11::Json conf)
                "[OSD %lu] My block_size is %u, but peer OSD %lu has %lu. We can't work together\n",
                this->osd_num, this->bs_block_size, cl->osd_num, conf["block_size"].uint64_value()
            );
-            return true;
+            return false;
        }
        else if (conf["bitmap_granularity"].uint64_value() != (uint64_t)this->bs_bitmap_granularity)
        {
@@ -148,7 +148,7 @@ bool osd_t::check_peer_config(osd_client_t *cl, json11::Json conf)
                "[OSD %lu] My bitmap_granularity is %u, but peer OSD %lu has %lu. We can't work together\n",
                this->osd_num, this->bs_bitmap_granularity, cl->osd_num, conf["bitmap_granularity"].uint64_value()
            );
-            return true;
+            return false;
        }
    }
    return true;
@@ -382,30 +382,6 @@ void osd_t::on_change_etcd_state_hook(std::map<std::string, etcd_kv_t> & changes
    }
 }

-void osd_t::on_change_pg_history_hook(pool_id_t pool_id, pg_num_t pg_num)
-{
-    auto pg_it = pgs.find({
-        .pool_id = pool_id,
-        .pg_num = pg_num,
-    });
-    if (pg_it != pgs.end() && pg_it->second.epoch > pg_it->second.reported_epoch &&
-        st_cli.pool_config[pool_id].pg_config[pg_num].epoch >= pg_it->second.epoch)
-    {
-        pg_it->second.reported_epoch = st_cli.pool_config[pool_id].pg_config[pg_num].epoch;
-        object_id oid = { 0 };
-        bool first = true;
-        for (auto op: pg_it->second.write_queue)
-        {
-            if (first || oid != op.first)
-            {
-                oid = op.first;
-                first = false;
-                continue_primary_write(op.second);
-            }
-        }
-    }
-}
-
 void osd_t::on_load_config_hook(json11::Json::object & global_config)
 {
    json11::Json::object osd_config = this->config;
@@ -704,13 +680,16 @@ void osd_t::apply_pg_config()
                        }
                    }
                }
+                auto vec_all_peers = std::vector<osd_num_t>(all_peers.begin(), all_peers.end());
                if (currently_taken)
                {
-                    if (pg_it->second.state & (PG_ACTIVE | PG_INCOMPLETE | PG_PEERING | PG_REPEERING | PG_PEERED))
+                    if (pg_it->second.state & (PG_ACTIVE | PG_INCOMPLETE | PG_PEERING | PG_REPEERING))
                    {
-                        if (pg_it->second.target_set == pg_cfg.target_set)
+                        if (pg_it->second.target_set == pg_cfg.target_set &&
+                            pg_it->second.target_history == pg_cfg.target_history &&
+                            pg_it->second.all_peers == vec_all_peers)
                        {
-                            // No change in osd_set; history changes are ignored
+                            // No change in osd_set and history
                            continue;
                        }
                        else
@@ -761,7 +740,7 @@ void osd_t::apply_pg_config()
                    .pg_num = pg_num,
                    .reported_epoch = pg_cfg.epoch,
                    .target_history = pg_cfg.target_history,
-                    .all_peers = std::vector<osd_num_t>(all_peers.begin(), all_peers.end()),
+                    .all_peers = vec_all_peers,
                    .target_set = pg_cfg.target_set,
                };
                if (pg.scheme == POOL_SCHEME_EC)
@@ -984,13 +963,6 @@ void osd_t::report_pg_states()
                        }
                        this->pgs.erase(pg_it);
                    }
-                    else if (pg_it->second.state & PG_PEERED)
-                    {
-                        // Activate PG after PG PEERED state is reported along with history
-                        // (if the state wasn't changed again)
-                        pg_it->second.state = pg_it->second.state & ~PG_PEERED | PG_ACTIVE;
-                        report_pg_state(pg_it->second);
-                    }
                }
            }
            // Push other PG state updates, if any
--- a/src/osd_flush.cpp
+++ b/src/osd_flush.cpp
@@ -226,42 +226,51 @@ bool osd_t::submit_flush_op(pool_id_t pool_id, pg_num_t pg_num, pg_flush_batch_t

 bool osd_t::pick_next_recovery(osd_recovery_op_t &op)
 {
-    if (!no_recovery)
+    if (!pgs.size())
    {
-        for (auto pg_it = pgs.begin(); pg_it != pgs.end(); pg_it++)
-        {
-            if ((pg_it->second.state & (PG_ACTIVE | PG_HAS_DEGRADED)) == (PG_ACTIVE | PG_HAS_DEGRADED))
-            {
-                for (auto obj_it = pg_it->second.degraded_objects.begin(); obj_it != pg_it->second.degraded_objects.end(); obj_it++)
-                {
-                    if (recovery_ops.find(obj_it->first) == recovery_ops.end())
-                    {
-                        op.degraded = true;
-                        op.oid = obj_it->first;
-                        return true;
-                    }
-                }
-            }
-        }
+        return false;
    }
-    if (!no_rebalance)
+    // Restart scanning from the same degraded/misplaced status as the last time
+    for (int tried_degraded = 0; tried_degraded < 2; tried_degraded++)
    {
-        for (auto pg_it = pgs.begin(); pg_it != pgs.end(); pg_it++)
+        if (recovery_last_degraded ? !no_recovery : !no_rebalance)
        {
            // Don't try to "recover" misplaced objects if "recovery" would make them degraded
-            if ((pg_it->second.state & (PG_ACTIVE | PG_DEGRADED | PG_HAS_MISPLACED)) == (PG_ACTIVE | PG_HAS_MISPLACED))
+            auto mask = recovery_last_degraded ? (PG_ACTIVE | PG_HAS_DEGRADED) : (PG_ACTIVE | PG_DEGRADED | PG_HAS_MISPLACED);
+            auto check = recovery_last_degraded ? (PG_ACTIVE | PG_HAS_DEGRADED) : (PG_ACTIVE | PG_HAS_MISPLACED);
+            // Restart scanning from the same PG as the last time
+            for (auto pg_it = pgs.lower_bound(recovery_last_pg); pg_it != pgs.end(); pg_it++)
            {
-                for (auto obj_it = pg_it->second.misplaced_objects.begin(); obj_it != pg_it->second.misplaced_objects.end(); obj_it++)
+                if ((pg_it->second.state & mask) == check)
                {
-                    if (recovery_ops.find(obj_it->first) == recovery_ops.end())
+                    auto & src = recovery_last_degraded ? pg_it->second.degraded_objects : pg_it->second.misplaced_objects;
+                    assert(src.size() > 0);
+                    // Restart scanning from the next object
+                    for (auto obj_it = src.upper_bound(recovery_last_oid); obj_it != src.end(); obj_it++)
                    {
-                        op.degraded = false;
-                        op.oid = obj_it->first;
-                        return true;
+                        if (recovery_ops.find(obj_it->first) == recovery_ops.end())
+                        {
+                            op.degraded = recovery_last_degraded;
+                            recovery_last_oid = op.oid = obj_it->first;
+                            recovery_pg_done++;
+                            // Switch to another PG after recovery_pg_switch operations
+                            // to always mix all PGs during recovery but still benefit
+                            // from recovery queue depth greater than 1
+                            if (recovery_pg_done >= recovery_pg_switch)
+                            {
+                                recovery_pg_done = 0;
+                                recovery_last_pg.pg_num++;
+                                recovery_last_oid = {};
+                            }
+                            return true;
+                        }
                    }
                }
            }
        }
+        recovery_last_degraded = !recovery_last_degraded;
+        recovery_last_pg = {};
+        recovery_last_oid = {};
    }
    return false;
 }
--- a/src/osd_id.h
+++ b/src/osd_id.h
@@ -28,3 +28,13 @@ inline bool operator < (const pool_pg_num_t & a, const pool_pg_num_t & b)
 {
    return a.pool_id < b.pool_id || a.pool_id == b.pool_id && a.pg_num < b.pg_num;
 }
+
+inline bool operator == (const pool_pg_num_t & a, const pool_pg_num_t & b)
+{
+    return a.pool_id == b.pool_id && a.pg_num == b.pg_num;
+}
+
+inline bool operator != (const pool_pg_num_t & a, const pool_pg_num_t & b)
+{
+    return a.pool_id != b.pool_id || a.pg_num != b.pg_num;
+}
--- a/src/osd_peering.cpp
+++ b/src/osd_peering.cpp
@@ -9,6 +9,8 @@
 #include "str_util.h"
 #include "osd.h"

+#define SELF_FD -1
+
 // Peering loop
 void osd_t::handle_peers()
 {
@@ -30,7 +32,16 @@ void osd_t::handle_peers()
                    if (p.second.state & PG_HAS_UNCLEAN)
                        peering_state = peering_state | OSD_FLUSHING_PGS;
                    else if (p.second.state & (PG_HAS_DEGRADED | PG_HAS_MISPLACED))
+                    {
                        peering_state = peering_state | OSD_RECOVERING;
+                        if (p.second.state & PG_HAS_DEGRADED)
+                        {
+                            // Restart recovery from degraded objects
+                            recovery_last_degraded = true;
+                            recovery_last_pg = {};
+                            recovery_last_oid = {};
+                        }
+                    }
                    ringloop->wakeup();
                    return;
                }
@@ -39,10 +50,6 @@ void osd_t::handle_peers()
                    still = true;
                }
            }
-            else if (p.second.state & PG_PEERED)
-            {
-                still = true;
-            }
        }
        if (!still)
        {
@@ -63,10 +70,6 @@ void osd_t::handle_peers()
                }
                still = true;
            }
-            else if (p.second.state & PG_PEERED)
-            {
-                still = true;
-            }
        }
        if (!still)
        {
@@ -89,7 +92,7 @@ void osd_t::repeer_pgs(osd_num_t peer_osd)
    {
        auto & pg = p.second;
        bool repeer = false;
-        if (pg.state & (PG_PEERING | PG_PEERED | PG_ACTIVE | PG_INCOMPLETE))
+        if (pg.state & (PG_PEERING | PG_ACTIVE | PG_INCOMPLETE))
        {
            for (osd_num_t pg_osd: pg.all_peers)
            {
@@ -300,82 +303,11 @@ void osd_t::start_pg_peering(pg_t & pg)
        {
            continue;
        }
-        submit_sync_and_list_subop(peer_osd, pg.peering_state);
+        submit_list_subop(peer_osd, pg.peering_state);
    }
    ringloop->wakeup();
 }

-void osd_t::submit_sync_and_list_subop(osd_num_t role_osd, pg_peering_state_t *ps)
-{
-    // Sync before listing, if not readonly
-    if (readonly)
-    {
-        submit_list_subop(role_osd, ps);
-    }
-    else if (role_osd == this->osd_num)
-    {
-        // Self
-        osd_op_t *op = new osd_op_t();
-        op->op_type = 0;
-        op->peer_fd = -1;
-        clock_gettime(CLOCK_REALTIME, &op->tv_begin);
-        op->bs_op = new blockstore_op_t();
-        op->bs_op->opcode = BS_OP_SYNC;
-        op->bs_op->callback = [this, ps, op, role_osd](blockstore_op_t *bs_op)
-        {
-            if (bs_op->retval < 0)
-            {
-                printf("Local OP_SYNC failed: %d (%s)\n", bs_op->retval, strerror(-bs_op->retval));
-                force_stop(1);
-                return;
-            }
-            add_bs_subop_stats(op);
-            delete op->bs_op;
-            op->bs_op = NULL;
-            delete op;
-            ps->list_ops.erase(role_osd);
-            submit_list_subop(role_osd, ps);
-        };
-        bs->enqueue_op(op->bs_op);
-        ps->list_ops[role_osd] = op;
-    }
-    else
-    {
-        // Peer
-        auto & cl = msgr.clients.at(msgr.osd_peer_fds.at(role_osd));
-        osd_op_t *op = new osd_op_t();
-        op->op_type = OSD_OP_OUT;
-        op->peer_fd = cl->peer_fd;
-        op->req = (osd_any_op_t){
-            .sec_sync = {
-                .header = {
-                    .magic = SECONDARY_OSD_OP_MAGIC,
-                    .id = msgr.next_subop_id++,
-                    .opcode = OSD_OP_SEC_SYNC,
-                },
-            },
-        };
-        op->callback = [this, ps, role_osd](osd_op_t *op)
-        {
-            if (op->reply.hdr.retval < 0)
-            {
-                // FIXME: Mark peer as failed and don't reconnect immediately after dropping the connection
-                printf("Failed to sync OSD %lu: %ld (%s), disconnecting peer\n", role_osd, op->reply.hdr.retval, strerror(-op->reply.hdr.retval));
-                int fail_fd = op->peer_fd;
-                ps->list_ops.erase(role_osd);
-                delete op;
-                msgr.stop_client(fail_fd);
-                return;
-            }
-            delete op;
-            ps->list_ops.erase(role_osd);
-            submit_list_subop(role_osd, ps);
-        };
-        msgr.outbox_push(op);
-        ps->list_ops[role_osd] = op;
-    }
-}
-
 void osd_t::submit_list_subop(osd_num_t role_osd, pg_peering_state_t *ps)
 {
    if (role_osd == this->osd_num)
@@ -383,7 +315,7 @@ void osd_t::submit_list_subop(osd_num_t role_osd, pg_peering_state_t *ps)
        // Self
        osd_op_t *op = new osd_op_t();
        op->op_type = 0;
-        op->peer_fd = -1;
+        op->peer_fd = SELF_FD;
        clock_gettime(CLOCK_REALTIME, &op->tv_begin);
        op->bs_op = new blockstore_op_t();
        op->bs_op->opcode = BS_OP_LIST;
@@ -415,8 +347,8 @@ void osd_t::submit_list_subop(osd_num_t role_osd, pg_peering_state_t *ps)
            op->bs_op = NULL;
            delete op;
        };
-        bs->enqueue_op(op->bs_op);
        ps->list_ops[role_osd] = op;
+        bs->enqueue_op(op->bs_op);
    }
    else
    {
@@ -463,14 +395,14 @@ void osd_t::submit_list_subop(osd_num_t role_osd, pg_peering_state_t *ps)
            ps->list_ops.erase(role_osd);
            delete op;
        };
-        msgr.outbox_push(op);
        ps->list_ops[role_osd] = op;
+        msgr.outbox_push(op);
    }
 }

 void osd_t::discard_list_subop(osd_op_t *list_op)
 {
-    if (list_op->peer_fd == 0)
+    if (list_op->peer_fd == SELF_FD)
    {
        // Self
        list_op->bs_op->callback = [list_op](blockstore_op_t *bs_op)
@@ -549,13 +481,17 @@ void osd_t::report_pg_state(pg_t & pg)
        pg.history_changed = true;
        pg.target_history.clear();
        pg.all_peers = pg.target_set;
+        std::sort(pg.all_peers.begin(), pg.all_peers.end());
        pg.cur_peers = pg.target_set;
    }
    else if (pg.state == (PG_ACTIVE|PG_LEFT_ON_DEAD))
    {
        // Clear history of active+left_on_dead PGs, but leave dead OSDs in all_peers
-        pg.history_changed = true;
-        pg.target_history.clear();
+        if (pg.target_history.size())
+        {
+            pg.history_changed = true;
+            pg.target_history.clear();
+        }
        std::set<osd_num_t> dead_peers;
        for (auto pg_osd: pg.all_peers)
        {
@@ -572,8 +508,12 @@ void osd_t::report_pg_state(pg_t & pg)
                dead_peers.insert(pg_osd);
            }
        }
-        pg.all_peers.clear();
-        pg.all_peers.insert(pg.all_peers.begin(), dead_peers.begin(), dead_peers.end());
+        auto new_all_peers = std::vector<osd_num_t>(dead_peers.begin(), dead_peers.end());
+        if (pg.all_peers != new_all_peers)
+        {
+            pg.history_changed = true;
+            pg.all_peers = new_all_peers;
+        }
        pg.cur_peers.clear();
        for (auto pg_osd: pg.target_set)
        {
--- a/src/osd_peering_pg.cpp
+++ b/src/osd_peering_pg.cpp
@@ -86,24 +86,11 @@ void pg_obj_state_check_t::walk()
    }
    if (pg->pg_cursize < pg->pg_size)
    {
-        // Report PG history and activate
-        pg->state |= PG_DEGRADED | PG_PEERED;
-        std::vector<osd_num_t> history_set;
-        for (auto peer_osd: pg->cur_set)
-        {
-            if (peer_osd != 0)
-            {
-                history_set.push_back(peer_osd);
-            }
-        }
-        pg->target_history.push_back(history_set);
-        pg->history_changed = true;
-    }
-    else
-    {
-        // Just activate
-        pg->state |= PG_ACTIVE;
+        // Activate as degraded
+        // Current OSD set will be added into target_history on first write
+        pg->state |= PG_DEGRADED;
    }
+    pg->state |= PG_ACTIVE;
    if (pg->state == PG_ACTIVE && pg->cur_peers.size() < pg->all_peers.size())
    {
        pg->state |= PG_LEFT_ON_DEAD;
@@ -435,21 +422,44 @@ void pg_t::calc_object_states(int log_level)
    std::sort(st.list.begin(), st.list.end());
    // Walk over it and check object states
    st.walk();
-    if (this->state & (PG_DEGRADED|PG_LEFT_ON_DEAD))
+    if (this->state != PG_ACTIVE)
    {
        assert(epoch != (((uint64_t)1 << PG_EPOCH_BITS)-1));
        epoch++;
    }
+    if (log_level > 0)
+    {
+        std::string osd_set_desc;
+        for (auto & osd_num: target_set)
+        {
+            osd_set_desc += (osd_set_desc == "" ? "" : ", ")+std::to_string(osd_num);
+        }
+        printf(
+            "[PG %u/%u] %lu clean objects on target OSD set %s\n",
+            pool_id, pg_num, clean_count, osd_set_desc.c_str()
+        );
+        for (auto & stp: state_dict)
+        {
+            osd_set_desc = "";
+            for (auto & loc: stp.first)
+            {
+                osd_set_desc += (osd_set_desc == "" ? "" : ", ")+
+                    std::to_string(loc.osd_num)+
+                    (st.replicated ? "" : "("+std::to_string(loc.role)+")")+
+                    (loc.outdated ? "(old)" : "");
+            }
+            printf("[PG %u/%u] %lu objects on OSD set %s\n", pool_id, pg_num, stp.second.object_count, osd_set_desc.c_str());
+        }
+    }
 }

 void pg_t::print_state()
 {
    printf(
-        "[PG %u/%u] is %s%s%s%s%s%s%s%s%s%s%s%s%s%s%s (%lu objects)\n", pool_id, pg_num,
+        "[PG %u/%u] is %s%s%s%s%s%s%s%s%s%s%s%s%s%s (%lu objects)\n", pool_id, pg_num,
        (state & PG_STARTING) ? "starting" : "",
        (state & PG_OFFLINE) ? "offline" : "",
        (state & PG_PEERING) ? "peering" : "",
-        (state & PG_PEERED) ? "peered" : "",
        (state & PG_INCOMPLETE) ? "incomplete" : "",
        (state & PG_ACTIVE) ? "active" : "",
        (state & PG_REPEERING) ? "repeering" : "",
--- a/src/osd_peering_pg_test.cpp
+++ b/src/osd_peering_pg_test.cpp
@@ -54,5 +54,6 @@ int main(int argc, char *argv[])
    {
        printf("dev: state=%lx\n", it.second.state);
    }
+    delete pg.peering_state;
    return 0;
 }
--- a/src/osd_primary.cpp
+++ b/src/osd_primary.cpp
@@ -228,7 +228,7 @@ resume_1:
 resume_2:
    if (op_data->errors > 0)
    {
-        finish_op(cur_op, op_data->epipe > 0 ? -EPIPE : -EIO);
+        finish_op(cur_op, op_data->errcode);
        return;
    }
    cur_op->reply.rw.version = op_data->fact_ver;
@@ -350,7 +350,7 @@ resume_2:
 resume_3:
    if (op_data->errors > 0)
    {
-        pg_cancel_write_queue(pg, cur_op, op_data->oid, op_data->epipe > 0 ? -EPIPE : -EIO);
+        pg_cancel_write_queue(pg, cur_op, op_data->oid, op_data->errcode);
        return;
    }
    // Check CAS version
@@ -371,7 +371,7 @@ resume_4:
 resume_5:
    if (op_data->errors > 0)
    {
-        pg_cancel_write_queue(pg, cur_op, op_data->oid, op_data->epipe > 0 ? -EPIPE : -EIO);
+        pg_cancel_write_queue(pg, cur_op, op_data->oid, op_data->errcode);
        return;
    }
    // Remove version override
--- a/src/osd_primary.h
+++ b/src/osd_primary.h
@@ -22,9 +22,9 @@ struct osd_primary_op_data_t
    pg_num_t pg_num;
    object_id oid;
    uint64_t target_ver;
-    uint64_t fact_ver = 0;
+    uint64_t orig_ver = 0, fact_ver = 0;
    uint64_t scheme = 0;
-    int n_subops = 0, done = 0, errors = 0, epipe = 0;
+    int n_subops = 0, done = 0, errors = 0, errcode = 0;
    int degraded = 0, pg_size, pg_data_size;
    osd_rmw_stripe_t *stripes;
    osd_op_t *subops = NULL;
--- a/src/osd_primary_chain.cpp
+++ b/src/osd_primary_chain.cpp
@@ -42,7 +42,7 @@ resume_4:
    {
        free(op_data->chain_reads);
        op_data->chain_reads = NULL;
-        finish_op(cur_op, op_data->epipe > 0 ? -EPIPE : -EIO);
+        finish_op(cur_op, op_data->errcode);
        return;
    }
    send_chained_read_results(pg, cur_op);
@@ -297,7 +297,7 @@ int osd_t::submit_bitmap_subops(osd_op_t *cur_op, pg_t & pg)
                    // Fail it immediately
                    subop->peer_fd = -1;
                    subop->reply.hdr.retval = -EPIPE;
-                    subop->callback(subop);
+                    ringloop->set_immediate([subop]() { std::function<void(osd_op_t*)>(subop->callback)(subop); });
                }
                subop_idx++;
            }
--- a/src/osd_primary_subops.cpp
+++ b/src/osd_primary_subops.cpp
@@ -122,7 +122,7 @@ void osd_t::submit_primary_subops(int submit_type, uint64_t op_version, const ui
        zero_read = -1;
    osd_op_t *subops = new osd_op_t[n_subops];
    op_data->fact_ver = 0;
-    op_data->done = op_data->errors = 0;
+    op_data->done = op_data->errors = op_data->errcode = 0;
    op_data->n_subops = n_subops;
    op_data->subops = subops;
    int sent = submit_primary_subop_batch(submit_type, op_data->oid.inode, op_version, op_data->stripes, osd_set, cur_op, 0, zero_read);
@@ -235,7 +235,7 @@ int osd_t::submit_primary_subop_batch(int submit_type, inode_t inode, uint64_t o
                    // Fail it immediately
                    subop->peer_fd = -1;
                    subop->reply.hdr.retval = -EPIPE;
-                    subop->callback(subop);
+                    ringloop->set_immediate([subop]() { std::function<void(osd_op_t*)>(subop->callback)(subop); });
                }
            }
            i++;
@@ -263,9 +263,11 @@ void osd_t::handle_primary_bs_subop(osd_op_t *subop)
    blockstore_op_t *bs_op = subop->bs_op;
    int expected = bs_op->opcode == BS_OP_READ || bs_op->opcode == BS_OP_WRITE
        || bs_op->opcode == BS_OP_WRITE_STABLE ? bs_op->len : 0;
-    if (bs_op->retval != expected && bs_op->opcode != BS_OP_READ)
+    if (bs_op->retval != expected && bs_op->opcode != BS_OP_READ &&
+        (bs_op->opcode != BS_OP_WRITE && bs_op->opcode != BS_OP_WRITE_STABLE ||
+        bs_op->retval != -ENOSPC))
    {
-        // die
+        // die on any error except ENOSPC
        throw std::runtime_error(
            "local blockstore modification failed (opcode = "+std::to_string(bs_op->opcode)+
            " retval = "+std::to_string(bs_op->retval)+")"
@@ -276,6 +278,8 @@ void osd_t::handle_primary_bs_subop(osd_op_t *subop)
    subop->reply.hdr.retval = bs_op->retval;
    if (bs_op->opcode == BS_OP_READ || bs_op->opcode == BS_OP_WRITE || bs_op->opcode == BS_OP_WRITE_STABLE)
    {
+        subop->req.sec_rw.oid = bs_op->oid;
+        subop->req.sec_rw.version = bs_op->version;
        subop->req.sec_rw.len = bs_op->len;
        subop->reply.sec_rw.version = bs_op->version;
    }
@@ -337,14 +341,17 @@ void osd_t::handle_primary_subop(osd_op_t *subop, osd_op_t *cur_op)
                osd_op_names[opcode], subop->peer_fd, retval, expected
            );
        }
-        if (retval == -EPIPE)
+        // Error priority: EIO > ENOSPC > EPIPE
+        if (op_data->errcode == 0 || retval == -EIO ||
+            retval == -ENOSPC && op_data->errcode == -EPIPE)
        {
-            op_data->epipe++;
+            op_data->errcode = retval;
        }
        op_data->errors++;
-        if (subop->peer_fd >= 0)
+        if (subop->peer_fd >= 0 && (opcode != OSD_OP_SEC_WRITE && opcode != OSD_OP_SEC_WRITE_STABLE ||
+            retval != -ENOSPC))
        {
-            // Drop connection on any error
+            // Drop connection on any error expect ENOSPC
            msgr.stop_client(subop->peer_fd);
        }
    }
@@ -408,7 +415,8 @@ void osd_t::cancel_primary_write(osd_op_t *cur_op)
        // are sent to peer OSDs, so we can't just throw them away.
        // Mark them with an extra EPIPE.
        cur_op->op_data->errors++;
-        cur_op->op_data->epipe++;
+        if (cur_op->op_data->errcode == 0)
+            cur_op->op_data->errcode = -EPIPE;
        cur_op->op_data->done--; // Caution: `done` must be signed because may become -1 here
    }
    else
@@ -460,7 +468,7 @@ void osd_t::submit_primary_del_batch(osd_op_t *cur_op, obj_ver_osd_t *chunks_to_
 {
    osd_primary_op_data_t *op_data = cur_op->op_data;
    op_data->n_subops = chunks_to_delete_count;
-    op_data->done = op_data->errors = 0;
+    op_data->done = op_data->errors = op_data->errcode = 0;
    if (!op_data->n_subops)
    {
        return;
@@ -512,7 +520,7 @@ void osd_t::submit_primary_del_batch(osd_op_t *cur_op, obj_ver_osd_t *chunks_to_
                // Fail it immediately
                subops[i].peer_fd = -1;
                subops[i].reply.hdr.retval = -EPIPE;
-                subops[i].callback(&subops[i]);
+                ringloop->set_immediate([subop = &subops[i]]() { std::function<void(osd_op_t*)>(subop->callback)(subop); });
            }
        }
    }
@@ -523,7 +531,7 @@ int osd_t::submit_primary_sync_subops(osd_op_t *cur_op)
    osd_primary_op_data_t *op_data = cur_op->op_data;
    int n_osds = op_data->dirty_osd_count;
    osd_op_t *subops = new osd_op_t[n_osds];
-    op_data->done = op_data->errors = 0;
+    op_data->done = op_data->errors = op_data->errcode = 0;
    op_data->n_subops = n_osds;
    op_data->subops = subops;
    std::map<uint64_t, int>::iterator peer_it;
@@ -579,7 +587,7 @@ void osd_t::submit_primary_stab_subops(osd_op_t *cur_op)
    osd_primary_op_data_t *op_data = cur_op->op_data;
    int n_osds = op_data->unstable_write_osds->size();
    osd_op_t *subops = new osd_op_t[n_osds];
-    op_data->done = op_data->errors = 0;
+    op_data->done = op_data->errors = op_data->errcode = 0;
    op_data->n_subops = n_osds;
    op_data->subops = subops;
    for (int i = 0; i < n_osds; i++)
@@ -627,7 +635,7 @@ void osd_t::submit_primary_stab_subops(osd_op_t *cur_op)
                // Fail it immediately
                subops[i].peer_fd = -1;
                subops[i].reply.hdr.retval = -EPIPE;
-                subops[i].callback(&subops[i]);
+                ringloop->set_immediate([subop = &subops[i]]() { std::function<void(osd_op_t*)>(subop->callback)(subop); });
            }
        }
    }
--- a/src/osd_primary_sync.cpp
+++ b/src/osd_primary_sync.cpp
@@ -240,7 +240,7 @@ resume_8:
    }
    if (op_data->errors > 0)
    {
-        finish_op(cur_op, op_data->epipe > 0 ? -EPIPE : -EIO);
+        finish_op(cur_op, op_data->errcode);
    }
    else
    {
--- a/src/osd_primary_write.cpp
+++ b/src/osd_primary_write.cpp
@@ -93,7 +93,7 @@ resume_2:
 resume_3:
    if (op_data->errors > 0)
    {
-        pg_cancel_write_queue(pg, cur_op, op_data->oid, op_data->epipe > 0 ? -EPIPE : -EIO);
+        pg_cancel_write_queue(pg, cur_op, op_data->oid, op_data->errcode);
        return;
    }
    // Check CAS version
@@ -138,6 +138,7 @@ resume_3:
        }
    }
    // Send writes
+    op_data->orig_ver = op_data->fact_ver;
    if ((op_data->fact_ver >> (64-PG_EPOCH_BITS)) < pg.epoch)
    {
        op_data->target_ver = ((uint64_t)pg.epoch << (64-PG_EPOCH_BITS)) | 1;
@@ -154,17 +155,36 @@ resume_3:
    if (pg.epoch > pg.reported_epoch)
    {
        // Report newer epoch before writing
-        // FIXME: We may report only one PG state here...
+        // FIXME: We don't have to report all changed PG states here
        this->pg_state_dirty.insert({ .pool_id = pg.pool_id, .pg_num = pg.pg_num });
+        if (pg.state != PG_ACTIVE)
+        {
+            // Check that current OSD set is in history and/or add it there
+            std::vector<osd_num_t> history_set;
+            for (auto peer_osd: pg.cur_set)
+                if (peer_osd != 0)
+                    history_set.push_back(peer_osd);
+            std::sort(history_set.begin(), history_set.end());
+            auto it = std::lower_bound(pg.target_history.begin(), pg.target_history.end(), history_set);
+            if (it == pg.target_history.end() || *it != history_set)
+                pg.target_history.insert(it, history_set);
+        }
        pg.history_changed = true;
        report_pg_states();
 resume_10:
        if (pg.epoch > pg.reported_epoch)
        {
-            op_data->st = 10;
+#define PG_EPOCH_WAIT_STATE 10
+            op_data->st = PG_EPOCH_WAIT_STATE;
            return;
        }
    }
+    // Recheck PG state after reporting history - maybe it's already stopping/restarting
+    if (pg.state & (PG_STOPPING|PG_REPEERING))
+    {
+        pg_cancel_write_queue(pg, cur_op, op_data->oid, -EPIPE);
+        return;
+    }
    submit_primary_subops(SUBMIT_WRITE, op_data->target_ver, pg.cur_set.data(), cur_op);
 resume_4:
    op_data->st = 4;
@@ -177,7 +197,7 @@ resume_5:
    }
    if (op_data->errors > 0)
    {
-        pg_cancel_write_queue(pg, cur_op, op_data->oid, op_data->epipe > 0 ? -EPIPE : -EIO);
+        pg_cancel_write_queue(pg, cur_op, op_data->oid, op_data->errcode);
        return;
    }
    if (op_data->object_state)
@@ -194,7 +214,7 @@ resume_7:
    {
        return;
    }
-    if (op_data->fact_ver == 1)
+    if (op_data->orig_ver == 0)
    {
        // Object is created
        pg.clean_count++;
@@ -254,7 +274,7 @@ resume_8:
 resume_9:
                if (op_data->errors > 0)
                {
-                    pg_cancel_write_queue(pg, cur_op, op_data->oid, op_data->epipe > 0 ? -EPIPE : -EIO);
+                    pg_cancel_write_queue(pg, cur_op, op_data->oid, op_data->errcode);
                    return;
                }
            }
@@ -286,6 +306,50 @@ continue_others:
    }
 }

+void osd_t::on_change_pg_history_hook(pool_id_t pool_id, pg_num_t pg_num)
+{
+    auto pg_it = pgs.find({
+        .pool_id = pool_id,
+        .pg_num = pg_num,
+    });
+    if (pg_it == pgs.end())
+    {
+        return;
+    }
+    auto & pg = pg_it->second;
+    if (pg.epoch > pg.reported_epoch &&
+        st_cli.pool_config[pool_id].pg_config[pg_num].epoch >= pg.epoch)
+    {
+        pg.reported_epoch = st_cli.pool_config[pool_id].pg_config[pg_num].epoch;
+        std::vector<object_id> resume_oids;
+        for (auto & op: pg.write_queue)
+        {
+            if (op.second->op_data->st == PG_EPOCH_WAIT_STATE)
+            {
+                // Run separately to prevent side effects
+                resume_oids.push_back(op.first);
+            }
+        }
+        for (auto & oid: resume_oids)
+        {
+            auto pg_it = pgs.find({
+                .pool_id = pool_id,
+                .pg_num = pg_num,
+            });
+            if (pg_it != pgs.end())
+            {
+                auto & pg = pg_it->second;
+                auto op_it = pg.write_queue.find(oid);
+                if (op_it != pg.write_queue.end() &&
+                    op_it->second->op_data->st == PG_EPOCH_WAIT_STATE)
+                {
+                    continue_primary_write(op_it->second);
+                }
+            }
+        }
+    }
+}
+
 bool osd_t::remember_unstable_write(osd_op_t *cur_op, pg_t & pg, pg_osd_set_t & loc_set, int base_state)
 {
    osd_primary_op_data_t *op_data = cur_op->op_data;
@@ -336,7 +400,7 @@ resume_7:
            op_data->unstable_write_osds = NULL;
            if (op_data->errors > 0)
            {
-                pg_cancel_write_queue(pg, cur_op, op_data->oid, op_data->epipe > 0 ? -EPIPE : -EIO);
+                pg_cancel_write_queue(pg, cur_op, op_data->oid, op_data->errcode);
                return false;
            }
        }
--- a/Show More
+++ b/Show More
Author	SHA1	Message	Date
Vitaliy Filippov	b4b6407716	WIP Implement RDMA v2 based on IBV_WR_RDMA_WRITE with remote buffer management One BIG FIXME remaining - handling large operations :))	2023-02-26 00:26:39 +03:00
Vitaliy Filippov	8139a34e97	Fix json11: allow trailing comma	2023-02-23 01:16:01 +03:00
Vitaliy Filippov	4ab630b44d	Use just sfdisk --json, --dump is not needed	2023-02-23 00:55:47 +03:00
Vitaliy Filippov	2c8241b7db	Remove PG "peered" state	2023-02-21 01:30:42 +03:00
Vitaliy Filippov	36a7dd3671	Move tests to "make test"	2023-02-21 01:30:42 +03:00
Vitaliy Filippov	936122bbcf	Initialize msgr lazily in client to speedup vitastor-cli with RDMA enabled	2023-02-19 18:59:07 +03:00
Vitaliy Filippov	1a1ba0d1e7	Add set_immediate to ringloop and use it for bs/osd ops to prevent reenterability issues	2023-02-09 17:37:26 +03:00
Vitaliy Filippov	3d09c9cec7	Remove unused wait_sqe() from ringloop	2023-02-09 17:37:26 +03:00
Vitaliy Filippov	3d08a1ad6c	Fix cluster_client test after last reenterability fixes	2023-02-05 01:47:32 +03:00
Vitaliy Filippov	499881d81c	Fix typo	2023-01-27 01:52:02 +03:00
Vitaliy Filippov	aba93b951b	Fix incorrect EC free space statistics in vitastor-cli df output	2023-01-26 02:04:29 +03:00
Vitaliy Filippov	d125fb1f30	Release 0.8.5 - Fix a possible "double free" bug in the client library happening on OSD restart - Fix a possible write hang on PG history update when only epoch is changed - Fix incorrect systemd target "local.target" in mon/make-etcd - Allow "content" option in PVE storage plugin to allow to enable containers - Build client library without tcmalloc which fixes "attempt to free invalid pointer" errors when, for example, trying to run QEMU with both Vitastor and Ceph RBD disks	2023-01-25 01:43:49 +03:00
Vitaliy Filippov	9d3fd72298	Require liburing < 2 in rpm specs	2023-01-25 01:43:49 +03:00
Vitaliy Filippov	8b552a01f9	Do not retry successful operation parts in client (could lead to "double free" bugs)	2023-01-25 01:30:36 +03:00
Vitaliy Filippov	0385b2f9e8	Fix write hangs on PG epoch update - always set pg.history_changed to true	2023-01-25 01:30:15 +03:00
Vitaliy Filippov	749c837045	Replace non-existing local.target with multi-user.target	2023-01-25 01:29:31 +03:00
Vitaliy Filippov	98001d845b	Remove version from vitastor-release.rpm links	2023-01-23 14:03:33 +03:00
Vitaliy Filippov	c96bcae74b	Allow "content" option in PVE storage plugin to allow to enable containers	2023-01-16 18:14:45 +03:00
Vitaliy Filippov	9f4e34a8cc	Build client library without tcmalloc Fixes "[src/tcmalloc.cc:332] Attempt to free invalid pointer ..." when trying to run QEMU with both Vitastor and Ceph RBD disks and other possible allocator collisions.	2023-01-15 00:01:11 +03:00
Vitaliy Filippov	81fc8bb94c	Release 0.8.4 New features: - Implement QCOW2 image/snapshot export via qemu-img (bdrv_co_block_status in the driver) - Remove OSDs from PG history during `vitastor-cli rm-osd` to prevent `left_on_dead` PG states after deletion - Add a new recovery_pg_switch setting to mix all PGs during recovery, to almost fully reduce the probability of ENOSPC during rebalance - Introduce partial ENOSPC ("OSD is full") handling - now ENOSPC doesn't turn into cascades of crashes - Add migration support to Proxmox VE Vitastor driver - Track last_clean_pgs on a per-pool basis thus reducing data movement in a cluster with pools remaining unclean/degraded for a long time Bug fixes: - Fix a bug where monitor could generate degraded PGs if one of the hosts had no OSDs - Fix a bug where monitor could skip PG redistribution with a lot of OSDs in cluster - Report PG history synchronously on the first write, which improves PG consistency and availability at the same time, because history now gets reported correctly and doesn't get reported without the need for it - Fix possible write and recovery stalls which could happen in a cluster with both EC and replicated pools - Make OSD and monitors sanitize & deduplicate PG history items in etcd - Fix non-working OSD peer config safety check - Fix a rare journal flush stall where flushing wasn't activated with full journal, but with empty flush queue - Fix builds without ISA-L (jerasure-only) crashing with EC N+K, K>=2 due to the lack of 16-byte buffer alignment - Fix a possible crash for EC N+K, K>=2 when calculating a parity chunk with previous parity chunk missing - Fix a bug where vitastor-disk purge with suppressed warnings didn't work	2023-01-13 23:59:54 +03:00
Vitaliy Filippov	bc465c16de	Fix arithmetic on void* for clang	2023-01-13 23:58:42 +03:00
Vitaliy Filippov	8763e9211c	Fix qemu driver compilation warning/error	2023-01-13 23:44:39 +03:00
Vitaliy Filippov	9e1a80bd17	Replace apt-key with trusted.gpg.d	2023-01-13 19:51:47 +03:00
Vitaliy Filippov	3e280f2f08	Mark vitastor as shared storage in PVE driver	2023-01-13 01:36:30 +03:00
Vitaliy Filippov	fe87b4076b	Fix backwards compatibility in cluster_client	2023-01-12 02:37:31 +03:00
Vitaliy Filippov	a38957c1a7	Skip empty hosts in lp-optimizer	2023-01-09 16:26:16 +03:00
Vitaliy Filippov	137309cf29	Implement bdrv_co_block_status for snapshot export support	2023-01-07 17:06:58 +03:00
Vitaliy Filippov	373f9d0387	Try to re-peer PGs on history change	2023-01-06 12:46:44 +03:00
Vitaliy Filippov	c4516ea971	Also remove deleted OSD from PG configuration and last_clean_pgs	2023-01-06 12:46:44 +03:00
Vitaliy Filippov	91065c80fc	Try to prevent left_on_dead when deleting OSDs by removing them from PG history	2023-01-06 12:46:43 +03:00
Vitaliy Filippov	0f6b946add	Time changes with every stat change, do not schedule checks based on it	2023-01-05 13:54:16 +03:00
Vitaliy Filippov	465cbf0b2f	Do not re-schedule recheck indefinitely, run it after mon_change_timeout in any case	2023-01-05 13:48:06 +03:00
Vitaliy Filippov	41add50e4e	Track last_clean_pgs on a per-pool basis	2023-01-03 02:20:50 +03:00
Vitaliy Filippov	02e7be7dc9	Prevent reenterability side effects during PG history operation resume	2023-01-03 02:20:50 +03:00
Vitaliy Filippov	73940adf07	Prioritize EC (non-instantly-stable) operations under journal pressure This reduces the probability of hitting OSD stalls with EC due to "deadlocks" where two parallel write operations wait for each other to complete	2023-01-03 00:05:45 +03:00
Vitaliy Filippov	e950c024d3	Do not sync peer OSDs before listing Sync before listing was added to wait for all PG writes possibly left in queue from the previous master to finish before listing it But in fact it may block the cluster when EC is used and some unstable writes are left in the queue - they block journal flushing, rollback/stabilize is required to unblock them, but rollback/stabilize may only happen after PG is peered. But peering needs listings, listings are requested only after sync, and sync itself waits for currently blocked writes waiting in the queue	2023-01-03 00:05:45 +03:00
Vitaliy Filippov	71d6d9f868	Fix possible crash on ENOSPC during operation cancel in blockstore	2023-01-03 00:05:45 +03:00
Vitaliy Filippov	a4dfa519af	Report PG history synchronously during write This has 2 effects: 1) OSD sets aren't added into PG history until actual write attempts anymore which removes unneeded extra osd_sets in PG history 2) New OSD sets are reported synchronously and can't be lost on PG restarts happening at the same time with reconfiguration	2023-01-01 23:41:05 +03:00
Vitaliy Filippov	37a6aff2fa	Write OSD numbers always as numbers in mon	2023-01-01 23:17:42 +03:00
Vitaliy Filippov	67019f5b02	Make OSD sort & sanitize PG history items	2023-01-01 23:17:42 +03:00
Vitaliy Filippov	0593e5c21c	Fix OSD peer config safety check	2022-12-31 02:24:42 +03:00
Vitaliy Filippov	998e24adf8	Add a new recovery_pg_switch setting to mix all PGs during recovery	2022-12-30 02:03:33 +03:00
Vitaliy Filippov	d7bd36dc32	Fix another rare journal flush stall	2022-12-30 02:03:33 +03:00
Vitaliy Filippov	cf5c562800	Log all object locations when peering PGs	2022-12-30 02:03:33 +03:00
Vitaliy Filippov	629200b0cc	Return ENOSPC as the primary OSD	2022-12-30 02:03:33 +03:00
Vitaliy Filippov	3589ccec22	Do not disconnect peer on ENOSPC during write	2022-12-30 01:54:25 +03:00
Vitaliy Filippov	8d55a1e780	Build osd_rmw_test both with and without ISA-L	2022-12-29 19:13:57 +03:00
Vitaliy Filippov	65f6b3a4eb	Fix jerasure crashing on bitmap calculation/restoration due to the lack of 16-byte alignment	2022-12-29 19:13:57 +03:00
Vitaliy Filippov	fd216eac77	Add a test for missing parity chunk calculation	2022-12-29 19:13:57 +03:00
Vitaliy Filippov	61fca7c426	Fix crash when calculating a parity chunk with previous parity chunk missing (test coming shortly)	2022-12-29 19:13:57 +03:00
Vitaliy Filippov	1c29ed80b9	Fix quote in docs :)	2022-12-28 18:08:53 +03:00
Vitaliy Filippov	68f3fb795e	Suppress warnings in vitastor-disk purge correctly	2022-12-27 11:09:19 +03:00
Vitaliy Filippov	fa90f287da	Release 0.8.3 - Implement a new "vitastor-disk purge" command to remove OSDs with safety checks - Implement a new "vitastor-cli rm-osd" command to only remove OSD metadata from etcd - Fix a bug where the monitor could ignore OSD removal and other /osd/stats key changes - Fix a bug where garbage could be returned when reading objects being written at the same time - Fix a rare write stall where journal space could be not reclaimed where there were no new operations in the flush queue - Fix a rare peering stall caused by a previous long listing operations queues limiting attempt - Fix total object count statistic in OSD on object creation - Add missing offset&len into vitastor-disk dump-journal for big_writes, fix JSON format - Make vitastor-cli print help on missing command - Make vitastor-cli translate all '-' to '_' in CLI options	2022-12-27 02:40:55 +03:00
Vitaliy Filippov	795020674d	Loop journal flusher when the queue is empty but there is a trim request	2022-12-27 02:28:20 +03:00
Vitaliy Filippov	8e12285629	Fix vitastor-disk purge (now it works)	2022-12-27 02:28:20 +03:00
Vitaliy Filippov	b9b50ab4cc	Implement vitastor-disk purge command	2022-12-26 02:48:48 +03:00
Vitaliy Filippov	0d8625f92d	Make vitastor-cli print help on missing command	2022-12-26 02:48:48 +03:00
Vitaliy Filippov	2f3c2c5140	Implement safety check for OSD removal, translate all '-' to '_' in cli options '-' to '_' translation fixes a bug with create --image_meta	2022-12-26 02:48:48 +03:00
Vitaliy Filippov	4ebdd02b0f	Remove LIST op limiter It doesn't prevent OSD slow ops but may itself lead to stalls :)	2022-12-26 02:48:48 +03:00
Vitaliy Filippov	bf6fdc4141	Check add/rm osd with 2048 PGs	2022-12-26 02:48:48 +03:00
Vitaliy Filippov	c2244331e6	Add vitastor-cli rm-osd command	2022-12-26 02:48:48 +03:00
Vitaliy Filippov	3de57e87b1	Recheck OSD tree in monitor on /osd/stats changes	2022-12-26 02:48:48 +03:00
Vitaliy Filippov	2d4cc688b2	Add a remove-osd test	2022-12-26 02:48:48 +03:00
Vitaliy Filippov	31bd1ec145	Fix object creation check for statistics	2022-12-21 02:51:11 +03:00
Vitaliy Filippov	c08d1f2dfe	Add missing offset&len into big_writes journal dump, fix commas again	2022-12-21 02:51:11 +03:00
Vitaliy Filippov	1d80bcc8d0	Fix blockstore returning garbage for unstable reads if there is an in-flight version "In-flight" versions are added into dirty_db when writes are enqueued. And they weren't ignored by subsequent reads even though they didn't have data location yet. This bug was leading to test_heal.sh not passing sometimes with replicated setups.	2022-12-21 02:48:24 +03:00
Vitaliy Filippov	5ef8bed75f	Release 0.8.2 - Fix QEMU driver compatibility with QEMU 7.0 and < 2.9 - Add patches for pve-qemu-kvm 7.1 (PVE 7.3) and pve-qemu-kvm 6.2 (PVE 7.2) - Fix Proxmox driver location in the pve-storage-vitastor package - Disable HDD autodetection in non-hybrid mode - Explicitly warn about a buggy kernels on -EAGAIN in io_uring - Final fix for the lack of zeroing out of old metadata entries (do not crash with "big_write journal_entry was allocated over another object" in some cases after an unclean OSD shutdown) - Wait for data writes before fsyncing data if data fsync is enabled - Never try to wait for free space inside blockstore thus stalling OSDs - Fix a rare crash in osd_peering due to callback ordering - Fix a rare duplication of ping & op message IDs - Fix a rare use-after-free during pings - Add --force to vitastor-disk read-sb - Make vitastor-disk dump metadata object IDs in hex, add forgotten commas - Fix vitastor-disk SCSI disk cache check	2022-12-17 17:54:13 +03:00
Vitaliy Filippov	8669998e5e	Fix discard_list_subop() for local ops	2022-12-17 17:54:13 +03:00
Vitaliy Filippov	b457327e77	Oops. Fix metadata read after fixes :-)	2022-12-17 17:31:57 +03:00
Vitaliy Filippov	f7fa9d5e34	Fix SCSI device cache type check	2022-12-17 17:31:57 +03:00
Vitaliy Filippov	49b88b01f9	Fix clang build	2022-12-17 16:25:26 +03:00
Vitaliy Filippov	71688bcb59	Disable HDD autodetection in non-hybrid mode	2022-12-17 16:12:15 +03:00
Vitaliy Filippov	552e207d2b	Explicitly print errors about -EAGAIN in io_uring	2022-12-17 15:49:49 +03:00
Vitaliy Filippov	5464821fa5	Final fix for the lack of zeroing out of old metadata entries If a crash occurs during flushing a redirect-write it may happen so that the disk contains both old and new metadata entries. This is OK, but prior to 0.8.0 after this situation OSDs started without problem, but then they crashed after some more overwrites with a "tried to overwrite non-zero metadata entry" error. 0.8.0 introduced a change that was intended to fix this situation, but rather than fixing it it prevented OSDs from starting, now because of a "big_write journal_entry was allocated over another object" error... :-) This change finally fixes the original issue. Followup to `54ef2c389f`	2022-12-17 14:50:31 +03:00
Vitaliy Filippov	6917a32ca8	Add --force to vitastor-disk read-sb	2022-12-17 02:47:15 +03:00
Vitaliy Filippov	f8722a8bd5	Dump meta in hex	2022-12-17 01:50:38 +03:00
Vitaliy Filippov	9c2f69c9fa	Add forgotten commas to vitastor-disk dump-journal	2022-12-17 01:22:58 +03:00
Vitaliy Filippov	1a93e3f33a	Wait for data writes before fsyncing data if data fsync is enabled	2022-12-16 20:46:55 +03:00
Vitaliy Filippov	3f35744052	Fix compatibility with QEMU aio_set_fd_handler signatures in 7.0 and < 2.9	2022-12-15 19:17:17 +03:00
Vitaliy Filippov	66f14ac019	Update notes about Proxmox 7.1-7.3	2022-12-15 18:57:28 +03:00
Vitaliy Filippov	1364009931	Add patches for pve-qemu-kvm 7.1 (PVE 7.3) and pve-qemu-kvm 6.2 (PVE 7.2)	2022-12-14 19:01:36 +03:00
Vitaliy Filippov	d7e30b8353	Fix pve-storage-vitastor filename	2022-12-14 16:41:35 +03:00
Vitaliy Filippov	cb437913d3	Never try to wait for free space inside blockstore	2022-12-12 00:27:05 +03:00
Vitaliy Filippov	472bce58ab	Fix rare crash in osd_peering due to callback ordering	2022-12-12 00:27:05 +03:00
Vitaliy Filippov	7a71e7ef01	Fix possible duplication of ping & op message IDs	2022-12-04 00:16:47 +03:00
Vitaliy Filippov	c71e5e7bbd	Fix possible use-after-free during pings	2022-12-04 00:16:47 +03:00
Vitaliy Filippov	8fdf30b21f	Release 0.8.1 - Remove an additional data copy operation when flushing journal (should slightly increase write performance) - Fix a bug where new writes in the inmemory_journal=false mode could overwrite the data currently read by a parallel read operation - Fix degraded parity writes for EC N+K when K>1 where the bug could also lead to an "assertion failed" error - Fix missing journal space check for "big" writes which could lead to "prefill_single_journal_entry(): assertion failed..." error in OSD - Fix possible "assertion failed: next->prev_wait >= 0" in client in rare cases - Fix missing "len" field in vitastor-disk write-journal big_writes - Fix possible crash of a full OSD (ENOSPC) - Fix CSI build scripts to include newest packages every time - Fix CSI endpoint in the liveness probe manifest	2022-11-20 11:44:09 +03:00
Vitaliy Filippov	238037ae31	Make journal trimmer wait until reads are completed when inmemory_journal is false Without this new writes may in theory overwrite journal data being read at that time	2022-11-20 01:49:21 +03:00
Vitaliy Filippov	09a8864686	Fix degraded parity writes for EC N+K when K>1 Fixes possible `calc_rmw_parity_ec(): Assertion `bufs[i][curbuf[i]].buf' failed` error	2022-11-20 00:50:13 +03:00
Vitaliy Filippov	6e6f6ecbb0	Add missing journal space check for big_writes Fixes possible `prefill_single_journal_entry(): Assertion `!journal.sector_info[journal.cur_sector].flush_count' failed` error	2022-11-20 00:50:13 +03:00
Vitaliy Filippov	9491f81419	Add missing documentation for OSD tags	2022-11-20 00:50:13 +03:00
Vitaliy Filippov	44c2b30167	Take newest packages every time when rebuilding CSI	2022-11-20 00:50:13 +03:00
Vitaliy Filippov	bf8a0581cd	Fix possible "assertion failed: next->prev_wait >= 0" in client	2022-11-20 00:50:13 +03:00
Vitaliy Filippov	5953942042	Add crc32c test utility	2022-11-20 00:50:13 +03:00
Vitaliy Filippov	a276a1f737	Do not copy journal data additional time when flushing	2022-11-20 00:50:13 +03:00
Vitaliy Filippov	cc24e5796e	Add a FIXME	2022-11-20 00:50:09 +03:00
Vitaliy Filippov	6e26732e6a	Fix skipped "len" field in vitastor-disk write-journal big_writes	2022-11-12 12:01:40 +03:00
Vitaliy Filippov	b4edc79449	Fix possible segfault on ENOSPC	2022-11-12 11:59:43 +03:00
Vitaliy Filippov	5f26887d32	Fix csi endpoint in liveness probe	2022-11-10 18:37:37 +03:00