WIP "Heap" metadata storage scheme

Pass clean_bitmap explicitly
2025-05-18 11:56:16 +03:00 · 2025-05-18 11:55:04 +03:00
52 changed files with 537 additions and 437 deletions
--- a/.gitea/workflows/test.yml
+++ b/.gitea/workflows/test.yml
@@ -684,24 +684,6 @@ jobs:
          echo ""
        done

-  test_write_iothreads:
-    runs-on: ubuntu-latest
-    needs: build
-    container: ${{env.TEST_IMAGE}}:${{github.sha}}
-    steps:
-    - name: Run test
-      id: test
-      timeout-minutes: 3
-      run: TEST_NAME=iothreads GLOBAL_CONFIG=',"client_iothread_count":4' /root/vitastor/tests/test_write.sh
-    - name: Print logs
-      if: always() && steps.test.outcome == 'failure'
-      run: |
-        for i in /root/vitastor/testdata/*.log /root/vitastor/testdata/*.txt; do
-          echo "-------- $i --------"
-          cat $i
-          echo ""
-        done
-
  test_write_no_same:
    runs-on: ubuntu-latest
    needs: build
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -2,6 +2,6 @@ cmake_minimum_required(VERSION 2.8.12)

 project(vitastor)

-set(VITASTOR_VERSION "2.2.2")
+set(VITASTOR_VERSION "2.2.0")

 add_subdirectory(src)
--- a/2
+++ b/2
--- a/csi/Makefile
+++ b/csi/Makefile
@@ -1,4 +1,4 @@
-VITASTOR_VERSION ?= v2.2.2
+VITASTOR_VERSION ?= v2.2.0

 all: build push

--- a/csi/deploy/004-csi-nodeplugin.yaml
+++ b/csi/deploy/004-csi-nodeplugin.yaml
@@ -49,7 +49,7 @@ spec:
            capabilities:
              add: ["SYS_ADMIN"]
            allowPrivilegeEscalation: true
-          image: vitalif/vitastor-csi:v2.2.2
+          image: vitalif/vitastor-csi:v2.2.0
          args:
            - "--node=$(NODE_ID)"
            - "--endpoint=$(CSI_ENDPOINT)"
--- a/csi/deploy/007-csi-provisioner.yaml
+++ b/csi/deploy/007-csi-provisioner.yaml
@@ -121,7 +121,7 @@ spec:
            privileged: true
            capabilities:
              add: ["SYS_ADMIN"]
-          image: vitalif/vitastor-csi:v2.2.2
+          image: vitalif/vitastor-csi:v2.2.0
          args:
            - "--node=$(NODE_ID)"
            - "--endpoint=$(CSI_ENDPOINT)"
--- a/csi/src/config.go
+++ b/csi/src/config.go
@@ -5,7 +5,7 @@ package vitastor

 const (
    vitastorCSIDriverName    = "csi.vitastor.io"
-    vitastorCSIDriverVersion = "2.2.2"
+    vitastorCSIDriverVersion = "2.2.0"
 )

 // Config struct fills the parameters of request or user input
--- a/debian/changelog
+++ b/debian/changelog
@@ -1,4 +1,4 @@
-vitastor (2.2.2-1) unstable; urgency=medium
+vitastor (2.2.0-1) unstable; urgency=medium

  * Bugfixes

--- a/docker/Makefile
+++ b/docker/Makefile
@@ -1,4 +1,4 @@
-VITASTOR_VERSION ?= v2.2.2
+VITASTOR_VERSION ?= v2.2.0

 all: build push

--- a/docker/etc/vitastor/docker.conf
+++ b/docker/etc/vitastor/docker.conf
@@ -4,7 +4,7 @@
 #

 # Desired Vitastor version
-VITASTOR_VERSION=v2.2.2
+VITASTOR_VERSION=v2.2.0

 # Additional arguments for all containers
 # For example, you may want to specify a custom logging driver here
--- a/docs/installation/docker.en.md
+++ b/docs/installation/docker.en.md
@@ -26,9 +26,9 @@ at Vitastor Kubernetes operator: https://github.com/Antilles7227/vitastor-operat
 The instruction is very simple.

 1. Download a Docker image of the desired version: \
-   `docker pull vitastor:v2.2.2`
+   `docker pull vitastor:v2.2.0`
 2. Install scripts to the host system: \
-   `docker run --rm -it -v /etc:/host-etc -v /usr/bin:/host-bin vitastor:v2.2.2 install.sh`
+   `docker run --rm -it -v /etc:/host-etc -v /usr/bin:/host-bin vitastor:v2.2.0 install.sh`
 3. Reload udev rules: \
   `udevadm control --reload-rules`

--- a/docs/installation/docker.ru.md
+++ b/docs/installation/docker.ru.md
@@ -25,9 +25,9 @@ Vitastor можно установить в Docker/Podman. При этом etcd,
 Инструкция по установке максимально простая.

 1. Скачайте Docker-образ желаемой версии: \
-   `docker pull vitastor:v2.2.2`
+   `docker pull vitastor:v2.2.0`
 2. Установите скрипты в хост-систему командой: \
-   `docker run --rm -it -v /etc:/host-etc -v /usr/bin:/host-bin vitastor:v2.2.2 install.sh`
+   `docker run --rm -it -v /etc:/host-etc -v /usr/bin:/host-bin vitastor:v2.2.0 install.sh`
 3. Перезагрузите правила udev: \
   `udevadm control --reload-rules`

--- a/docs/installation/proxmox.en.md
+++ b/docs/installation/proxmox.en.md
@@ -6,10 +6,10 @@

 # Proxmox VE

-To enable Vitastor support in Proxmox Virtual Environment (6.4-8.x are supported):
+To enable Vitastor support in Proxmox Virtual Environment (6.4-8.1 are supported):

 - Add the corresponding Vitastor Debian repository into sources.list on Proxmox hosts:
-  bookworm for 8.1+, pve8.0 for 8.0, bullseye for 7.4, pve7.3 for 7.3, pve7.2 for 7.2, pve7.1 for 7.1, buster for 6.4
+  bookworm for 8.1, pve8.0 for 8.0, bullseye for 7.4, pve7.3 for 7.3, pve7.2 for 7.2, pve7.1 for 7.1, buster for 6.4
 - Install vitastor-client, pve-qemu-kvm, pve-storage-vitastor (* or see note) packages from Vitastor repository
 - Define storage in `/etc/pve/storage.cfg` (see below)
 - Block network access from VMs to Vitastor network (to OSDs and etcd),
--- a/docs/installation/proxmox.ru.md
+++ b/docs/installation/proxmox.ru.md
@@ -6,10 +6,10 @@

 # Proxmox VE

-Чтобы подключить Vitastor к Proxmox Virtual Environment (поддерживаются версии 6.4-8.x):
+Чтобы подключить Vitastor к Proxmox Virtual Environment (поддерживаются версии 6.4-8.1):

 - Добавьте соответствующий Debian-репозиторий Vitastor в sources.list на хостах Proxmox:
-  bookworm для 8.1+, pve8.0 для 8.0, bullseye для 7.4, pve7.3 для 7.3, pve7.2 для 7.2, pve7.1 для 7.1, buster для 6.4
+  bookworm для 8.1, pve8.0 для 8.0, bullseye для 7.4, pve7.3 для 7.3, pve7.2 для 7.2, pve7.1 для 7.1, buster для 6.4
 - Установите пакеты vitastor-client, pve-qemu-kvm, pve-storage-vitastor (* или см. сноску) из репозитория Vitastor
 - Определите тип хранилища в `/etc/pve/storage.cfg` (см. ниже)
 - Обязательно заблокируйте доступ от виртуальных машин к сети Vitastor (OSD и etcd), т.к. Vitastor (пока) не поддерживает аутентификацию
--- a/docs/intro/author.en.md
+++ b/docs/intro/author.en.md
@@ -10,17 +10,8 @@ Copyright (c) Vitaliy Filippov (vitalif [at] yourcmc.ru), 2019+

 Join Vitastor Telegram Chat: https://t.me/vitastor

-License: VNPL 1.1 for server-side code and dual VNPL 1.1 + GPL 2.0+ for client tools.
-
-Server-side code is licensed only under the terms of VNPL.
-
-Client libraries (cluster_client and so on) are dual-licensed under the same
-VNPL 1.1 and also GNU GPL 2.0 or later to allow for compatibility with GPLed
-software like QEMU and fio.
-
-## VNPL
-
-Vitastor Network Public License 1.1 (VNPL 1.1) is a copyleft license based on
+All server-side code (OSD, Monitor and so on) is licensed under the terms of
+Vitastor Network Public License 1.1 (VNPL 1.1), a copyleft license based on
 GNU GPLv3.0 with the additional "Network Interaction" clause which requires
 opensourcing all programs directly or indirectly interacting with Vitastor
 through a computer network and expressly designed to be used in conjunction
@@ -29,83 +20,18 @@ the terms of the same license, but also under the terms of any GPL-Compatible
 Free Software License, as listed by the Free Software Foundation.
 This is a stricter copyleft license than the Affero GPL.

-The idea of VNPL is, in addition to modules linked to Vitastor code in a single
-binary file, to extend copyleft action to micro-service modules only interacting
-with it over the network.
+Please note that VNPL doesn't require you to open the code of proprietary
+software running inside a VM if it's not specially designed to be used with
+Vitastor.

 Basically, you can't use the software in a proprietary environment to provide
 its functionality to users without opensourcing all intermediary components
 standing between the user and Vitastor or purchasing a commercial license
 from the author 😀.

-At the same time, VNPL doesn't impose any restrictions on software *not specially designed*
-to be used with Vitastor, for example, on Windows running inside a VM with a Vitastor disk.
+Client libraries (cluster_client and so on) are dual-licensed under the same
+VNPL 1.1 and also GNU GPL 2.0 or later to allow for compatibility with GPLed
+software like QEMU and fio.

-## Explanation
-
-Network copyleft is governed by the clause **13. Remote Network Interaction** of VNPL.
-
-A program is considered to be a "Proxy Program" if it meets both conditions:
- It is specially designed to be used with Vitastor. Basically, it means that the program
-  has any functionality specific to Vitastor and thus "knows" that it works with Vitastor,
-  not with something random.
- It interacts with Vitastor directly or indirectly through any programming interface,
-  including API, CLI, network or any wrapper (also considered a Proxy Program itself).
-
-If, in addition to that:
- You give any user an apportunity to interact with Vitastor directly or indirectly through
-  any computer interface including the network or any number of wrappers (Proxy Programs).
-
-Then VNPL requires you to publish the code of all above Proxy Programs to all above users
-under the terms of any GPL-compatible license - that is, GPL, LGPL, MIT/BSD or Apache 2,
-because "GPL compatibility" is treated as an ability to legally include licensed code in
-a GPL application.
-
-So, if you have a "Proxy Program", but it's not open to the user who directly or indirectly
-interacts with Vitastor - you are forbidden to use Vitastor under the terms of VNPL and you
-need a commercial license which doesn't contain open-source requirements.
-
-## Examples
-
- Vitastor Kubernetes CSI driver which creates PersistentVolumes by calling `vitastor-cli create`.
-  - Yes, it interacts with Vitastor through vitastor-cli.
-  - Yes, it is designed specially for use with Vitastor (it has no sense otherwise).
-  - So, CSI driver **definitely IS** a Proxy Program and must be published under the terms of
-    a free software license.
- Windows, installed in a VM with the system disk on Vitastor storage.
-  - Yes, it interacts with Vitastor indirectly - it reads and writes data through the block
-    device interface, emulated by QEMU.
-  - No, it definitely isn't designed specially for use with Vitastor - Windows was created long
-    ago before Vitastor and doesn't know anything about it.
-  - So, Windows **definitely IS NOT** a Proxy Program and VNPL doesn't require to open it.
- Cloud control panel which makes requests to Vitastor Kubernetes CSI driver.
-  - Yes, it interacts with Vitastor indirectly through the CSI driver, which is a Proxy Program.
-  - May or may not be designed specially for use with Vitastor. How to determine exactly?
-    Imagine that Vitastor is replaced with any other storage (for example, with a proprietary).
-    Do control panel functions change in any way? If they do (for example, if snapshots stop working),
-    then the panel contains specific functionality and thus is designed specially for use with Vitastor.
-    Otherwise, the panel is universal and isn't designed specially for Vitastor.
-  - So, whether you are required to open-source the panel also **depends** on whether it
-    contains specific functionality or not.
-
-## Why?
-
-Because I believe into the spirit of copyleft (Linux wouldn't become so popular without GPL!)
-and, at the same time, I want to have a way to monetize the product.
-
-Existing licenses including AGPL are useless for it with an SDS - SDS is a very deeply
-internal software which is almost definitely invisible to the user and thus AGPL doesn't
-require anyone to open the code even if they make a proprietary fork.
-
-And, in fact, the current situation in the world where GPL is though to only restrict direct
-linking of programs into a single executable file, isn't much correct. Nowadays, programs
-are more often linked with network API calls, not with /usr/bin/ld, and a software product
-may consist of dozens of microservices interacting with each other over the network.
-
-That's why we need VNPL to keep the license sufficiently copyleft.
-
-## License Texts
-
- VNPL 1.1 in English: [VNPL-1.1.txt](../../VNPL-1.1.txt)
- VNPL 1.1 in Russian: [VNPL-1.1-RU.txt](../../VNPL-1.1-RU.txt)
- GPL 2.0: [GPL-2.0.txt](../../GPL-2.0.txt)
+You can find the full text of VNPL-1.1 in the file [VNPL-1.1.txt](../../VNPL-1.1.txt).
+GPL 2.0 is also included in this repository as [GPL-2.0.txt](../../GPL-2.0.txt).
--- a/docs/intro/author.ru.md
+++ b/docs/intro/author.ru.md
@@ -12,14 +12,6 @@

 Лицензия: VNPL 1.1 на серверный код и двойная VNPL 1.1 + GPL 2.0+ на клиентский.

-Серверные компоненты распространяются только на условиях VNPL.
-
-Клиентские библиотеки распространяются на условиях двойной лицензии VNPL 1.0
-и также на условиях GNU GPL 2.0 или более поздней версии. Так сделано в целях
-совместимости с таким ПО, как QEMU и fio.
-
-## VNPL
-
 VNPL - "сетевой копилефт", собственная свободная копилефт-лицензия
 Vitastor Network Public License 1.1, основанная на GNU GPL 3.0 с дополнительным
 условием "Сетевого взаимодействия", требующим распространять все программы,
@@ -37,70 +29,9 @@ Vitastor Network Public License 1.1, основанная на GNU GPL 3.0 с д
 На Windows и любое другое ПО, не разработанное *специально* для использования
 вместе с Vitastor, никакие ограничения не накладываются.

-## Пояснение
+Клиентские библиотеки распространяются на условиях двойной лицензии VNPL 1.0
+и также на условиях GNU GPL 2.0 или более поздней версии. Так сделано в целях
+совместимости с таким ПО, как QEMU и fio.

-Сетевой копилефт регулируется пунктом лицензии **13. Удалённое сетевое взаимодействие**.
-
-Программа считается "прокси-программой", если верны оба условия:
- Она создана специально для работы вместе с Vitastor. По сути это означает, что программа
-  должна иметь специфичный для Vitastor функционал, то есть, "знать", что она взаимодействует
-  именно с Vitastor.
- Она прямо или косвенно взаимодействует с Vitastor через абсолютно любой программный
-  интерфейс, включая любые способы вызова: API, CLI, сеть или через какую-то обёртку (в
-  свою очередь тоже являющуюся прокси-программой).
-
-Если в дополнение к этому также:
- Вы предоставляете любому пользователю возможность взаимодействовать с Vitastor по сети,
-  опять-таки, через любой интерфейс или любую серию "обёрток" (прокси-программ)
-
-То, согласно VNPL, вы должны открыть код "прокси-программ" **таким пользователям** на условиях
-любой GPL-совместимой лицензии - то есть, GPL, LGPL, MIT/BSD или Apache 2 - "совместимость с GPL"
-понимается как возможность включать лицензируемый код в GPL-приложение.
-
-Соответственно, если у вас есть "прокси-программа", но её код не открыт пользователю,
-который прямо или косвенно взаимодействует с Vitastor - вам запрещено использовать Vitastor
-на условиях VNPL и вам нужна коммерческая лицензия, не содержащая требований об открытии кода.
-
-## Примеры
-
- Kubernetes CSI-драйвер Vitastor, создающий PersistentVolume с помощью вызова `vitastor-cli create`.
-  - Да, взаимодействует с Vitastor через vitastor-cli.
-  - Да, создавался специально для работы с Vitastor (иначе в чём же ещё его смысл).
-  - Значит, CSI-драйвер **точно считается** "прокси-программой" и должен быть открыт под свободной
-    лицензией.
- Windows, установленный в виртуальную машину на диске Vitastor.
-  - Да, взаимодействует с Vitastor "прямо или косвенно" - пишет и читает данные через интерфейс
-    блочного устройства, эмулируемый QEMU.
-  - Нет, точно не создан *специально для работы с Vitastor* - когда его создавали, никакого
-    Vitastor ещё и в помине не было.
-  - Значит, Windows **точно не считается** "прокси-программой" и на него требования VNPL не распространяются.
- Панель управления облака, делающая запросы к Kubernetes CSI-драйверу Vitastor.
-  - Да, взаимодействует с Vitastor косвенно через CSI-драйвер, являющийся "прокси-программой".
-  - Сходу не известно, создавалась ли конкретно для работы с Vitastor. Как понять, да или нет?
-    Представьте, что Vitastor заменён на любую другую систему хранения (например, на проприетарную).
-    Работа панели управления изменится? Если да (например, перестанут работать снапшоты) - значит,
-    панель содержит специфичный функционал и "создана специально для работы с Vitastor".
-    Если нет - значит, специфичного функционала панель не содержит и в принципе она универсальна.
-  - Нужно ли открывать панель - **зависит** от того, содержит она специфичный функционал или нет.
-
-## Почему так?
-
-Потому что я одновременно верю в дух копилефт-лицензий (Linux не стал бы так популярен,
-если бы не GPL!) и хочу иметь возможность монетизации продукта.
-
-При этом использовать даже AGPL для программной СХД бессмысленно - это глубоко внутреннее
-ПО, которое пользователь почти наверняка не увидит вообще, поэтому и открывать код никому
-никогда не придётся, даже при создании производного продукта.
-
-Да и в целом сложившаяся в мире ситуация, при которой действие GPL ограничивается только
-прямым связыванием в один исполняемый файл, не очень корректна. В настоящее время программы
-гораздо чаще интегрируют сетевыми вызовами, а не с помощью /usr/bin/ld, и общий программный
-продукт может состоять из нескольких десятков микросервисов, взаимодействующих по сети.
-
-Поэтому для сохранения достаточной "копилефтности" и придумана VNPL.
-
-## Тексты лицензий
-
- VNPL 1.1 на английском языке: [VNPL-1.1.txt](../../VNPL-1.1.txt)
- VNPL 1.1 на русском языке: [VNPL-1.1-RU.txt](../../VNPL-1.1-RU.txt)
- GPL 2.0: [GPL-2.0.txt](../../GPL-2.0.txt)
+Вы можете найти полный текст VNPL 1.1 на английском языке в файле [VNPL-1.1.txt](../../VNPL-1.1.txt),
+VNPL 1.1 на русском языке в файле [VNPL-1.1-RU.txt](../../VNPL-1.1-RU.txt), а GPL 2.0 в файле [GPL-2.0.txt](../../GPL-2.0.txt).
--- a/mon/package.json
+++ b/mon/package.json
@@ -1,6 +1,6 @@
 {
  "name": "vitastor-mon",
-  "version": "2.2.2",
+  "version": "2.2.0",
  "description": "Vitastor SDS monitor service",
  "main": "mon-main.js",
  "scripts": {
--- a/node-binding/package.json
+++ b/node-binding/package.json
@@ -1,6 +1,6 @@
 {
  "name": "vitastor",
-  "version": "2.2.2",
+  "version": "2.2.0",
  "description": "Low-level native bindings to Vitastor client library",
  "main": "index.js",
  "keywords": [
--- a/patches/VitastorPlugin.pm
+++ b/patches/VitastorPlugin.pm
@@ -410,8 +410,8 @@ sub volume_size_info
    my $prefix = defined $scfg->{vitastor_prefix} ? $scfg->{vitastor_prefix} : 'pve/';
    my ($vtype, $name, $vmid) = $class->parse_volname($volname);
    my $info = _process_list($scfg, $storeid, run_cli($scfg, [ 'ls', $prefix.$name ]))->[0];
-    # (size, format, used, parent, ctime)
-    return wantarray ? ($info->{size}, $info->{format}, $info->{size}, $info->{parent}, 0) : $info->{size};
+    #return wantarray ? ($size, $format, $used, $parent, $st->ctime) : $size;
+    return $info->{size};
 }

 sub volume_resize
--- a/patches/cinder-vitastor.py
+++ b/patches/cinder-vitastor.py
@@ -50,7 +50,7 @@ from cinder.volume import configuration
 from cinder.volume import driver
 from cinder.volume import volume_utils

-VITASTOR_VERSION = '2.2.2'
+VITASTOR_VERSION = '2.2.0'

 LOG = logging.getLogger(__name__)

--- a/rpm/vitastor-el7.spec
+++ b/rpm/vitastor-el7.spec
@@ -1,11 +1,11 @@
 Name:           vitastor
-Version:        2.2.2
+Version:        2.2.0
 Release:        1%{?dist}
 Summary:        Vitastor, a fast software-defined clustered block storage

 License:        Vitastor Network Public License 1.1
 URL:            https://vitastor.io/
-Source0:        vitastor-2.2.2.el7.tar.gz
+Source0:        vitastor-2.2.0.el7.tar.gz

 BuildRequires:  liburing-devel >= 0.6
 BuildRequires:  gperftools-devel
--- a/rpm/vitastor-el8.spec
+++ b/rpm/vitastor-el8.spec
@@ -1,11 +1,11 @@
 Name:           vitastor
-Version:        2.2.2
+Version:        2.2.0
 Release:        1%{?dist}
 Summary:        Vitastor, a fast software-defined clustered block storage

 License:        Vitastor Network Public License 1.1
 URL:            https://vitastor.io/
-Source0:        vitastor-2.2.2.el8.tar.gz
+Source0:        vitastor-2.2.0.el8.tar.gz

 BuildRequires:  liburing-devel >= 0.6
 BuildRequires:  gperftools-devel
--- a/rpm/vitastor-el9.spec
+++ b/rpm/vitastor-el9.spec
@@ -1,11 +1,11 @@
 Name:           vitastor
-Version:        2.2.2
+Version:        2.2.0
 Release:        1%{?dist}
 Summary:        Vitastor, a fast software-defined clustered block storage

 License:        Vitastor Network Public License 1.1
 URL:            https://vitastor.io/
-Source0:        vitastor-2.2.2.el9.tar.gz
+Source0:        vitastor-2.2.0.el9.tar.gz

 BuildRequires:  liburing-devel >= 0.6
 BuildRequires:  gperftools-devel
--- a/src/CMakeLists.txt
+++ b/src/CMakeLists.txt
@@ -19,7 +19,7 @@ if("${CMAKE_INSTALL_PREFIX}" MATCHES "^/usr/local/?$")
 	set(CMAKE_INSTALL_RPATH "${CMAKE_INSTALL_PREFIX}/${CMAKE_INSTALL_LIBDIR}")
 endif()

-add_definitions(-DVITASTOR_VERSION="2.2.2")
+add_definitions(-DVITASTOR_VERSION="2.2.0")
 add_definitions(-D_LARGEFILE64_SOURCE -D_FILE_OFFSET_BITS=64 -Wall -Wno-sign-compare -Wno-comment -Wno-parentheses -Wno-pointer-arith -fdiagnostics-color=always -fno-omit-frame-pointer -I ${CMAKE_SOURCE_DIR}/src)
 add_link_options(-fno-omit-frame-pointer)
 if (${WITH_ASAN})
--- a/src/blockstore/blockstore.h
+++ b/src/blockstore/blockstore.h
@@ -31,6 +31,7 @@
 #define DEFAULT_DATA_BLOCK_ORDER 17
 #define MIN_DATA_BLOCK_SIZE 4*1024
 #define MAX_DATA_BLOCK_SIZE 128*1024*1024
+#define MAX_META_BLOCK_SIZE 64*1024
 #define DEFAULT_BITMAP_GRANULARITY 4096

 #define BS_OP_MIN 1
--- a/src/blockstore/blockstore_disk.cpp
+++ b/src/blockstore/blockstore_disk.cpp
@@ -127,9 +127,9 @@ void blockstore_disk_t::parse_config(std::map<std::string, std::string> & config
    {
        throw std::runtime_error("meta_block_size must be a multiple of "+std::to_string(DIRECT_IO_ALIGNMENT));
    }
-    else if (meta_block_size > MAX_DATA_BLOCK_SIZE)
+    else if (meta_block_size > MAX_META_BLOCK_SIZE)
    {
-        throw std::runtime_error("meta_block_size must not exceed "+std::to_string(MAX_DATA_BLOCK_SIZE));
+        throw std::runtime_error("meta_block_size must not exceed "+std::to_string(MAX_META_BLOCK_SIZE));
    }
    if (data_offset % disk_alignment)
    {
--- a/src/blockstore/blockstore_flush.cpp
+++ b/src/blockstore/blockstore_flush.cpp
@@ -427,13 +427,6 @@ stop_flusher:
        printf("Flushing %jx:%jx v%ju\n", cur.oid.inode, cur.oid.stripe, cur.version);
 #endif
        flusher->active_flushers++;
-        // Find it in clean_db
-        {
-            auto & clean_db = bs->clean_db_shard(cur.oid);
-            auto clean_it = clean_db.find(cur.oid);
-            old_clean_ver = (clean_it != clean_db.end() ? clean_it->second.version : 0);
-            old_clean_loc = (clean_it != clean_db.end() ? clean_it->second.location : UINT64_MAX);
-        }
        // Scan dirty versions of the object to determine what we need to read
        scan_dirty();
        // Writes and deletes shouldn't happen at the same time
@@ -538,7 +531,7 @@ resume_2:
        {
            // zero out old metadata entry
            {
-                clean_disk_entry *old_entry = (clean_disk_entry*)((uint8_t*)meta_old.buf + meta_old.pos*bs->dsk.clean_entry_size);
+                clean_disk_entry *old_entry = (clean_disk_entry*)((uint8_t*)meta_old.buf + meta_old.pos);
                if (old_entry->oid.inode != 0 && old_entry->oid != cur.oid)
                {
                    printf("Fatal error (metadata corruption or bug): tried to wipe metadata entry %ju (%jx:%jx v%ju) as old location of %jx:%jx\n",
@@ -547,7 +540,7 @@ resume_2:
                    exit(1);
                }
            }
-            memset((uint8_t*)meta_old.buf + meta_old.pos*bs->dsk.clean_entry_size, 0, bs->dsk.clean_entry_size);
+            memset((uint8_t*)meta_old.buf + meta_old.pos, 0, bs->dsk.clean_entry_size);
    resume_20:
            if (meta_old.sector != meta_new.sector && !write_meta_block(meta_old, 20))
                return false;
@@ -608,7 +601,7 @@ resume_2:

 void journal_flusher_co::update_metadata_entry()
 {
-    clean_disk_entry *new_entry = (clean_disk_entry*)((uint8_t*)meta_new.buf + meta_new.pos*bs->dsk.clean_entry_size);
+    clean_disk_entry *new_entry = (clean_disk_entry*)((uint8_t*)meta_new.buf + meta_new.pos);
    if (new_entry->oid.inode != 0 && new_entry->oid != cur.oid)
    {
        printf(
@@ -623,7 +616,7 @@ void journal_flusher_co::update_metadata_entry()
    if (has_delete)
    {
        // Zero out the new metadata entry
-        memset((uint8_t*)meta_new.buf + meta_new.pos*bs->dsk.clean_entry_size, 0, bs->dsk.clean_entry_size);
+        memset((uint8_t*)meta_new.buf + meta_new.pos, 0, bs->dsk.clean_entry_size);
    }
    else
    {
@@ -805,7 +798,7 @@ bool journal_flusher_co::clear_incomplete_csum_block_bits(int wait_base)
            }
        }
        {
-            clean_disk_entry *new_entry = (clean_disk_entry*)((uint8_t*)meta_new.buf + meta_new.pos*bs->dsk.clean_entry_size);
+            clean_disk_entry *new_entry = (clean_disk_entry*)((uint8_t*)meta_new.buf + meta_new.pos);
            if (new_entry->oid != cur.oid)
            {
                printf(
@@ -912,6 +905,12 @@ void journal_flusher_co::calc_block_checksums(uint32_t *new_data_csums, bool ski

 void journal_flusher_co::scan_dirty()
 {
+    // Find it in clean_db
+    auto & clean_db = bs->clean_db_shard(cur.oid);
+    auto clean_it = clean_db.find(cur.oid);
+    old_clean_ver = (clean_it != clean_db.end() ? clean_it->second.version : 0);
+    old_clean_loc = (clean_it != clean_db.end() ? clean_it->second.location : UINT64_MAX);
+    auto old_clean_bitmap = (clean_it != clean_db.end() ? bs->get_clean_entry_bitmap(clean_it, 0) : NULL);
    dirty_it = dirty_start = dirty_end;
    v.clear();
    copy_count = 0;
@@ -1037,13 +1036,12 @@ void journal_flusher_co::scan_dirty()
            read_to_fill_incomplete = 0;
            return;
        }
-        uint8_t *bmp_ptr = bs->get_clean_entry_bitmap(old_clean_loc, 0);
        uint64_t fulfilled = 0;
        int last = v.size()-1;
        while (last >= 0 && (v[last].copy_flags & COPY_BUF_CSUM_FILL))
            last--;
        read_to_fill_incomplete = bs->fill_partial_checksum_blocks(
-            v, fulfilled, bmp_ptr, NULL, false, NULL, v[0].offset/bs->dsk.csum_block_size * bs->dsk.csum_block_size,
+            v, fulfilled, old_clean_bitmap, NULL, false, NULL, v[0].offset/bs->dsk.csum_block_size * bs->dsk.csum_block_size,
            ((v[last].offset+v[last].len-1) / bs->dsk.csum_block_size + 1) * bs->dsk.csum_block_size
        );
    }
@@ -1139,7 +1137,7 @@ bool journal_flusher_co::modify_meta_do_reads(int wait_base)
 resume_0:
    if (!modify_meta_read(clean_loc, meta_new, wait_base+0))
        return false;
-    new_clean_bitmap = (uint8_t*)meta_new.buf + meta_new.pos*bs->dsk.clean_entry_size + sizeof(clean_disk_entry);
+    new_clean_bitmap = (uint8_t*)meta_new.buf + meta_new.pos + sizeof(clean_disk_entry);
    if (old_clean_loc != UINT64_MAX && old_clean_loc != clean_loc)
    {
    resume_1:
@@ -1193,7 +1191,7 @@ bool journal_flusher_co::modify_meta_read(uint64_t meta_loc, flusher_meta_write_
    // so I'll avoid it as long as I can.
    wr.submitted = false;
    wr.sector = ((meta_loc >> bs->dsk.block_order) / (bs->dsk.meta_block_size / bs->dsk.clean_entry_size)) * bs->dsk.meta_block_size;
-    wr.pos = ((meta_loc >> bs->dsk.block_order) % (bs->dsk.meta_block_size / bs->dsk.clean_entry_size));
+    wr.pos = ((meta_loc >> bs->dsk.block_order) % (bs->dsk.meta_block_size / bs->dsk.clean_entry_size)) * bs->dsk.clean_entry_size;
    if (bs->inmemory_meta)
    {
        wr.buf = (uint8_t*)bs->metadata_buffer + wr.sector;
--- a/src/blockstore/blockstore_impl.cpp
+++ b/src/blockstore/blockstore_impl.cpp
@@ -42,6 +42,8 @@ blockstore_impl_t::~blockstore_impl_t()
        free(metadata_buffer);
    if (clean_bitmaps)
        free(clean_bitmaps);
+    if (heap_meta.blocks)
+        delete[] heap_meta.blocks;
 }

 bool blockstore_impl_t::is_started()
@@ -431,13 +433,29 @@ blockstore_clean_db_t& blockstore_impl_t::clean_db_shard(object_id oid)
 {
    uint64_t pg_num = 0;
    uint64_t pool_id = (oid.inode >> (64-POOL_ID_BITS));
-    auto sh_it = clean_db_settings.find(pool_id);
-    if (sh_it != clean_db_settings.end())
+    auto sett_it = clean_db_settings.find(pool_id);
+    if (sett_it != clean_db_settings.end())
    {
        // like map_to_pg()
-        pg_num = (oid.stripe / sh_it->second.pg_stripe_size) % sh_it->second.pg_count + 1;
+        pg_num = (oid.stripe / sett_it->second.pg_stripe_size) % sett_it->second.pg_count + 1;
    }
-    return clean_db_shards[(pool_id << (64-POOL_ID_BITS)) | pg_num];
+    auto shard_id = (pool_id << (64-POOL_ID_BITS)) | pg_num;
+    if (dsk.meta_format == BLOCKSTORE_META_FORMAT_HEAP)
+    {
+        auto sh_it = clean_db_shards.find(shard_id);
+        if (sh_it == clean_db_shards.end())
+        {
+            // clean_db_t stores larger entries with heap_meta, but we disguise it as smaller clean_entry :)
+            // patched cpp-btree with extra_data
+            clean_db_shards[shard_id] = blockstore_clean_db_t(
+                sizeof(clean_entry_heap_t) - sizeof(clean_entry)
+                + (inmemory_meta ? dsk.clean_dyn_size : 2*dsk.clean_entry_bitmap_size)
+            );
+            return clean_db_shards[shard_id];
+        }
+        return sh_it->second;
+    }
+    return clean_db_shards[shard_id];
 }

 void blockstore_impl_t::reshard_clean_db(pool_id_t pool, uint32_t pg_count, uint32_t pg_stripe_size)
--- a/src/blockstore/blockstore_impl.h
+++ b/src/blockstore/blockstore_impl.h
@@ -96,6 +96,9 @@
 #define BLOCKSTORE_META_MAGIC_V1 0x726F747341544956l
 #define BLOCKSTORE_META_FORMAT_V1 1
 #define BLOCKSTORE_META_FORMAT_V2 2
+#define BLOCKSTORE_META_FORMAT_HEAP 3
+#define BLOCKSTORE_META_HEADER_V1_SIZE 36
+#define BLOCKSTORE_META_HEADER_V2_SIZE 48

 // metadata header (superblock)
 struct __attribute__((__packed__)) blockstore_meta_header_v1_t
@@ -119,6 +122,7 @@ struct __attribute__((__packed__)) blockstore_meta_header_v2_t
    uint32_t data_csum_type;
    uint32_t csum_block_size;
    uint32_t header_csum;
+    uint32_t block_id_bits; // 32 by default in heap meta
 };

 // 32 bytes = 24 bytes + block bitmap (4 bytes by default) + external attributes (also bitmap, 4 bytes by default)
@@ -140,6 +144,62 @@ struct __attribute__((__packed__)) clean_entry
    uint64_t location;
 };

+typedef uint32_t heap_block_num_t;
+
+// 50 = 16 (key=object_id) + 26 (value) + 8 (bitmap) + N (checksum) bytes per "clean" entry in memory
+struct __attribute__((__packed__)) clean_entry_heap_t
+{
+    uint64_t version;
+    uint64_t location; // UINT64_MAX = deleted
+    // previous versions invalidated by this version
+    heap_block_num_t prev_versions;
+    // metadata block number
+    heap_block_num_t meta_block;
+    // offset within block
+    uint16_t block_offset;
+    uint8_t bitmap[];
+};
+
+struct __attribute__((__packed__)) heap_meta_block_header_t
+{
+    uint64_t magic;
+    uint64_t seq_num;
+    uint32_t invalidates_blocks;
+};
+
+// 48+checksums = (40+bitmap)+checksums bytes per on-disk "heap" entry
+// for 128 KB block without checksums, it's 48 bytes - 84 entries per 4 kb metadata block
+// for 128 KB block with 4k checksums, it's 176 bytes - 22 entries per 4 kb metadata block
+// for 1 MB block without checksums, it's 80 bytes - 50 entries per 4 kb metadata block
+// for 1 MB block with 4k checksums, it's 1104 bytes O_o - only 3 entries per 4 kb metadata block
+// for 1 MB block with 32k checksums, it's 176 bytes again
+struct __attribute__((__packed__)) heap_meta_entry_t
+{
+    object_id oid;
+    uint64_t version;
+    uint64_t location; // UINT64_MAX = deleted
+    uint64_t reserved;
+    uint8_t bitmap[];
+};
+
+struct heap_meta_block_t
+{
+    heap_block_num_t offset = 0;
+    uint64_t seq_num = 0;
+    uint32_t used_space = 0;
+    std::vector<uint64_t> invalidates_blocks;
+};
+
+struct heap_meta_t
+{
+    heap_block_num_t block_count = 0;
+    heap_meta_block_t *blocks = NULL;
+    // used space => block number
+    std::multimap<uint32_t, heap_block_num_t> used_space_map;
+    heap_block_num_t cur_written_block = 0;
+    uint8_t *written_block_buf = NULL;
+};
+
 // 64 = 24 + 40 bytes per dirty entry in memory (obj_ver_id => dirty_entry). Plus checksums
 struct __attribute__((__packed__)) dirty_entry
 {
@@ -272,6 +332,8 @@ class blockstore_impl_t

    struct ring_consumer_t ring_consumer;

+    heap_meta_t heap_meta;
+
    std::map<pool_id_t, pool_shard_settings_t> clean_db_settings;
    std::map<pool_pg_id_t, blockstore_clean_db_t> clean_db_shards;
    std::map<uint64_t, int> no_inode_stats;
@@ -317,7 +379,7 @@ class blockstore_impl_t
    void open_data();
    void open_meta();
    void open_journal();
-    uint8_t* get_clean_entry_bitmap(uint64_t block_loc, int offset);
+    uint8_t* get_clean_entry_bitmap(blockstore_clean_db_t::iterator clean_it, int offset);

    blockstore_clean_db_t& clean_db_shard(object_id oid);
    void reshard_clean_db(pool_id_t pool_id, uint32_t pg_count, uint32_t pg_stripe_size);
@@ -345,9 +407,9 @@ class blockstore_impl_t
        uint64_t &fulfilled, uint32_t item_start, uint32_t item_end,
        uint32_t item_state, uint64_t item_version, uint64_t item_location,
        uint64_t journal_sector, uint8_t *csum, int *dyn_data);
-    bool fulfill_clean_read(blockstore_op_t *read_op, uint64_t & fulfilled,
-        uint8_t *clean_entry_bitmap, int *dyn_data,
-        uint32_t item_start, uint32_t item_end, uint64_t clean_loc, uint64_t clean_ver);
+    bool fulfill_clean_read_journal(blockstore_op_t *read_op, uint64_t & fulfilled,
+        uint8_t *clean_entry_bitmap, int *dyn_data, uint32_t item_start, uint32_t item_end, uint64_t clean_loc, uint64_t clean_ver);
+    bool fulfill_clean_read_meta(blockstore_op_t *read_op, uint64_t & fulfilled, blockstore_clean_db_t::iterator clean_it);
    int fill_partial_checksum_blocks(std::vector<copy_buffer_t> & rv, uint64_t & fulfilled,
        uint8_t *clean_entry_bitmap, int *dyn_data, bool from_journal, uint8_t *read_buf, uint64_t read_offset, uint64_t read_end);
    int pad_journal_read(std::vector<copy_buffer_t> & rv, copy_buffer_t & cp,
@@ -356,7 +418,7 @@ class blockstore_impl_t
    bool read_range_fulfilled(std::vector<copy_buffer_t> & rv, uint64_t & fulfilled, uint8_t *read_buf,
        uint8_t *clean_entry_bitmap, uint32_t item_start, uint32_t item_end);
    bool read_checksum_block(blockstore_op_t *op, int rv_pos, uint64_t &fulfilled, uint64_t clean_loc);
-    uint8_t* read_clean_meta_block(blockstore_op_t *read_op, uint64_t clean_loc, int rv_pos);
+    uint8_t* read_clean_meta_block(blockstore_op_t *op, blockstore_clean_db_t::iterator clean_it, int rv_pos);
    bool verify_padded_checksums(uint8_t *clean_entry_bitmap, uint8_t *csum_buf, uint32_t offset,
        iovec *iov, int n_iov, std::function<void(uint32_t, uint32_t, uint32_t)> bad_block_cb);
    bool verify_journal_checksums(uint8_t *csums, uint32_t offset,
--- a/src/blockstore/blockstore_init.cpp
+++ b/src/blockstore/blockstore_init.cpp
@@ -54,6 +54,7 @@ int blockstore_init_meta::loop()
    else if (wait_state == 4) goto resume_4;
    else if (wait_state == 5) goto resume_5;
    else if (wait_state == 6) goto resume_6;
+    else if (wait_state == 7) goto resume_7;
    printf("Reading blockstore metadata\n");
    if (bs->inmemory_meta)
        metadata_buffer = bs->metadata_buffer;
@@ -78,6 +79,7 @@ resume_1:
    if (iszero((uint64_t*)metadata_buffer, bs->dsk.meta_block_size / sizeof(uint64_t)))
    {
        {
+            memset(metadata_buffer, 0, bs->dsk.meta_block_size);
            blockstore_meta_header_v2_t *hdr = (blockstore_meta_header_v2_t *)metadata_buffer;
            hdr->zero = 0;
            hdr->magic = BLOCKSTORE_META_MAGIC_V1;
@@ -85,12 +87,19 @@ resume_1:
            hdr->meta_block_size = bs->dsk.meta_block_size;
            hdr->data_block_size = bs->dsk.data_block_size;
            hdr->bitmap_granularity = bs->dsk.bitmap_granularity;
+            if (bs->dsk.meta_format >= BLOCKSTORE_META_FORMAT_HEAP)
+            {
+                hdr->block_id_bits = sizeof(heap_block_num_t);
+            }
            if (bs->dsk.meta_format >= BLOCKSTORE_META_FORMAT_V2)
            {
                hdr->data_csum_type = bs->dsk.data_csum_type;
                hdr->csum_block_size = bs->dsk.csum_block_size;
                hdr->header_csum = 0;
-                hdr->header_csum = crc32c(0, hdr, sizeof(*hdr));
+                hdr->header_csum = crc32c(0, hdr,
+                    bs->dsk.meta_format == BLOCKSTORE_META_FORMAT_V2
+                    ? BLOCKSTORE_META_HEADER_V2_SIZE
+                    : sizeof(*hdr));
            }
        }
        if (bs->readonly)
@@ -128,7 +137,7 @@ resume_1:
            );
            exit(1);
        }
-        if (hdr->version == BLOCKSTORE_META_FORMAT_V2)
+        if (hdr->version == BLOCKSTORE_META_FORMAT_HEAP)
        {
            uint32_t csum = hdr->header_csum;
            hdr->header_csum = 0;
@@ -138,6 +147,23 @@ resume_1:
                exit(1);
            }
            hdr->header_csum = csum;
+            bs->dsk.meta_format = BLOCKSTORE_META_FORMAT_HEAP;
+            if (hdr->block_id_bits != sizeof(heap_block_num_t))
+            {
+                printf("Heap metadata block ID size (%u) is not supported by this build\n", hdr->block_id_bits);
+                exit(1);
+            }
+        }
+        else if (hdr->version == BLOCKSTORE_META_FORMAT_V2)
+        {
+            uint32_t csum = hdr->header_csum;
+            hdr->header_csum = 0;
+            if (crc32c(0, hdr, BLOCKSTORE_META_HEADER_V2_SIZE) != csum)
+            {
+                printf("Metadata header is corrupt (checksum mismatch).\n");
+                exit(1);
+            }
+            hdr->header_csum = csum;
            if (bs->dsk.meta_format != BLOCKSTORE_META_FORMAT_V2)
            {
                bs->dsk.meta_format = BLOCKSTORE_META_FORMAT_V2;
@@ -160,11 +186,11 @@ resume_1:
                printf("Warning: Starting with metadata in the old format without checksums, as stored on disk\n");
            }
        }
-        else if (hdr->version > BLOCKSTORE_META_FORMAT_V2)
+        else
        {
            printf(
                "Metadata format is too new for me (stored version is %ju, max supported %u).\n",
-                hdr->version, BLOCKSTORE_META_FORMAT_V2
+                hdr->version, BLOCKSTORE_META_FORMAT_HEAP
            );
            exit(1);
        }
@@ -189,7 +215,12 @@ resume_1:
    // Skip superblock
    md_offset = bs->dsk.meta_block_size;
    next_offset = md_offset;
-    entries_per_block = bs->dsk.meta_block_size / bs->dsk.clean_entry_size;
+    entries_per_block = bs->dsk.meta_block_size / bs->dsk.clean_entry_size; // FIXME only array
+    if (bs->dsk.meta_format == BLOCKSTORE_META_FORMAT_HEAP)
+    {
+        bs->heap_meta.blocks = new heap_meta_block_t[bs->dsk.meta_len / bs->dsk.meta_block_size];
+        bs->heap_meta.block_count = bs->dsk.meta_len / bs->dsk.meta_block_size;
+    }
    // Read the rest of the metadata
 resume_2:
    if (next_offset < bs->dsk.meta_len && submitted == 0)
@@ -233,9 +264,10 @@ resume_2:
            bool changed = false;
            for (uint64_t sector = 0; sector < bufs[i].size; sector += bs->dsk.meta_block_size)
            {
-                // handle <count> entries
-                if (handle_meta_block(bufs[i].buf + sector, entries_per_block,
-                    ((bufs[i].offset + sector - md_offset) / bs->dsk.meta_block_size) * entries_per_block))
+                auto this_changed = bs->dsk.meta_format == BLOCKSTORE_META_FORMAT_HEAP
+                    ? handle_heap_meta_block(bufs[i].buf + sector, bufs[i].offset + sector - md_offset)
+                    : handle_array_meta_block(bufs[i].buf + sector, bufs[i].offset + sector - md_offset);
+                if (this_changed)
                    changed = true;
            }
            if (changed && !bs->inmemory_meta && !bs->readonly)
@@ -262,6 +294,41 @@ resume_2:
        wait_state = 2;
        return 1;
    }
+    if (bs->dsk.meta_format == BLOCKSTORE_META_FORMAT_HEAP)
+    {
+        // build used_space index
+        for (heap_block_num_t i = 0; i < bs->heap_meta.block_count; i++)
+        {
+            bs->heap_meta.used_space_map.emplace(std::pair<uint32_t, heap_block_num_t>(bs->heap_meta.blocks[i].used_space, i));
+        }
+    }
+    if (heap_invalidated_block_seq.size() && !bs->readonly)
+    {
+        // zero out invalidated blocks not zeroed during the previous OSD execution
+        for (auto inv_seq: heap_invalidated_block_seq)
+        {
+            auto num_it = heap_block_by_seq.find(inv_seq);
+            if (num_it != heap_block_by_seq.end())
+                heap_invalidated_block_nums.push_back(num_it->second);
+        }
+        memset(metadata_buffer, 0, bs->dsk.meta_block_size);
+        for (i = 0; i < heap_invalidated_block_nums.size(); i++)
+        {
+            GET_SQE();
+            last_read_offset = heap_invalidated_block_nums[i]*bs->dsk.meta_block_size;
+            data->iov = { metadata_buffer, (size_t)bs->dsk.meta_block_size };
+            data->callback = [this](ring_data_t *data) { handle_event(data, -1); };
+            my_uring_prep_writev(sqe, bs->dsk.meta_fd, &data->iov, 1, bs->dsk.meta_offset + last_read_offset);
+            bs->ringloop->submit();
+            submitted++;
+resume_7:
+            if (submitted > 0)
+            {
+                wait_state = 7;
+                return 1;
+            }
+        }
+    }
    if (entries_to_zero.size() && !bs->inmemory_meta && !bs->readonly)
    {
        std::sort(entries_to_zero.begin(), entries_to_zero.end());
@@ -329,8 +396,9 @@ resume_6:
    return 0;
 }

-bool blockstore_init_meta::handle_meta_block(uint8_t *buf, uint64_t entries_per_block, uint64_t done_cnt)
+bool blockstore_init_meta::handle_array_meta_block(uint8_t *buf, uint64_t block_offset)
 {
+    uint64_t done_cnt = (block_offset / bs->dsk.meta_block_size) * entries_per_block;
    bool updated = false;
    uint64_t max_i = entries_per_block;
    if (max_i > bs->dsk.block_count-done_cnt)
@@ -429,6 +497,132 @@ bool blockstore_init_meta::handle_meta_block(uint8_t *buf, uint64_t entries_per_
    return updated;
 }

+static int bitmap_count_ones(uint8_t *bitmap, int size)
+{
+    int n = 0, i = 0;
+    for (; i <= size-sizeof(unsigned); i += sizeof(unsigned))
+    {
+        n += __builtin_popcount(*(unsigned*)(bitmap+i));
+    }
+    for (; i < size; i++)
+    {
+        n += __builtin_popcount(*(unsigned char*)(bitmap+i));
+    }
+    return n;
+}
+
+// v3 / heap / "cow" metadata block
+bool blockstore_init_meta::handle_heap_meta_block(uint8_t *buf, uint64_t block_offset)
+{
+    if ((block_offset / bs->dsk.meta_block_size) > (heap_block_num_t)-1)
+    {
+        fprintf(stderr, "Metadata area too large\n");
+        exit(1);
+    }
+    // Validate block CRC
+    uint32_t block_crc = *(uint32_t*)(buf + bs->dsk.meta_block_size - 4);
+    if (crc32c(0, buf, bs->dsk.meta_block_size-4) != block_crc)
+    {
+        return false;
+    }
+    // Validate header
+    heap_meta_block_header_t *hdr = (heap_meta_block_header_t*)buf;
+    if (hdr->magic != BLOCKSTORE_META_MAGIC_V1)
+    {
+        return false;
+    }
+    if (hdr->invalidates_blocks > (bs->dsk.meta_block_size-4-sizeof(heap_meta_block_header_t))/sizeof(uint64_t))
+    {
+        fprintf(stderr, "Metadata block at %jx contains too large invalidates_blocks count: %x\n", block_offset, hdr->invalidates_blocks);
+        exit(1);
+    }
+    if (heap_invalidated_block_seq.find(hdr->seq_num) != heap_invalidated_block_seq.end())
+    {
+        // Check if the block is invalidated and handled after the block that invalidates it
+        return false;
+    }
+    uint64_t hdr_size = sizeof(heap_meta_block_header_t) + hdr->invalidates_blocks*8;
+    heap_meta_block_t & blk = bs->heap_meta.blocks[block_offset/bs->dsk.meta_block_size];
+    blk.offset = block_offset;
+    blk.seq_num = hdr->seq_num;
+    blk.used_space = hdr_size + 4;
+    uint64_t *hdr_inv = (uint64_t*)(hdr + 1);
+    for (int i = 0; i < hdr->invalidates_blocks; i++)
+    {
+        blk.invalidates_blocks.push_back(hdr_inv[i]);
+        heap_invalidated_block_seq.insert(hdr_inv[i]);
+    }
+    heap_block_by_seq[hdr->seq_num] = block_offset;
+    // Process sub-blocks
+    uint64_t heap_entry_size = sizeof(heap_meta_entry_t) + bs->dsk.clean_dyn_size;
+    for (uint64_t pos = sizeof(heap_meta_block_header_t); pos < bs->dsk.meta_block_size-4; pos += heap_entry_size)
+    {
+        heap_meta_entry_t *diskentry = (heap_meta_entry_t*)(buf + pos);
+        if (!diskentry->oid.inode || !diskentry->version)
+        {
+            continue;
+        }
+        auto & clean_db = bs->clean_db_shard(diskentry->oid);
+        auto mementry = (clean_entry_heap_t*)(&clean_db[diskentry->oid]);
+        bool exists = mementry->version != 0;
+        if (exists && mementry->version >= diskentry->version)
+        {
+            if (mementry->version == diskentry->version)
+            {
+                // Voluntarily allow duplicates of in-memory entries with different
+                // bitmaps to support checksum updates with hole-punching
+                int old_count = bitmap_count_ones(mementry->bitmap, bs->dsk.clean_entry_bitmap_size);
+                int new_count = bitmap_count_ones(diskentry->bitmap, bs->dsk.clean_entry_bitmap_size);
+                if (old_count < new_count)
+                {
+                    continue;
+                }
+            }
+            else
+            {
+                continue;
+            }
+        }
+        blk.used_space += heap_entry_size;
+        if (exists && mementry->location != UINT64_MAX)
+        {
+            // free the previous block
+            uint64_t old_clean_loc = mementry->location >> bs->dsk.block_order;
+#ifdef BLOCKSTORE_DEBUG
+            printf("Free block %ju from %jx:%jx v%ju\n", 1+old_clean_loc,
+                diskentry->oid.inode, diskentry->oid.stripe, mementry->version);
+#endif
+            bs->data_alloc->set(old_clean_loc, false);
+            bs->inode_space_stats[diskentry->oid.inode] -= bs->dsk.data_block_size;
+            bs->used_blocks--;
+            bs->heap_meta.blocks[mementry->meta_block].used_space -= heap_entry_size;
+        }
+        if (diskentry->location != UINT64_MAX)
+        {
+            bs->data_alloc->set(diskentry->location >> bs->dsk.block_order, true);
+            bs->inode_space_stats[diskentry->oid.inode] += bs->dsk.data_block_size;
+            bs->used_blocks++;
+#ifdef BLOCKSTORE_DEBUG
+            printf("Allocate block (heap entry) %ju: %jx:%jx v%ju\n", 1 + (diskentry->location >> bs->dsk.block_order),
+                diskentry->oid.inode, diskentry->oid.stripe, diskentry->version);
+#endif
+        }
+        mementry->version = diskentry->version;
+        mementry->location = diskentry->location;
+        mementry->meta_block = block_offset / bs->dsk.meta_block_size;
+        mementry->block_offset = block_offset % bs->dsk.meta_block_size;
+        if (exists)
+        {
+            mementry->prev_versions++;
+        }
+        // Extra data: 2 bitmaps + checksums or just 2 bitmaps if inmemory_meta is disabled
+        memcpy(&mementry->bitmap, &diskentry->bitmap, bs->inmemory_meta ? bs->dsk.clean_dyn_size : 2*bs->dsk.clean_entry_bitmap_size);
+        entries_loaded++;
+    }
+    // We have to zero out headers of invalidated blocks, but we'll do it later
+    return false;
+}
+
 blockstore_init_journal::blockstore_init_journal(blockstore_impl_t *bs)
 {
    this->bs = bs;
--- a/src/blockstore/blockstore_init.h
+++ b/src/blockstore/blockstore_init.h
@@ -28,7 +28,13 @@ class blockstore_init_meta
    unsigned entries_per_block = 0;
    int i = 0, j = 0;
    std::vector<uint64_t> entries_to_zero;
-    bool handle_meta_block(uint8_t *buf, uint64_t count, uint64_t done_cnt);
+
+    std::map<uint64_t, heap_block_num_t> heap_block_by_seq;
+    std::set<uint64_t> heap_invalidated_block_seq;
+    std::vector<heap_block_num_t> heap_invalidated_block_nums;
+
+    bool handle_array_meta_block(uint8_t *buf, uint64_t block_offset);
+    bool handle_heap_meta_block(uint8_t *buf, uint64_t block_offset);
    void handle_event(ring_data_t *data, int buf_num);
 public:
    blockstore_init_meta(blockstore_impl_t *bs);
--- a/src/blockstore/blockstore_open.cpp
+++ b/src/blockstore/blockstore_open.cpp
@@ -111,6 +111,10 @@ void blockstore_impl_t::parse_config(blockstore_config_t & config, bool init)
    {
        metadata_buf_size = 4*1024*1024;
    }
+    if (metadata_buf_size % dsk.meta_block_size)
+    {
+        metadata_buf_size = ((metadata_buf_size+dsk.meta_block_size-1) / dsk.meta_block_size) * dsk.meta_block_size;
+    }
    if (dsk.meta_device == dsk.data_device)
    {
        disable_meta_fsync = disable_data_fsync;
--- a/src/blockstore/blockstore_read.cpp
+++ b/src/blockstore/blockstore_read.cpp
@@ -148,10 +148,14 @@ int blockstore_impl_t::fulfill_read(blockstore_op_t *read_op,
    return r;
 }

-uint8_t* blockstore_impl_t::get_clean_entry_bitmap(uint64_t block_loc, int offset)
+uint8_t* blockstore_impl_t::get_clean_entry_bitmap(blockstore_clean_db_t::iterator clean_it, int offset)
 {
+    if (dsk.meta_format == BLOCKSTORE_META_FORMAT_HEAP)
+    {
+        return ((uint8_t*)&clean_it->second) + sizeof(clean_entry_heap_t) + offset;
+    }
    uint8_t *clean_entry_bitmap;
-    uint64_t meta_loc = block_loc >> dsk.block_order;
+    uint64_t meta_loc = clean_it->second.location >> dsk.block_order;
    if (inmemory_meta)
    {
        uint64_t sector = (meta_loc / (dsk.meta_block_size / dsk.clean_entry_size)) * dsk.meta_block_size;
@@ -159,7 +163,9 @@ uint8_t* blockstore_impl_t::get_clean_entry_bitmap(uint64_t block_loc, int offse
        clean_entry_bitmap = ((uint8_t*)metadata_buffer + sector + pos*dsk.clean_entry_size + sizeof(clean_disk_entry) + offset);
    }
    else
+    {
        clean_entry_bitmap = (uint8_t*)(clean_bitmaps + meta_loc*2*dsk.clean_entry_bitmap_size + offset);
+    }
    return clean_entry_bitmap;
 }

@@ -433,7 +439,7 @@ int blockstore_impl_t::dequeue_read(blockstore_op_t *read_op)
                if (!IS_JOURNAL(dirty.state))
                {
                    // Read from data disk, possibly checking checksums
-                    if (!fulfill_clean_read(read_op, fulfilled, bmp_ptr, dyn_data,
+                    if (!fulfill_clean_read_journal(read_op, fulfilled, bmp_ptr, dyn_data,
                        dirty.offset, dirty.offset+dirty.len, dirty.location, dirty_it->first.version))
                    {
                        goto undo_read;
@@ -464,14 +470,13 @@ int blockstore_impl_t::dequeue_read(blockstore_op_t *read_op)
            result_version = clean_it->second.version;
            if (read_op->bitmap)
            {
-                void *bmp_ptr = get_clean_entry_bitmap(clean_it->second.location, dsk.clean_entry_bitmap_size);
+                void *bmp_ptr = get_clean_entry_bitmap(clean_it, dsk.clean_entry_bitmap_size);
                memcpy(read_op->bitmap, bmp_ptr, dsk.clean_entry_bitmap_size);
            }
        }
        if (fulfilled < read_op->len)
        {
-            if (!fulfill_clean_read(read_op, fulfilled, NULL, NULL, 0, dsk.data_block_size,
-                clean_it->second.location, clean_it->second.version))
+            if (!fulfill_clean_read_meta(read_op, fulfilled, clean_it))
            {
                goto undo_read;
            }
@@ -581,40 +586,22 @@ int blockstore_impl_t::pad_journal_read(std::vector<copy_buffer_t> & rv, copy_bu
    return 0;
 }

-bool blockstore_impl_t::fulfill_clean_read(blockstore_op_t *read_op, uint64_t & fulfilled,
+bool blockstore_impl_t::fulfill_clean_read_journal(blockstore_op_t *read_op, uint64_t & fulfilled,
    uint8_t *clean_entry_bitmap, int *dyn_data, uint32_t item_start, uint32_t item_end, uint64_t clean_loc, uint64_t clean_ver)
 {
-    bool from_journal = clean_entry_bitmap != NULL;
-    if (!clean_entry_bitmap)
-    {
-        // NULL clean_entry_bitmap means we're reading from data, not from the journal,
-        // and the bitmap location is obvious
-        clean_entry_bitmap = get_clean_entry_bitmap(clean_loc, 0);
-    }
    if (dsk.csum_block_size > dsk.bitmap_granularity)
    {
        auto & rv = PRIV(read_op)->read_vec;
-        int req = fill_partial_checksum_blocks(rv, fulfilled, clean_entry_bitmap, dyn_data, from_journal,
+        int req = fill_partial_checksum_blocks(rv, fulfilled, clean_entry_bitmap, dyn_data, true,
            (uint8_t*)read_op->buf, read_op->offset, read_op->offset+read_op->len);
-        if (!inmemory_meta && !from_journal && req > 0)
-        {
-            // Read checksums from disk
-            uint8_t *csum_buf = read_clean_meta_block(read_op, clean_loc, rv.size()-req);
-            for (int i = req; i > 0; i--)
-            {
-                rv[rv.size()-i].csum_buf = csum_buf;
-            }
-        }
        for (int i = req; i > 0; i--)
        {
            if (!read_checksum_block(read_op, i, fulfilled, clean_loc))
-            {
                return false;
-            }
        }
        PRIV(read_op)->clean_block_used = req > 0;
    }
-    else if (from_journal)
+    else
    {
        // Don't scan bitmap - journal writes don't have holes (internal bitmap)!
        uint8_t *csum = !dsk.csum_block_size ? 0 : (clean_entry_bitmap + dsk.clean_entry_bitmap_size +
@@ -635,6 +622,43 @@ bool blockstore_impl_t::fulfill_clean_read(blockstore_op_t *read_op, uint64_t &
            assert(fulfill_read(read_op, fulfilled, item_end, dsk.data_block_size, (BS_ST_DELETE | BS_ST_STABLE), 0, 0, 0, NULL, NULL));
        }
    }
+    // Increment reference counter if clean data is being read from the disk
+    if (PRIV(read_op)->clean_block_used)
+    {
+        auto & uo = used_clean_objects[clean_loc];
+        uo.refs++;
+        if (dsk.csum_block_size && flusher->is_mutated(clean_loc))
+            uo.was_changed = true;
+        PRIV(read_op)->clean_block_used = clean_loc;
+    }
+    return true;
+}
+
+bool blockstore_impl_t::fulfill_clean_read_meta(blockstore_op_t *read_op, uint64_t & fulfilled, blockstore_clean_db_t::iterator clean_it)
+{
+    uint8_t *clean_entry_bitmap = get_clean_entry_bitmap(clean_it, 0);
+    uint64_t clean_loc = clean_it->second.location;
+    if (dsk.csum_block_size > dsk.bitmap_granularity)
+    {
+        auto & rv = PRIV(read_op)->read_vec;
+        int req = fill_partial_checksum_blocks(rv, fulfilled, clean_entry_bitmap, NULL, false,
+            (uint8_t*)read_op->buf, read_op->offset, read_op->offset+read_op->len);
+        if (!inmemory_meta && req > 0)
+        {
+            // Read checksums from disk
+            uint8_t *csum_buf = read_clean_meta_block(read_op, clean_it, rv.size()-req);
+            for (int i = req; i > 0; i--)
+            {
+                rv[rv.size()-i].csum_buf = csum_buf;
+            }
+        }
+        for (int i = req; i > 0; i--)
+        {
+            if (!read_checksum_block(read_op, i, fulfilled, clean_loc))
+                return false;
+        }
+        PRIV(read_op)->clean_block_used = req > 0;
+    }
    else
    {
        bool csum_done = !dsk.csum_block_size || inmemory_meta;
@@ -662,13 +686,13 @@ bool blockstore_impl_t::fulfill_clean_read(blockstore_op_t *read_op, uint64_t &
                if (!csum_done)
                {
                    // Read checksums from disk
-                    csum_buf = read_clean_meta_block(read_op, clean_loc, PRIV(read_op)->read_vec.size());
+                    csum_buf = read_clean_meta_block(read_op, clean_it, PRIV(read_op)->read_vec.size());
                    csum_done = true;
                }
                uint8_t *csum = !dsk.csum_block_size ? 0 : (csum_buf + 2*dsk.clean_entry_bitmap_size + bmp_start*(dsk.data_csum_type & 0xFF));
                if (!fulfill_read(read_op, fulfilled, bmp_start * dsk.bitmap_granularity,
                    bmp_end * dsk.bitmap_granularity, (BS_ST_BIG_WRITE | BS_ST_STABLE), 0,
-                    clean_loc + bmp_start * dsk.bitmap_granularity, 0, csum, dyn_data))
+                    clean_loc + bmp_start * dsk.bitmap_granularity, 0, csum, NULL))
                {
                    return false;
                }
@@ -688,11 +712,22 @@ bool blockstore_impl_t::fulfill_clean_read(blockstore_op_t *read_op, uint64_t &
    return true;
 }

-uint8_t* blockstore_impl_t::read_clean_meta_block(blockstore_op_t *op, uint64_t clean_loc, int rv_pos)
+uint8_t* blockstore_impl_t::read_clean_meta_block(blockstore_op_t *op, blockstore_clean_db_t::iterator clean_it, int rv_pos)
 {
+    uint64_t sector, pos;
    auto & rv = PRIV(op)->read_vec;
-    auto sector = ((clean_loc >> dsk.block_order) / (dsk.meta_block_size / dsk.clean_entry_size)) * dsk.meta_block_size;
-    auto pos = ((clean_loc >> dsk.block_order) % (dsk.meta_block_size / dsk.clean_entry_size)) * dsk.clean_entry_size;
+    if (dsk.meta_format == BLOCKSTORE_META_FORMAT_HEAP)
+    {
+        auto clean_heap_entry = (clean_entry_heap_t*)(&clean_it->second);
+        sector = clean_heap_entry->meta_block * dsk.meta_block_size;
+        pos = clean_heap_entry->block_offset;
+    }
+    else
+    {
+        auto clean_loc = clean_it->second.location;
+        sector = ((clean_loc >> dsk.block_order) / (dsk.meta_block_size / dsk.clean_entry_size)) * dsk.meta_block_size;
+        pos = ((clean_loc >> dsk.block_order) % (dsk.meta_block_size / dsk.clean_entry_size)) * dsk.clean_entry_size;
+    }
    uint8_t *buf = (uint8_t*)memalign_or_die(MEM_ALIGNMENT, dsk.meta_block_size);
    rv.insert(rv.begin()+rv_pos, (copy_buffer_t){
        .copy_flags = COPY_BUF_META_BLOCK|COPY_BUF_CSUM_FILL,
@@ -807,11 +842,6 @@ bool blockstore_impl_t::verify_clean_padded_checksums(blockstore_op_t *op, uint6
    if (from_journal)
        return verify_padded_checksums(dyn_data, dyn_data + dsk.clean_entry_bitmap_size, offset, iov, n_iov, bad_block_cb);
    clean_loc = (clean_loc >> dsk.block_order) << dsk.block_order;
-    if (!dyn_data)
-    {
-        assert(inmemory_meta);
-        dyn_data = get_clean_entry_bitmap(clean_loc, 0);
-    }
    return verify_padded_checksums(dyn_data, dyn_data + 2*dsk.clean_entry_bitmap_size, offset, iov, n_iov, bad_block_cb);
 }

@@ -869,8 +899,18 @@ void blockstore_impl_t::handle_read_event(ring_data_t *data, blockstore_op_t *op
                        auto & uo = used_clean_objects.at((rv[i].disk_offset >> dsk.block_order) << dsk.block_order);
                        if (!uo.was_changed)
                        {
+                            bool from_journal = (rv[i].copy_flags & COPY_BUF_JOURNALED_BIG);
+                            auto csum_buf = rv[i].csum_buf;
+                            if (!from_journal && !csum_buf)
+                            {
+                                assert(inmemory_meta);
+                                auto & clean_db = clean_db_shard(op->oid);
+                                auto clean_it = clean_db.find(op->oid);
+                                assert(clean_it != clean_db.end());
+                                csum_buf = get_clean_entry_bitmap(clean_it, 0);
+                            }
                            verify_clean_padded_checksums(
-                                op, rv[i].disk_offset, rv[i].csum_buf, (rv[i].copy_flags & COPY_BUF_JOURNALED_BIG), iov, n_iov,
+                                op, rv[i].disk_offset, csum_buf, from_journal, iov, n_iov,
                                [&](uint32_t bad_block, uint32_t calc_csum, uint32_t stored_csum)
                                {
                                    ok = false;
@@ -1019,7 +1059,7 @@ int blockstore_impl_t::read_bitmap(object_id oid, uint64_t target_version, void
            *result_version = clean_it->second.version;
        if (bitmap)
        {
-            void *bmp_ptr = get_clean_entry_bitmap(clean_it->second.location, dsk.clean_entry_bitmap_size);
+            void *bmp_ptr = get_clean_entry_bitmap(clean_it, dsk.clean_entry_bitmap_size);
            memcpy(bitmap, bmp_ptr, dsk.clean_entry_bitmap_size);
        }
        return 0;
--- a/src/blockstore/blockstore_write.cpp
+++ b/src/blockstore/blockstore_write.cpp
@@ -57,7 +57,7 @@ bool blockstore_impl_t::enqueue_write(blockstore_op_t *op)
            version = clean_it->second.version + 1;
            if (!is_del)
            {
-                void *bmp_ptr = get_clean_entry_bitmap(clean_it->second.location, dsk.clean_entry_bitmap_size);
+                void *bmp_ptr = get_clean_entry_bitmap(clean_it, dsk.clean_entry_bitmap_size);
                memcpy(dyn_ptr, bmp_ptr, dsk.clean_entry_bitmap_size);
            }
        }
@@ -341,7 +341,7 @@ int blockstore_impl_t::dequeue_write(blockstore_op_t *op)
            cancel_all_writes(op, dirty_it, -ENOSPC);
            return 2;
        }
-        if (inmemory_meta)
+        if (inmemory_meta && dsk.meta_format != BLOCKSTORE_META_FORMAT_HEAP)
        {
            // Check once more that metadata entry is zeroed (the reverse means a bug or corruption)
            uint64_t sector = (loc / (dsk.meta_block_size / dsk.clean_entry_size)) * dsk.meta_block_size;
--- a/src/client/messenger.cpp
+++ b/src/client/messenger.cpp
@@ -188,7 +188,7 @@ void osd_messenger_t::init()
            auto cl = cl_it->second;
            cl_it++;
            auto peer_fd = cl->peer_fd;
-            if (!cl->osd_num && !cl->in_osd_num || cl->peer_state != PEER_CONNECTED && cl->peer_state != PEER_RDMA)
+            if (!cl->osd_num || cl->peer_state != PEER_CONNECTED && cl->peer_state != PEER_RDMA)
            {
                // Do not run keepalive on regular clients
                continue;
@@ -199,7 +199,7 @@ void osd_messenger_t::init()
                if (!cl->ping_time_remaining)
                {
                    // Ping timed out, stop the client
-                    fprintf(stderr, "Ping timed out for OSD %ju (client %d), disconnecting peer\n", cl->in_osd_num ? cl->in_osd_num : cl->osd_num, cl->peer_fd);
+                    fprintf(stderr, "Ping timed out for OSD %ju (client %d), disconnecting peer\n", cl->osd_num, cl->peer_fd);
                    stop_client(peer_fd, true);
                    // Restart iterator because it may be invalidated
                    cl_it = clients.upper_bound(peer_fd);
@@ -230,7 +230,7 @@ void osd_messenger_t::init()
                            return;
                        }
                        int fail_fd = (op->reply.hdr.retval != 0 ? op->peer_fd : -1);
-                        auto fail_osd_num = cl->in_osd_num ? cl->in_osd_num : cl->osd_num;
+                        auto fail_osd_num = cl->osd_num;
                        cl->ping_time_remaining = 0;
                        delete op;
                        if (fail_fd >= 0)
--- a/src/client/messenger.h
+++ b/src/client/messenger.h
@@ -60,7 +60,6 @@ struct osd_client_t
    int ping_time_remaining = 0;
    int idle_time_remaining = 0;
    osd_num_t osd_num = 0;
-    osd_num_t in_osd_num = 0;
    bool is_incoming = false;

    void *in_buf = NULL;
@@ -99,7 +98,6 @@ struct osd_client_t
    std::vector<osd_op_t*> zc_free_list;

    ~osd_client_t();
-    void cancel_ops();
 };

 struct osd_wanted_peer_t
@@ -237,7 +235,6 @@ public:
    void outbox_push(osd_op_t *cur_op);
    std::function<void(osd_op_t*)> exec_op;
    std::function<void(osd_num_t)> repeer_pgs;
-    std::function<void(osd_num_t)> break_pg_locks;
    std::function<bool(osd_client_t*, json11::Json)> check_config_hook;
    void read_requests();
    void send_replies();
--- a/src/client/msgr_op.h
+++ b/src/client/msgr_op.h
@@ -173,7 +173,6 @@ struct osd_op_t
    osd_op_buf_list_t iov;

    ~osd_op_t();
-    void cancel();

    bool is_recovery_related();
 };
--- a/src/client/msgr_rdmacm.cpp
+++ b/src/client/msgr_rdmacm.cpp
@@ -510,12 +510,13 @@ void osd_messenger_t::rdmacm_established(rdma_cm_event *ev)
    rc->qp = conn->cmid->qp;
    // And an osd_client_t
    auto cl = new osd_client_t();
+    cl->is_incoming = true;
    cl->peer_addr = conn->parsed_addr;
    cl->peer_port = conn->rdmacm_port;
    cl->peer_fd = conn->peer_fd;
    cl->peer_state = PEER_RDMA;
    cl->connect_timeout_id = -1;
-    cl->in_osd_num = peer_osd;
+    cl->osd_num = peer_osd;
    cl->in_buf = malloc_or_die(receive_buffer_size);
    cl->rdma_conn = rc;
    clients[conn->peer_fd] = cl;
--- a/src/client/msgr_receive.cpp
+++ b/src/client/msgr_receive.cpp
@@ -8,12 +8,11 @@ void osd_messenger_t::read_requests()
    for (int i = 0; i < read_ready_clients.size(); i++)
    {
        int peer_fd = read_ready_clients[i];
-        auto cl_it = clients.find(peer_fd);
-        if (cl_it == clients.end() || !cl_it->second || cl_it->second->read_msg.msg_iovlen)
+        osd_client_t *cl = clients[peer_fd];
+        if (cl->read_msg.msg_iovlen)
        {
            continue;
        }
-        auto cl = cl_it->second;
        if (cl->read_remaining < receive_buffer_size)
        {
            cl->read_iov.iov_base = cl->in_buf;
@@ -34,12 +33,8 @@ void osd_messenger_t::read_requests()
            auto iothread = iothreads.size() ? iothreads[peer_fd % iothreads.size()] : NULL;
            io_uring_sqe sqe_local;
            ring_data_t data_local;
+            sqe_local.user_data = (uint64_t)&data_local;
            io_uring_sqe* sqe = (iothread ? &sqe_local : ringloop->get_sqe());
-            if (iothread)
-            {
-                sqe_local = { .user_data = (uint64_t)&data_local };
-                data_local = {};
-            }
            if (!sqe)
            {
                cl->read_msg.msg_iovlen = 0;
@@ -61,8 +56,7 @@ void osd_messenger_t::read_requests()
            {
                result = -errno;
            }
-            // like set_immediate
-            tfd->set_timer_us(0, false, [this, result, cl](int){ handle_read(result, cl); });
+            handle_read(result, cl);
        }
    }
    read_ready_clients.clear();
@@ -234,7 +228,7 @@ bool osd_messenger_t::handle_finished_read(osd_client_t *cl)
            {
                if (cl->read_op->req.hdr.id != cl->read_op_id)
                {
-                    fprintf(stderr, "Warning: operation sequencing is broken on client %d: expected num %ju, got %ju, stopping client\n", cl->peer_fd, cl->read_op_id, cl->read_op->req.hdr.id);
+                    fprintf(stderr, "Warning: operation sequencing is broken on client %d, stopping client\n", cl->peer_fd);
                    stop_client(cl->peer_fd);
                    return false;
                }
--- a/src/client/msgr_send.cpp
+++ b/src/client/msgr_send.cpp
@@ -194,14 +194,12 @@ bool osd_messenger_t::try_send(osd_client_t *cl)
        auto iothread = iothreads.size() ? iothreads[peer_fd % iothreads.size()] : NULL;
        io_uring_sqe sqe_local;
        ring_data_t data_local;
+        sqe_local.user_data = (uint64_t)&data_local;
        io_uring_sqe* sqe = (iothread ? &sqe_local : ringloop->get_sqe());
-        if (iothread)
-        {
-            sqe_local = { .user_data = (uint64_t)&data_local };
-            data_local = {};
-        }
        if (!sqe)
+        {
            return false;
+        }
        cl->write_msg.msg_iov = cl->send_list.data();
        cl->write_msg.msg_iovlen = cl->send_list.size() < IOV_MAX ? cl->send_list.size() : IOV_MAX;
        cl->refs++;
@@ -239,8 +237,7 @@ bool osd_messenger_t::try_send(osd_client_t *cl)
        {
            result = -errno;
        }
-        // like set_immediate
-        tfd->set_timer_us(0, false, [this, result, cl](int){ handle_send(result, false, false, cl); });
+        handle_send(result, false, false, cl);
    }
    return true;
 }
--- a/src/client/msgr_stop.cpp
+++ b/src/client/msgr_stop.cpp
@@ -9,37 +9,38 @@
 #include "msgr_rdma.h"
 #endif

-void osd_client_t::cancel_ops()
+void osd_messenger_t::cancel_osd_ops(osd_client_t *cl)
 {
    std::vector<osd_op_t*> cancel_ops;
-    cancel_ops.resize(sent_ops.size());
+    cancel_ops.resize(cl->sent_ops.size());
    int i = 0;
-    for (auto p: sent_ops)
+    for (auto p: cl->sent_ops)
    {
        cancel_ops[i++] = p.second;
    }
-    sent_ops.clear();
+    cl->sent_ops.clear();
+    cl->outbox.clear();
    for (auto op: cancel_ops)
    {
-        op->cancel();
+        cancel_op(op);
    }
 }

-void osd_op_t::cancel()
+void osd_messenger_t::cancel_op(osd_op_t *op)
 {
-    if (op_type == OSD_OP_OUT && callback)
+    if (op->op_type == OSD_OP_OUT)
    {
-        reply.hdr.magic = SECONDARY_OSD_REPLY_MAGIC;
-        reply.hdr.id = req.hdr.id;
-        reply.hdr.opcode = req.hdr.opcode;
-        reply.hdr.retval = -EPIPE;
-        // Copy lambda to be unaffected by `delete this`
-        (std::function<void(osd_op_t*)>(callback))(this);
+        op->reply.hdr.magic = SECONDARY_OSD_REPLY_MAGIC;
+        op->reply.hdr.id = op->req.hdr.id;
+        op->reply.hdr.opcode = op->req.hdr.opcode;
+        op->reply.hdr.retval = -EPIPE;
+        // Copy lambda to be unaffected by `delete op`
+        std::function<void(osd_op_t*)>(op->callback)(op);
    }
    else
    {
        // This function is only called in stop_client(), so it's fine to destroy the operation
-        delete this;
+        delete op;
    }
 }

@@ -62,10 +63,6 @@ void osd_messenger_t::stop_client(int peer_fd, bool force, bool force_delete)
        {
            fprintf(stderr, "[OSD %ju] Stopping client %d (OSD peer %ju)\n", osd_num, peer_fd, cl->osd_num);
        }
-        else if (cl->in_osd_num)
-        {
-            fprintf(stderr, "[OSD %ju] Stopping client %d (incoming OSD peer %ju)\n", osd_num, peer_fd, cl->in_osd_num);
-        }
        else
        {
            fprintf(stderr, "[OSD %ju] Stopping client %d (regular client)\n", osd_num, peer_fd);
@@ -76,12 +73,8 @@ void osd_messenger_t::stop_client(int peer_fd, bool force, bool force_delete)
    cl->peer_state = PEER_STOPPED;
    if (cl->osd_num)
    {
-        auto osd_it = osd_peer_fds.find(cl->osd_num);
-        if (osd_it != osd_peer_fds.end() && osd_it->second == cl->peer_fd)
-        {
-            // ...and forget OSD peer
-            osd_peer_fds.erase(osd_it);
-        }
+        // ...and forget OSD peer
+        osd_peer_fds.erase(cl->osd_num);
    }
 #ifndef __MOCK__
    // Then remove FD from the eventloop so we don't accidentally read something
@@ -108,17 +101,30 @@ void osd_messenger_t::stop_client(int peer_fd, bool force, bool force_delete)
        }
    }
 #endif
-    if (cl->in_osd_num && break_pg_locks)
-    {
-        // Break PG locks
-        break_pg_locks(cl->in_osd_num);
-    }
    if (cl->osd_num)
    {
        // Then repeer PGs because cancel_op() callbacks can try to perform
        // some actions and we need correct PG states to not do something silly
        repeer_pgs(cl->osd_num);
    }
+    // Then cancel all operations
+    if (cl->read_op)
+    {
+        if (!cl->read_op->callback)
+        {
+            delete cl->read_op;
+        }
+        else
+        {
+            cancel_op(cl->read_op);
+        }
+        cl->read_op = NULL;
+    }
+    if (cl->osd_num)
+    {
+        // Cancel outbound operations
+        cancel_osd_ops(cl);
+    }
    // Find the item again because it can be invalidated at this point
    it = clients.find(peer_fd);
    if (it != clients.end())
@@ -143,17 +149,6 @@ osd_client_t::~osd_client_t()
        close(peer_fd);
        peer_fd = -1;
    }
-    // Then cancel all operations
-    // Operations have to be canceled only after clearing all references to osd_client_t
-    // because otherwise their buffers may be still present in io_uring asynchronous requests
-    if (read_op)
-    {
-        // read_op may be an incoming op or a continued response for an outbound op
-        read_op->cancel();
-        read_op = NULL;
-    }
-    // Cancel outbound ops
-    cancel_ops();
 #ifndef __MOCK__
 #ifdef WITH_RDMA
    if (rdma_conn)
--- a/src/client/vitastor.pc.in
+++ b/src/client/vitastor.pc.in
@@ -6,7 +6,7 @@ includedir=${prefix}/@CMAKE_INSTALL_INCLUDEDIR@

 Name: Vitastor
 Description: Vitastor client library
-Version: 2.2.2
+Version: 2.2.0
 Libs: -L${libdir} -lvitastor_client
 Cflags: -I${includedir}

--- a/src/cmd/cli_rm_osd.cpp
+++ b/src/cmd/cli_rm_osd.cpp
@@ -70,7 +70,7 @@ struct rm_osd_t
        {
            if (parent->cli->st_cli.peer_states.find(osd_id) != parent->cli->st_cli.peer_states.end())
            {
-                is_warning = !allow_up;
+                is_warning = true;
                still_up.push_back(osd_id);
            }
        }
--- a/src/osd/osd.cpp
+++ b/src/osd/osd.cpp
@@ -85,7 +85,6 @@ osd_t::osd_t(const json11::Json & config, ring_loop_t *ringloop)
    msgr.ringloop = this->ringloop;
    msgr.exec_op = [this](osd_op_t *op) { exec_op(op); };
    msgr.repeer_pgs = [this](osd_num_t peer_osd) { repeer_pgs(peer_osd); };
-    msgr.break_pg_locks = [this](osd_num_t peer_osd) { break_pg_locks(peer_osd); };
    msgr.check_config_hook = [this](osd_client_t *cl, json11::Json conf) { return check_peer_config(cl, conf); };
    msgr.init();

--- a/src/osd/osd.h
+++ b/src/osd/osd.h
@@ -278,8 +278,6 @@ class osd_t
    void handle_peers();
    bool check_peer_config(osd_client_t *cl, json11::Json conf);
    void repeer_pgs(osd_num_t osd_num);
-    void repeer_pg(pg_t & pg);
-    void break_pg_locks(osd_num_t osd_num);
    void start_pg_peering(pg_t & pg);
    void drop_dirty_pg_connections(pool_pg_num_t pg);
    void record_pg_lock(pg_t & pg, osd_num_t peer_osd, uint64_t pg_state);
--- a/src/osd/osd_cluster.cpp
+++ b/src/osd/osd_cluster.cpp
@@ -432,16 +432,9 @@ void osd_t::apply_pg_locks_localize_only()
        }
        auto & pool_cfg = pool_it->second;
        auto & pg = pp.second;
-        auto old_disable_pg_locks = pg.disable_pg_locks;
        pg.disable_pg_locks = pg_locks_localize_only &&
-            (pool_cfg.scheme != POOL_SCHEME_REPLICATED ||
-            pool_cfg.local_reads == POOL_LOCAL_READ_PRIMARY);
-        if (!pg.disable_pg_locks && old_disable_pg_locks)
-        {
-            // Relock PG
-            printf("[PG %u/%u] Repeer to enable PG locks\n", pg.pool_id, pg.pg_num);
-            repeer_pg(pg);
-        }
+            pool_cfg.scheme == POOL_SCHEME_REPLICATED &&
+            pool_cfg.local_reads == POOL_LOCAL_READ_PRIMARY;
    }
 }

@@ -884,8 +877,8 @@ void osd_t::apply_pg_config()
                pg.next_scrub = pg_cfg.next_scrub;
                pg.target_set = pg_cfg.target_set;
                pg.disable_pg_locks = pg_locks_localize_only &&
-                    (pool_item.second.scheme != POOL_SCHEME_REPLICATED ||
-                    pool_item.second.local_reads == POOL_LOCAL_READ_PRIMARY);
+                    pool_item.second.scheme == POOL_SCHEME_REPLICATED &&
+                    pool_item.second.local_reads == POOL_LOCAL_READ_PRIMARY;
                if (pg.scheme == POOL_SCHEME_EC)
                {
                    use_ec(pg.pg_size, pg.pg_data_size, true);
@@ -1051,15 +1044,8 @@ void osd_t::report_pg_states()
    etcd_reporting_pg_state = true;
    st_cli.etcd_txn(json11::Json::object {
        { "compare", checks }, { "success", success }, { "failure", failure }
-    }, st_cli.etcd_quick_timeout, 0, 0, [this, reporting_pgs, success_count = success.size(), failure_count = failure.size()](std::string err, json11::Json data)
+    }, st_cli.etcd_quick_timeout, 0, 0, [this, reporting_pgs](std::string err, json11::Json data)
    {
-        int expected_count = (data["succeeded"].bool_value() ? success_count : failure_count);
-        if (expected_count != data["responses"].array_items().size())
-        {
-            printf("Unexpected response from etcd - 'responses' count (%u) isn't equal to expected (%u), stopping\n",
-                data["responses"].array_items().size(), expected_count);
-            force_stop(1);
-        }
        etcd_reporting_pg_state = false;
        if (!data["succeeded"].bool_value())
        {
--- a/src/osd/osd_peering.cpp
+++ b/src/osd/osd_peering.cpp
@@ -73,25 +73,18 @@ void osd_t::handle_peers()
    }
 }

-void osd_t::break_pg_locks(osd_num_t peer_osd)
-{
-    for (auto lock_it = pg_locks.begin(); lock_it != pg_locks.end(); )
-    {
-        if (lock_it->second.primary_osd == peer_osd)
-        {
-            if (log_level > 3)
-            {
-                printf("Break PG %u/%u lock on disconnection of OSD %ju\n", lock_it->first.pool_id, lock_it->first.pg_num, peer_osd);
-            }
-            pg_locks.erase(lock_it++);
-        }
-        else
-            lock_it++;
-    }
-}
-
 void osd_t::repeer_pgs(osd_num_t peer_osd)
 {
+    if (msgr.osd_peer_fds.find(peer_osd) == msgr.osd_peer_fds.end())
+    {
+        for (auto lock_it = pg_locks.begin(); lock_it != pg_locks.end(); )
+        {
+            if (lock_it->second.primary_osd == peer_osd)
+                pg_locks.erase(lock_it++);
+            else
+                lock_it++;
+        }
+    }
    // Re-peer affected PGs
    for (auto & p: pgs)
    {
@@ -111,26 +104,21 @@ void osd_t::repeer_pgs(osd_num_t peer_osd)
            {
                // Repeer this pg
                printf("[PG %u/%u] Repeer because of OSD %ju\n", pg.pool_id, pg.pg_num, peer_osd);
-                repeer_pg(pg);
+                if (!(pg.state & (PG_ACTIVE | PG_REPEERING)) || pg.can_repeer())
+                {
+                    start_pg_peering(pg);
+                }
+                else
+                {
+                    // Stop accepting new operations, wait for current ones to finish or fail
+                    pg.state = pg.state & ~PG_ACTIVE | PG_REPEERING;
+                    report_pg_state(pg);
+                }
            }
        }
    }
 }

-void osd_t::repeer_pg(pg_t & pg)
-{
-    if (!(pg.state & (PG_ACTIVE | PG_REPEERING)) || pg.can_repeer())
-    {
-        start_pg_peering(pg);
-    }
-    else
-    {
-        // Stop accepting new operations, wait for current ones to finish or fail
-        pg.state = pg.state & ~PG_ACTIVE | PG_REPEERING;
-        report_pg_state(pg);
-    }
-}
-
 // Reset PG state (when peering or stopping)
 void osd_t::reset_pg(pg_t & pg)
 {
@@ -478,7 +466,6 @@ void osd_t::relock_pg(pg_t & pg)
            auto pg_it = pgs.find(pg_id);
            if (pg_it == pgs.end())
            {
-                printf("Warning: PG %u/%u is gone during lock attempt\n", pg_id.pool_id, pg_id.pg_num);
                return;
            }
            auto & pg = pg_it->second;
--- a/src/osd/osd_primary_subops.cpp
+++ b/src/osd/osd_primary_subops.cpp
@@ -417,17 +417,15 @@ void osd_t::handle_primary_subop(osd_op_t *subop, osd_op_t *cur_op)
    if (retval != expected)
    {
        int64_t peer_osd = (msgr.clients.find(subop->peer_fd) != msgr.clients.end()
-            ? msgr.clients[subop->peer_fd]->osd_num : 0);
+            ? msgr.clients[subop->peer_fd]->osd_num : -subop->peer_fd);
        if (opcode == OSD_OP_SEC_READ || opcode == OSD_OP_SEC_WRITE || opcode == OSD_OP_SEC_WRITE_STABLE)
        {
            printf(
                subop->peer_fd >= 0
-                    ? (peer_osd > 0
-                        ? "%1$s subop to %2$jx:%3$jx v%4$ju failed on osd %7$ju: retval = %5$d (expected %6$d)\n"
-                        : "%1$s subop to %2$jx:%3$jx v%4$ju failed on peer %8$d: retval = %5$d (expected %6$d)\n")
+                    ? "%1$s subop to %2$jx:%3$jx v%4$ju failed on osd %7$jd: retval = %5$d (expected %6$d)\n"
                    : "%1$s subop to %2$jx:%3$jx v%4$ju failed locally: retval = %5$d (expected %6$d)\n",
                osd_op_names[opcode], subop->req.sec_rw.oid.inode, subop->req.sec_rw.oid.stripe, subop->req.sec_rw.version,
-                retval, expected, peer_osd, subop->peer_fd
+                retval, expected, peer_osd
            );
        }
        else if (opcode == OSD_OP_SEC_DELETE)
--- a/src/osd/osd_secondary.cpp
+++ b/src/osd/osd_secondary.cpp
@@ -91,17 +91,16 @@ bool osd_t::sec_check_pg_lock(osd_num_t primary_osd, const object_id &oid)
    {
        return false;
    }
-    auto & pool_cfg = pool_cfg_it->second;
-    if (pg_locks_localize_only && (pool_cfg.scheme != POOL_SCHEME_REPLICATED || pool_cfg.local_reads == POOL_LOCAL_READ_PRIMARY))
-    {
-        return true;
-    }
    auto ppg = (pool_pg_num_t){ .pool_id = pool_id, .pg_num = map_to_pg(oid, pool_cfg_it->second.pg_stripe_size) };
    auto pg_it = pgs.find(ppg);
    if (pg_it != pgs.end() && pg_it->second.state != PG_OFFLINE)
    {
        return false;
    }
+    if (pg_it->second.disable_pg_locks)
+    {
+        return true;
+    }
    auto lock_it = pg_locks.find(ppg);
    return lock_it != pg_locks.end() && lock_it->second.primary_osd == primary_osd;
 }
@@ -141,7 +140,7 @@ void osd_t::exec_secondary_real(osd_op_t *cur_op)
        cur_op->req.hdr.opcode == OSD_OP_SEC_WRITE_STABLE)
    {
        if (!(cur_op->req.sec_rw.flags & OSD_OP_IGNORE_PG_LOCK) &&
-            !sec_check_pg_lock(cl->in_osd_num, cur_op->req.sec_rw.oid))
+            !sec_check_pg_lock(cl->osd_num, cur_op->req.sec_rw.oid))
        {
            cur_op->bs_op->retval = -EPIPE;
            secondary_op_callback(cur_op);
@@ -170,7 +169,7 @@ void osd_t::exec_secondary_real(osd_op_t *cur_op)
    else if (cur_op->req.hdr.opcode == OSD_OP_SEC_DELETE)
    {
        if (!(cur_op->req.sec_del.flags & OSD_OP_IGNORE_PG_LOCK) &&
-            !sec_check_pg_lock(cl->in_osd_num, cur_op->req.sec_del.oid))
+            !sec_check_pg_lock(cl->osd_num, cur_op->req.sec_del.oid))
        {
            cur_op->bs_op->retval = -EPIPE;
            secondary_op_callback(cur_op);
@@ -194,7 +193,7 @@ void osd_t::exec_secondary_real(osd_op_t *cur_op)
        {
            for (int i = 0; i < cur_op->bs_op->len; i++)
            {
-                if (!sec_check_pg_lock(cl->in_osd_num, ((obj_ver_id*)cur_op->buf)[i].oid))
+                if (!sec_check_pg_lock(cl->osd_num, ((obj_ver_id*)cur_op->buf)[i].oid))
                {
                    cur_op->bs_op->retval = -EPIPE;
                    secondary_op_callback(cur_op);
@@ -248,7 +247,7 @@ void osd_t::exec_sec_read_bmp(osd_op_t *cur_op)
        void *cur_buf = reply_buf;
        for (int i = 0; i < n; i++)
        {
-            if (!sec_check_pg_lock(cl->in_osd_num, ov[i].oid) &&
+            if (!sec_check_pg_lock(cl->osd_num, ov[i].oid) &&
                !(cur_op->req.sec_read_bmp.flags & OSD_OP_IGNORE_PG_LOCK))
            {
                free(reply_buf);
@@ -270,7 +269,7 @@ void osd_t::exec_sec_lock(osd_op_t *cur_op)
 {
    cur_op->reply.sec_lock.cur_primary = 0;
    auto cl = msgr.clients.at(cur_op->peer_fd);
-    if (!cl->in_osd_num ||
+    if (!cl->osd_num ||
        cur_op->req.sec_lock.flags != OSD_SEC_LOCK_PG &&
        cur_op->req.sec_lock.flags != OSD_SEC_UNLOCK_PG ||
        cur_op->req.sec_lock.pool_id > ((uint64_t)1<<POOL_ID_BITS) ||
@@ -291,7 +290,7 @@ void osd_t::exec_sec_lock(osd_op_t *cur_op)
    auto lock_it = pg_locks.find(ppg);
    if (cur_op->req.sec_lock.flags == OSD_SEC_LOCK_PG)
    {
-        if (lock_it != pg_locks.end() && lock_it->second.primary_osd != cl->in_osd_num)
+        if (lock_it != pg_locks.end() && lock_it->second.primary_osd != cl->osd_num)
        {
            cur_op->reply.sec_lock.cur_primary = lock_it->second.primary_osd;
            finish_op(cur_op, -EBUSY);
@@ -304,21 +303,13 @@ void osd_t::exec_sec_lock(osd_op_t *cur_op)
            finish_op(cur_op, -EBUSY);
            return;
        }
-        if (log_level > 3)
-        {
-            printf("Lock PG %u/%u for OSD %ju\n", ppg.pool_id, ppg.pg_num, cl->in_osd_num);
-        }
        pg_locks[ppg] = (osd_pg_lock_t){
-            .primary_osd = cl->in_osd_num,
+            .primary_osd = cl->osd_num,
            .state = cur_op->req.sec_lock.pg_state,
        };
    }
-    else if (lock_it != pg_locks.end() && lock_it->second.primary_osd == cl->in_osd_num)
+    else if (lock_it != pg_locks.end() && lock_it->second.primary_osd == cl->osd_num)
    {
-        if (log_level > 3)
-        {
-            printf("Unlock PG %u/%u by OSD %ju\n", ppg.pool_id, ppg.pg_num, cl->in_osd_num);
-        }
        pg_locks.erase(lock_it);
    }
    finish_op(cur_op, 0);
@@ -332,7 +323,7 @@ void osd_t::exec_show_config(osd_op_t *cur_op)
        : json11::Json();
    auto peer_osd_num = req_json["osd_num"].uint64_value();
    auto cl = msgr.clients.at(cur_op->peer_fd);
-    cl->in_osd_num = peer_osd_num;
+    cl->osd_num = peer_osd_num;
    if (req_json["features"]["check_sequencing"].bool_value())
    {
        cl->check_sequencing = true;
--- a/src/test/test_cluster_client.cpp
+++ b/src/test/test_cluster_client.cpp
@@ -121,7 +121,6 @@ void pretend_connected(cluster_client_t *cli, osd_num_t osd_num)
    cli->msgr.osd_peer_fds[osd_num] = peer_fd;
    cli->msgr.clients[peer_fd] = new osd_client_t();
    cli->msgr.clients[peer_fd]->osd_num = osd_num;
-    cli->msgr.clients[peer_fd]->peer_fd = peer_fd;
    cli->msgr.clients[peer_fd]->peer_state = PEER_CONNECTED;
    cli->msgr.wanted_peers.erase(osd_num);
    cli->msgr.repeer_pgs(osd_num);
--- a/src/util/ringloop.cpp
+++ b/src/util/ringloop.cpp
@@ -125,8 +125,6 @@ void ring_loop_t::loop()
        if (cqe->flags & IORING_CQE_F_MORE)
        {
            // There will be a second notification
-            if (mt)
-                mu.unlock();
            d->res = cqe->res;
            d->more = true;
            if (d->callback)
--- a/tests/run_tests.sh
+++ b/tests/run_tests.sh
@@ -59,7 +59,6 @@ SCHEME=ec IMMEDIATE_COMMIT=1 ./test_rebalance_verify.sh

 ./test_write.sh
 SCHEME=xor ./test_write.sh
-TEST_NAME=iothreads GLOBAL_CONFIG=',"client_iothread_count":4' ./test_write.sh

 ./test_write_no_same.sh
Author	SHA1	Message	Date
Vitaliy Filippov	2f3b1f37a2	WIP "Heap" metadata storage scheme Some checks failed Test / test_rebalance_verify_ec_imm (push) Successful in 1m39s Details Test / test_write_no_same (push) Successful in 8s Details Test / test_write (push) Successful in 32s Details Test / test_rebalance_verify_imm (push) Failing after 2m15s Details Test / test_write_xor (push) Successful in 35s Details Test / test_heal_pg_size_2 (push) Successful in 2m18s Details Test / test_heal_local_read (push) Successful in 2m17s Details Test / test_heal_ec (push) Successful in 2m20s Details Test / test_heal_antietcd (push) Successful in 2m20s Details Test / test_heal_csum_32k_dmj (push) Successful in 2m20s Details Test / test_heal_csum_32k (push) Successful in 2m19s Details Test / test_heal_csum_4k_dmj (push) Successful in 2m18s Details Test / test_heal_csum_32k_dj (push) Failing after 2m29s Details Test / test_resize_auto (push) Successful in 8s Details Test / test_snapshot_pool2 (push) Successful in 14s Details Test / test_osd_tags (push) Successful in 7s Details Test / test_enospc (push) Successful in 9s Details Test / test_enospc_xor (push) Successful in 13s Details Test / test_enospc_imm (push) Successful in 10s Details Test / test_enospc_imm_xor (push) Successful in 13s Details Test / test_scrub (push) Successful in 15s Details Test / test_scrub_zero_osd_2 (push) Successful in 13s Details Test / test_scrub_xor (push) Successful in 15s Details Test / test_heal_csum_4k_dj (push) Successful in 2m18s Details Test / test_heal_csum_4k (push) Successful in 2m18s Details Test / test_scrub_pg_size_3 (push) Successful in 16s Details Test / test_scrub_pg_size_6_pg_minsize_4_osd_count_6_ec (push) Successful in 18s Details Test / test_scrub_ec (push) Successful in 15s Details Test / test_nfs (push) Successful in 13s Details Test / test_resize (push) Failing after 3m5s Details	2025-05-18 11:56:16 +03:00
Vitaliy Filippov	abd5cbfbe4	Pass clean_bitmap explicitly	2025-05-18 11:55:04 +03:00