Use 32-bit big write location (OK for up to 512 TB OSDs)

Do not store offset & len in big_writes
Fix object crc32c calculation
2025-09-17 02:03:48 +03:00 · 2025-09-17 02:03:40 +03:00 · 2025-09-14 21:56:30 +03:00 · 2025-09-14 14:11:34 +03:00 · 2025-09-14 13:40:54 +03:00 · 2025-09-14 13:40:54 +03:00
80 changed files with 12058 additions and 6839 deletions
--- a/.gitea/workflows/test.yml
+++ b/.gitea/workflows/test.yml
@@ -810,6 +810,24 @@ jobs:
          echo ""
        done

+  test_reweight_half:
+    runs-on: ubuntu-latest
+    needs: build
+    container: ${{env.TEST_IMAGE}}:${{github.sha}}
+    steps:
+    - name: Run test
+      id: test
+      timeout-minutes: 3
+      run: /root/vitastor/tests/test_reweight_half.sh
+    - name: Print logs
+      if: always() && steps.test.outcome == 'failure'
+      run: |
+        for i in /root/vitastor/testdata/*.log /root/vitastor/testdata/*.txt; do
+          echo "-------- $i --------"
+          cat $i
+          echo ""
+        done
+
  test_heal_csum_32k_dmj:
    runs-on: ubuntu-latest
    needs: build
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -4,4 +4,17 @@ project(vitastor)

 set(VITASTOR_VERSION "2.3.0")

+include(CTest)
+
+add_custom_target(build_tests)
+add_custom_target(test
+	COMMAND
+	echo leak:tcmalloc > ${CMAKE_CURRENT_BINARY_DIR}/lsan-suppress.txt &&
+	env LSAN_OPTIONS=suppressions=${CMAKE_CURRENT_BINARY_DIR}/lsan-suppress.txt ${CMAKE_CTEST_COMMAND}
+)
+# make -j16 -C ../../build test_heap && ../../build/src/test/test_heap
+# make -j16 -C ../../build test_heap && rm -f $(find ../../build -name '*.gcda') && ctest -V -T test -T coverage -R heap --test-dir ../../build && (cd ../../build; gcovr -f ../src --html --html-nested -o coverage/index.html; cd ../src/test)
+# make -j16 -C ../../build test_blockstore && rm -f $(find ../../build -name '*.gcda') && ctest -V -T test -T coverage -R blockstore --test-dir ../../build && (cd ../../build; gcovr -f ../src --html --html-nested -o coverage/index.html; cd ../src/test)
+# kcov --include-path=../../../src ../../kcov ./test_blockstore
+add_dependencies(test build_tests)
 add_subdirectory(src)
--- a/csi/Dockerfile
+++ b/csi/Dockerfile
@@ -36,7 +36,7 @@ RUN (echo deb http://vitastor.io/debian bookworm main > /etc/apt/sources.list.d/
    ((echo 'Package: *'; echo 'Pin: origin "vitastor.io"'; echo 'Pin-Priority: 1000') > /etc/apt/preferences.d/vitastor.pref) && \
    wget -q -O /etc/apt/trusted.gpg.d/vitastor.gpg https://vitastor.io/debian/pubkey.gpg && \
    apt-get update && \
-    apt-get install -y vitastor-client && \
+    apt-get install -y vitastor-client ibverbs-providers && \
    wget https://vitastor.io/archive/qemu/qemu-bookworm-9.2.2%2Bds-1%2Bvitastor4/qemu-utils_9.2.2%2Bds-1%2Bvitastor4_amd64.deb && \
    wget https://vitastor.io/archive/qemu/qemu-bookworm-9.2.2%2Bds-1%2Bvitastor4/qemu-block-extra_9.2.2%2Bds-1%2Bvitastor4_amd64.deb && \
    dpkg -x qemu-utils*.deb tmp1 && \
--- a/debian/control
+++ b/debian/control
@@ -4,7 +4,7 @@ Priority: optional
 Maintainer: Vitaliy Filippov <vitalif@yourcmc.ru>
 Build-Depends: debhelper, g++ (>= 8), libstdc++6 (>= 8),
  linux-libc-dev, libgoogle-perftools-dev, libjerasure-dev, libgf-complete-dev,
-  libibverbs-dev, libisal-dev, cmake, pkg-config, libnl-3-dev, libnl-genl-3-dev,
+  libibverbs-dev, librdmacm-dev, libisal-dev, cmake, pkg-config, libnl-3-dev, libnl-genl-3-dev,
  node-bindings <!nocheck>, node-gyp, node-nan
 Standards-Version: 4.5.0
 Homepage: https://vitastor.io/
--- a/docker/Dockerfile
+++ b/docker/Dockerfile
@@ -3,7 +3,7 @@
 FROM debian:bookworm

 ADD etc/apt /etc/apt/
-RUN apt-get update && apt-get -y install vitastor udev systemd qemu-system-x86 qemu-system-common qemu-block-extra qemu-utils jq nfs-common && apt-get clean
+RUN apt-get update && apt-get -y install vitastor ibverbs-providers udev systemd qemu-system-x86 qemu-system-common qemu-block-extra qemu-utils jq nfs-common && apt-get clean
 ADD sleep.sh /usr/bin/
 ADD install.sh /usr/bin/
 ADD scripts /opt/scripts/
--- a/docs/config/osd.en.md
+++ b/docs/config/osd.en.md
@@ -491,7 +491,7 @@ Can be used to slow down scrubbing if it affects user load too much.
 ## scrub_list_limit

 - Type: integer
- Default: 1000
+- Default: 262144
 - Can be changed online: yes

 Number of objects to list in one listing operation during scrub.
--- a/docs/config/osd.ru.md
+++ b/docs/config/osd.ru.md
@@ -514,7 +514,7 @@ fsync небезопасным даже с режимом "directsync".
 ## scrub_list_limit

 - Тип: целое число
- Значение по умолчанию: 1000
+- Значение по умолчанию: 262144
 - Можно менять на лету: да

 Размер загружаемых за одну операцию списков объектов в процессе фоновой
--- a/docs/config/src/osd.yml
+++ b/docs/config/src/osd.yml
@@ -566,7 +566,7 @@
    сильно влияет на пользовательскую нагрузку.
 - name: scrub_list_limit
  type: int
-  default: 1000
+  default: 262144
  online: true
  info: |
    Number of objects to list in one listing operation during scrub.
--- a/docs/usage/disk.en.md
+++ b/docs/usage/disk.en.md
@@ -73,6 +73,8 @@ Options (automatic mode):
 --max_other 10%
  Use disks for OSD data even if they already have non-Vitastor partitions,
  but only if these take up no more than this percent of disk space.
+--dry-run
+  Check and print new OSD count for each disk but do not actually create them.
 ```

 Options (single-device mode):
--- a/docs/usage/disk.ru.md
+++ b/docs/usage/disk.ru.md
@@ -74,6 +74,8 @@ vitastor-disk - инструмент командной строки для уп
 --max_other 10%
  Использовать диски под данные OSD, даже если на них уже есть не-Vitastor-овые
  разделы, но только в случае, если они занимают не более данного процента диска.
+--dry-run
+  Проверить и вывести число новых OSD для каждого диска, но не создавать их.
 ```

 Опции для режима одного OSD:
--- a/mon/osd_tree.js
+++ b/mon/osd_tree.js
@@ -15,7 +15,7 @@ function get_osd_tree(global_config, state)
        const stat = state.osd.stats[osd_num];
        const osd_cfg = state.config.osd[osd_num];
        let reweight = osd_cfg == null ? 1 : Number(osd_cfg.reweight);
-        if (isNaN(reweight) || reweight < 0 || reweight > 0)
+        if (isNaN(reweight) || reweight < 0 || reweight > 1)
            reweight = 1;
        if (stat && stat.size && reweight && (state.osd.state[osd_num] || Number(stat.time) >= down_time ||
            osd_cfg && osd_cfg.noout))
--- a/patches/VitastorPlugin.pm
+++ b/patches/VitastorPlugin.pm
@@ -499,4 +499,55 @@ sub rename_volume
    return "${storeid}:${base_name}${target_volname}";
 }

+sub _monkey_patch_qemu_blockdev_options
+{
+    my ($cfg, $volid, $machine_version, $options) = @_;
+    my ($storeid, $volname) = PVE::Storage::parse_volume_id($volid);
+
+    my $scfg = PVE::Storage::storage_config($cfg, $storeid);
+
+    my $plugin = PVE::Storage::Plugin->lookup($scfg->{type});
+
+    my ($vtype) = $plugin->parse_volname($volname);
+    die "cannot use volume of type '$vtype' as a QEMU blockdevice\n"
+        if $vtype ne 'images' && $vtype ne 'iso' && $vtype ne 'import';
+
+    return $plugin->qemu_blockdev_options($scfg, $storeid, $volname, $machine_version, $options);
+}
+
+sub qemu_blockdev_options
+{
+    my ($class, $scfg, $storeid, $volname, $machine_version, $options) = @_;
+    my $prefix = defined $scfg->{vitastor_prefix} ? $scfg->{vitastor_prefix} : 'pve/';
+    my ($vtype, $name, $vmid) = $class->parse_volname($volname);
+    $name .= '@'.$options->{'snapshot-name'} if $options->{'snapshot-name'};
+    if ($scfg->{vitastor_nbd})
+    {
+        my $mapped = run_cli($scfg, [ 'ls' ], binary => '/usr/bin/vitastor-nbd');
+        my ($kerneldev) = grep { $mapped->{$_}->{image} eq $prefix.$name } keys %$mapped;
+        die "Image not mapped via NBD" if !$kerneldev;
+        return { driver => 'host_device', filename => $kerneldev };
+    }
+    my $blockdev = {
+        driver => 'vitastor',
+        image => $prefix.$name,
+    };
+    if ($scfg->{vitastor_config_path})
+    {
+        $blockdev->{'config-path'} = $scfg->{vitastor_config_path};
+    }
+    if ($scfg->{vitastor_etcd_address})
+    {
+        # FIXME This is the only exception: etcd_address -> etcd_host for qemu
+        $blockdev->{'etcd-host'} = $scfg->{vitastor_etcd_address};
+    }
+    if ($scfg->{vitastor_etcd_prefix})
+    {
+        $blockdev->{'etcd-prefix'} = $scfg->{vitastor_etcd_prefix};
+    }
+    return $blockdev;
+}
+
+*PVE::Storage::qemu_blockdev_options = *_monkey_patch_qemu_blockdev_options;
+
 1;
--- a/src/CMakeLists.txt
+++ b/src/CMakeLists.txt
@@ -19,6 +19,7 @@ if("${CMAKE_INSTALL_PREFIX}" MATCHES "^/usr/local/?$")
 	endif()
 	set(CMAKE_INSTALL_RPATH "${CMAKE_INSTALL_PREFIX}/${CMAKE_INSTALL_LIBDIR}")
 endif()
+set(ENABLE_COVERAGE false CACHE BOOL "Enable code coverage")

 add_definitions(-DVITASTOR_VERSION="2.3.0")
 add_definitions(-D_GNU_SOURCE -D_LARGEFILE64_SOURCE -D_FILE_OFFSET_BITS=64 -Wall -Wno-sign-compare -Wno-comment -Wno-parentheses -Wno-pointer-arith -fdiagnostics-color=always -fno-omit-frame-pointer -fvisibility=hidden -I ${CMAKE_SOURCE_DIR}/src)
@@ -31,6 +32,11 @@ set(CMAKE_CXX_FLAGS_RELEASE "${CMAKE_CXX_FLAGS_RELEASE} -fvisibility-inlines-hid
 set(CMAKE_CXX_FLAGS_MINSIZEREL "${CMAKE_CXX_FLAGS_MINSIZEREL} -fvisibility-inlines-hidden")
 set(CMAKE_CXX_FLAGS_RELWITHDEBINFO "${CMAKE_CXX_FLAGS_RELWITHDEBINFO} -fvisibility-inlines-hidden")

+if (${ENABLE_COVERAGE})
+	add_definitions(-coverage)
+	add_link_options(-coverage)
+endif()
+
 set(CMAKE_BUILD_TYPE RelWithDebInfo)
 string(REGEX REPLACE "([\\/\\-]O)[^ \t\r\n]*" "\\13" CMAKE_CXX_FLAGS_RELEASE "${CMAKE_CXX_FLAGS_RELEASE}")
 string(REGEX REPLACE "([\\/\\-]O)[^ \t\r\n]*" "\\13" CMAKE_CXX_FLAGS_MINSIZEREL "${CMAKE_CXX_FLAGS_MINSIZEREL}")
@@ -78,14 +84,6 @@ else()
 	set(LIBURING_LIBRARIES uring)
 endif (${WITH_SYSTEM_LIBURING})

-add_custom_target(build_tests)
-add_custom_target(test
-	COMMAND
-	echo leak:tcmalloc > ${CMAKE_CURRENT_BINARY_DIR}/lsan-suppress.txt &&
-	env LSAN_OPTIONS=suppressions=${CMAKE_CURRENT_BINARY_DIR}/lsan-suppress.txt ${CMAKE_CTEST_COMMAND}
-)
-add_dependencies(test build_tests)
-
 include_directories(
 	../
 	${CMAKE_SOURCE_DIR}/src/blockstore
--- a/src/blockstore/CMakeLists.txt
+++ b/src/blockstore/CMakeLists.txt
@@ -2,15 +2,17 @@ cmake_minimum_required(VERSION 2.8.12)

 project(vitastor)

-# libvitastor_blk.so
-add_library(vitastor_blk SHARED
-	../util/allocator.cpp blockstore.cpp blockstore_impl.cpp blockstore_disk.cpp blockstore_init.cpp blockstore_open.cpp blockstore_journal.cpp blockstore_read.cpp
-	blockstore_write.cpp blockstore_sync.cpp blockstore_stable.cpp blockstore_rollback.cpp blockstore_flush.cpp ../util/crc32c.c ../util/ringloop.cpp
+# libvitastor_blk.a
+add_library(vitastor_blk STATIC
+	../util/allocator.cpp ../util/crc32c.c ../util/ringloop.cpp
+	multilist.cpp blockstore_heap.cpp blockstore_disk.cpp
+	blockstore.cpp blockstore_impl.cpp blockstore_init.cpp blockstore_open.cpp
+	blockstore_flush.cpp blockstore_read.cpp blockstore_stable.cpp blockstore_sync.cpp blockstore_write.cpp
 )
+target_compile_options(vitastor_blk PUBLIC -fPIC)
 target_link_libraries(vitastor_blk
 	${LIBURING_LIBRARIES}
 	${ISAL_LIBRARIES}
-	tcmalloc_minimal
 	# for timerfd_manager
 	vitastor_common
 )
--- a/src/blockstore/blockstore.cpp
+++ b/src/blockstore/blockstore.cpp
@@ -3,7 +3,7 @@

 #include "blockstore_impl.h"

-blockstore_t::blockstore_t(blockstore_config_t & config, ring_loop_t *ringloop, timerfd_manager_t *tfd)
+blockstore_t::blockstore_t(blockstore_config_t & config, ring_loop_i *ringloop, timerfd_manager_t *tfd)
 {
    impl = new blockstore_impl_t(config, ringloop, tfd);
 }
@@ -48,9 +48,9 @@ int blockstore_t::read_bitmap(object_id oid, uint64_t target_version, void *bitm
    return impl->read_bitmap(oid, target_version, bitmap, result_version);
 }

-std::map<uint64_t, uint64_t> & blockstore_t::get_inode_space_stats()
+const std::map<uint64_t, uint64_t> & blockstore_t::get_inode_space_stats()
 {
-    return impl->inode_space_stats;
+    return impl->get_inode_space_stats();
 }

 void blockstore_t::dump_diagnostics()
@@ -82,8 +82,3 @@ uint32_t blockstore_t::get_bitmap_granularity()
 {
    return impl->get_bitmap_granularity();
 }
-
-void blockstore_t::set_no_inode_stats(const std::vector<uint64_t> & pool_ids)
-{
-    impl->set_no_inode_stats(pool_ids);
-}
--- a/src/blockstore/blockstore.h
+++ b/src/blockstore/blockstore.h
@@ -22,17 +22,20 @@
 #define DIRECT_IO_ALIGNMENT 512
 #endif

-// Memory allocation alignment (page size is usually optimal)
-#ifndef MEM_ALIGNMENT
-#define MEM_ALIGNMENT 4096
-#endif
-
 // Default block size is 128 KB, current allowed range is 4K - 128M
 #define DEFAULT_DATA_BLOCK_ORDER 17
 #define MIN_DATA_BLOCK_SIZE 4*1024
 #define MAX_DATA_BLOCK_SIZE 128*1024*1024
 #define DEFAULT_BITMAP_GRANULARITY 4096

+#define MIN_JOURNAL_SIZE 1024*1024
+
+// "VITAstor"
+#define BLOCKSTORE_META_MAGIC_V1 0x726F747341544956l
+#define BLOCKSTORE_META_FORMAT_V1 1
+#define BLOCKSTORE_META_FORMAT_V2 2
+#define BLOCKSTORE_META_FORMAT_HEAP 3
+
 #define BS_OP_MIN 1
 #define BS_OP_READ 1
 #define BS_OP_WRITE 2
@@ -48,6 +51,12 @@

 /*

+All operations may be submitted in any order, because reads only see completed writes,
+syncs only sync completed writes and writes don't depend on each other.
+
+The only restriction is that the external code MUST NOT submit multiple writes for one
+object in parallel. This is a natural restriction because `version` numbers are used though.
+
 Blockstore opcode documentation:

 ## BS_OP_READ / BS_OP_WRITE / BS_OP_WRITE_STABLE
@@ -162,8 +171,8 @@ struct __attribute__ ((visibility("default"))) blockstore_op_t
            uint32_t list_stable_limit;
        };
    };
-    void *buf = NULL;
-    void *bitmap = NULL;
+    uint8_t *buf = NULL;
+    uint8_t *bitmap = NULL;
    int retval = 0;

    uint8_t private_data[BS_OP_PRIVATE_DATA_SIZE];
@@ -177,7 +186,7 @@ class __attribute__((visibility("default"))) blockstore_t
 {
    blockstore_impl_t *impl;
 public:
-    blockstore_t(blockstore_config_t & config, ring_loop_t *ringloop, timerfd_manager_t *tfd);
+    blockstore_t(blockstore_config_t & config, ring_loop_i *ringloop, timerfd_manager_t *tfd);
    ~blockstore_t();

    // Update configuration
@@ -205,10 +214,7 @@ public:
    int read_bitmap(object_id oid, uint64_t target_version, void *bitmap, uint64_t *result_version = NULL);

    // Get per-inode space usage statistics
-    std::map<uint64_t, uint64_t> & get_inode_space_stats();
-
-    // Set per-pool no_inode_stats
-    void set_no_inode_stats(const std::vector<uint64_t> & pool_ids);
+    const std::map<uint64_t, uint64_t> & get_inode_space_stats();

    // Print diagnostics to stdout
    void dump_diagnostics();
--- a/src/blockstore/blockstore_disk.cpp
+++ b/src/blockstore/blockstore_disk.cpp
@@ -2,11 +2,14 @@
 // License: VNPL-1.1 (see README.md for details)

 #include <sys/file.h>
+#include <sys/ioctl.h>
+#include <unistd.h>

 #include <stdexcept>

-#include "blockstore_impl.h"
+#include "blockstore.h"
 #include "blockstore_disk.h"
+#include "blockstore_heap.h"
 #include "str_util.h"
 #include "allocator.h"

@@ -44,8 +47,11 @@ void blockstore_disk_t::parse_config(std::map<std::string, std::string> & config
    disk_alignment = parse_size(config["disk_alignment"]);
    journal_block_size = parse_size(config["journal_block_size"]);
    meta_block_size = parse_size(config["meta_block_size"]);
+    meta_block_target_free_space = parse_size(config["meta_block_target_free_space"]);
    bitmap_granularity = parse_size(config["bitmap_granularity"]);
    meta_format = stoull_full(config["meta_format"]);
+    atomic_write_size = (config.find("atomic_write_size") != config.end()
+        ? parse_size(config["atomic_write_size"]) : 4096);
    if (config.find("data_io") == config.end() &&
        config.find("meta_io") == config.end() &&
        config.find("journal_io") == config.end())
@@ -90,12 +96,28 @@ void blockstore_disk_t::parse_config(std::map<std::string, std::string> & config
    if (!min_discard_size)
        min_discard_size = 1024*1024;
    discard_granularity = parse_size(config["discard_granularity"]);
+    inmemory_meta = config["inmemory_metadata"] != "false" && config["inmemory_metadata"] != "0" &&
+        config["inmemory_metadata"] != "no";
+    inmemory_journal = config["inmemory_journal"] != "false" && config["inmemory_journal"] != "0" &&
+        config["inmemory_journal"] != "no";
+    disable_data_fsync = config["disable_data_fsync"] == "true" || config["disable_data_fsync"] == "1" || config["disable_data_fsync"] == "yes";
+    disable_meta_fsync = config["disable_meta_fsync"] == "true" || config["disable_meta_fsync"] == "1" || config["disable_meta_fsync"] == "yes";
+    disable_journal_fsync = config["disable_journal_fsync"] == "true" || config["disable_journal_fsync"] == "1" || config["disable_journal_fsync"] == "yes";
+    if (mock_mode)
+    {
+        data_device_size = parse_size(config["data_device_size"]);
+        data_device_sect = parse_size(config["data_device_sect"]);
+        meta_device_size = parse_size(config["meta_device_size"]);
+        meta_device_sect = parse_size(config["meta_device_sect"]);
+        journal_device_size = parse_size(config["journal_device_size"]);
+        journal_device_sect = parse_size(config["journal_device_sect"]);
+    }
    // Validate
    if (!data_block_size)
    {
        data_block_size = (1 << DEFAULT_DATA_BLOCK_ORDER);
    }
-    if ((block_order = is_power_of_two(data_block_size)) >= 64 || data_block_size < MIN_DATA_BLOCK_SIZE || data_block_size >= MAX_DATA_BLOCK_SIZE)
+    if (is_power_of_two(data_block_size) >= 64 || data_block_size < MIN_DATA_BLOCK_SIZE || data_block_size >= MAX_DATA_BLOCK_SIZE)
    {
        throw std::runtime_error("Bad block size");
    }
@@ -131,6 +153,14 @@ void blockstore_disk_t::parse_config(std::map<std::string, std::string> & config
    {
        throw std::runtime_error("meta_block_size must not exceed "+std::to_string(MAX_DATA_BLOCK_SIZE));
    }
+    if (!meta_block_target_free_space)
+    {
+        meta_block_target_free_space = 800;
+    }
+    if (meta_block_target_free_space >= meta_block_size)
+    {
+        throw std::runtime_error("meta_block_target_free_space must not exceed "+std::to_string(meta_block_size));
+    }
    if (data_offset % disk_alignment)
    {
        throw std::runtime_error("data_offset must be a multiple of disk_alignment = "+std::to_string(disk_alignment));
@@ -179,17 +209,29 @@ void blockstore_disk_t::parse_config(std::map<std::string, std::string> & config
    {
        throw std::runtime_error("journal_offset must be a multiple of journal_block_size = "+std::to_string(journal_block_size));
    }
+    if (!meta_format)
+    {
+        meta_format = BLOCKSTORE_META_FORMAT_HEAP;
+    }
+    if (meta_device == data_device)
+    {
+        disable_meta_fsync = disable_data_fsync;
+    }
+    if (journal_device == meta_device)
+    {
+        disable_journal_fsync = disable_meta_fsync;
+    }
 }

-void blockstore_disk_t::calc_lengths(bool skip_meta_check)
+void blockstore_disk_t::calc_lengths()
 {
    // data
    data_len = data_device_size - data_offset;
-    if (data_fd == meta_fd && data_offset < meta_offset)
+    if (data_device == meta_device && data_offset < meta_offset)
    {
        data_len = meta_offset - data_offset;
    }
-    if (data_fd == journal_fd && data_offset < journal_offset)
+    if (data_device == journal_device && data_offset < journal_offset)
    {
        data_len = data_len < journal_offset-data_offset
            ? data_len : journal_offset-data_offset;
@@ -204,23 +246,23 @@ void blockstore_disk_t::calc_lengths(bool skip_meta_check)
        data_len = cfg_data_size;
    }
    // meta
-    uint64_t meta_area_size = (meta_fd == data_fd ? data_device_size : meta_device_size) - meta_offset;
-    if (meta_fd == data_fd && meta_offset <= data_offset)
+    meta_area_size = (meta_device == data_device ? data_device_size : meta_device_size) - meta_offset;
+    if (meta_device == data_device && meta_offset <= data_offset)
    {
        meta_area_size = data_offset - meta_offset;
    }
-    if (meta_fd == journal_fd && meta_offset <= journal_offset)
+    if (meta_device == journal_device && meta_offset <= journal_offset)
    {
        meta_area_size = meta_area_size < journal_offset-meta_offset
            ? meta_area_size : journal_offset-meta_offset;
    }
    // journal
-    journal_len = (journal_fd == data_fd ? data_device_size : (journal_fd == meta_fd ? meta_device_size : journal_device_size)) - journal_offset;
-    if (journal_fd == data_fd && journal_offset <= data_offset)
+    journal_len = (journal_device == data_device ? data_device_size : (journal_device == meta_device ? meta_device_size : journal_device_size)) - journal_offset;
+    if (journal_device == data_device && journal_offset <= data_offset)
    {
        journal_len = data_offset - journal_offset;
    }
-    if (journal_fd == meta_fd && journal_offset <= meta_offset)
+    if (journal_device == meta_device && journal_offset <= meta_offset)
    {
        journal_len = journal_len < meta_offset-journal_offset
            ? journal_len : meta_offset-journal_offset;
@@ -230,37 +272,37 @@ void blockstore_disk_t::calc_lengths(bool skip_meta_check)
    clean_entry_bitmap_size = data_block_size / bitmap_granularity / 8;
    clean_dyn_size = clean_entry_bitmap_size*2 + (csum_block_size
        ? data_block_size/csum_block_size*(data_csum_type & 0xFF) : 0);
-    clean_entry_size = sizeof(clean_disk_entry) + clean_dyn_size + 4 /*entry_csum*/;
-    meta_len = (1 + (block_count - 1 + meta_block_size / clean_entry_size) / (meta_block_size / clean_entry_size)) * meta_block_size;
-    bool new_doesnt_fit = (!meta_format && !skip_meta_check && meta_area_size < meta_len && !data_csum_type);
-    if (meta_format == BLOCKSTORE_META_FORMAT_V1 || new_doesnt_fit)
+    if (meta_format == BLOCKSTORE_META_FORMAT_HEAP)
    {
-        uint64_t clean_entry_v0_size = sizeof(clean_disk_entry) + 2*clean_entry_bitmap_size;
-        uint64_t meta_v0_len = (1 + (block_count - 1 + meta_block_size / clean_entry_v0_size)
-            / (meta_block_size / clean_entry_v0_size)) * meta_block_size;
-        if (meta_format == BLOCKSTORE_META_FORMAT_V1 || meta_area_size >= meta_v0_len)
-        {
-            // Old metadata fits.
-            if (new_doesnt_fit)
-            {
-                printf("Warning: Using old metadata format without checksums because the new format"
-                    " doesn't fit into provided area (%ju bytes required, %ju bytes available)\n", meta_len, meta_area_size);
-            }
-            clean_entry_size = clean_entry_v0_size;
-            meta_len = meta_v0_len;
-            meta_format = BLOCKSTORE_META_FORMAT_V1;
-        }
-        else
-            meta_format = BLOCKSTORE_META_FORMAT_V2;
+        uint32_t entries_per_block = ((meta_block_size-meta_block_target_free_space) /
+            (sizeof(heap_object_t) + sizeof(heap_write_t) + clean_dyn_size));
+        min_meta_len = (block_count+entries_per_block-1) / entries_per_block * meta_block_size;
+    }
+    else if (meta_format == BLOCKSTORE_META_FORMAT_V1)
+    {
+        clean_entry_size = 24 /*sizeof(clean_disk_entry)*/ + 2*clean_entry_bitmap_size;
+        min_meta_len = (1 + (block_count - 1 + meta_block_size / clean_entry_size)
+            / (meta_block_size / clean_entry_size)) * meta_block_size;
+    }
+    else if (meta_format == BLOCKSTORE_META_FORMAT_V2)
+    {
+        clean_entry_size = 24 /*sizeof(clean_disk_entry)*/ + clean_dyn_size + 4 /*entry_csum*/;
+        min_meta_len = (1 + (block_count - 1 + meta_block_size / clean_entry_size) / (meta_block_size / clean_entry_size)) * meta_block_size;
    }
    else
-        meta_format = BLOCKSTORE_META_FORMAT_V2;
-    if (!skip_meta_check && meta_area_size < meta_len)
    {
-        throw std::runtime_error("Metadata area is too small, need at least "+std::to_string(meta_len)+" bytes, have only "+std::to_string(meta_area_size)+" bytes");
+        throw std::runtime_error("meta_format = "+std::to_string(meta_format)+" is not supported");
+    }
+}
+
+void blockstore_disk_t::check_lengths()
+{
+    if (meta_area_size < min_meta_len)
+    {
+        throw std::runtime_error("Metadata area is too small, need at least "+std::to_string(min_meta_len)+" bytes, have only "+std::to_string(meta_area_size)+" bytes");
    }
    // requested journal size
-    if (!skip_meta_check && cfg_journal_size > journal_len)
+    if (cfg_journal_size > journal_len)
    {
        throw std::runtime_error("Requested journal_size is too large");
    }
@@ -321,12 +363,19 @@ static int bs_openmode(const std::string & mode)

 void blockstore_disk_t::open_data()
 {
-    data_fd = open(data_device.c_str(), bs_openmode(data_io) | O_RDWR);
+    if (data_fd >= 0)
+    {
+        throw std::runtime_error("data device is already opened");
+    }
+    data_fd = mock_mode ? MOCK_DATA_FD : open(data_device.c_str(), bs_openmode(data_io) | O_RDWR);
    if (data_fd == -1)
    {
        throw std::runtime_error("Failed to open data device "+data_device+": "+std::string(strerror(errno)));
    }
-    check_size(data_fd, &data_device_size, &data_device_sect, "data device");
+    if (!mock_mode)
+    {
+        check_size(data_fd, &data_device_size, &data_device_sect, "data device");
+    }
    if (disk_alignment % data_device_sect)
    {
        throw std::runtime_error(
@@ -338,7 +387,7 @@ void blockstore_disk_t::open_data()
    {
        throw std::runtime_error("data_offset exceeds device size = "+std::to_string(data_device_size));
    }
-    if (!disable_flock && flock(data_fd, LOCK_EX|LOCK_NB) != 0)
+    if (!mock_mode && !disable_flock && flock(data_fd, LOCK_EX|LOCK_NB) != 0)
    {
        throw std::runtime_error(std::string("Failed to lock data device: ") + strerror(errno));
    }
@@ -346,19 +395,26 @@ void blockstore_disk_t::open_data()

 void blockstore_disk_t::open_meta()
 {
+    if (meta_fd >= 0)
+    {
+        throw std::runtime_error("metadata device is already opened");
+    }
    if (meta_device != data_device || meta_io != data_io)
    {
-        meta_fd = open(meta_device.c_str(), bs_openmode(meta_io) | O_RDWR);
+        meta_fd = mock_mode ? MOCK_META_FD : open(meta_device.c_str(), bs_openmode(meta_io) | O_RDWR);
        if (meta_fd == -1)
        {
            throw std::runtime_error("Failed to open metadata device "+meta_device+": "+std::string(strerror(errno)));
        }
-        check_size(meta_fd, &meta_device_size, &meta_device_sect, "metadata device");
+        if (!mock_mode)
+        {
+            check_size(meta_fd, &meta_device_size, &meta_device_sect, "metadata device");
+        }
        if (meta_offset >= meta_device_size)
        {
            throw std::runtime_error("meta_offset exceeds device size = "+std::to_string(meta_device_size));
        }
-        if (!disable_flock && meta_device != data_device && flock(meta_fd, LOCK_EX|LOCK_NB) != 0)
+        if (!mock_mode && !disable_flock && meta_device != data_device && flock(meta_fd, LOCK_EX|LOCK_NB) != 0)
        {
            throw std::runtime_error(std::string("Failed to lock metadata device: ") + strerror(errno));
        }
@@ -384,15 +440,26 @@ void blockstore_disk_t::open_meta()

 void blockstore_disk_t::open_journal()
 {
+    if (journal_fd >= 0)
+    {
+        throw std::runtime_error("journal device is already opened");
+    }
    if (journal_device != meta_device || journal_io != meta_io)
    {
-        journal_fd = open(journal_device.c_str(), bs_openmode(journal_io) | O_RDWR);
+        journal_fd = mock_mode ? MOCK_JOURNAL_FD : open(journal_device.c_str(), bs_openmode(journal_io) | O_RDWR);
        if (journal_fd == -1)
        {
            throw std::runtime_error("Failed to open journal device "+journal_device+": "+std::string(strerror(errno)));
        }
-        check_size(journal_fd, &journal_device_size, &journal_device_sect, "journal device");
-        if (!disable_flock && journal_device != meta_device && flock(journal_fd, LOCK_EX|LOCK_NB) != 0)
+        if (!mock_mode)
+        {
+            check_size(journal_fd, &journal_device_size, &journal_device_sect, "journal device");
+        }
+        if (journal_offset >= journal_device_size)
+        {
+            throw std::runtime_error("journal_offset exceeds device size = "+std::to_string(journal_device_size));
+        }
+        if (!mock_mode && !disable_flock && journal_device != meta_device && flock(journal_fd, LOCK_EX|LOCK_NB) != 0)
        {
            throw std::runtime_error(std::string("Failed to lock journal device: ") + strerror(errno));
        }
@@ -418,25 +485,32 @@ void blockstore_disk_t::open_journal()

 void blockstore_disk_t::close_all()
 {
-    if (data_fd >= 0)
-        close(data_fd);
-    if (meta_fd >= 0 && meta_fd != data_fd)
-        close(meta_fd);
-    if (journal_fd >= 0 && journal_fd != meta_fd)
-        close(journal_fd);
+    if (!mock_mode)
+    {
+        if (data_fd >= 0)
+            close(data_fd);
+        if (meta_fd >= 0 && meta_fd != data_fd)
+            close(meta_fd);
+        if (journal_fd >= 0 && journal_fd != meta_fd)
+            close(journal_fd);
+    }
    data_fd = meta_fd = journal_fd = -1;
 }

 // Sadly DISCARD only works through ioctl(), but it seems to always block the device queue,
 // so it's not a big deal that we can only run it synchronously.
-int blockstore_disk_t::trim_data(allocator_t *alloc)
+int blockstore_disk_t::trim_data(std::function<bool(uint64_t)> is_free)
 {
+    if (mock_mode)
+    {
+        return -EINVAL;
+    }
    int r = 0;
    uint64_t j = 0, i = 0;
    uint64_t discarded = 0;
    for (; i <= block_count; i++)
    {
-        if (i >= block_count || alloc->get(i))
+        if (i >= block_count || is_free(i))
        {
            if (i > j && (i-j)*data_block_size >= min_discard_size)
            {
--- a/src/blockstore/blockstore_disk.h
+++ b/src/blockstore/blockstore_disk.h
@@ -12,6 +12,10 @@
 // Lower byte of checksum type is its length
 #define BLOCKSTORE_CSUM_CRC32C 0x104

+#define MOCK_DATA_FD 1000
+#define MOCK_META_FD 1001
+#define MOCK_JOURNAL_FD 1002
+
 class allocator_t;

 struct blockstore_disk_t
@@ -22,11 +26,15 @@ struct blockstore_disk_t
    // Required write alignment and journal/metadata/data areas' location alignment
    uint32_t disk_alignment = 4096;
    // Journal block size - minimum_io_size of the journal device is the best choice
-    uint64_t journal_block_size = 4096;
+    uint32_t journal_block_size = 4096;
    // Metadata block size - minimum_io_size of the metadata device is the best choice
-    uint64_t meta_block_size = 4096;
+    uint32_t meta_block_size = 4096;
+    // Atomic write size of the data block device
+    uint32_t atomic_write_size = 4096;
+    // Target free space in metadata blocks
+    uint32_t meta_block_target_free_space = 800;
    // Sparse write tracking granularity. 4 KB is a good choice. Must be a multiple of disk_alignment
-    uint64_t bitmap_granularity = 4096;
+    uint32_t bitmap_granularity = 4096;
    // Data checksum type, BLOCKSTORE_CSUM_NONE or BLOCKSTORE_CSUM_CRC32C
    uint32_t data_csum_type = BLOCKSTORE_CSUM_NONE;
    // Checksum block size, must be a multiple of bitmap_granularity
@@ -36,27 +44,36 @@ struct blockstore_disk_t
    // I/O modes for data, metadata and journal: direct or "" = O_DIRECT, cached = O_SYNC, directsync = O_DIRECT|O_SYNC
    // O_SYNC without O_DIRECT = use Linux page cache for reads and writes
    std::string data_io, meta_io, journal_io;
+    // It is safe to disable fsync() if drive write cache is writethrough
+    bool disable_data_fsync = false, disable_meta_fsync = false, disable_journal_fsync = false;
+    // Keep journal (buffered data) in memory?
+    bool inmemory_meta = true;
+    // Keep metadata in memory?
+    bool inmemory_journal = true;
    // Data discard granularity and minimum size (for the sake of performance)
    bool discard_on_start = false;
    uint64_t min_discard_size = 1024*1024;
    uint64_t discard_granularity = 0;

    int meta_fd = -1, data_fd = -1, journal_fd = -1;
-    uint64_t meta_offset, meta_device_sect, meta_device_size, meta_len, meta_format = 0;
+    uint64_t meta_offset, meta_device_sect, meta_device_size, meta_area_size, min_meta_len, meta_format = 0;
    uint64_t data_offset, data_device_sect, data_device_size, data_len;
    uint64_t journal_offset, journal_device_sect, journal_device_size, journal_len;

-    uint32_t block_order = 0;
    uint64_t block_count = 0;
-    uint32_t clean_entry_bitmap_size = 0, clean_entry_size = 0, clean_dyn_size = 0;
+    uint32_t clean_entry_bitmap_size = 0;
+    uint32_t clean_entry_size = 0, clean_dyn_size = 0; // for meta_v1/2
+
+    bool mock_mode = false;

    void parse_config(std::map<std::string, std::string> & config);
    void open_data();
    void open_meta();
    void open_journal();
-    void calc_lengths(bool skip_meta_check = false);
+    void calc_lengths();
+    void check_lengths();
    void close_all();
-    int trim_data(allocator_t *alloc);
+    int trim_data(std::function<bool(uint64_t)> is_free);

    inline uint64_t dirty_dyn_size(uint64_t offset, uint64_t len)
    {
--- a/src/blockstore/blockstore_flush.cpp
+++ b/src/blockstore/blockstore_flush.cpp
--- a/src/blockstore/blockstore_flush.h
+++ b/src/blockstore/blockstore_flush.h
@@ -1,22 +1,20 @@
 // Copyright (c) Vitaliy Filippov, 2019+
 // License: VNPL-1.1 (see README.md for details)

-#define COPY_BUF_JOURNAL 1
-#define COPY_BUF_DATA 2
-#define COPY_BUF_ZERO 4
-#define COPY_BUF_CSUM_FILL 8
-#define COPY_BUF_COALESCED 16
-#define COPY_BUF_META_BLOCK 32
-#define COPY_BUF_JOURNALED_BIG 64
+#define COPY_BUF_JOURNAL    0x01
+#define COPY_BUF_DATA       0x02
+#define COPY_BUF_ZERO       0x04
+#define COPY_BUF_CSUM_FILL  0x08
+#define COPY_BUF_COALESCED  0x10
+#define COPY_BUF_PADDED     0x20
+#define COPY_BUF_SKIP_CSUM  0x40

 struct copy_buffer_t
 {
-    int copy_flags;
-    uint64_t offset, len, disk_offset;
-    uint64_t journal_sector; // only for reads: sector+1 if used and !journal.inmemory, otherwise 0
-    void *buf;
-    uint8_t *csum_buf;
-    int *dyn_data;
+    uint32_t copy_flags;
+    uint64_t offset, len, disk_loc, disk_offset, disk_len;
+    uint8_t *buf;
+    uint64_t wr_lsn;
 };

 struct meta_sector_t
@@ -27,13 +25,6 @@ struct meta_sector_t
    int usage_count;
 };

-struct flusher_sync_t
-{
-    bool fsync_meta;
-    int ready_count;
-    int state;
-};
-
 struct flusher_meta_write_t
 {
    uint64_t sector, pos;
@@ -49,94 +40,75 @@ class journal_flusher_co
 {
    blockstore_impl_t *bs;
    journal_flusher_t *flusher;
-    int wait_state, wait_count, wait_journal_count;
+    int co_id;
+    int wait_state, wait_count;
    struct io_uring_sqe *sqe;
    struct ring_data_t *data;

-    std::list<flusher_sync_t>::iterator cur_sync;
+    std::function<void(ring_data_t*)> simple_callback_r, simple_callback_w;

-    obj_ver_id cur;
-    std::map<obj_ver_id, dirty_entry>::iterator dirty_it, dirty_start, dirty_end;
-    std::map<object_id, uint64_t>::iterator repeat_it;
-    std::function<void(ring_data_t*)> simple_callback_r, simple_callback_rj, simple_callback_w;
+    object_id cur_oid;
+    uint64_t copy_id;
+    uint64_t compact_lsn;
+    uint64_t cur_version;
+    heap_object_t *cur_obj;
+    heap_write_t *begin_wr, *end_wr;
+    uint32_t modified_block;
+    bool should_repeat;

-    bool try_trim = false;
-    bool skip_copy, has_delete, has_writes;
-    std::vector<copy_buffer_t> v;
-    std::vector<copy_buffer_t>::iterator it;
-    int i;
-    bool fill_incomplete, cleared_incomplete;
-    int read_to_fill_incomplete;
+    std::vector<copy_buffer_t> read_vec;
+    uint32_t overwrite_start, overwrite_end;
+    uint32_t big_start, big_end;
+    int i, res;
+    bool read_to_fill_incomplete;
    int copy_count;
-    uint64_t clean_loc, clean_ver, old_clean_loc, old_clean_ver;
+    uint64_t clean_loc;
    flusher_meta_write_t meta_old, meta_new;
-    bool clean_init_bitmap;
-    uint64_t clean_bitmap_offset, clean_bitmap_len;
-    uint8_t *clean_init_dyn_ptr;
-    uint8_t *new_clean_bitmap;
-
-    uint64_t new_trim_pos;
+    bool do_repeat = false;

    friend class journal_flusher_t;
-    void scan_dirty();
-    bool read_dirty(int wait_base);
-    bool modify_meta_do_reads(int wait_base);
-    bool wait_meta_reads(int wait_base);
-    bool modify_meta_read(uint64_t meta_loc, flusher_meta_write_t &wr, int wait_base);
-    bool clear_incomplete_csum_block_bits(int wait_base);
-    void calc_block_checksums(uint32_t *new_data_csums, bool skip_overwrites);
-    void update_metadata_entry();
-    bool write_meta_block(flusher_meta_write_t & meta_block, int wait_base);
-    void update_clean_db();
-    void free_data_blocks();
-    bool fsync_batch(bool fsync_meta, int wait_base);
-    bool trim_journal(int wait_base);
+
+    void iterate_checksum_holes(std::function<void(int & pos, uint32_t hole_start, uint32_t hole_end)> cb);
+    void fill_partial_checksum_blocks();
    void free_buffers();
+    int check_and_punch_checksums();
+    bool calc_block_checksums();
+    bool write_meta_block(int wait_base);
+    bool read_buffered(int wait_base);
+    bool fsync_meta(int wait_base);
+    int fsync_buffer(int wait_base);
+    bool trim_lsn(int wait_base);
 public:
    journal_flusher_co();
+    ~journal_flusher_co();
    bool loop();
 };

 // Journal flusher itself
 class journal_flusher_t
 {
-    int trim_wanted = 0;
-    bool dequeuing;
-    int min_flusher_count, max_flusher_count, cur_flusher_count, target_flusher_count;
-    int flusher_start_threshold;
+    int force_start = 0;
+    int min_flusher_count = 0, max_flusher_count = 0, cur_flusher_count = 0, target_flusher_count = 0;
    journal_flusher_co *co;
    blockstore_impl_t *bs;
    friend class journal_flusher_co;

-    int journal_trim_counter;
-    bool trimming;
-    void* journal_superblock;
+    int advance_lsn_counter = 0;
+    uint64_t compact_counter = 0;

-    int active_flushers;
-    int syncing_flushers;
-    std::list<flusher_sync_t> syncs;
-    std::map<object_id, uint64_t> sync_to_repeat;
-
-    std::map<uint64_t, meta_sector_t> meta_sectors;
-    std::deque<object_id> flush_queue;
-    std::unordered_map<object_id, uint64_t> flush_versions;
-    std::unordered_set<uint64_t> inflight_meta_sectors;
-
-    bool try_find_older(std::map<obj_ver_id, dirty_entry>::iterator & dirty_end, obj_ver_id & cur);
-    bool try_find_other(std::map<obj_ver_id, dirty_entry>::iterator & dirty_end, obj_ver_id & cur);
+    int active_flushers = 0;
+    int wanting_meta_fsync = 0;
+    bool fsyncing_meta = false;
+    int syncing_buffer = 0;

 public:
    journal_flusher_t(blockstore_impl_t *bs);
    ~journal_flusher_t();
    void loop();
-    bool is_trim_wanted() { return trim_wanted; }
+    int get_syncing_buffer();
+    uint64_t get_compact_counter();
    bool is_active();
-    void mark_trim_possible();
    void request_trim();
    void release_trim();
-    void enqueue_flush(obj_ver_id oid);
-    void unshift_flush(obj_ver_id oid, bool force);
-    void remove_flush(object_id oid);
    void dump_diagnostics();
-    bool is_mutated(uint64_t clean_loc);
 };
--- a/src/blockstore/blockstore_heap.cpp
+++ b/src/blockstore/blockstore_heap.cpp
--- a/src/blockstore/blockstore_heap.h
+++ b/src/blockstore/blockstore_heap.h
@@ -0,0 +1,376 @@
+// Metadata storage version 3 ("heap")
+// Copyright (c) Vitaliy Filippov, 2025+
+// License: VNPL-1.1 (see README.md for details)
+
+#pragma once
+
+#include <map>
+#include <unordered_map>
+#include <set>
+#include <deque>
+#include <vector>
+
+#include "../client/object_id.h"
+#include "../util/robin_hood.h"
+#include "blockstore_disk.h"
+#include "multilist.h"
+
+struct pool_shard_settings_t
+{
+    uint32_t pg_count;
+    uint32_t pg_stripe_size;
+};
+
+#define BS_HEAP_TYPE 7
+#define BS_HEAP_OBJECT 1
+#define BS_HEAP_SMALL_WRITE 2
+#define BS_HEAP_BIG_WRITE 3
+#define BS_HEAP_TOMBSTONE 4
+#define BS_HEAP_INTENT_WRITE 5
+#define BS_HEAP_STABLE 8
+
+class blockstore_heap_t;
+
+struct __attribute__((__packed__)) heap_small_write_t
+{
+    uint16_t size;
+    int16_t next_pos;
+    uint8_t flags;
+    uint64_t lsn;
+    uint64_t version;
+    uint64_t location;
+    uint32_t offset;
+    uint32_t len;
+};
+
+struct __attribute__((__packed__)) heap_big_write_t
+{
+    uint16_t size;
+    int16_t next_pos;
+    uint8_t flags;
+    uint64_t lsn;
+    uint64_t version;
+    uint32_t block_num;
+};
+
+struct __attribute__((__packed__)) heap_tombstone_t
+{
+    uint16_t size;
+    int16_t next_pos;
+    uint8_t flags;
+    uint64_t lsn;
+    uint64_t version;
+};
+
+struct __attribute__((__packed__)) heap_write_t
+{
+    // size should have top bit cleared
+    uint16_t size = 0;
+    int16_t next_pos = 0;
+    uint8_t entry_type = 0; // BS_HEAP_*
+    uint64_t lsn = 0;
+    uint64_t version = 0;
+
+    // uint8_t[] external_bitmap
+    // uint8_t[] internal_bitmap
+    // uint32_t[] checksums
+
+    heap_write_t *next();
+    inline uint8_t type() const { return (entry_type & BS_HEAP_TYPE); }
+    inline heap_small_write_t& small() { return *(heap_small_write_t*)this; }
+    inline heap_big_write_t& big() { return *(heap_big_write_t*)this; }
+    uint32_t get_size(blockstore_heap_t *heap);
+    uint32_t get_csum_size(blockstore_heap_t *heap);
+    bool needs_recheck(blockstore_heap_t *heap);
+    bool needs_compact(blockstore_heap_t *heap);
+    bool is_compacted(uint64_t compacted_lsn);
+    bool can_be_collapsed(blockstore_heap_t *heap);
+    bool is_allowed_before_compacted(uint64_t compacted_lsn, bool is_last_entry);
+    uint8_t *get_ext_bitmap(blockstore_heap_t *heap);
+    uint8_t *get_int_bitmap(blockstore_heap_t *heap);
+    uint8_t *get_checksums(blockstore_heap_t *heap);
+    uint32_t *get_checksum(blockstore_heap_t *heap);
+    uint64_t big_location(blockstore_heap_t *heap);
+    void set_big_location(blockstore_heap_t *heap, uint64_t location);
+};
+
+struct __attribute__((__packed__)) heap_object_t
+{
+    // size should have top bit cleared
+    uint16_t size = 0;
+    // linked list of write entries...
+    // newest entries are stored first to simplify scanning
+    int16_t write_pos = 0;
+    uint8_t entry_type = 0; // BS_HEAP_*
+    uint32_t crc32c = 0;
+    uint64_t inode = 0;
+    uint64_t stripe = 0;
+
+    heap_write_t *get_writes();
+    uint32_t calc_crc32c();
+};
+
+struct heap_object_lsn_t
+{
+    object_id oid;
+    uint64_t lsn;
+};
+
+inline bool operator < (const heap_object_lsn_t & a, const heap_object_lsn_t & b)
+{
+    return a.oid < b.oid || a.oid == b.oid && a.lsn < b.lsn;
+}
+
+struct tmp_compact_item_t
+{
+    object_id oid;
+    uint64_t lsn;
+    bool compact;
+};
+
+struct heap_mvcc_copy_id_t
+{
+    object_id oid;
+    uint64_t copy_id;
+};
+
+inline bool operator == (const heap_mvcc_copy_id_t & a, const heap_mvcc_copy_id_t & b)
+{
+    return a.oid.inode == b.oid.inode && a.oid.stripe == b.oid.stripe && a.copy_id == b.copy_id;
+}
+
+namespace std
+{
+    template<> struct hash<heap_mvcc_copy_id_t>
+    {
+        inline size_t operator()(const heap_mvcc_copy_id_t &s) const
+        {
+            size_t seed = std::hash<object_id>()(s.oid);
+            // Copy-pasted from spp::hash_combine()
+            seed ^= (s.copy_id + 0xc6a4a7935bd1e995 + (seed << 6) + (seed >> 2));
+            return seed;
+        }
+    };
+};
+
+struct heap_object_mvcc_t
+{
+    uint32_t readers = 0;
+    heap_object_t *entry_copy = NULL;
+};
+
+struct __attribute__((__packed__)) heap_block_info_t
+{
+    uint32_t used_space = 0;
+    uint32_t free_pos = 0;
+    uint8_t *data = NULL;
+};
+
+struct heap_inflight_lsn_t
+{
+    object_id oid;
+    uint64_t flags;
+};
+
+struct heap_refqi_t
+{
+    uint64_t lsn;
+    uint64_t inode;
+    uint64_t location;
+    uint32_t len;
+    bool is_data;
+};
+
+using i64hash_t = robin_hood::hash<uint64_t>;
+using heap_block_index_t = robin_hood::unordered_flat_map<uint64_t,
+    robin_hood::unordered_flat_map<inode_t, robin_hood::unordered_flat_map<uint64_t, uint64_t, i64hash_t, std::equal_to<uint64_t>, 88>, i64hash_t>, i64hash_t>;
+using heap_mvcc_map_t = robin_hood::unordered_flat_map<heap_mvcc_copy_id_t, heap_object_mvcc_t>;
+
+class blockstore_heap_t
+{
+    friend class heap_write_t;
+    friend class heap_object_t;
+
+    blockstore_disk_t *dsk = NULL;
+    uint8_t* buffer_area = NULL;
+    bool abort_on_corruption = false;
+    bool abort_on_overlap = true;
+    int log_level = 0;
+
+    const uint32_t meta_block_count = 0;
+    uint32_t target_block_free_space = 800;
+
+    uint64_t next_lsn = 0;
+    robin_hood::unordered_flat_map<pool_id_t, pool_shard_settings_t> pool_shard_settings;
+    // PG => inode => stripe => block number
+    heap_block_index_t block_index;
+    std::vector<heap_block_info_t> block_info;
+    allocator_t *data_alloc = NULL;
+    multilist_index_t *meta_alloc = NULL;
+    uint32_t meta_alloc_count = 0;
+    uint64_t meta_used_space = 0;
+    multilist_alloc_t *buffer_alloc = NULL;
+    heap_mvcc_map_t object_mvcc;
+    std::unordered_map<uint64_t, uint32_t> mvcc_data_refs;
+    std::unordered_map<uint64_t, uint32_t> mvcc_buffer_refs;
+    std::map<uint64_t, uint64_t> inode_space_stats;
+    uint64_t buffer_area_used_space = 0;
+    uint64_t data_used_space = 0;
+
+    // LSN queue: inflight (writing) -> completed [-> fsynced] -> compactable -> compacted [-> fsynced] -> trimmed and removed
+    std::deque<heap_inflight_lsn_t> inflight_lsn;
+    uint32_t to_compact_count = 0;
+    uint64_t first_inflight_lsn = 0;
+    uint64_t completed_lsn = 0;
+    uint64_t fsynced_lsn = 0;
+    uint64_t compacted_lsn = 0;
+    uint64_t next_compact_lsn = 0;
+    std::deque<heap_refqi_t> overwrite_ref_queue;
+
+    std::vector<tmp_compact_item_t> tmp_compact_queue;
+    std::deque<object_id> recheck_queue;
+    int recheck_in_progress = 0;
+    bool in_recheck = false;
+    std::function<void(bool is_data, uint64_t offset, uint64_t len, uint8_t* buf, std::function<void()>)> recheck_cb;
+    int recheck_queue_depth = 0;
+
+    const uint32_t max_write_entry_size;
+
+    uint64_t get_pg_id(inode_t inode, uint64_t stripe);
+    void defragment_block(uint32_t block_num);
+    uint32_t find_block_run(heap_block_info_t & block, uint32_t space);
+    uint32_t find_block_space(uint32_t block_num, uint32_t space);
+    uint32_t block_has_compactable(uint8_t *data);
+    uint32_t compact_object_to(heap_object_t *obj, uint64_t lsn, uint8_t *new_csums, bool do_free);
+    void copy_full_object(uint8_t *dst, heap_object_t *obj);
+    bool mvcc_save_copy(heap_object_t *obj);
+    bool mvcc_check_tracking(object_id oid);
+    void free_mvcc(heap_mvcc_map_t::iterator mvcc_it);
+    void allocate_block(heap_block_info_t & inf);
+    int allocate_new_object(object_id oid, uint32_t full_object_size, uint32_t *modified_block, heap_object_t **new_obj);
+    int add_object(object_id oid, heap_write_t *wr, uint32_t *modified_block);
+    void mark_overwritten(uint64_t over_lsn, uint64_t inode, heap_write_t *wr, heap_write_t *end_wr, bool tracking_active);
+    int update_object(uint32_t block_num, heap_object_t *obj, heap_write_t *wr, uint32_t *modified_block, uint32_t *moved_from_block);
+    void init_erase(uint32_t block_num, heap_object_t *obj);
+    void erase_object(uint32_t block_num, heap_object_t *obj, uint64_t lsn, bool tracking_active);
+    void reindex_block(uint32_t block_num, heap_object_t *from_obj);
+    void erase_block_index(inode_t inode, uint64_t stripe);
+    void deref_data(uint64_t inode, uint64_t location, bool free_at_0);
+    void deref_buffer(uint64_t inode, uint64_t location, uint32_t len, bool free_at_0);
+    void deref_overwrites(uint64_t lsn);
+    void free_object_space(inode_t inode, heap_write_t *from, heap_write_t *to, int mode = 0);
+    void add_used_space(uint32_t block_num, int32_t used_delta);
+    void push_inflight_lsn(object_id oid, uint64_t lsn, uint64_t flags);
+
+public:
+    blockstore_heap_t(blockstore_disk_t *dsk, uint8_t *buffer_area, int log_level = 0);
+    ~blockstore_heap_t();
+    // set initially compacted lsn - should be done before loading
+    void set_compacted_lsn(uint64_t compacted_lsn);
+    uint64_t get_compacted_lsn();
+    // load data from the disk, returns count of loaded write entries
+    void read_blocks(uint64_t disk_offset, uint64_t size, uint8_t *buf,
+        std::function<void(heap_object_t*)> handle_object, std::function<void(uint32_t, uint32_t, uint8_t*)> handle_block);
+    uint64_t load_blocks(uint64_t disk_offset, uint64_t size, uint8_t *buf);
+    // finish loading
+    void finish_load();
+    // recheck small write data after reading the database from disk
+    bool recheck_small_writes(std::function<void(bool is_data, uint64_t offset, uint64_t len, uint8_t* buf, std::function<void()>)> read_buffer, int queue_depth);
+    // initialize metadata area (fill it with empty data)
+    // returns 0 when done, EAGAIN when the caller has to wait more
+    int initialize();
+    // read from the metadata area
+    // returns 0 when done, EAGAIN when the caller has to wait more
+    int read();
+    // reshard database according to the pool's PG count
+    void reshard(pool_id_t pool, uint32_t pg_count, uint32_t pg_stripe_size);
+    // read an object entry and lock it against removal
+    // in the future, may become asynchronous
+    heap_object_t *lock_and_read_entry(object_id oid, uint64_t & copy_id);
+    // re-read a locked object entry with the given lsn (pointer may be invalidated)
+    heap_object_t *read_locked_entry(object_id oid, uint64_t copy_id);
+    // read an object entry without locking it
+    heap_object_t *read_entry(object_id oid, uint32_t *block_num_ptr, bool for_update = false);
+    // unlock an entry
+    bool unlock_entry(object_id oid, uint64_t copy_id);
+    // set or verify checksums in a write request
+    bool calc_checksums(heap_write_t *wr, uint8_t *data, bool set, uint32_t offset = 0, uint32_t len = 0);
+    // set or verify raw block checksums
+    bool calc_block_checksums(uint32_t *block_csums, uint8_t *data, uint8_t *bitmap, uint32_t start, uint32_t end,
+        bool set, std::function<void(uint32_t, uint32_t, uint32_t)> bad_block_cb);
+    bool calc_block_checksums(uint32_t *block_csums, uint8_t *bitmap,
+        uint32_t start, uint32_t end, std::function<uint8_t*(uint32_t start, uint32_t & len)> next,
+        bool set, std::function<void(uint32_t, uint32_t, uint32_t)> bad_block_cb);
+    // copy an object as is
+    int copy_object(heap_object_t *obj, uint32_t *modified_block);
+    // auto-compacts the object, then adds a write entry to it and to the compaction queue
+    // return 0 if OK, or maybe ENOSPC
+    int post_write(object_id oid, heap_write_t *wr, uint32_t *modified_block, uint32_t *moved_from_block);
+    int post_write(uint32_t & block_num, object_id oid, heap_object_t *obj, heap_write_t *wr, uint32_t *moved_from_block);
+    // stabilize an unstable object version
+    // return 0 if OK, ENOENT if not exists
+    int post_stabilize(object_id oid, uint64_t version, uint32_t *modified_block, uint64_t *new_lsn, uint64_t *new_to_lsn);
+    // rollback an unstable object version
+    // return 0 if OK, ENOENT if not exists, EBUSY if already stable
+    int post_rollback(object_id oid, uint64_t version, uint64_t *new_lsn, uint32_t *modified_block);
+    // forget an object
+    // return error code
+    int post_delete(object_id oid, uint64_t *new_lsn, uint32_t *modified_block);
+    int post_delete(uint32_t block_num, heap_object_t *obj, uint64_t *new_lsn);
+    // get the next object to compact
+    // guaranteed to return objects in min lsn order
+    // returns 0 if OK, ENOENT if nothing to compact
+    int get_next_compact(object_id & oid);
+    // get the range of an object eligible for compaction
+    void get_compact_range(heap_object_t *obj, uint64_t max_lsn, heap_write_t **begin_wr, heap_write_t **end_wr);
+    // mark an object as compacted up to the given lsn
+    int compact_object(object_id oid, uint64_t lsn, uint8_t *new_csums);
+    // retrieve object listing from a PG
+    int list_objects(uint32_t pg_num, object_id min_oid, object_id max_oid,
+        obj_ver_id **result_list, size_t *stable_count, size_t *unstable_count);
+    // set a block number for a new object and returns error status: 0, EAGAIN or ENOSPC
+    int get_block_for_new_object(uint32_t & out_block_num, uint32_t size = 0);
+
+    // inflight write tracking
+    void mark_lsn_completed(uint64_t lsn);
+    void mark_lsn_fsynced(uint64_t lsn);
+    void mark_lsn_compacted(uint64_t lsn, bool allow_undone = false);
+    void mark_object_compacted(heap_object_t *obj, uint64_t max_lsn);
+    void mark_lsn_trimmed(uint64_t lsn);
+    uint64_t get_completed_lsn();
+    uint64_t get_fsynced_lsn();
+
+    // data device block allocator functions
+    uint64_t find_free_data();
+    bool is_data_used(uint64_t location);
+    void use_data(inode_t inode, uint64_t location);
+    void free_data(inode_t inode, uint64_t location);
+
+    // buffer device allocator functions
+    uint64_t find_free_buffer_area(uint64_t size);
+    bool is_buffer_area_free(uint64_t location, uint64_t size);
+    void use_buffer_area(inode_t inode, uint64_t location, uint64_t size);
+    void free_buffer_area(inode_t inode, uint64_t location, uint64_t size);
+    uint64_t get_buffer_area_used_space();
+
+    // get metadata block data buffer and used space
+    uint8_t *get_meta_block(uint32_t block_num);
+    uint32_t get_meta_block_used_space(uint32_t block_num);
+
+    // get space usage statistics
+    uint64_t get_data_used_space();
+    const std::map<uint64_t, uint64_t> & get_inode_space_stats();
+    uint64_t get_meta_total_space();
+    uint64_t get_meta_used_space();
+    uint32_t get_meta_nearfull_blocks();
+    uint32_t get_inflight_queue_size();
+    uint32_t get_compact_queue_size();
+    uint32_t get_to_compact_count();
+
+    // get maximum size for a temporary heap_write_t buffer
+    uint32_t get_max_write_entry_size();
+
+    // only for tests
+    void set_abort_on_corruption(bool fail);
+    void set_abort_on_overlap(bool fail);
+};
--- a/src/blockstore/blockstore_impl.cpp
+++ b/src/blockstore/blockstore_impl.cpp
@@ -1,13 +1,17 @@
 // Copyright (c) Vitaliy Filippov, 2019+
 // License: VNPL-1.1 (see README.md for details)

-#include "blockstore_impl.h"
+#include <stdexcept>

-blockstore_impl_t::blockstore_impl_t(blockstore_config_t & config, ring_loop_t *ringloop, timerfd_manager_t *tfd)
+#include "blockstore_impl.h"
+#include "crc32c.h"
+
+blockstore_impl_t::blockstore_impl_t(blockstore_config_t & config, ring_loop_i *ringloop, timerfd_manager_t *tfd, bool mock_mode)
 {
    assert(sizeof(blockstore_op_private_t) <= BS_OP_PRIVATE_DATA_SIZE);
    this->tfd = tfd;
    this->ringloop = ringloop;
+    dsk.mock_mode = mock_mode;
    ring_consumer.loop = [this]() { loop(); };
    ringloop->register_consumer(&ring_consumer);
    initialized = 0;
@@ -17,31 +21,43 @@ blockstore_impl_t::blockstore_impl_t(blockstore_config_t & config, ring_loop_t *
        dsk.open_data();
        dsk.open_meta();
        dsk.open_journal();
-        calc_lengths();
-        alloc_dyn_data = dsk.clean_dyn_size > sizeof(void*) || dsk.csum_block_size > 0;
+        dsk.calc_lengths();
        zero_object = (uint8_t*)memalign_or_die(MEM_ALIGNMENT, dsk.data_block_size);
-        data_alloc = new allocator_t(dsk.block_count);
    }
    catch (std::exception & e)
    {
        dsk.close_all();
        throw;
    }
+    memset(zero_object, 0, dsk.data_block_size);
+    meta_superblock = (uint8_t*)memalign_or_die(MEM_ALIGNMENT, dsk.meta_block_size);
+    memset(meta_superblock, 0, dsk.meta_block_size);
+}
+
+void blockstore_impl_t::init()
+{
    flusher = new journal_flusher_t(this);
+    if (dsk.inmemory_journal)
+    {
+        buffer_area = (uint8_t*)memalign_or_die(MEM_ALIGNMENT, dsk.journal_len);
+    }
+    heap = new blockstore_heap_t(&dsk, buffer_area, log_level);
 }

 blockstore_impl_t::~blockstore_impl_t()
 {
-    delete data_alloc;
-    delete flusher;
+    if (flusher)
+        delete flusher;
+    if (heap)
+        delete heap;
+    if (buffer_area)
+        free(buffer_area);
+    if (meta_superblock)
+        free(meta_superblock);
    if (zero_object)
        free(zero_object);
    ringloop->unregister_consumer(&ring_consumer);
    dsk.close_all();
-    if (metadata_buffer)
-        free(metadata_buffer);
-    if (clean_bitmaps)
-        free(clean_bitmaps);
 }

 bool blockstore_impl_t::is_started()
@@ -57,10 +73,9 @@ bool blockstore_impl_t::is_stalled()
 // main event loop - produce requests
 void blockstore_impl_t::loop()
 {
-    // FIXME: initialized == 10 is ugly
    if (initialized != 10)
    {
-        // read metadata, then journal
+        // read metadata
        if (initialized == 0)
        {
            metadata_init_reader = new blockstore_init_meta(this);
@@ -73,69 +88,41 @@ void blockstore_impl_t::loop()
            {
                delete metadata_init_reader;
                metadata_init_reader = NULL;
-                journal_init_reader = new blockstore_init_journal(this);
-                initialized = 2;
-            }
-        }
-        if (initialized == 2)
-        {
-            int res = journal_init_reader->loop();
-            if (!res)
-            {
-                delete journal_init_reader;
-                journal_init_reader = NULL;
                initialized = 3;
-                ringloop->wakeup();
            }
        }
        if (initialized == 3)
        {
            if (!readonly && dsk.discard_on_start)
-                dsk.trim_data(data_alloc);
-            if (journal.flush_journal)
-                initialized = 4;
-            else
-                initialized = 10;
-        }
-        if (initialized == 4)
-        {
-            if (readonly)
            {
-                printf("Can't flush the journal in readonly mode\n");
-                exit(1);
+                dsk.trim_data([this](uint64_t block_num){ return heap->is_data_used(block_num * dsk.data_block_size); });
            }
-            flusher->loop();
-            ringloop->submit();
+            initialized = 10;
        }
    }
    else
    {
        // try to submit ops
        unsigned initial_ring_space = ringloop->space_left();
-        // has_writes == 0 - no writes before the current queue item
-        // has_writes == 1 - some writes in progress
-        // has_writes == 2 - tried to submit some writes, but failed
-        int has_writes = 0, op_idx = 0, new_idx = 0;
+        int op_idx = 0, new_idx = 0;
+        bool has_unfinished_writes = false;
        for (; op_idx < submit_queue.size(); op_idx++, new_idx++)
        {
            auto op = submit_queue[op_idx];
            submit_queue[new_idx] = op;
-            // FIXME: This needs some simplification
-            // Writes should not block reads if the ring is not full and reads don't depend on them
-            // In all other cases we should stop submission
            if (PRIV(op)->wait_for)
            {
                check_wait(op);
                if (PRIV(op)->wait_for == WAIT_SQE)
                {
+                    // ring is full, stop submission
                    break;
                }
                else if (PRIV(op)->wait_for)
                {
-                    if (op->opcode == BS_OP_WRITE || op->opcode == BS_OP_WRITE_STABLE || op->opcode == BS_OP_DELETE)
-                    {
-                        has_writes = 2;
-                    }
+                    has_unfinished_writes = has_unfinished_writes || op->opcode == BS_OP_WRITE ||
+                        op->opcode == BS_OP_WRITE_STABLE || op->opcode == BS_OP_DELETE ||
+                        op->opcode == BS_OP_STABLE || op->opcode == BS_OP_ROLLBACK;
                    continue;
                }
            }
@@ -148,46 +135,33 @@ void blockstore_impl_t::loop()
            {
                wr_st = dequeue_read(op);
            }
-            else if (op->opcode == BS_OP_WRITE || op->opcode == BS_OP_WRITE_STABLE)
+            else if (op->opcode == BS_OP_WRITE || op->opcode == BS_OP_WRITE_STABLE || op->opcode == BS_OP_DELETE)
            {
-                if (has_writes == 2)
-                {
-                    // Some writes already could not be submitted
-                    continue;
-                }
                wr_st = dequeue_write(op);
-                has_writes = wr_st > 0 ? 1 : 2;
-            }
-            else if (op->opcode == BS_OP_DELETE)
-            {
-                if (has_writes == 2)
-                {
-                    // Some writes already could not be submitted
-                    continue;
-                }
-                wr_st = dequeue_del(op);
-                has_writes = wr_st > 0 ? 1 : 2;
+                has_unfinished_writes = has_unfinished_writes || (wr_st != 2);
            }
            else if (op->opcode == BS_OP_SYNC)
            {
-                // sync only completed writes?
-                // wait for the data device fsync to complete, then submit journal writes for big writes
-                // then submit an fsync operation
+                // syncs only completed writes, so doesn't have to be blocked by anything
                wr_st = continue_sync(op);
            }
-            else if (op->opcode == BS_OP_STABLE)
+            else if (op->opcode == BS_OP_STABLE || op->opcode == BS_OP_ROLLBACK)
            {
                wr_st = dequeue_stable(op);
-            }
-            else if (op->opcode == BS_OP_ROLLBACK)
-            {
-                wr_st = dequeue_rollback(op);
+                has_unfinished_writes = has_unfinished_writes || (wr_st != 2);
            }
            else if (op->opcode == BS_OP_LIST)
            {
-                // LIST doesn't have to be blocked by previous modifications
-                process_list(op);
-                wr_st = 2;
+                // LIST has to be blocked by previous writes and commits/rollbacks
+                if (!has_unfinished_writes)
+                {
+                    process_list(op);
+                    wr_st = 2;
+                }
+                else
+                {
+                    wr_st = 0;
+                }
            }
            if (wr_st == 2)
            {
@@ -196,16 +170,13 @@ void blockstore_impl_t::loop()
            }
            if (wr_st == 0)
            {
+                PRIV(op)->pending_ops = 0;
                ringloop->restore(prev_sqe_pos);
                if (PRIV(op)->wait_for == WAIT_SQE)
                {
                    // ring is full, stop submission
                    break;
                }
-                else if (PRIV(op)->wait_for == WAIT_JOURNAL)
-                {
-                    PRIV(op)->wait_detail2 = (unstable_writes.size()+unstable_unsynced);
-                }
            }
        }
        if (op_idx != new_idx)
@@ -225,12 +196,6 @@ void blockstore_impl_t::loop()
        {
            throw std::runtime_error(std::string("io_uring_submit: ") + strerror(-ret));
        }
-        for (auto s: journal.submitting_sectors)
-        {
-            // Mark journal sector writes as submitted
-            journal.sector_info[s].submit_id = 0;
-        }
-        journal.submitting_sectors.clear();
        if ((initial_ring_space - ringloop->space_left()) > 0)
        {
            live = true;
@@ -248,7 +213,7 @@ bool blockstore_impl_t::is_safe_to_stop()
    {
        return false;
    }
-    if (unsynced_big_writes.size() > 0 || unsynced_small_writes.size() > 0)
+    if (unsynced_big_write_count > 0 || unsynced_small_write_count > 0)
    {
        if (!readonly && !stop_sync_submitted)
        {
@@ -272,7 +237,7 @@ void blockstore_impl_t::check_wait(blockstore_op_t *op)
 {
    if (PRIV(op)->wait_for == WAIT_SQE)
    {
-        if (ringloop->sqes_left() < PRIV(op)->wait_detail)
+        if (ringloop->space_left() < PRIV(op)->wait_detail)
        {
            // stop submission if there's still no free space
 #ifdef BLOCKSTORE_DEBUG
@@ -282,40 +247,13 @@ void blockstore_impl_t::check_wait(blockstore_op_t *op)
        }
        PRIV(op)->wait_for = 0;
    }
-    else if (PRIV(op)->wait_for == WAIT_JOURNAL)
+    else if (PRIV(op)->wait_for == WAIT_COMPACTION)
    {
-        if (journal.used_start == PRIV(op)->wait_detail &&
-            (unstable_writes.size()+unstable_unsynced) == PRIV(op)->wait_detail2)
+        if (flusher->get_compact_counter() <= PRIV(op)->wait_detail)
        {
            // do not submit
 #ifdef BLOCKSTORE_DEBUG
-            printf("Still waiting to flush journal offset %08jx\n", PRIV(op)->wait_detail);
-#endif
-            return;
-        }
-        flusher->release_trim();
-        PRIV(op)->wait_for = 0;
-    }
-    else if (PRIV(op)->wait_for == WAIT_JOURNAL_BUFFER)
-    {
-        int next = ((journal.cur_sector + 1) % journal.sector_count);
-        if (journal.sector_info[next].flush_count > 0 ||
-            journal.sector_info[next].dirty)
-        {
-            // do not submit
-#ifdef BLOCKSTORE_DEBUG
-            printf("Still waiting for a journal buffer\n");
-#endif
-            return;
-        }
-        PRIV(op)->wait_for = 0;
-    }
-    else if (PRIV(op)->wait_for == WAIT_FREE)
-    {
-        if (!data_alloc->get_free_count() && big_to_flush > 0)
-        {
-#ifdef BLOCKSTORE_DEBUG
-            printf("Still waiting for free space on the data device\n");
+            printf("Still waiting for more flushes\n");
 #endif
            return;
        }
@@ -361,75 +299,11 @@ void blockstore_impl_t::init_op(blockstore_op_t *op)
 {
    // Call constructor without allocating memory. We'll call destructor before returning op back
    new ((void*)op->private_data) blockstore_op_private_t;
-    PRIV(op)->min_flushed_journal_sector = PRIV(op)->max_flushed_journal_sector = 0;
    PRIV(op)->wait_for = 0;
    PRIV(op)->op_state = 0;
    PRIV(op)->pending_ops = 0;
 }

-static bool replace_stable(object_id oid, uint64_t version, int search_start, int search_end, obj_ver_id* list)
-{
-    while (search_start < search_end)
-    {
-        int pos = search_start+(search_end-search_start)/2;
-        if (oid < list[pos].oid)
-        {
-            search_end = pos;
-        }
-        else if (list[pos].oid < oid)
-        {
-            search_start = pos+1;
-        }
-        else
-        {
-            list[pos].version = version;
-            return true;
-        }
-    }
-    return false;
-}
-
-blockstore_clean_db_t& blockstore_impl_t::clean_db_shard(object_id oid)
-{
-    uint64_t pg_num = 0;
-    uint64_t pool_id = (oid.inode >> (64-POOL_ID_BITS));
-    auto sh_it = clean_db_settings.find(pool_id);
-    if (sh_it != clean_db_settings.end())
-    {
-        // like map_to_pg()
-        pg_num = (oid.stripe / sh_it->second.pg_stripe_size) % sh_it->second.pg_count + 1;
-    }
-    return clean_db_shards[(pool_id << (64-POOL_ID_BITS)) | pg_num];
-}
-
-void blockstore_impl_t::reshard_clean_db(pool_id_t pool, uint32_t pg_count, uint32_t pg_stripe_size)
-{
-    uint64_t pool_id = (uint64_t)pool;
-    std::map<pool_pg_id_t, blockstore_clean_db_t> new_shards;
-    auto sh_it = clean_db_shards.lower_bound((pool_id << (64-POOL_ID_BITS)));
-    while (sh_it != clean_db_shards.end() &&
-        (sh_it->first >> (64-POOL_ID_BITS)) == pool_id)
-    {
-        for (auto & pair: sh_it->second)
-        {
-            // like map_to_pg()
-            uint64_t pg_num = (pair.first.stripe / pg_stripe_size) % pg_count + 1;
-            uint64_t shard_id = (pool_id << (64-POOL_ID_BITS)) | pg_num;
-            new_shards[shard_id][pair.first] = pair.second;
-        }
-        clean_db_shards.erase(sh_it++);
-    }
-    for (sh_it = new_shards.begin(); sh_it != new_shards.end(); sh_it++)
-    {
-        auto & to = clean_db_shards[sh_it->first];
-        to.swap(sh_it->second);
-    }
-    clean_db_settings[pool_id] = (pool_shard_settings_t){
-        .pg_count = pg_count,
-        .pg_stripe_size = pg_stripe_size,
-    };
-}
-
 void blockstore_impl_t::process_list(blockstore_op_t *op)
 {
    uint32_t list_pg = op->pg_number+1;
@@ -438,7 +312,8 @@ void blockstore_impl_t::process_list(blockstore_op_t *op)
    uint64_t min_inode = op->min_oid.inode;
    uint64_t max_inode = op->max_oid.inode;
    // Check PG
-    if (pg_count != 0 && (pg_stripe_size < MIN_DATA_BLOCK_SIZE || list_pg > pg_count))
+    if (!pg_count || (pg_stripe_size < MIN_DATA_BLOCK_SIZE || list_pg > pg_count) ||
+        !INODE_POOL(min_inode) || INODE_POOL(min_inode) != INODE_POOL(max_inode))
    {
        op->retval = -EINVAL;
        FINISH_OP(op);
@@ -446,250 +321,40 @@ void blockstore_impl_t::process_list(blockstore_op_t *op)
    }
    // Check if the DB needs resharding
    // (we don't know about PGs from the beginning, we only create "shards" here)
-    uint64_t first_shard = 0, last_shard = UINT64_MAX;
-    if (min_inode != 0 &&
-        // Check if min_inode == max_inode == pool_id<<N, i.e. this is a pool listing
-        (min_inode >> (64-POOL_ID_BITS)) == (max_inode >> (64-POOL_ID_BITS)))
+    heap->reshard(INODE_POOL(min_inode), pg_count, pg_stripe_size);
+    obj_ver_id *result = NULL;
+    size_t stable_count = 0, unstable_count = 0;
+    int res = heap->list_objects(list_pg, op->min_oid, op->max_oid, &result, &stable_count, &unstable_count);
+    if (op->list_stable_limit)
    {
-        pool_id_t pool_id = (min_inode >> (64-POOL_ID_BITS));
-        if (pg_count > 1)
+        // Ordered result is expected - used by scrub
+        // We use an unordered map
+        std::sort(result, result + stable_count);
+        if (stable_count > op->list_stable_limit)
        {
-            // Per-pg listing
-            auto sh_it = clean_db_settings.find(pool_id);
-            if (sh_it == clean_db_settings.end() ||
-                sh_it->second.pg_count != pg_count ||
-                sh_it->second.pg_stripe_size != pg_stripe_size)
-            {
-                reshard_clean_db(pool_id, pg_count, pg_stripe_size);
-            }
-            first_shard = last_shard = ((uint64_t)pool_id << (64-POOL_ID_BITS)) | list_pg;
-        }
-        else
-        {
-            // Per-pool listing
-            first_shard = ((uint64_t)pool_id << (64-POOL_ID_BITS));
-            last_shard = ((uint64_t)(pool_id+1) << (64-POOL_ID_BITS)) - 1;
+            memmove(result + op->list_stable_limit, result + stable_count, unstable_count);
+            stable_count = op->list_stable_limit;
        }
    }
-    // Copy clean_db entries
-    int stable_count = 0, stable_alloc = 0;
-    if (min_inode != max_inode)
-    {
-        for (auto shard_it = clean_db_shards.lower_bound(first_shard);
-            shard_it != clean_db_shards.end() && shard_it->first <= last_shard;
-            shard_it++)
-        {
-            auto & clean_db = shard_it->second;
-            stable_alloc += clean_db.size();
-        }
-    }
-    if (op->list_stable_limit > 0)
-    {
-        stable_alloc = op->list_stable_limit;
-        if (stable_alloc > 1024*1024)
-            stable_alloc = 1024*1024;
-    }
-    if (stable_alloc < 32768)
-    {
-        stable_alloc = 32768;
-    }
-    obj_ver_id *stable = (obj_ver_id*)malloc(sizeof(obj_ver_id) * stable_alloc);
-    if (!stable)
-    {
-        op->retval = -ENOMEM;
-        FINISH_OP(op);
-        return;
-    }
-    auto max_oid = op->max_oid;
-    bool limited = false;
-    pool_pg_id_t last_shard_id = 0;
-    for (auto shard_it = clean_db_shards.lower_bound(first_shard);
-        shard_it != clean_db_shards.end() && shard_it->first <= last_shard;
-        shard_it++)
-    {
-        auto & clean_db = shard_it->second;
-        auto clean_it = clean_db.begin(), clean_end = clean_db.end();
-        if (op->min_oid.inode != 0 || op->min_oid.stripe != 0)
-        {
-            clean_it = clean_db.lower_bound(op->min_oid);
-        }
-        if ((max_oid.inode != 0 || max_oid.stripe != 0) && !(max_oid < op->min_oid))
-        {
-            clean_end = clean_db.upper_bound(max_oid);
-        }
-        for (; clean_it != clean_end; clean_it++)
-        {
-            if (stable_count >= stable_alloc)
-            {
-                stable_alloc *= 2;
-                obj_ver_id* nst = (obj_ver_id*)realloc(stable, sizeof(obj_ver_id) * stable_alloc);
-                if (!nst)
-                {
-                    op->retval = -ENOMEM;
-                    FINISH_OP(op);
-                    return;
-                }
-                stable = nst;
-            }
-            stable[stable_count++] = {
-                .oid = clean_it->first,
-                .version = clean_it->second.version,
-            };
-            if (op->list_stable_limit > 0 && stable_count >= op->list_stable_limit)
-            {
-                if (!limited)
-                {
-                    limited = true;
-                    max_oid = stable[stable_count-1].oid;
-                }
-                break;
-            }
-        }
-        if (op->list_stable_limit > 0)
-        {
-            // To maintain the order, we have to include objects in the same range from other shards
-            if (last_shard_id != 0 && last_shard_id != shard_it->first)
-                std::sort(stable, stable+stable_count);
-            if (stable_count > op->list_stable_limit)
-                stable_count = op->list_stable_limit;
-        }
-        last_shard_id = shard_it->first;
-    }
-    if (op->list_stable_limit == 0 && first_shard != last_shard)
-    {
-        // If that's not a per-PG listing, sort clean entries (already sorted if list_stable_limit != 0)
-        std::sort(stable, stable+stable_count);
-    }
-    int clean_stable_count = stable_count;
-    // Copy dirty_db entries (sorted, too)
-    int unstable_count = 0, unstable_alloc = 0;
-    obj_ver_id *unstable = NULL;
-    {
-        auto dirty_it = dirty_db.begin(), dirty_end = dirty_db.end();
-        if (op->min_oid.inode != 0 || op->min_oid.stripe != 0)
-        {
-            dirty_it = dirty_db.lower_bound({
-                .oid = op->min_oid,
-                .version = 0,
-            });
-        }
-        if ((max_oid.inode != 0 || max_oid.stripe != 0) && !(max_oid < op->min_oid))
-        {
-            dirty_end = dirty_db.upper_bound({
-                .oid = max_oid,
-                .version = UINT64_MAX,
-            });
-        }
-        for (; dirty_it != dirty_end; dirty_it++)
-        {
-            if (!pg_count || ((dirty_it->first.oid.stripe / pg_stripe_size) % pg_count + 1) == list_pg) // like map_to_pg()
-            {
-                if (IS_DELETE(dirty_it->second.state))
-                {
-                    // Deletions are always stable, so try to zero out two possible entries
-                    if (!replace_stable(dirty_it->first.oid, 0, 0, clean_stable_count, stable))
-                    {
-                        replace_stable(dirty_it->first.oid, 0, clean_stable_count, stable_count, stable);
-                    }
-                }
-                else if (IS_STABLE(dirty_it->second.state) || (dirty_it->second.state & BS_ST_INSTANT))
-                {
-                    // First try to replace a clean stable version in the first part of the list
-                    if (!replace_stable(dirty_it->first.oid, dirty_it->first.version, 0, clean_stable_count, stable))
-                    {
-                        // Then try to replace the last dirty stable version in the second part of the list
-                        if (stable_count > 0 && stable[stable_count-1].oid == dirty_it->first.oid)
-                        {
-                            stable[stable_count-1].version = dirty_it->first.version;
-                        }
-                        else
-                        {
-                            if (stable_count >= stable_alloc)
-                            {
-                                stable_alloc += 32768;
-                                obj_ver_id *nst = (obj_ver_id*)realloc(stable, sizeof(obj_ver_id) * stable_alloc);
-                                if (!nst)
-                                {
-                                    if (unstable)
-                                        free(unstable);
-                                    op->retval = -ENOMEM;
-                                    FINISH_OP(op);
-                                    return;
-                                }
-                                stable = nst;
-                            }
-                            stable[stable_count++] = dirty_it->first;
-                        }
-                    }
-                    if (op->list_stable_limit > 0 && stable_count >= op->list_stable_limit)
-                    {
-                        // Stop here
-                        break;
-                    }
-                }
-                else
-                {
-                    if (unstable_count >= unstable_alloc)
-                    {
-                        unstable_alloc += 32768;
-                        obj_ver_id *nst = (obj_ver_id*)realloc(unstable, sizeof(obj_ver_id) * unstable_alloc);
-                        if (!nst)
-                        {
-                            if (stable)
-                                free(stable);
-                            op->retval = -ENOMEM;
-                            FINISH_OP(op);
-                            return;
-                        }
-                        unstable = nst;
-                    }
-                    unstable[unstable_count++] = dirty_it->first;
-                }
-            }
-        }
-    }
-    // Remove zeroed out stable entries
-    int j = 0;
-    for (int i = 0; i < stable_count; i++)
-    {
-        if (stable[i].version != 0)
-        {
-            stable[j++] = stable[i];
-        }
-    }
-    stable_count = j;
-    if (stable_count+unstable_count > stable_alloc)
-    {
-        stable_alloc = stable_count+unstable_count;
-        obj_ver_id *nst = (obj_ver_id*)realloc(stable, sizeof(obj_ver_id) * stable_alloc);
-        if (!nst)
-        {
-            if (unstable)
-                free(unstable);
-            op->retval = -ENOMEM;
-            FINISH_OP(op);
-            return;
-        }
-        stable = nst;
-    }
-    // Copy unstable entries
-    for (int i = 0; i < unstable_count; i++)
-    {
-        stable[j++] = unstable[i];
-    }
-    free(unstable);
    op->version = stable_count;
-    op->retval = stable_count+unstable_count;
-    op->buf = stable;
+    op->retval = res == 0 ? stable_count+unstable_count : -res;
+    op->buf = (uint8_t*)result;
    FINISH_OP(op);
 }

 void blockstore_impl_t::dump_diagnostics()
 {
-    journal.dump_diagnostics();
    flusher->dump_diagnostics();
 }

+void blockstore_meta_header_v3_t::set_crc32c()
+{
+    header_csum = 0;
+    uint32_t calc = crc32c(0, this, version == BLOCKSTORE_META_FORMAT_HEAP
+        ? sizeof(blockstore_meta_header_v3_t) : sizeof(blockstore_meta_header_v2_t));
+    header_csum = calc;
+}
+
 void blockstore_impl_t::disk_error_abort(const char *op, int retval, int expected)
 {
    if (retval == -EAGAIN)
@@ -703,85 +368,7 @@ void blockstore_impl_t::disk_error_abort(const char *op, int retval, int expecte
    exit(1);
 }

-void blockstore_impl_t::set_no_inode_stats(const std::vector<uint64_t> & pool_ids)
+uint64_t blockstore_impl_t::get_free_block_count()
 {
-    for (auto & np: no_inode_stats)
-    {
-        np.second = 2;
-    }
-    for (auto pool_id: pool_ids)
-    {
-        if (!no_inode_stats[pool_id])
-            recalc_inode_space_stats(pool_id, false);
-        no_inode_stats[pool_id] = 1;
-    }
-    for (auto np_it = no_inode_stats.begin(); np_it != no_inode_stats.end(); )
-    {
-        if (np_it->second == 2)
-        {
-            recalc_inode_space_stats(np_it->first, true);
-            no_inode_stats.erase(np_it++);
-        }
-        else
-            np_it++;
-    }
-}
-
-void blockstore_impl_t::recalc_inode_space_stats(uint64_t pool_id, bool per_inode)
-{
-    auto sp_begin = inode_space_stats.lower_bound((pool_id << (64-POOL_ID_BITS)));
-    auto sp_end = inode_space_stats.lower_bound(((pool_id+1) << (64-POOL_ID_BITS)));
-    inode_space_stats.erase(sp_begin, sp_end);
-    auto sh_it = clean_db_shards.lower_bound((pool_id << (64-POOL_ID_BITS)));
-    while (sh_it != clean_db_shards.end() &&
-        (sh_it->first >> (64-POOL_ID_BITS)) == pool_id)
-    {
-        for (auto & pair: sh_it->second)
-        {
-            uint64_t space_id = per_inode ? pair.first.inode : (pool_id << (64-POOL_ID_BITS));
-            inode_space_stats[space_id] += dsk.data_block_size;
-        }
-        sh_it++;
-    }
-    object_id last_oid = {};
-    bool last_exists = false;
-    auto dirty_it = dirty_db.lower_bound((obj_ver_id){ .oid = { .inode = (pool_id << (64-POOL_ID_BITS)) } });
-    while (dirty_it != dirty_db.end() && (dirty_it->first.oid.inode >> (64-POOL_ID_BITS)) == pool_id)
-    {
-        if (IS_STABLE(dirty_it->second.state) && (IS_BIG_WRITE(dirty_it->second.state) || IS_DELETE(dirty_it->second.state)))
-        {
-            bool exists = false;
-            if (last_oid == dirty_it->first.oid)
-            {
-                exists = last_exists;
-            }
-            else
-            {
-                auto & clean_db = clean_db_shard(dirty_it->first.oid);
-                auto clean_it = clean_db.find(dirty_it->first.oid);
-                exists = clean_it != clean_db.end();
-            }
-            uint64_t space_id = per_inode ? dirty_it->first.oid.inode : (pool_id << (64-POOL_ID_BITS));
-            if (IS_BIG_WRITE(dirty_it->second.state))
-            {
-                if (!exists)
-                    inode_space_stats[space_id] += dsk.data_block_size;
-                last_exists = true;
-            }
-            else
-            {
-                if (exists)
-                {
-                    auto & sp = inode_space_stats[space_id];
-                    if (sp > dsk.data_block_size)
-                        sp -= dsk.data_block_size;
-                    else
-                        inode_space_stats.erase(space_id);
-                }
-                last_exists = false;
-            }
-            last_oid = dirty_it->first.oid;
-        }
-        dirty_it++;
-    }
+    return dsk.block_count - heap->get_data_used_space()/dsk.data_block_size;
 }
--- a/src/blockstore/blockstore_impl.h
+++ b/src/blockstore/blockstore_impl.h
@@ -5,6 +5,7 @@

 #include "blockstore.h"
 #include "blockstore_disk.h"
+#include "blockstore_heap.h"

 #include <sys/types.h>
 #include <sys/ioctl.h>
@@ -21,45 +22,16 @@
 #include <unordered_map>
 #include <unordered_set>

-#include "cpp-btree/btree_map.h"
-
 #include "malloc_or_die.h"
-#include "allocator.h"

 //#define BLOCKSTORE_DEBUG

-// States are not stored on disk. Instead, they're deduced from the journal
-
-#define BS_ST_SMALL_WRITE 0x01
-#define BS_ST_BIG_WRITE 0x02
-#define BS_ST_DELETE 0x03
-
-#define BS_ST_WAIT_DEL 0x10
-#define BS_ST_WAIT_BIG 0x20
-#define BS_ST_IN_FLIGHT 0x30
-#define BS_ST_SUBMITTED 0x40
-#define BS_ST_WRITTEN 0x50
-#define BS_ST_SYNCED 0x60
-#define BS_ST_STABLE 0x70
-
-#define BS_ST_INSTANT 0x100
-
 #define IMMEDIATE_NONE 0
 #define IMMEDIATE_SMALL 1
 #define IMMEDIATE_ALL 2

-#define BS_ST_TYPE_MASK 0x0F
-#define BS_ST_WORKFLOW_MASK 0xF0
-#define IS_IN_FLIGHT(st) (((st) & 0xF0) <= BS_ST_SUBMITTED)
-#define IS_STABLE(st) (((st) & 0xF0) == BS_ST_STABLE)
-#define IS_SYNCED(st) (((st) & 0xF0) >= BS_ST_SYNCED)
-#define IS_JOURNAL(st) (((st) & 0x0F) == BS_ST_SMALL_WRITE)
-#define IS_BIG_WRITE(st) (((st) & 0x0F) == BS_ST_BIG_WRITE)
-#define IS_DELETE(st) (((st) & 0x0F) == BS_ST_DELETE)
-#define IS_INSTANT(st) (((st) & BS_ST_TYPE_MASK) == BS_ST_DELETE || ((st) & BS_ST_INSTANT))
-
 #define BS_SUBMIT_CHECK_SQES(n) \
-    if (ringloop->sqes_left() < (n))\
+    if (ringloop->space_left() < (n))\
    {\
        /* Pause until there are more requests available */\
        PRIV(op)->wait_detail = (n);\
@@ -91,13 +63,6 @@
        return 0;\
    }

-#include "blockstore_journal.h"
-
-// "VITAstor"
-#define BLOCKSTORE_META_MAGIC_V1 0x726F747341544956l
-#define BLOCKSTORE_META_FORMAT_V1 1
-#define BLOCKSTORE_META_FORMAT_V2 2
-
 // metadata header (superblock)
 struct __attribute__((__packed__)) blockstore_meta_header_v1_t
 {
@@ -122,75 +87,26 @@ struct __attribute__((__packed__)) blockstore_meta_header_v2_t
    uint32_t header_csum;
 };

-// 32 bytes = 24 bytes + block bitmap (4 bytes by default) + external attributes (also bitmap, 4 bytes by default)
-// per "clean" entry on disk with fixed metadata tables
-struct __attribute__((__packed__)) clean_disk_entry
+struct __attribute__((__packed__)) blockstore_meta_header_v3_t
 {
-    object_id oid;
+    uint64_t zero;
+    uint64_t magic;
    uint64_t version;
-    uint8_t bitmap[];
-    // Two more fields come after bitmap in metadata version 2:
-    // uint32_t data_csum[];
-    // uint32_t entry_csum;
-};
+    uint32_t meta_block_size;
+    uint32_t data_block_size;
+    uint32_t bitmap_granularity;
+    uint32_t data_csum_type;
+    uint32_t csum_block_size;
+    uint32_t header_csum;
+    uint64_t compacted_lsn;

-// 32 = 16 + 16 bytes per "clean" entry in memory (object_id => clean_entry)
-struct __attribute__((__packed__)) clean_entry
-{
-    uint64_t version;
-    uint64_t location;
+    void set_crc32c();
 };

-// 64 = 24 + 40 bytes per dirty entry in memory (obj_ver_id => dirty_entry). Plus checksums
-struct __attribute__((__packed__)) dirty_entry
-{
-    uint32_t state;
-    uint32_t flags;    // unneeded, but present for alignment
-    uint64_t location; // location in either journal or data -> in BYTES
-    uint32_t offset;   // data offset within object (stripe)
-    uint32_t len;      // data length
-    uint64_t journal_sector; // journal sector used for this entry
-    void* dyn_data;    // dynamic data: external bitmap and data block checksums. may be a pointer to the in-memory journal
-};
-
-// - Sync must be submitted after previous writes/deletes (not before!)
-// - Reads to the same object must be submitted after previous writes/deletes
-//   are written (not necessarily synced) in their location. This is because we
-//   rely on read-modify-write for erasure coding and we must return new data
-//   to calculate parity for subsequent writes
-// - Writes may be submitted in any order, because they don't overlap. Each write
-//   goes into a new location - either on the journal device or on the data device
-// - Stable (stabilize) must be submitted after sync of that object is completed
-//   It's even OK to return an error to the caller if that object is not synced yet
-// - Journal trim may be processed only after all versions are moved to
-//   the main storage AND after all read operations for older versions complete
-// - If an operation can not be submitted because the ring is full
-//   we should stop submission of other operations. Otherwise some "scatter" reads
-//   may end up blocked for a long time.
-// Otherwise, the submit order is free, that is all operations may be submitted immediately
-// In fact, adding a write operation must immediately result in dirty_db being populated
-
 // Suspend operation until there are more free SQEs
 #define WAIT_SQE 1
-// Suspend operation until there are <wait_detail> bytes of free space in the journal on disk
-#define WAIT_JOURNAL 3
-// Suspend operation until the next journal sector buffer is free
-#define WAIT_JOURNAL_BUFFER 4
-// Suspend operation until there is some free space on the data device
-#define WAIT_FREE 5
-
-struct used_clean_obj_t
-{
-    int refs;
-    bool was_freed; // was freed by a parallel flush?
-    bool was_changed; // was changed by a parallel flush?
-};
-
-// https://github.com/algorithm-ninja/cpp-btree
-// https://github.com/greg7mdp/sparsepp/ was used previously, but it was TERRIBLY slow after resizing
-// with sparsepp, random reads dropped to ~700 iops very fast with just as much as ~32k objects in the DB
-typedef btree::btree_map<object_id, clean_entry> blockstore_clean_db_t;
-typedef std::map<obj_ver_id, dirty_entry> blockstore_dirty_db_t;
+// Suspend until something is compacted
+#define WAIT_COMPACTION 2

 #include "blockstore_init.h"

@@ -203,58 +119,47 @@ struct blockstore_op_private_t
 {
    // Wait status
    int wait_for;
-    uint64_t wait_detail, wait_detail2;
+    uint64_t wait_detail;
    int pending_ops;
    int op_state;

+    // Read, write, sync, stabilize
+    uint64_t lsn;
+
    // Read
-    uint64_t clean_block_used;
    std::vector<copy_buffer_t> read_vec;

-    // Sync, write
-    uint64_t min_flushed_journal_sector, max_flushed_journal_sector;
+    // Write
+    uint64_t location;
+    bool is_big;
+
+    // Stabilize, rollback
+    int stab_pos;
+
+    // Stabilize
+    uint64_t to_lsn;

    // Write
    struct iovec iov_zerofill[3];
-    // Warning: must not have a default value here because it's written to before calling constructor in blockstore_write.cpp O_o
-    uint64_t real_version;
    timespec tv_begin;
-
-    // Sync
-    std::vector<obj_ver_id> sync_big_writes, sync_small_writes;
 };

-typedef uint32_t pool_id_t;
-typedef uint64_t pool_pg_id_t;
-
-#define POOL_ID_BITS 16
-
-struct pool_shard_settings_t
-{
-    uint32_t pg_count;
-    uint32_t pg_stripe_size;
-};
-
-#define STAB_SPLIT_DONE 1
-#define STAB_SPLIT_WAIT 2
-#define STAB_SPLIT_SYNC 3
-#define STAB_SPLIT_TODO 4
-
 class blockstore_impl_t
 {
+public:
    blockstore_disk_t dsk;

    /******* OPTIONS *******/
    bool readonly = false;
-    // It is safe to disable fsync() if drive write cache is writethrough
-    bool disable_data_fsync = false, disable_meta_fsync = false, disable_journal_fsync = false;
    // Enable if you want every operation to be executed with an "implicit fsync"
    // Suitable only for server SSDs with capacitors, requires disabled data and journal fsyncs
    int immediate_commit = IMMEDIATE_NONE;
    bool inmemory_meta = false;
+    uint32_t meta_write_recheck_parallelism = 0;
    // Maximum and minimum flusher count
-    unsigned max_flusher_count, min_flusher_count;
-    unsigned journal_trim_interval;
+    unsigned max_flusher_count = 0, min_flusher_count = 0;
+    unsigned journal_trim_interval = 0;
+    unsigned flusher_start_threshold = 0;
    // Maximum queue depth
    unsigned max_write_iodepth = 128;
    // Enable small (journaled) write throttling, useful for the SSD+HDD case
@@ -269,139 +174,89 @@ class blockstore_impl_t
    uint64_t autosync_writes = 128;
    // Log level (0-10)
    int log_level = 0;
+    // Enable correct block checksum validation on objects updated with small writes when checksum block
+    // is larger than bitmap_granularity, at the expense of extra metadata fsyncs during compaction
+    bool perfect_csum_update = false;
    /******* END OF OPTIONS *******/

    struct ring_consumer_t ring_consumer;

-    std::map<pool_id_t, pool_shard_settings_t> clean_db_settings;
-    std::map<pool_pg_id_t, blockstore_clean_db_t> clean_db_shards;
-    std::map<uint64_t, int> no_inode_stats;
-    uint8_t *clean_bitmaps = NULL;
-    blockstore_dirty_db_t dirty_db;
+    blockstore_heap_t *heap = NULL;
+    uint8_t* meta_superblock = NULL;
+    uint8_t *buffer_area = NULL;
    std::vector<blockstore_op_t*> submit_queue;
-    std::vector<obj_ver_id> unsynced_big_writes, unsynced_small_writes;
-    int unsynced_big_write_count = 0, unstable_unsynced = 0;
+    int unsynced_big_write_count = 0, unsynced_small_write_count = 0, unsynced_meta_write_count = 0;
    int unsynced_queued_ops = 0;
-    allocator_t *data_alloc = NULL;
-    uint64_t used_blocks = 0;
    uint8_t *zero_object = NULL;

-    void *metadata_buffer = NULL;
-
-    struct journal_t journal;
    journal_flusher_t *flusher;
-    int big_to_flush = 0;
    int write_iodepth = 0;
-    bool alloc_dyn_data = false;
-
-    // clean data blocks referenced by read operations
-    std::map<uint64_t, used_clean_obj_t> used_clean_objects;
+    int inflight_big = 0;
+    bool fsyncing_data = false;

    bool live = false, queue_stall = false;
-    ring_loop_t *ringloop;
-    timerfd_manager_t *tfd;
+    ring_loop_i *ringloop = NULL;
+    timerfd_manager_t *tfd = NULL;

-    bool stop_sync_submitted;
+    bool stop_sync_submitted = false;

    inline struct io_uring_sqe* get_sqe()
    {
        return ringloop->get_sqe();
    }

-    friend class blockstore_init_meta;
-    friend class blockstore_init_journal;
-    friend struct blockstore_journal_check_t;
-    friend class journal_flusher_t;
-    friend class journal_flusher_co;
-
-    void calc_lengths();
    void open_data();
    void open_meta();
    void open_journal();
-    uint8_t* get_clean_entry_bitmap(uint64_t block_loc, int offset);

-    blockstore_clean_db_t& clean_db_shard(object_id oid);
-    void reshard_clean_db(pool_id_t pool_id, uint32_t pg_count, uint32_t pg_stripe_size);
-    void recalc_inode_space_stats(uint64_t pool_id, bool per_inode);
-
-    // Journaling
-    void prepare_journal_sector_write(int sector, blockstore_op_t *op);
-    void handle_journal_write(ring_data_t *data, uint64_t flush_id);
    void disk_error_abort(const char *op, int retval, int expected);

    // Asynchronous init
    int initialized;
    int metadata_buf_size;
    blockstore_init_meta* metadata_init_reader;
-    blockstore_init_journal* journal_init_reader;

+    void init();
    void check_wait(blockstore_op_t *op);
    void init_op(blockstore_op_t *op);

    // Read
-    int dequeue_read(blockstore_op_t *read_op);
+    int dequeue_read(blockstore_op_t *op);
+    int fulfill_read(blockstore_op_t *op);
+    uint32_t prepare_read(std::vector<copy_buffer_t> & read_vec, heap_object_t *obj, heap_write_t *wr, uint32_t start, uint32_t end);
+    uint32_t prepare_read_with_bitmaps(std::vector<copy_buffer_t> & read_vec, heap_object_t *obj, heap_write_t *wr, uint32_t start, uint32_t end);
+    uint32_t prepare_read_zero(std::vector<copy_buffer_t> & read_vec, uint32_t start, uint32_t end);
+    uint32_t prepare_read_simple(std::vector<copy_buffer_t> & read_vec, heap_object_t *obj, heap_write_t *wr, uint32_t start, uint32_t end);
+    void prepare_disk_read(std::vector<copy_buffer_t> & read_vec, int pos, heap_object_t *obj, heap_write_t *wr,
+        uint32_t blk_start, uint32_t blk_end, uint32_t start, uint32_t end, uint32_t copy_flags);
    void find_holes(std::vector<copy_buffer_t> & read_vec, uint32_t item_start, uint32_t item_end,
-        std::function<int(int, bool, uint32_t, uint32_t)> callback);
-    int fulfill_read(blockstore_op_t *read_op,
-        uint64_t &fulfilled, uint32_t item_start, uint32_t item_end,
-        uint32_t item_state, uint64_t item_version, uint64_t item_location,
-        uint64_t journal_sector, uint8_t *csum, int *dyn_data);
-    bool fulfill_clean_read(blockstore_op_t *read_op, uint64_t & fulfilled,
-        uint8_t *clean_entry_bitmap, int *dyn_data,
-        uint32_t item_start, uint32_t item_end, uint64_t clean_loc, uint64_t clean_ver);
-    int fill_partial_checksum_blocks(std::vector<copy_buffer_t> & rv, uint64_t & fulfilled,
-        uint8_t *clean_entry_bitmap, int *dyn_data, bool from_journal, uint8_t *read_buf, uint64_t read_offset, uint64_t read_end);
-    int pad_journal_read(std::vector<copy_buffer_t> & rv, copy_buffer_t & cp,
-        uint64_t dirty_offset, uint64_t dirty_end, uint64_t dirty_loc, uint8_t *csum_ptr, int *dyn_data,
-        uint64_t offset, uint64_t submit_len, uint64_t & blk_begin, uint64_t & blk_end, uint8_t* & blk_buf);
-    bool read_range_fulfilled(std::vector<copy_buffer_t> & rv, uint64_t & fulfilled, uint8_t *read_buf,
-        uint8_t *clean_entry_bitmap, uint32_t item_start, uint32_t item_end);
-    bool read_checksum_block(blockstore_op_t *op, int rv_pos, uint64_t &fulfilled, uint64_t clean_loc);
-    uint8_t* read_clean_meta_block(blockstore_op_t *read_op, uint64_t clean_loc, int rv_pos);
-    bool verify_padded_checksums(uint8_t *clean_entry_bitmap, uint8_t *csum_buf, uint32_t offset,
-        iovec *iov, int n_iov, std::function<void(uint32_t, uint32_t, uint32_t)> bad_block_cb);
-    bool verify_journal_checksums(uint8_t *csums, uint32_t offset,
-        iovec *iov, int n_iov, std::function<void(uint32_t, uint32_t, uint32_t)> bad_block_cb);
-    bool verify_clean_padded_checksums(blockstore_op_t *op, uint64_t clean_loc, uint8_t *dyn_data, bool from_journal,
-        iovec *iov, int n_iov, std::function<void(uint32_t, uint32_t, uint32_t)> bad_block_cb);
-    int fulfill_read_push(blockstore_op_t *op, void *buf, uint64_t offset, uint64_t len,
-        uint32_t item_state, uint64_t item_version);
+        std::function<void(int&, uint32_t, uint32_t)> callback);
+    void free_read_buffers(std::vector<copy_buffer_t> & rv);
    void handle_read_event(ring_data_t *data, blockstore_op_t *op);
+    bool verify_read_checksums(blockstore_op_t *op);

    // Write
    bool enqueue_write(blockstore_op_t *op);
-    void cancel_all_writes(blockstore_op_t *op, blockstore_dirty_db_t::iterator dirty_it, int retval);
+    void prepare_meta_block_write(blockstore_op_t *op, uint64_t modified_block, io_uring_sqe *sqe = NULL);
    int dequeue_write(blockstore_op_t *op);
-    int dequeue_del(blockstore_op_t *op);
+    int make_big_write(blockstore_op_t *op, uint32_t offset, uint32_t len, uint32_t *modified_block, uint32_t *moved_from_block);
    int continue_write(blockstore_op_t *op);
-    void release_journal_sectors(blockstore_op_t *op);
    void handle_write_event(ring_data_t *data, blockstore_op_t *op);

    // Sync
    int continue_sync(blockstore_op_t *op);
-    void ack_sync(blockstore_op_t *op);
+    bool submit_fsyncs(int & wait_count);
+    int do_sync(blockstore_op_t *op, int base_state);

    // Stabilize
    int dequeue_stable(blockstore_op_t *op);
-    int continue_stable(blockstore_op_t *op);
-    void mark_stable(obj_ver_id ov, bool forget_dirty = false);
-    void stabilize_object(object_id oid, uint64_t max_ver);
-    blockstore_op_t* selective_sync(blockstore_op_t *op);
-    int split_stab_op(blockstore_op_t *op, std::function<int(obj_ver_id v)> decider);
-
-    // Rollback
-    int dequeue_rollback(blockstore_op_t *op);
-    int continue_rollback(blockstore_op_t *op);
-    void mark_rolled_back(const obj_ver_id & ov);
-    void erase_dirty(blockstore_dirty_db_t::iterator dirty_start, blockstore_dirty_db_t::iterator dirty_end, uint64_t clean_loc);
-    void free_dirty_dyn_data(dirty_entry & e);

    // List
    void process_list(blockstore_op_t *op);

-public:
+/*public:*/

-    blockstore_impl_t(blockstore_config_t & config, ring_loop_t *ringloop, timerfd_manager_t *tfd);
+    blockstore_impl_t(blockstore_config_t & config, ring_loop_i *ringloop, timerfd_manager_t *tfd, bool mock_mode = false);
    ~blockstore_impl_t();

    void parse_config(blockstore_config_t & config, bool init);
@@ -427,21 +282,13 @@ public:
    // Simplified synchronous operation: get object bitmap & current version
    int read_bitmap(object_id oid, uint64_t target_version, void *bitmap, uint64_t *result_version = NULL);

-    // Unstable writes are added here (map of object_id -> version)
-    std::unordered_map<object_id, uint64_t> unstable_writes;
-
-    // Space usage statistics
-    std::map<uint64_t, uint64_t> inode_space_stats;
-
-    // Set per-pool no_inode_stats
-    void set_no_inode_stats(const std::vector<uint64_t> & pool_ids);
-
    // Print diagnostics to stdout
    void dump_diagnostics();

+    const std::map<uint64_t, uint64_t> & get_inode_space_stats() { return heap->get_inode_space_stats(); }
    inline uint32_t get_block_size() { return dsk.data_block_size; }
    inline uint64_t get_block_count() { return dsk.block_count; }
-    inline uint64_t get_free_block_count() { return dsk.block_count - used_blocks; }
+    uint64_t get_free_block_count();
    inline uint32_t get_bitmap_granularity() { return dsk.disk_alignment; }
    inline uint64_t get_journal_size() { return dsk.journal_len; }
 };
--- a/src/blockstore/blockstore_init.cpp
+++ b/src/blockstore/blockstore_init.cpp
--- a/src/blockstore/blockstore_init.h
+++ b/src/blockstore/blockstore_init.h
@@ -25,47 +25,10 @@ class blockstore_init_meta
    uint64_t next_offset = 0;
    uint64_t last_read_offset = 0;
    uint64_t entries_loaded = 0;
-    unsigned entries_per_block = 0;
    int i = 0, j = 0;
-    std::vector<uint64_t> entries_to_zero;
    bool handle_meta_block(uint8_t *buf, uint64_t count, uint64_t done_cnt);
    void handle_event(ring_data_t *data, int buf_num);
 public:
    blockstore_init_meta(blockstore_impl_t *bs);
    int loop();
 };
-
-struct bs_init_journal_done
-{
-    void *buf;
-    uint64_t pos, len;
-};
-
-class blockstore_init_journal
-{
-    blockstore_impl_t *bs;
-    int wait_state = 0, wait_count = 0, handle_res = 0;
-    uint64_t entries_loaded = 0;
-    uint32_t crc32_last = 0;
-    bool started = false;
-    uint64_t next_free;
-    std::vector<bs_init_journal_done> done;
-    std::vector<obj_ver_id> double_allocs;
-    std::vector<iovec> small_write_data;
-    uint64_t journal_pos = 0;
-    uint64_t continue_pos = 0;
-    void *init_write_buf = NULL;
-    uint64_t init_write_sector = 0;
-    bool wrapped = false;
-    void *submitted_buf;
-    struct io_uring_sqe *sqe;
-    struct ring_data_t *data;
-    journal_entry_start *je_start;
-    std::function<void(ring_data_t*)> simple_callback;
-    int handle_journal_part(void *buf, uint64_t done_pos, uint64_t len);
-    void handle_event(ring_data_t *data);
-    void erase_dirty_object(blockstore_dirty_db_t::iterator dirty_it);
-public:
-    blockstore_init_journal(blockstore_impl_t* bs);
-    int loop();
-};
--- a/src/blockstore/blockstore_journal.cpp
+++ b/src/blockstore/blockstore_journal.cpp
@@ -1,328 +0,0 @@
-// Copyright (c) Vitaliy Filippov, 2019+
-// License: VNPL-1.1 (see README.md for details)
-
-#include "blockstore_impl.h"
-
-blockstore_journal_check_t::blockstore_journal_check_t(blockstore_impl_t *bs)
-{
-    this->bs = bs;
-    sectors_to_write = 0;
-    next_pos = bs->journal.next_free;
-    next_sector = bs->journal.cur_sector;
-    first_sector = -1;
-    next_in_pos = bs->journal.in_sector_pos;
-    right_dir = next_pos >= bs->journal.used_start;
-}
-
-// Check if we can write <required> entries of <size> bytes and <data_after> data bytes after them to the journal
-int blockstore_journal_check_t::check_available(blockstore_op_t *op, int entries_required, int size, int data_after)
-{
-    uint64_t prev_next = next_sector;
-    int required = entries_required;
-    while (1)
-    {
-        int fits = bs->journal.no_same_sector_overwrites && next_pos == bs->journal.next_free && bs->journal.sector_info[next_sector].written
-            ? 0
-            : (bs->journal.block_size - next_in_pos) / size;
-        if (fits > 0)
-        {
-            if (fits > required)
-            {
-                fits = required;
-            }
-            if (first_sector == -1)
-            {
-                first_sector = next_sector;
-            }
-            required -= fits;
-            next_in_pos += fits * size;
-            if (next_sector != prev_next || !sectors_to_write)
-            {
-                // Except the previous call to this function
-                sectors_to_write++;
-            }
-        }
-        else if (bs->journal.sector_info[next_sector].dirty)
-        {
-            if (next_sector != prev_next || !sectors_to_write)
-            {
-                // Except the previous call to this function
-                sectors_to_write++;
-            }
-        }
-        if (required <= 0)
-        {
-            break;
-        }
-        next_pos = next_pos + bs->journal.block_size;
-        if (next_pos >= bs->journal.len)
-        {
-            next_pos = bs->journal.block_size;
-            right_dir = false;
-        }
-        next_in_pos = 0;
-        next_sector = ((next_sector + 1) % bs->journal.sector_count);
-        if (next_sector == first_sector)
-        {
-            // next_sector may wrap when all sectors are flushed and the incoming batch is too big
-            // This is an error condition, we can't wait for anything in this case
-            throw std::runtime_error(
-                "Blockstore journal_sector_buffer_count="+std::to_string(bs->journal.sector_count)+
-                " is too small for a batch of "+std::to_string(entries_required)+" entries of "+std::to_string(size)+" bytes"
-            );
-        }
-        if (bs->journal.sector_info[next_sector].flush_count > 0 ||
-            bs->journal.sector_info[next_sector].dirty)
-        {
-            // No memory buffer available. Wait for it.
-            int used = 0, dirty = 0;
-            for (int i = 0; i < bs->journal.sector_count; i++)
-            {
-                if (bs->journal.sector_info[i].dirty)
-                {
-                    dirty++;
-                    used++;
-                }
-                if (bs->journal.sector_info[i].flush_count > 0)
-                {
-                    used++;
-                }
-            }
-            // In fact, it's even more rare than "ran out of journal space", so print a warning
-            printf(
-                "Ran out of journal sector buffers: %d/%ju buffers used (%d dirty), next buffer (%jd)"
-                " is %s and flushed %ju times. Consider increasing \'journal_sector_buffer_count\'\n",
-                used, bs->journal.sector_count, dirty, next_sector,
-                bs->journal.sector_info[next_sector].dirty ? "dirty" : "not dirty",
-                bs->journal.sector_info[next_sector].flush_count
-            );
-            PRIV(op)->wait_for = WAIT_JOURNAL_BUFFER;
-            return 0;
-        }
-    }
-    if (data_after > 0)
-    {
-        next_pos = next_pos + data_after;
-        if (next_pos >= bs->journal.len)
-        {
-            if (right_dir)
-                next_pos = bs->journal.block_size + data_after;
-            right_dir = false;
-        }
-    }
-    if (!right_dir && next_pos >= bs->journal.used_start-bs->journal.block_size)
-    {
-        // No space in the journal. Wait until used_start changes.
-        if (bs->log_level > 5)
-        {
-            printf(
-                "Ran out of journal space (used_start=%08jx, next_free=%08jx, dirty_start=%08jx)\n",
-                bs->journal.used_start, bs->journal.next_free, bs->journal.dirty_start
-            );
-        }
-        PRIV(op)->wait_for = WAIT_JOURNAL;
-        bs->flusher->request_trim();
-        PRIV(op)->wait_detail = bs->journal.used_start;
-        return 0;
-    }
-    return 1;
-}
-
-journal_entry* prefill_single_journal_entry(journal_t & journal, uint16_t type, uint32_t size)
-{
-    if (!journal.entry_fits(size))
-    {
-        assert(!journal.sector_info[journal.cur_sector].dirty);
-        // Move to the next journal sector
-        if (journal.sector_info[journal.cur_sector].flush_count > 0)
-        {
-            // Also select next sector buffer in memory
-            journal.cur_sector = ((journal.cur_sector + 1) % journal.sector_count);
-            assert(!journal.sector_info[journal.cur_sector].flush_count);
-        }
-        else
-        {
-            journal.dirty_start = journal.next_free;
-        }
-        journal.sector_info[journal.cur_sector].written = false;
-        journal.sector_info[journal.cur_sector].offset = journal.next_free;
-        journal.in_sector_pos = 0;
-        auto next_next_free = (journal.next_free+journal.block_size) < journal.len ? journal.next_free + journal.block_size : journal.block_size;
-        // double check that next_free doesn't cross used_start from the left
-        assert(journal.next_free >= journal.used_start && next_next_free >= journal.next_free || next_next_free < journal.used_start);
-        journal.next_free = next_next_free;
-        memset(journal.inmemory
-            ? (uint8_t*)journal.buffer + journal.sector_info[journal.cur_sector].offset
-            : (uint8_t*)journal.sector_buf + journal.block_size*journal.cur_sector, 0, journal.block_size);
-    }
-    journal_entry *je = (struct journal_entry*)(
-        (journal.inmemory
-            ? (uint8_t*)journal.buffer + journal.sector_info[journal.cur_sector].offset
-            : (uint8_t*)journal.sector_buf + journal.block_size*journal.cur_sector) + journal.in_sector_pos
-    );
-    journal.in_sector_pos += size;
-    je->magic = JOURNAL_MAGIC;
-    je->type = type;
-    je->size = size;
-    je->crc32_prev = journal.crc32_last;
-    journal.sector_info[journal.cur_sector].dirty = true;
-    return je;
-}
-
-void blockstore_impl_t::prepare_journal_sector_write(int cur_sector, blockstore_op_t *op)
-{
-    // Don't submit the same sector twice in the same batch
-    if (!journal.sector_info[cur_sector].submit_id)
-    {
-        io_uring_sqe *sqe = get_sqe();
-        // Caller must ensure availability of an SQE
-        assert(sqe != NULL);
-        ring_data_t *data = ((ring_data_t*)sqe->user_data);
-        journal.sector_info[cur_sector].written = true;
-        journal.sector_info[cur_sector].submit_id = ++journal.submit_id;
-        assert(journal.submit_id != 0); // check overflow
-        journal.submitting_sectors.push_back(cur_sector);
-        journal.sector_info[cur_sector].flush_count++;
-        data->iov = (struct iovec){
-            (journal.inmemory
-                ? (uint8_t*)journal.buffer + journal.sector_info[cur_sector].offset
-                : (uint8_t*)journal.sector_buf + journal.block_size*cur_sector),
-            (size_t)journal.block_size
-        };
-        data->callback = [this, flush_id = journal.submit_id](ring_data_t *data) { handle_journal_write(data, flush_id); };
-        io_uring_prep_writev(
-            sqe, dsk.journal_fd, &data->iov, 1, journal.offset + journal.sector_info[cur_sector].offset
-        );
-    }
-    journal.sector_info[cur_sector].dirty = false;
-    // But always remember that this operation has to wait until this exact journal write is finished
-    journal.flushing_ops.emplace(journal.sector_info[cur_sector].submit_id, (pending_journaling_t){
-        .pending = 1,
-        .sector = cur_sector,
-        .op = op,
-    });
-    auto priv = PRIV(op);
-    priv->pending_ops++;
-    if (!priv->min_flushed_journal_sector)
-        priv->min_flushed_journal_sector = 1+cur_sector;
-    assert(priv->min_flushed_journal_sector <= journal.sector_count);
-    priv->max_flushed_journal_sector = 1+cur_sector;
-}
-
-void blockstore_impl_t::handle_journal_write(ring_data_t *data, uint64_t flush_id)
-{
-    live = true;
-    if (data->res != data->iov.iov_len)
-    {
-        // FIXME: our state becomes corrupted after a write error. maybe do something better than just die
-        disk_error_abort("journal write", data->res, data->iov.iov_len);
-    }
-    auto fl_it = journal.flushing_ops.lower_bound(flush_id);
-    if (fl_it != journal.flushing_ops.end() && fl_it->first == flush_id && fl_it->second.sector >= 0)
-    {
-        journal.sector_info[fl_it->second.sector].flush_count--;
-    }
-    auto is_first = fl_it == journal.flushing_ops.begin();
-    while (fl_it != journal.flushing_ops.end())
-    {
-        bool del = false;
-        if (fl_it->first == flush_id)
-        {
-            fl_it->second.pending = 0;
-            del = is_first;
-        }
-        else
-        {
-            del = !fl_it->second.pending;
-        }
-        if (del)
-        {
-            // Do not complete this operation if previous writes are unfinished
-            // Otherwise also complete following operations waiting for this one
-            auto priv = PRIV(fl_it->second.op);
-            priv->pending_ops--;
-            assert(priv->pending_ops >= 0);
-            if (priv->pending_ops == 0)
-            {
-                release_journal_sectors(fl_it->second.op);
-                priv->op_state++;
-                ringloop->wakeup();
-            }
-            journal.flushing_ops.erase(fl_it++);
-        }
-        else
-        {
-            fl_it++;
-        }
-    }
-}
-
-journal_t::~journal_t()
-{
-    if (sector_buf)
-        free(sector_buf);
-    if (sector_info)
-        free(sector_info);
-    if (buffer)
-        free(buffer);
-    sector_buf = NULL;
-    sector_info = NULL;
-    buffer = NULL;
-}
-
-uint64_t journal_t::get_trim_pos()
-{
-    auto journal_used_it = used_sectors.lower_bound(used_start);
-    if (journal_used_it == used_sectors.end())
-    {
-        // Journal is cleared to its end, restart from the beginning
-        journal_used_it = used_sectors.begin();
-        if (journal_used_it == used_sectors.end())
-        {
-            // Journal is empty
-            return next_free;
-        }
-        else
-        {
-            // next_free does not need updating during trim
-#ifdef BLOCKSTORE_DEBUG
-            printf(
-                "Trimming journal (used_start=%08jx, next_free=%08jx, dirty_start=%08jx, new_start=%08jx, new_refcount=%jd)\n",
-                used_start, next_free, dirty_start,
-                journal_used_it->first, journal_used_it->second
-            );
-#endif
-            return journal_used_it->first;
-        }
-    }
-    else if (journal_used_it->first > used_start)
-    {
-        // Journal is cleared up to <journal_used_it>
-#ifdef BLOCKSTORE_DEBUG
-        printf(
-            "Trimming journal (used_start=%08jx, next_free=%08jx, dirty_start=%08jx, new_start=%08jx, new_refcount=%jd)\n",
-            used_start, next_free, dirty_start,
-            journal_used_it->first, journal_used_it->second
-        );
-#endif
-        return journal_used_it->first;
-    }
-    // Can't trim journal
-    return used_start;
-}
-
-void journal_t::dump_diagnostics()
-{
-    auto journal_used_it = used_sectors.lower_bound(used_start);
-    if (journal_used_it == used_sectors.end())
-    {
-        // Journal is cleared to its end, restart from the beginning
-        journal_used_it = used_sectors.begin();
-    }
-    printf(
-        "Journal: used_start=%08jx next_free=%08jx dirty_start=%08jx trim_to=%08jx trim_to_refs=%jd\n",
-        used_start, next_free, dirty_start,
-        journal_used_it == used_sectors.end() ? 0 : journal_used_it->first,
-        journal_used_it == used_sectors.end() ? 0 : journal_used_it->second
-    );
-}
--- a/src/blockstore/blockstore_open.cpp
+++ b/src/blockstore/blockstore_open.cpp
@@ -2,6 +2,7 @@
 // License: VNPL-1.1 (see README.md for details)

 #include <sys/file.h>
+#include <stdexcept>
 #include "blockstore_impl.h"

 void blockstore_impl_t::parse_config(blockstore_config_t & config, bool init)
@@ -14,12 +15,14 @@ void blockstore_impl_t::parse_config(blockstore_config_t & config, bool init)
    }
    min_flusher_count = strtoull(config["min_flusher_count"].c_str(), NULL, 10);
    journal_trim_interval = strtoull(config["journal_trim_interval"].c_str(), NULL, 10);
+    flusher_start_threshold = strtoull(config["flusher_start_threshold"].c_str(), NULL, 10);
    max_write_iodepth = strtoull(config["max_write_iodepth"].c_str(), NULL, 10);
    throttle_small_writes = config["throttle_small_writes"] == "true" || config["throttle_small_writes"] == "1" || config["throttle_small_writes"] == "yes";
    throttle_target_iops = strtoull(config["throttle_target_iops"].c_str(), NULL, 10);
    throttle_target_mbs = strtoull(config["throttle_target_mbs"].c_str(), NULL, 10);
    throttle_target_parallelism = strtoull(config["throttle_target_parallelism"].c_str(), NULL, 10);
    throttle_threshold_us = strtoull(config["throttle_threshold_us"].c_str(), NULL, 10);
+    perfect_csum_update = config["perfect_csum_update"] == "true" || config["perfect_csum_update"] == "1" || config["perfect_csum_update"] == "yes";
    if (config["autosync_writes"] != "")
    {
        autosync_writes = strtoull(config["autosync_writes"].c_str(), NULL, 10);
@@ -28,13 +31,17 @@ void blockstore_impl_t::parse_config(blockstore_config_t & config, bool init)
    {
        max_flusher_count = 256;
    }
-    if (!min_flusher_count || journal.flush_journal)
+    if (!min_flusher_count)
    {
        min_flusher_count = 1;
    }
    if (!journal_trim_interval)
    {
-        journal_trim_interval = 512;
+        journal_trim_interval = 1024;
+    }
+    if (!flusher_start_threshold)
+    {
+        flusher_start_threshold = 32;
    }
    if (!max_write_iodepth)
    {
@@ -68,23 +75,6 @@ void blockstore_impl_t::parse_config(blockstore_config_t & config, bool init)
    {
        readonly = true;
    }
-    if (config["disable_data_fsync"] == "true" || config["disable_data_fsync"] == "1" || config["disable_data_fsync"] == "yes")
-    {
-        disable_data_fsync = true;
-    }
-    if (config["disable_meta_fsync"] == "true" || config["disable_meta_fsync"] == "1" || config["disable_meta_fsync"] == "yes")
-    {
-        disable_meta_fsync = true;
-    }
-    if (config["disable_journal_fsync"] == "true" || config["disable_journal_fsync"] == "1" || config["disable_journal_fsync"] == "yes")
-    {
-        disable_journal_fsync = true;
-    }
-    if (config["flush_journal"] == "true" || config["flush_journal"] == "1" || config["flush_journal"] == "yes")
-    {
-        // Only flush journal and exit
-        journal.flush_journal = true;
-    }
    if (config["immediate_commit"] == "all")
    {
        immediate_commit = IMMEDIATE_ALL;
@@ -94,85 +84,23 @@ void blockstore_impl_t::parse_config(blockstore_config_t & config, bool init)
        immediate_commit = IMMEDIATE_SMALL;
    }
    metadata_buf_size = strtoull(config["meta_buf_size"].c_str(), NULL, 10);
-    inmemory_meta = config["inmemory_metadata"] != "false" && config["inmemory_metadata"] != "0" &&
-        config["inmemory_metadata"] != "no";
-    journal.sector_count = strtoull(config["journal_sector_buffer_count"].c_str(), NULL, 10);
-    journal.no_same_sector_overwrites = config["journal_no_same_sector_overwrites"] == "true" ||
-        config["journal_no_same_sector_overwrites"] == "1" || config["journal_no_same_sector_overwrites"] == "yes";
-    journal.inmemory = config["inmemory_journal"] != "false" && config["inmemory_journal"] != "0" &&
-        config["inmemory_journal"] != "no";
+    meta_write_recheck_parallelism = strtoull(config["meta_write_recheck_parallelism"].c_str(), NULL, 10);
    log_level = strtoull(config["log_level"].c_str(), NULL, 10);
    // Validate
-    if (journal.sector_count < 2)
-    {
-        journal.sector_count = 32;
-    }
    if (metadata_buf_size < 65536)
    {
        metadata_buf_size = 4*1024*1024;
    }
-    if (dsk.meta_device == dsk.data_device)
+    if (!meta_write_recheck_parallelism)
    {
-        disable_meta_fsync = disable_data_fsync;
+        meta_write_recheck_parallelism = 16;
    }
-    if (dsk.journal_device == dsk.meta_device)
-    {
-        disable_journal_fsync = disable_meta_fsync;
-    }
-    if (immediate_commit != IMMEDIATE_NONE && !disable_journal_fsync)
+    if (immediate_commit != IMMEDIATE_NONE && !dsk.disable_journal_fsync)
    {
        throw std::runtime_error("immediate_commit requires disable_journal_fsync");
    }
-    if (immediate_commit == IMMEDIATE_ALL && !disable_data_fsync)
+    if (immediate_commit == IMMEDIATE_ALL && !dsk.disable_data_fsync)
    {
        throw std::runtime_error("immediate_commit=all requires disable_journal_fsync and disable_data_fsync");
    }
-    // init some fields
-    journal.block_size = dsk.journal_block_size;
-    journal.next_free = dsk.journal_block_size;
-    journal.used_start = dsk.journal_block_size;
-    // no free space because sector is initially unmapped
-    journal.in_sector_pos = dsk.journal_block_size;
-}
-
-void blockstore_impl_t::calc_lengths()
-{
-    dsk.calc_lengths();
-    journal.len = dsk.journal_len;
-    journal.block_size = dsk.journal_block_size;
-    journal.offset = dsk.journal_offset;
-    if (inmemory_meta)
-    {
-        metadata_buffer = memalign(MEM_ALIGNMENT, dsk.meta_len);
-        if (!metadata_buffer)
-            throw std::runtime_error("Failed to allocate memory for the metadata ("+std::to_string(dsk.meta_len/1024/1024)+" MB)");
-    }
-    else if (dsk.clean_entry_bitmap_size || dsk.data_csum_type)
-    {
-        clean_bitmaps = (uint8_t*)malloc(dsk.block_count * 2 * dsk.clean_entry_bitmap_size);
-        if (!clean_bitmaps)
-        {
-            throw std::runtime_error(
-                "Failed to allocate memory for the metadata sparse write bitmap ("+
-                std::to_string(dsk.block_count * 2 * dsk.clean_entry_bitmap_size / 1024 / 1024)+" MB)"
-            );
-        }
-    }
-    if (journal.inmemory)
-    {
-        journal.buffer = memalign(MEM_ALIGNMENT, journal.len);
-        if (!journal.buffer)
-            throw std::runtime_error("Failed to allocate memory for journal ("+std::to_string(journal.len/1024/1024)+" MB)");
-    }
-    else
-    {
-        journal.sector_buf = (uint8_t*)memalign(MEM_ALIGNMENT, journal.sector_count * dsk.journal_block_size);
-        if (!journal.sector_buf)
-            throw std::bad_alloc();
-    }
-    journal.sector_info = (journal_sector_info_t*)calloc(journal.sector_count, sizeof(journal_sector_info_t));
-    if (!journal.sector_info)
-    {
-        throw std::bad_alloc();
-    }
 }
--- a/src/blockstore/blockstore_read.cpp
+++ b/src/blockstore/blockstore_read.cpp
--- a/src/blockstore/blockstore_rollback.cpp
+++ b/src/blockstore/blockstore_rollback.cpp
@@ -1,258 +0,0 @@
-// Copyright (c) Vitaliy Filippov, 2019+
-// License: VNPL-1.1 (see README.md for details)
-
-#include "blockstore_impl.h"
-
-int blockstore_impl_t::dequeue_rollback(blockstore_op_t *op)
-{
-    if (PRIV(op)->op_state)
-    {
-        return continue_rollback(op);
-    }
-    int r = split_stab_op(op, [this](obj_ver_id ov)
-    {
-        // Check that there are some versions greater than v->version (which may be zero),
-        // check that they're unstable, synced, and not currently written to
-        auto dirty_it = dirty_db.lower_bound((obj_ver_id){
-            .oid = ov.oid,
-            .version = UINT64_MAX,
-        });
-        if (dirty_it == dirty_db.begin())
-        {
-            // Already rolled back, skip this object version
-            return STAB_SPLIT_DONE;
-        }
-        else
-        {
-            dirty_it--;
-            if (dirty_it->first.oid != ov.oid || dirty_it->first.version < ov.version)
-            {
-                // Already rolled back, skip this object version
-                return STAB_SPLIT_DONE;
-            }
-            while (dirty_it->first.oid == ov.oid && dirty_it->first.version > ov.version)
-            {
-                if (IS_IN_FLIGHT(dirty_it->second.state))
-                {
-                    // Object write is still in progress. Wait until the write request completes
-                    return STAB_SPLIT_WAIT;
-                }
-                else if (!IS_SYNCED(dirty_it->second.state) ||
-                    IS_STABLE(dirty_it->second.state))
-                {
-                    // Sync the object
-                    return STAB_SPLIT_SYNC;
-                }
-                if (dirty_it == dirty_db.begin())
-                {
-                    break;
-                }
-                dirty_it--;
-            }
-            return STAB_SPLIT_TODO;
-        }
-    });
-    if (r != 1)
-    {
-        return r;
-    }
-    // Check journal space
-    blockstore_journal_check_t space_check(this);
-    if (!space_check.check_available(op, op->len, sizeof(journal_entry_rollback), 0))
-    {
-        return 0;
-    }
-    // There is sufficient space. Check SQEs
-    BS_SUBMIT_CHECK_SQES(space_check.sectors_to_write);
-    // Prepare and submit journal entries
-    int s = 0;
-    auto v = (obj_ver_id*)op->buf;
-    for (int i = 0; i < op->len; i++, v++)
-    {
-        if (!journal.entry_fits(sizeof(journal_entry_rollback)) &&
-            journal.sector_info[journal.cur_sector].dirty)
-        {
-            prepare_journal_sector_write(journal.cur_sector, op);
-            s++;
-        }
-        journal_entry_rollback *je = (journal_entry_rollback*)
-            prefill_single_journal_entry(journal, JE_ROLLBACK, sizeof(journal_entry_rollback));
-        je->oid = v->oid;
-        je->version = v->version;
-        je->crc32 = je_crc32((journal_entry*)je);
-        journal.crc32_last = je->crc32;
-    }
-    prepare_journal_sector_write(journal.cur_sector, op);
-    s++;
-    assert(s == space_check.sectors_to_write);
-    PRIV(op)->op_state = 1;
-    return 1;
-}
-
-int blockstore_impl_t::continue_rollback(blockstore_op_t *op)
-{
-    if (PRIV(op)->op_state == 2)
-        goto resume_2;
-    else if (PRIV(op)->op_state == 4)
-        goto resume_4;
-    else
-        return 1;
-resume_2:
-    if (!disable_journal_fsync)
-    {
-        BS_SUBMIT_GET_SQE(sqe, data);
-        io_uring_prep_fsync(sqe, dsk.journal_fd, IORING_FSYNC_DATASYNC);
-        data->iov = { 0 };
-        data->callback = [this, op](ring_data_t *data) { handle_write_event(data, op); };
-        PRIV(op)->min_flushed_journal_sector = PRIV(op)->max_flushed_journal_sector = 0;
-        PRIV(op)->pending_ops = 1;
-        PRIV(op)->op_state = 3;
-        return 1;
-    }
-resume_4:
-    obj_ver_id* v;
-    int i;
-    for (i = 0, v = (obj_ver_id*)op->buf; i < op->len; i++, v++)
-    {
-        mark_rolled_back(*v);
-    }
-    // Acknowledge op
-    op->retval = 0;
-    FINISH_OP(op);
-    return 2;
-}
-
-void blockstore_impl_t::mark_rolled_back(const obj_ver_id & ov)
-{
-    auto it = dirty_db.lower_bound((obj_ver_id){
-        .oid = ov.oid,
-        .version = UINT64_MAX,
-    });
-    if (it != dirty_db.begin())
-    {
-        uint64_t max_unstable = 0;
-        auto rm_start = it;
-        auto rm_end = it;
-        it--;
-        while (1)
-        {
-            if (it->first.oid != ov.oid)
-                break;
-            else if (it->first.version <= ov.version)
-            {
-                if (!IS_STABLE(it->second.state))
-                    max_unstable = it->first.version;
-                break;
-            }
-            else if (IS_IN_FLIGHT(it->second.state) || IS_STABLE(it->second.state))
-                break;
-            // Remove entry
-            rm_start = it;
-            if (it == dirty_db.begin())
-                break;
-            it--;
-        }
-        if (rm_start != rm_end)
-        {
-            erase_dirty(rm_start, rm_end, UINT64_MAX);
-            auto unstab_it = unstable_writes.find(ov.oid);
-            if (unstab_it != unstable_writes.end())
-            {
-                if (max_unstable == 0)
-                    unstable_writes.erase(unstab_it);
-                else
-                    unstab_it->second = max_unstable;
-            }
-        }
-    }
-}
-
-void blockstore_impl_t::erase_dirty(blockstore_dirty_db_t::iterator dirty_start, blockstore_dirty_db_t::iterator dirty_end, uint64_t clean_loc)
-{
-    if (dirty_end == dirty_start)
-    {
-        return;
-    }
-    auto dirty_it = dirty_end;
-    dirty_it--;
-    if (IS_DELETE(dirty_it->second.state))
-    {
-        object_id oid = dirty_it->first.oid;
-#ifdef BLOCKSTORE_DEBUG
-        printf("Unblock writes-after-delete %jx:%jx v%ju\n", oid.inode, oid.stripe, dirty_it->first.version);
-#endif
-        dirty_it = dirty_end;
-        // Unblock operations blocked by delete flushing
-        uint32_t next_state = BS_ST_IN_FLIGHT;
-        while (dirty_it != dirty_db.end() && dirty_it->first.oid == oid)
-        {
-            if ((dirty_it->second.state & BS_ST_WORKFLOW_MASK) == BS_ST_WAIT_DEL)
-            {
-                dirty_it->second.state = (dirty_it->second.state & ~BS_ST_WORKFLOW_MASK) | next_state;
-                if (IS_BIG_WRITE(dirty_it->second.state))
-                {
-                    next_state = BS_ST_WAIT_BIG;
-                }
-            }
-            dirty_it++;
-        }
-        dirty_it = dirty_end;
-        dirty_it--;
-    }
-    while (1)
-    {
-        if ((IS_BIG_WRITE(dirty_it->second.state) || IS_DELETE(dirty_it->second.state)) &&
-            IS_STABLE(dirty_it->second.state))
-        {
-            big_to_flush--;
-        }
-        if (IS_BIG_WRITE(dirty_it->second.state) && dirty_it->second.location != clean_loc &&
-            dirty_it->second.location != UINT64_MAX)
-        {
-#ifdef BLOCKSTORE_DEBUG
-            printf("Free block %ju from %jx:%jx v%ju\n", dirty_it->second.location >> dsk.block_order,
-                dirty_it->first.oid.inode, dirty_it->first.oid.stripe, dirty_it->first.version);
-#endif
-            data_alloc->set(dirty_it->second.location >> dsk.block_order, false);
-        }
-        auto used = --journal.used_sectors.at(dirty_it->second.journal_sector);
-#ifdef BLOCKSTORE_DEBUG
-        printf(
-            "remove usage of journal offset %08jx by %jx:%jx v%ju (%ju refs)\n", dirty_it->second.journal_sector,
-            dirty_it->first.oid.inode, dirty_it->first.oid.stripe, dirty_it->first.version, used
-        );
-#endif
-        if (used == 0)
-        {
-            journal.used_sectors.erase(dirty_it->second.journal_sector);
-            if (dirty_it->second.journal_sector == journal.sector_info[journal.cur_sector].offset)
-            {
-                // Mark current sector as "full" to select the new one
-                journal.in_sector_pos = dsk.journal_block_size;
-            }
-            flusher->mark_trim_possible();
-        }
-        free_dirty_dyn_data(dirty_it->second);
-        if (dirty_it == dirty_start)
-        {
-            break;
-        }
-        dirty_it--;
-    }
-    dirty_db.erase(dirty_start, dirty_end);
-}
-
-void blockstore_impl_t::free_dirty_dyn_data(dirty_entry & e)
-{
-    if (e.dyn_data)
-    {
-        if (alloc_dyn_data &&
-            --*((int*)e.dyn_data) == 0) // refcount
-        {
-            // dyn_data contains the bitmap and checksums
-            // free it if it doesn't refer to the in-memory journal
-            free(e.dyn_data);
-        }
-        e.dyn_data = NULL;
-    }
-}
--- a/src/blockstore/blockstore_stable.cpp
+++ b/src/blockstore/blockstore_stable.cpp
@@ -3,559 +3,87 @@

 #include "blockstore_impl.h"

-// Stabilize small write:
-// 1) Copy data from the journal to the data device
-// 2) Increase version on the metadata device and sync it
-// 3) Advance clean_db entry's version, clear previous journal entries
-//
-// This makes 1 4K small write+sync look like:
-// 512b+4K (journal) + sync + 512b (journal) + sync + 4K (data) [+ sync?] + 512b (metadata) + sync.
-// WA = 2.375. It's not the best, SSD FTL-like redirect-write could probably be lower
-// even with defragmentation. But it's fixed and it's still better than in Ceph. :)
-// except for HDD-only clusters, because each write results in 3 seeks.
-
-// Stabilize big write:
-// 1) Copy metadata from the journal to the metadata device
-// 2) Move dirty_db entry to clean_db and clear previous journal entries
-//
-// This makes 1 128K big write+sync look like:
-// 128K (data) + sync + 512b (journal) + sync + 512b (journal) + sync + 512b (metadata) + sync.
-// WA = 1.012. Very good :)
-
-// Stabilize delete:
-// 1) Remove metadata entry and sync it
-// 2) Remove dirty_db entry and clear previous journal entries
-// We have 2 problems here:
-// - In the cluster environment, we must store the "tombstones" of deleted objects until
-//   all replicas (not just quorum) agrees about their deletion. That is, "stabilize" is
-//   not possible for deletes in degraded placement groups
-// - With simple "fixed" metadata tables we can't just clear the metadata entry of the latest
-//   object version. We must clear all previous entries, too.
-// FIXME Fix both problems - probably, by switching from "fixed" metadata tables to "dynamic"
-
-// AND We must do it in batches, for the sake of reduced fsync call count
-// AND We must know what we stabilize. Basic workflow is like:
-// 1) primary OSD receives sync request
-// 2) it submits syncs to blockstore and peers
-// 3) after everyone acks sync it acks sync to the client
-// 4) after a while it takes his synced object list and sends stabilize requests
-//    to peers and to its own blockstore, thus freeing the old version
-
-struct ver_vector_t
-{
-    obj_ver_id *items = NULL;
-    uint64_t alloc = 0, size = 0;
-};
-
-static void init_versions(ver_vector_t & vec, obj_ver_id *start, obj_ver_id *end, uint64_t len)
-{
-    if (!vec.items)
-    {
-        vec.alloc = len;
-        vec.items = (obj_ver_id*)malloc_or_die(sizeof(obj_ver_id) * vec.alloc);
-        for (auto sv = start; sv < end; sv++)
-        {
-            vec.items[vec.size++] = *sv;
-        }
-    }
-}
-
-static void append_version(ver_vector_t & vec, obj_ver_id ov)
-{
-    if (vec.size >= vec.alloc)
-    {
-        vec.alloc = !vec.alloc ? 4 : vec.alloc*2;
-        vec.items = (obj_ver_id*)realloc_or_die(vec.items, sizeof(obj_ver_id) * vec.alloc);
-    }
-    vec.items[vec.size++] = ov;
-}
-
-static bool check_unsynced(std::vector<obj_ver_id> & check, obj_ver_id ov, std::vector<obj_ver_id> & to, int *count)
-{
-    bool found = false;
-    int j = 0, k = 0;
-    while (j < check.size())
-    {
-        if (check[j] == ov)
-            found = true;
-        if (check[j].oid == ov.oid && check[j].version <= ov.version)
-        {
-            to.push_back(check[j++]);
-            if (count)
-                (*count)--;
-        }
-        else
-            check[k++] = check[j++];
-    }
-    check.resize(k);
-    return found;
-}
-
-blockstore_op_t* blockstore_impl_t::selective_sync(blockstore_op_t *op)
-{
-    unsynced_big_write_count -= unsynced_big_writes.size();
-    unsynced_big_writes.swap(PRIV(op)->sync_big_writes);
-    unsynced_big_write_count += unsynced_big_writes.size();
-    unsynced_small_writes.swap(PRIV(op)->sync_small_writes);
-    // Create a sync operation, insert into the end of the queue
-    // And move ourselves into the end too!
-    // Rather hacky but that's what we need...
-    blockstore_op_t *sync_op = new blockstore_op_t;
-    sync_op->opcode = BS_OP_SYNC;
-    sync_op->buf = NULL;
-    sync_op->callback = [](blockstore_op_t *sync_op)
-    {
-        delete sync_op;
-    };
-    init_op(sync_op);
-    int sync_res = continue_sync(sync_op);
-    if (sync_res != 2)
-    {
-        // Put SYNC into the queue if it's not finished yet
-        submit_queue.push_back(sync_op);
-    }
-    // Restore unsynced_writes
-    unsynced_small_writes.swap(PRIV(op)->sync_small_writes);
-    unsynced_big_write_count -= unsynced_big_writes.size();
-    unsynced_big_writes.swap(PRIV(op)->sync_big_writes);
-    unsynced_big_write_count += unsynced_big_writes.size();
-    if (sync_res == 2)
-    {
-        // Sync is immediately completed
-        return NULL;
-    }
-    return sync_op;
-}
-
-// Returns: 2 = stop processing and dequeue, 0 = stop processing and do not dequeue, 1 = proceed with op itself
-int blockstore_impl_t::split_stab_op(blockstore_op_t *op, std::function<int(obj_ver_id v)> decider)
-{
-    bool add_sync = false;
-    ver_vector_t good_vers, bad_vers;
-    obj_ver_id* v;
-    int i, todo = 0;
-    for (i = 0, v = (obj_ver_id*)op->buf; i < op->len; i++, v++)
-    {
-        int action = decider(*v);
-        if (action < 0)
-        {
-            // Rollback changes
-            for (auto & ov: PRIV(op)->sync_big_writes)
-            {
-                unsynced_big_writes.push_back(ov);
-                unsynced_big_write_count++;
-            }
-            for (auto & ov: PRIV(op)->sync_small_writes)
-            {
-                unsynced_small_writes.push_back(ov);
-            }
-            free(good_vers.items);
-            good_vers.items = NULL;
-            free(bad_vers.items);
-            bad_vers.items = NULL;
-            // Error
-            op->retval = action;
-            FINISH_OP(op);
-            return 2;
-        }
-        else if (action == STAB_SPLIT_DONE)
-        {
-            // Already done
-            init_versions(good_vers, (obj_ver_id*)op->buf, v, op->len);
-        }
-        else if (action == STAB_SPLIT_WAIT)
-        {
-            // Already in progress, we just have to wait until it finishes
-            init_versions(good_vers, (obj_ver_id*)op->buf, v, op->len);
-            append_version(bad_vers, *v);
-        }
-        else if (action == STAB_SPLIT_SYNC)
-        {
-            // Needs a SYNC, we have to send a SYNC if not already in progress
-            //
-            // If the object is not present in unsynced_(big|small)_writes then
-            // it's currently being synced. If it's present then we can initiate
-            // its sync ourselves.
-            init_versions(good_vers, (obj_ver_id*)op->buf, v, op->len);
-            append_version(bad_vers, *v);
-            if (!add_sync)
-            {
-                PRIV(op)->sync_big_writes.clear();
-                PRIV(op)->sync_small_writes.clear();
-                add_sync = true;
-            }
-            check_unsynced(unsynced_small_writes, *v, PRIV(op)->sync_small_writes, NULL);
-            check_unsynced(unsynced_big_writes, *v, PRIV(op)->sync_big_writes, &unsynced_big_write_count);
-        }
-        else /* if (action == STAB_SPLIT_TODO) */
-        {
-            if (good_vers.items)
-            {
-                // If we're selecting versions then append it
-                // Main idea is that 99% of the time all versions passed to BS_OP_STABLE are synced
-                // And we don't want to select/allocate anything in that optimistic case
-                append_version(good_vers, *v);
-            }
-            todo++;
-        }
-    }
-    // In a pessimistic scenario, an operation may be split into 3:
-    // - Stabilize synced entries
-    // - Sync unsynced entries
-    // - Continue for unsynced entries after sync
-    add_sync = add_sync && (PRIV(op)->sync_big_writes.size() || PRIV(op)->sync_small_writes.size());
-    if (!todo && !bad_vers.size)
-    {
-        // Already stable
-        op->retval = 0;
-        FINISH_OP(op);
-        return 2;
-    }
-    op->retval = 0;
-    if (!todo && !add_sync)
-    {
-        // Only wait for inflight writes or current in-progress syncs
-        return 0;
-    }
-    blockstore_op_t *sync_op = NULL, *split_stab_op = NULL;
-    if (add_sync)
-    {
-        // Initiate a selective sync for PRIV(op)->sync_(big|small)_writes
-        sync_op = selective_sync(op);
-    }
-    if (bad_vers.size)
-    {
-        // Split part of the request into a separate operation
-        split_stab_op = new blockstore_op_t;
-        split_stab_op->opcode = op->opcode;
-        split_stab_op->buf = bad_vers.items;
-        split_stab_op->len = bad_vers.size;
-        init_op(split_stab_op);
-        submit_queue.push_back(split_stab_op);
-    }
-    if (sync_op || split_stab_op || good_vers.items)
-    {
-        void *orig_buf = op->buf;
-        if (good_vers.items)
-        {
-            op->buf = good_vers.items;
-            op->len = good_vers.size;
-        }
-        // Make a wrapped callback
-        int *split_op_counter = (int*)malloc_or_die(sizeof(int));
-        *split_op_counter = (sync_op ? 1 : 0) + (split_stab_op ? 1 : 0) + (todo ? 1 : 0);
-        auto cb = [op, good_items = good_vers.items,
-            bad_items = bad_vers.items, split_op_counter,
-            orig_buf, real_cb = op->callback](blockstore_op_t *split_op)
-        {
-            if (split_op->retval != 0)
-                op->retval = split_op->retval;
-            (*split_op_counter)--;
-            assert((*split_op_counter) >= 0);
-            if (op != split_op)
-                delete split_op;
-            if (!*split_op_counter)
-            {
-                free(good_items);
-                free(bad_items);
-                free(split_op_counter);
-                op->buf = orig_buf;
-                real_cb(op);
-            }
-        };
-        if (sync_op)
-        {
-            sync_op->callback = cb;
-        }
-        if (split_stab_op)
-        {
-            split_stab_op->callback = cb;
-        }
-        op->callback = cb;
-    }
-    if (!todo)
-    {
-        // All work is postponed
-        op->callback = NULL;
-        return 2;
-    }
-    return 1;
-}
-
+// Handles both stabilize (commit) and rollback
 int blockstore_impl_t::dequeue_stable(blockstore_op_t *op)
 {
-    if (PRIV(op)->op_state)
-    {
-        return continue_stable(op);
-    }
-    int r = split_stab_op(op, [this](obj_ver_id ov)
-    {
-        auto dirty_it = dirty_db.find(ov);
-        if (dirty_it == dirty_db.end())
-        {
-            auto & clean_db = clean_db_shard(ov.oid);
-            auto clean_it = clean_db.find(ov.oid);
-            if (clean_it == clean_db.end() || clean_it->second.version < ov.version)
-            {
-                // No such object version
-                printf("Error: %jx:%jx v%ju not found while stabilizing\n", ov.oid.inode, ov.oid.stripe, ov.version);
-                return -ENOENT;
-            }
-            else
-            {
-                // Already stable
-                return STAB_SPLIT_DONE;
-            }
-        }
-        else if (IS_STABLE(dirty_it->second.state))
-        {
-            // Already stable
-            return STAB_SPLIT_DONE;
-        }
-        while (true)
-        {
-            if (IS_IN_FLIGHT(dirty_it->second.state))
-            {
-                // Object write is still in progress. Wait until the write request completes
-                return STAB_SPLIT_WAIT;
-            }
-            else if (!IS_SYNCED(dirty_it->second.state))
-            {
-                // Object not synced yet - sync it
-                // In previous versions we returned EBUSY here and required
-                // the caller (OSD) to issue a global sync first. But a global sync
-                // waits for all writes in the queue including inflight writes. And
-                // inflight writes may themselves be blocked by unstable writes being
-                // still present in the journal and not flushed away from it.
-                // So we must sync specific objects here.
-                //
-                // Even more, we have to process "stabilize" request in parts. That is,
-                // we must stabilize all objects which are already synced. Otherwise
-                // they may block objects which are NOT synced yet.
-                return STAB_SPLIT_SYNC;
-            }
-            else if (IS_STABLE(dirty_it->second.state))
-            {
-                break;
-            }
-            // Check previous versions too
-            if (dirty_it == dirty_db.begin())
-            {
-                break;
-            }
-            dirty_it--;
-            if (dirty_it->first.oid != ov.oid)
-            {
-                break;
-            }
-        }
-        return STAB_SPLIT_TODO;
-    });
-    if (r != 1)
-    {
-        return r;
-    }
-    // Check journal space
-    blockstore_journal_check_t space_check(this);
-    if (!space_check.check_available(op, op->len, sizeof(journal_entry_stable), 0))
-    {
-        return 0;
-    }
-    // There is sufficient space. Check SQEs
-    BS_SUBMIT_CHECK_SQES(space_check.sectors_to_write);
-    // Prepare and submit journal entries
-    int s = 0;
-    auto v = (obj_ver_id*)op->buf;
-    for (int i = 0; i < op->len; i++, v++)
-    {
-        if (!journal.entry_fits(sizeof(journal_entry_stable)) &&
-            journal.sector_info[journal.cur_sector].dirty)
-        {
-            prepare_journal_sector_write(journal.cur_sector, op);
-            s++;
-        }
-        journal_entry_stable *je = (journal_entry_stable*)
-            prefill_single_journal_entry(journal, JE_STABLE, sizeof(journal_entry_stable));
-        je->oid = v->oid;
-        je->version = v->version;
-        je->crc32 = je_crc32((journal_entry*)je);
-        journal.crc32_last = je->crc32;
-    }
-    prepare_journal_sector_write(journal.cur_sector, op);
-    s++;
-    assert(s == space_check.sectors_to_write);
-    PRIV(op)->op_state = 1;
-    return 1;
-}
-
-int blockstore_impl_t::continue_stable(blockstore_op_t *op)
-{
-    if (PRIV(op)->op_state == 2)
-        goto resume_2;
-    else if (PRIV(op)->op_state == 4)
-        goto resume_4;
-    else
-        return 1;
-resume_2:
-    if (!disable_journal_fsync)
-    {
-        BS_SUBMIT_GET_SQE(sqe, data);
-        io_uring_prep_fsync(sqe, dsk.journal_fd, IORING_FSYNC_DATASYNC);
-        data->iov = { 0 };
-        data->callback = [this, op](ring_data_t *data) { handle_write_event(data, op); };
-        PRIV(op)->min_flushed_journal_sector = PRIV(op)->max_flushed_journal_sector = 0;
-        PRIV(op)->pending_ops = 1;
-        PRIV(op)->op_state = 3;
-        return 1;
-    }
-resume_4:
-    // Mark dirty_db entries as stable, acknowledge op completion
-    obj_ver_id* v;
-    int i;
-    for (i = 0, v = (obj_ver_id*)op->buf; i < op->len; i++, v++)
-    {
-        // Mark all dirty_db entries up to op->version as stable
-#ifdef BLOCKSTORE_DEBUG
-        printf("Stabilize %jx:%jx v%ju\n", v->oid.inode, v->oid.stripe, v->version);
-#endif
-        mark_stable(*v);
-    }
-    // Acknowledge op
+    obj_ver_id *v = (obj_ver_id*)op->buf;
+    auto priv = PRIV(op);
+    if (priv->op_state == 1)      goto resume_1;
+    else if (priv->op_state == 2) goto resume_2;
+    else if (priv->op_state == 3) goto resume_3;
+    else if (priv->op_state == 4) goto resume_4;
+    assert(!priv->op_state);
+    // Modify in-memory state and assign contiguous LSNs
+    priv->stab_pos = 0;
+    priv->lsn = priv->to_lsn = 0;
    op->retval = 0;
+    while (priv->stab_pos < op->len)
+    {
+        uint32_t modified_block = 0;
+        uint64_t new_lsn = 0;
+        uint64_t new_to_lsn = 0;
+        int res = op->opcode == BS_OP_STABLE
+            ? heap->post_stabilize(v[priv->stab_pos].oid, v[priv->stab_pos].version, &modified_block, &new_lsn, &new_to_lsn)
+            : heap->post_rollback(v[priv->stab_pos].oid, v[priv->stab_pos].version, &new_lsn, &modified_block);
+        if (res != 0)
+        {
+            assert(res == ENOENT || res == EBUSY);
+            op->retval = -res;
+        }
+        if (new_lsn)
+        {
+            assert(priv->lsn == 0 || priv->to_lsn == new_lsn-1);
+            if (!priv->lsn)
+                priv->lsn = new_lsn;
+            priv->to_lsn = op->opcode == BS_OP_STABLE ? new_to_lsn : new_lsn;
+        }
+        priv->stab_pos++;
+    }
+    // Submit metadata writes
+    priv->stab_pos = 0;
+resume_1:
+    priv->op_state = 1;
+    while (priv->stab_pos < op->len)
+    {
+        uint32_t block_num = 0;
+        heap_object_t *obj = heap->read_entry(v[priv->stab_pos].oid, &block_num);
+        if (obj)
+        {
+            io_uring_sqe *sqe = get_sqe();
+            if (!sqe)
+            {
+                if (priv->pending_ops > 0)
+                    return 1;
+                priv->wait_detail = 1;
+                priv->wait_for = WAIT_SQE;
+                return 0;
+            }
+            prepare_meta_block_write(op, block_num, sqe);
+        }
+        priv->stab_pos++;
+    }
+    if (priv->pending_ops > 0)
+    {
+        priv->op_state = 1;
+        return 1;
+    }
+    // Mark writes as completed to allow compaction
+    for (uint64_t lsn = priv->lsn; lsn <= priv->to_lsn; lsn++)
+    {
+        heap->mark_lsn_completed(lsn);
+    }
+    unsynced_meta_write_count++;
+    // Fsync, just because our semantics imply that commit (stabilize) is immediately fsynced
+    priv->op_state = 2;
+resume_2:
+resume_3:
+resume_4:
+    int res = do_sync(op, 2);
+    if (res != 2)
+    {
+        return res;
+    }
+    // Done. Don't touch op->retval - if anything resulted in ENOENT, return it as is
    FINISH_OP(op);
    return 2;
 }
-
-void blockstore_impl_t::mark_stable(obj_ver_id v, bool forget_dirty)
-{
-    auto dirty_it = dirty_db.find(v);
-    if (dirty_it != dirty_db.end())
-    {
-        if (IS_INSTANT(dirty_it->second.state))
-        {
-            // 'Instant' (non-EC) operations may complete and try to become stable out of order. Prevent it.
-            auto back_it = dirty_it;
-            while (back_it != dirty_db.begin())
-            {
-                back_it--;
-                if (back_it->first.oid != v.oid)
-                {
-                    break;
-                }
-                if (!IS_STABLE(back_it->second.state))
-                {
-                    // There are preceding unstable versions, can't flush <v>
-                    return;
-                }
-            }
-            while (true)
-            {
-                dirty_it++;
-                if (dirty_it == dirty_db.end() || dirty_it->first.oid != v.oid ||
-                    !IS_SYNCED(dirty_it->second.state))
-                {
-                    dirty_it--;
-                    break;
-                }
-                v.version = dirty_it->first.version;
-            }
-        }
-        while (1)
-        {
-            bool was_stable = IS_STABLE(dirty_it->second.state);
-            if ((dirty_it->second.state & BS_ST_WORKFLOW_MASK) == BS_ST_SYNCED)
-            {
-                dirty_it->second.state = (dirty_it->second.state & ~BS_ST_WORKFLOW_MASK) | BS_ST_STABLE;
-                // Allocations and deletions are counted when they're stabilized
-                if (IS_BIG_WRITE(dirty_it->second.state))
-                {
-                    int exists = -1;
-                    if (dirty_it != dirty_db.begin())
-                    {
-                        auto prev_it = dirty_it;
-                        prev_it--;
-                        if (prev_it->first.oid == v.oid)
-                        {
-                            exists = IS_DELETE(prev_it->second.state) ? 0 : 1;
-                        }
-                    }
-                    if (exists == -1)
-                    {
-                        auto & clean_db = clean_db_shard(v.oid);
-                        auto clean_it = clean_db.find(v.oid);
-                        exists = clean_it != clean_db.end() ? 1 : 0;
-                    }
-                    if (!exists)
-                    {
-                        uint64_t space_id = dirty_it->first.oid.inode;
-                        if (no_inode_stats[dirty_it->first.oid.inode >> (64-POOL_ID_BITS)])
-                            space_id = space_id & ~(((uint64_t)1 << (64-POOL_ID_BITS)) - 1);
-                        inode_space_stats[space_id] += dsk.data_block_size;
-                        used_blocks++;
-                    }
-                    big_to_flush++;
-                }
-                else if (IS_DELETE(dirty_it->second.state))
-                {
-                    uint64_t space_id = dirty_it->first.oid.inode;
-                    if (no_inode_stats[dirty_it->first.oid.inode >> (64-POOL_ID_BITS)])
-                        space_id = space_id & ~(((uint64_t)1 << (64-POOL_ID_BITS)) - 1);
-                    auto & sp = inode_space_stats[space_id];
-                    if (sp > dsk.data_block_size)
-                        sp -= dsk.data_block_size;
-                    else
-                        inode_space_stats.erase(space_id);
-                    used_blocks--;
-                    big_to_flush++;
-                }
-            }
-            else if (IS_IN_FLIGHT(dirty_it->second.state))
-            {
-                // mark_stable should never be called for in-flight or submitted writes
-                printf(
-                    "BUG: Attempt to mark_stable object %jx:%jx v%ju state of which is %x\n",
-                    dirty_it->first.oid.inode, dirty_it->first.oid.stripe, dirty_it->first.version,
-                    dirty_it->second.state
-                );
-                exit(1);
-            }
-            if (forget_dirty && (IS_BIG_WRITE(dirty_it->second.state) ||
-                IS_DELETE(dirty_it->second.state)))
-            {
-                // Big write overrides all previous dirty entries
-                auto erase_end = dirty_it;
-                while (dirty_it != dirty_db.begin())
-                {
-                    dirty_it--;
-                    if (dirty_it->first.oid != v.oid)
-                    {
-                        dirty_it++;
-                        break;
-                    }
-                }
-                auto & clean_db = clean_db_shard(v.oid);
-                auto clean_it = clean_db.find(v.oid);
-                uint64_t clean_loc = clean_it != clean_db.end()
-                    ? clean_it->second.location : UINT64_MAX;
-                erase_dirty(dirty_it, erase_end, clean_loc);
-                break;
-            }
-            if (was_stable || dirty_it == dirty_db.begin())
-            {
-                break;
-            }
-            dirty_it--;
-            if (dirty_it->first.oid != v.oid)
-            {
-                break;
-            }
-        }
-        flusher->enqueue_flush(v);
-    }
-    auto unstab_it = unstable_writes.find(v.oid);
-    if (unstab_it != unstable_writes.end() &&
-        unstab_it->second <= v.version)
-    {
-        unstable_writes.erase(unstab_it);
-    }
-}
--- a/src/blockstore/blockstore_sync.cpp
+++ b/src/blockstore/blockstore_sync.cpp
@@ -3,231 +3,112 @@

 #include "blockstore_impl.h"

-#define SYNC_HAS_SMALL 1
-#define SYNC_HAS_BIG 2
-#define SYNC_DATA_SYNC_SENT 3
-#define SYNC_DATA_SYNC_DONE 4
-#define SYNC_JOURNAL_WRITE_SENT 5
-#define SYNC_JOURNAL_WRITE_DONE 6
-#define SYNC_JOURNAL_SYNC_SENT 7
-#define SYNC_DONE 8
-
 int blockstore_impl_t::continue_sync(blockstore_op_t *op)
 {
-    if (immediate_commit == IMMEDIATE_ALL)
+    if (!PRIV(op)->op_state)
    {
-        // We can return immediately because sync is only dequeued after all previous writes
        op->retval = 0;
+    }
+    int res = do_sync(op, 0);
+    if (res == 2)
+    {
        FINISH_OP(op);
-        return 2;
    }
-    if (PRIV(op)->op_state == 0)
-    {
-        stop_sync_submitted = false;
-        unsynced_big_write_count -= unsynced_big_writes.size();
-        PRIV(op)->sync_big_writes.swap(unsynced_big_writes);
-        PRIV(op)->sync_small_writes.swap(unsynced_small_writes);
-        unsynced_big_writes.clear();
-        unsynced_small_writes.clear();
-        if (PRIV(op)->sync_big_writes.size() > 0)
-            PRIV(op)->op_state = SYNC_HAS_BIG;
-        else if (PRIV(op)->sync_small_writes.size() > 0)
-            PRIV(op)->op_state = SYNC_HAS_SMALL;
-        else
-            PRIV(op)->op_state = SYNC_DONE;
-    }
-    if (PRIV(op)->op_state == SYNC_HAS_SMALL)
-    {
-        // No big writes, just fsync the journal
-        if (journal.sector_info[journal.cur_sector].dirty)
-        {
-            // Write out the last journal sector if it happens to be dirty
-            BS_SUBMIT_CHECK_SQES(1);
-            prepare_journal_sector_write(journal.cur_sector, op);
-            PRIV(op)->op_state = SYNC_JOURNAL_WRITE_SENT;
-            return 1;
-        }
-        else
-        {
-            PRIV(op)->op_state = SYNC_JOURNAL_WRITE_DONE;
-        }
-    }
-    if (PRIV(op)->op_state == SYNC_HAS_BIG)
-    {
-        // 1st step: fsync data
-        if (!disable_data_fsync)
-        {
-            BS_SUBMIT_GET_SQE(sqe, data);
-            io_uring_prep_fsync(sqe, dsk.data_fd, IORING_FSYNC_DATASYNC);
-            data->iov = { 0 };
-            data->callback = [this, op](ring_data_t *data) { handle_write_event(data, op); };
-            PRIV(op)->min_flushed_journal_sector = PRIV(op)->max_flushed_journal_sector = 0;
-            PRIV(op)->pending_ops = 1;
-            PRIV(op)->op_state = SYNC_DATA_SYNC_SENT;
-            return 1;
-        }
-        else
-        {
-            PRIV(op)->op_state = SYNC_DATA_SYNC_DONE;
-        }
-    }
-    if (PRIV(op)->op_state == SYNC_DATA_SYNC_DONE)
-    {
-        // 2nd step: Data device is synced, prepare & write journal entries
-        // Check space in the journal and journal memory buffers
-        blockstore_journal_check_t space_check(this);
-        if (dsk.csum_block_size)
-        {
-            // More complex check because all journal entries have different lengths
-            int left = PRIV(op)->sync_big_writes.size();
-            for (auto & sbw: PRIV(op)->sync_big_writes)
-            {
-                left--;
-                auto & dirty_entry = dirty_db.at(sbw);
-                uint64_t dyn_size = dsk.dirty_dyn_size(dirty_entry.offset, dirty_entry.len);
-                if (!space_check.check_available(op, 1, sizeof(journal_entry_big_write) + dyn_size, 0))
-                {
-                    return 0;
-                }
-            }
-        }
-        else if (!space_check.check_available(op, PRIV(op)->sync_big_writes.size(),
-            sizeof(journal_entry_big_write) + dsk.clean_entry_bitmap_size, 0))
-        {
-            return 0;
-        }
-        // Check SQEs. Don't bother about merging, submit each journal sector as a separate request
-        BS_SUBMIT_CHECK_SQES(space_check.sectors_to_write);
-        // Prepare and submit journal entries
-        auto it = PRIV(op)->sync_big_writes.begin();
-        int s = 0;
-        while (it != PRIV(op)->sync_big_writes.end())
-        {
-            auto & dirty_entry = dirty_db.at(*it);
-            uint64_t dyn_size = dsk.dirty_dyn_size(dirty_entry.offset, dirty_entry.len);
-            if (!journal.entry_fits(sizeof(journal_entry_big_write) + dyn_size) &&
-                journal.sector_info[journal.cur_sector].dirty)
-            {
-                prepare_journal_sector_write(journal.cur_sector, op);
-                s++;
-            }
-            journal_entry_big_write *je = (journal_entry_big_write*)prefill_single_journal_entry(
-                journal, (dirty_entry.state & BS_ST_INSTANT) ? JE_BIG_WRITE_INSTANT : JE_BIG_WRITE,
-                sizeof(journal_entry_big_write) + dyn_size
-            );
-            auto jsec = dirty_entry.journal_sector = journal.sector_info[journal.cur_sector].offset;
-            assert(journal.next_free >= journal.used_start
-                ? (jsec >= journal.used_start && jsec < journal.next_free)
-                : (jsec >= journal.used_start || jsec < journal.next_free));
-            journal.used_sectors[journal.sector_info[journal.cur_sector].offset]++;
-#ifdef BLOCKSTORE_DEBUG
-            printf(
-                "journal offset %08jx is used by %jx:%jx v%ju (%ju refs)\n",
-                dirty_entry.journal_sector, it->oid.inode, it->oid.stripe, it->version,
-                journal.used_sectors[journal.sector_info[journal.cur_sector].offset]
-            );
-#endif
-            je->oid = it->oid;
-            je->version = it->version;
-            je->offset = dirty_entry.offset;
-            je->len = dirty_entry.len;
-            je->location = dirty_entry.location;
-            memcpy((void*)(je+1), (alloc_dyn_data
-                ? (uint8_t*)dirty_entry.dyn_data+sizeof(int) : (uint8_t*)&dirty_entry.dyn_data), dyn_size);
-            je->crc32 = je_crc32((journal_entry*)je);
-            journal.crc32_last = je->crc32;
-            it++;
-        }
-        prepare_journal_sector_write(journal.cur_sector, op);
-        s++;
-        assert(s == space_check.sectors_to_write);
-        PRIV(op)->op_state = SYNC_JOURNAL_WRITE_SENT;
-        return 1;
-    }
-    if (PRIV(op)->op_state == SYNC_JOURNAL_WRITE_DONE)
-    {
-        if (!disable_journal_fsync)
-        {
-            BS_SUBMIT_GET_SQE(sqe, data);
-            io_uring_prep_fsync(sqe, dsk.journal_fd, IORING_FSYNC_DATASYNC);
-            data->iov = { 0 };
-            data->callback = [this, op](ring_data_t *data) { handle_write_event(data, op); };
-            PRIV(op)->min_flushed_journal_sector = PRIV(op)->max_flushed_journal_sector = 0;
-            PRIV(op)->pending_ops = 1;
-            PRIV(op)->op_state = SYNC_JOURNAL_SYNC_SENT;
-            return 1;
-        }
-        else
-        {
-            PRIV(op)->op_state = SYNC_DONE;
-        }
-    }
-    if (PRIV(op)->op_state == SYNC_DONE)
-    {
-        ack_sync(op);
-        return 2;
-    }
-    return 1;
+    return res;
 }

-void blockstore_impl_t::ack_sync(blockstore_op_t *op)
+bool blockstore_impl_t::submit_fsyncs(int & wait_count)
 {
-    // Handle states
-    for (auto it = PRIV(op)->sync_big_writes.begin(); it != PRIV(op)->sync_big_writes.end(); it++)
+    int n = ((unsynced_small_write_count > 0 || unsynced_big_write_count > 0 || unsynced_meta_write_count > 0) && !dsk.disable_meta_fsync) +
+        (unsynced_small_write_count > 0 && !dsk.disable_journal_fsync && dsk.journal_fd != dsk.meta_fd) +
+        (unsynced_big_write_count > 0 && !dsk.disable_data_fsync && dsk.data_fd != dsk.meta_fd && dsk.data_fd != dsk.journal_fd);
+    if (ringloop->space_left() < n)
    {
-#ifdef BLOCKSTORE_DEBUG
-        printf("Ack sync big %jx:%jx v%ju\n", it->oid.inode, it->oid.stripe, it->version);
-#endif
-        auto & unstab = unstable_writes[it->oid];
-        unstab = unstab < it->version ? it->version : unstab;
-        auto dirty_it = dirty_db.find(*it);
-        dirty_it->second.state = ((dirty_it->second.state & ~BS_ST_WORKFLOW_MASK) | BS_ST_SYNCED);
-        if (dirty_it->second.state & BS_ST_INSTANT)
-        {
-            mark_stable(dirty_it->first);
-        }
-        else
-        {
-            unstable_unsynced--;
-            assert(unstable_unsynced >= 0);
-        }
-        dirty_it++;
-        while (dirty_it != dirty_db.end() && dirty_it->first.oid == it->oid)
-        {
-            if ((dirty_it->second.state & BS_ST_WORKFLOW_MASK) == BS_ST_WAIT_BIG)
-            {
-                dirty_it->second.state = (dirty_it->second.state & ~BS_ST_WORKFLOW_MASK) | BS_ST_IN_FLIGHT;
-            }
-            dirty_it++;
-        }
+        return false;
    }
-    for (auto it = PRIV(op)->sync_small_writes.begin(); it != PRIV(op)->sync_small_writes.end(); it++)
+    if (!n)
    {
-#ifdef BLOCKSTORE_DEBUG
-        printf("Ack sync small %jx:%jx v%ju\n", it->oid.inode, it->oid.stripe, it->version);
-#endif
-        auto & unstab = unstable_writes[it->oid];
-        unstab = unstab < it->version ? it->version : unstab;
-        if (dirty_db[*it].state == (BS_ST_DELETE | BS_ST_WRITTEN))
-        {
-            dirty_db[*it].state = (BS_ST_DELETE | BS_ST_SYNCED);
-            // Deletions are treated as immediately stable
-            mark_stable(*it);
-        }
-        else /* (BS_ST_INSTANT?) | BS_ST_SMALL_WRITE | BS_ST_WRITTEN */
-        {
-            dirty_db[*it].state = (dirty_db[*it].state & ~BS_ST_WORKFLOW_MASK) | BS_ST_SYNCED;
-            if (dirty_db[*it].state & BS_ST_INSTANT)
-            {
-                mark_stable(*it);
-            }
-            else
-            {
-                unstable_unsynced--;
-                assert(unstable_unsynced >= 0);
-            }
-        }
+        return true;
    }
-    op->retval = 0;
-    FINISH_OP(op);
+    auto cb = [this, & wait_count](ring_data_t *data)
+    {
+        if (data->res != 0)
+            disk_error_abort("sync meta", data->res, 0);
+        wait_count--;
+        assert(wait_count >= 0);
+        if (!wait_count)
+            ringloop->wakeup();
+    };
+    if ((unsynced_small_write_count > 0 || unsynced_big_write_count > 0 || unsynced_meta_write_count > 0) && !dsk.disable_meta_fsync)
+    {
+        // fsync meta
+        io_uring_sqe *sqe = get_sqe();
+        assert(sqe);
+        ring_data_t *data = ((ring_data_t*)sqe->user_data);
+        io_uring_prep_fsync(sqe, dsk.meta_fd, IORING_FSYNC_DATASYNC);
+        data->iov = { 0 };
+        data->callback = cb;
+        wait_count++;
+    }
+    if (unsynced_small_write_count > 0 && !dsk.disable_journal_fsync && dsk.meta_fd != dsk.journal_fd)
+    {
+        // fsync buffer
+        io_uring_sqe *sqe = get_sqe();
+        assert(sqe);
+        ring_data_t *data = ((ring_data_t*)sqe->user_data);
+        io_uring_prep_fsync(sqe, dsk.journal_fd, IORING_FSYNC_DATASYNC);
+        data->iov = { 0 };
+        data->callback = cb;
+        wait_count++;
+    }
+    if (unsynced_big_write_count > 0 && !dsk.disable_data_fsync && dsk.data_fd != dsk.meta_fd && dsk.data_fd != dsk.journal_fd)
+    {
+        // fsync data
+        io_uring_sqe *sqe = get_sqe();
+        assert(sqe);
+        ring_data_t *data = ((ring_data_t*)sqe->user_data);
+        io_uring_prep_fsync(sqe, dsk.data_fd, IORING_FSYNC_DATASYNC);
+        data->iov = { 0 };
+        data->callback = cb;
+        wait_count++;
+    }
+    unsynced_big_write_count = 0;
+    unsynced_small_write_count = 0;
+    unsynced_meta_write_count = 0;
+    return true;
+}
+
+int blockstore_impl_t::do_sync(blockstore_op_t *op, int base_state)
+{
+    int op_state = PRIV(op)->op_state - base_state;
+    if (op_state == 1) goto resume_1;
+    if (op_state == 2) goto resume_2;
+    assert(!op_state);
+    if (flusher->get_syncing_buffer())
+    {
+        // Wait for flusher-initiated sync
+        return 0;
+    }
+    if (dsk.disable_journal_fsync && dsk.disable_meta_fsync && dsk.disable_data_fsync || !unsynced_big_write_count && !unsynced_small_write_count)
+    {
+        // We can return immediately because sync only syncs previous writes
+        unsynced_big_write_count = unsynced_small_write_count = unsynced_meta_write_count = 0;
+        return 2;
+    }
+    PRIV(op)->lsn = heap->get_completed_lsn();
+    if (!submit_fsyncs(PRIV(op)->pending_ops))
+    {
+        PRIV(op)->wait_detail = 1;
+        PRIV(op)->wait_for = WAIT_SQE;
+        return 0;
+    }
+resume_1:
+    if (PRIV(op)->pending_ops > 0)
+    {
+        PRIV(op)->op_state = base_state+1;
+        return 1;
+    }
+resume_2:
+    heap->mark_lsn_fsynced(PRIV(op)->lsn);
+    return 2;
 }
--- a/src/blockstore/blockstore_write.cpp
+++ b/src/blockstore/blockstore_write.cpp
--- a/src/blockstore/fio_engine.cpp
+++ b/src/blockstore/fio_engine.cpp
@@ -12,7 +12,7 @@
 // [LD_PRELOAD=libasan.so.8] \
 // fio -name=test -thread -ioengine=../build/src/blockstore/libfio_vitastor_blk.so \
 //     -bs=4k -direct=1 -rw=randwrite -iodepth=16 -size=900M -loops=10 \
-//     -bs_config='{"data_device":"./test_data.bin","meta_offset":0,"journal_offset":16777216,"data_offset":33554432,"disable_data_fsync":true,"immediate_commit":"all","journal_no_same_sector_overwrites":true}'
+//     -bs_config='{"data_device":"./test_data.bin","meta_offset":0,"journal_offset":16777216,"data_offset":33554432,"disable_data_fsync":true,"meta_format":3,"immediate_commit":"all","log_level":100,"journal_no_same_sector_overwrites":true,"journal_sector_buffer_count":1024}'
 //
 // Linear write:
 //
@@ -183,7 +183,7 @@ static enum fio_q_status bs_queue(struct thread_data *td, struct io_u *io)
    {
    case DDIR_READ:
        op->opcode = BS_OP_READ;
-        op->buf = io->xfer_buf;
+        op->buf = (uint8_t*)io->xfer_buf;
        op->oid = {
            .inode = 1,
            .stripe = io->offset / bsd->bs->get_block_size(),
@@ -204,7 +204,7 @@ static enum fio_q_status bs_queue(struct thread_data *td, struct io_u *io)
        break;
    case DDIR_WRITE:
        op->opcode = BS_OP_WRITE_STABLE;
-        op->buf = io->xfer_buf;
+        op->buf = (uint8_t*)io->xfer_buf;
        op->oid = {
            .inode = 1,
            .stripe = io->offset / bsd->bs->get_block_size(),
--- a/src/blockstore/blockstore_journal.h
+++ b/src/blockstore/blockstore_journal.h
@@ -1,12 +1,11 @@
+// Old metadata format on-disk structures
 // Copyright (c) Vitaliy Filippov, 2019+
 // License: VNPL-1.1 (see README.md for details)

 #pragma once

 #include "crc32c.h"
-#include <set>

-#define MIN_JOURNAL_SIZE 4*1024*1024
 #define JOURNAL_MAGIC 0x4A33
 #define JOURNAL_VERSION_V1 1
 #define JOURNAL_VERSION_V2 2
@@ -145,74 +144,14 @@ inline uint32_t je_crc32(journal_entry *je)
    return crc32c(0x48674bc7, ((uint8_t*)je)+4, je->size-4);
 }

-struct journal_sector_info_t
+// 32 bytes = 24 bytes + block bitmap (4 bytes by default) + external attributes (also bitmap, 4 bytes by default)
+// per "clean" entry on disk with fixed metadata tables
+struct __attribute__((__packed__)) clean_disk_entry
 {
-    uint64_t offset;
-    uint64_t flush_count;
-    bool written;
-    bool dirty;
-    uint64_t submit_id;
+    object_id oid;
+    uint64_t version;
+    uint8_t bitmap[];
+    // Two more fields come after bitmap in metadata version 2:
+    // uint32_t data_csum[];
+    // uint32_t entry_csum;
 };
-
-struct pending_journaling_t
-{
-    int pending;
-    int sector;
-    blockstore_op_t *op;
-};
-
-struct journal_t
-{
-    int fd;
-    bool inmemory = false;
-    bool flush_journal = false;
-    void *buffer = NULL;
-
-    uint64_t block_size;
-    uint64_t offset, len;
-    // Next free block offset
-    uint64_t next_free = 0;
-    // First occupied block offset
-    uint64_t used_start = 0;
-    // End of the last block not used for writing anymore
-    uint64_t dirty_start = 0;
-    uint32_t crc32_last = 0;
-
-    // Current sector(s) used for writing
-    void *sector_buf = NULL;
-    journal_sector_info_t *sector_info = NULL;
-    uint64_t sector_count;
-    bool no_same_sector_overwrites = false;
-    int cur_sector = 0;
-    int in_sector_pos = 0;
-    std::vector<int> submitting_sectors;
-    std::multimap<uint64_t, pending_journaling_t> flushing_ops;
-    uint64_t submit_id = 0;
-
-    // Used sector map
-    // May use ~ 80 MB per 1 GB of used journal space in the worst case
-    std::map<uint64_t, uint64_t> used_sectors;
-
-    ~journal_t();
-    bool trim();
-    uint64_t get_trim_pos();
-    void dump_diagnostics();
-    inline bool entry_fits(int size)
-    {
-        return !(block_size - in_sector_pos < size ||
-            no_same_sector_overwrites && sector_info[cur_sector].written);
-    }
-};
-
-struct blockstore_journal_check_t
-{
-    blockstore_impl_t *bs;
-    uint64_t next_pos, next_sector, next_in_pos;
-    int sectors_to_write, first_sector;
-    bool right_dir; // writing to the end or the beginning of the ring buffer
-
-    blockstore_journal_check_t(blockstore_impl_t *bs);
-    int check_available(blockstore_op_t *op, int required, int size, int data_after);
-};
-
-journal_entry* prefill_single_journal_entry(journal_t & journal, uint16_t type, uint32_t size);
--- a/src/blockstore/multilist.cpp
+++ b/src/blockstore/multilist.cpp
@@ -0,0 +1,338 @@
+// Variable-length O(1) disk space allocator
+// Copyright (c) Vitaliy Filippov, 2025+
+// License: VNPL-1.1 (see README.md for details)
+
+#include <assert.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <set>
+#include "multilist.h"
+
+multilist_alloc_t::multilist_alloc_t(uint32_t count, uint32_t maxn):
+    count(count), maxn(maxn)
+{
+    // not-so-memory-efficient: 16 MB memory per 1 GB buffer space, but buffer spaces are small, so OK
+    assert(count > 1 && count < 0x80000000);
+    sizes.resize(count);
+    nexts.resize(count); // nexts[i] = 0 -> area is used; nexts[i] = 1 -> no next; nexts[i] >= 2 -> next item
+    prevs.resize(count);
+    heads.resize(maxn); // heads[i] = 0 -> empty list; heads[i] >= 1 -> list head
+    sizes[0] = count;
+    sizes[count-1] = -count; // end
+    nexts[0] = 1;
+    heads[maxn-1] = 1;
+#ifdef MULTILIST_TRACE
+    print();
+#endif
+}
+
+bool multilist_alloc_t::is_free(uint32_t pos)
+{
+    assert(pos < count);
+    if (sizes[pos] < 0)
+        pos += sizes[pos]+1;
+    while (pos > 0 && !sizes[pos])
+        pos--;
+    return nexts[pos] > 0;
+}
+
+uint32_t multilist_alloc_t::find(uint32_t size)
+{
+    assert(size > 0);
+    assert(size <= maxn);
+    for (uint32_t i = size-1; i < maxn; i++)
+    {
+        if (heads[i])
+        {
+            return heads[i]-1;
+        }
+    }
+    return UINT32_MAX;
+}
+
+void multilist_alloc_t::verify()
+{
+    std::set<uint32_t> reachable;
+    for (int i = 0; i < maxn; i++)
+    {
+        uint32_t cur = heads[i];
+        while (cur)
+        {
+            if (!nexts[cur-1])
+            {
+                fprintf(stderr, "ERROR: item %d from freelist %d is not free\n", cur-1, i);
+                print();
+                abort();
+            }
+            if (nexts[cur-1] >= count+2)
+            {
+                fprintf(stderr, "ERROR: next out of range at %d: %d\n", cur-1, nexts[cur-1]);
+                print();
+                abort();
+            }
+            if (!(i < maxn-1 ? sizes[cur-1] == i+1 : (sizes[cur-1] >= i+1)))
+            {
+                fprintf(stderr, "ERROR: item %d is in wrong freelist: expected size %d, but actual size is %d\n", cur-1, i+1, sizes[cur-1]);
+                print();
+                abort();
+            }
+            if (reachable.find(cur-1) != reachable.end())
+            {
+                fprintf(stderr, "ERROR: doubly-claimed item %d\n", cur-1);
+                print();
+                abort();
+            }
+            reachable.insert(cur-1);
+            cur = nexts[cur-1]-1;
+        }
+    }
+    for (int i = 0; i < count; )
+    {
+        if (sizes[i])
+        {
+            assert(i+sizes[i] <= count);
+            if (sizes[i] > 1 && sizes[i+sizes[i]-1] != -sizes[i])
+            {
+                fprintf(stderr, "ERROR: start/end mismatch at %d: sizes[%d] should be %d, but is %d\n", i, i+sizes[i]-1, -sizes[i], sizes[i+sizes[i]-1]);
+                print();
+                abort();
+            }
+            for (int j = i+1; j < i+sizes[i]-1; j++)
+            {
+                if (sizes[j])
+                {
+                    fprintf(stderr, "ERROR: internal non-zero at %d: %d\n", j, sizes[j]);
+                    print();
+                    abort();
+                }
+            }
+            if (nexts[i] && reachable.find(i) == reachable.end())
+            {
+                fprintf(stderr, "ERROR: %d is unreachable from heads\n", i);
+                print();
+                abort();
+            }
+            if (nexts[i] >= 2)
+            {
+                if (nexts[i] >= 2+count)
+                {
+                    fprintf(stderr, "ERROR: next out of range at %d: %d\n", i, nexts[i]);
+                    print();
+                    abort();
+                }
+                if (prevs[nexts[i]-2] != i+1)
+                {
+                    fprintf(stderr, "ERROR: prev[next] (%d) != this (%d) at %d", prevs[nexts[i]-2], i+1, i);
+                    print();
+                    abort();
+                }
+            }
+            i += (sizes[i] > 1 ? sizes[i] : 1);
+        }
+        else
+            i++;
+    }
+}
+
+void multilist_alloc_t::print()
+{
+    printf("heads:");
+    for (int i = 0; i < maxn; i++)
+        if (heads[i])
+            printf(" %u=%u", i, heads[i]);
+    printf("\n");
+    printf("sizes:");
+    for (int i = 0; i < count; i++)
+        if (sizes[i])
+            printf(" %d=%d", i, sizes[i]);
+    printf("\n");
+    printf("prevs:");
+    for (int i = 0; i < count; i++)
+        if (prevs[i])
+            printf(" %d=%d", i, prevs[i]);
+    printf("\n");
+    printf("nexts:");
+    for (int i = 0; i < count; i++)
+        if (nexts[i])
+            printf(" %d=%d", i, nexts[i]);
+    printf("\n");
+    printf("items:");
+    for (int i = 0; i < count; )
+    {
+        if (sizes[i])
+        {
+            printf(" %u=(s:%d,n:%u,p:%u)", i, sizes[i], nexts[i], prevs[i]);
+            assert(i+sizes[i] <= count);
+            i += (sizes[i] > 1 ? sizes[i] : 1);
+        }
+        else
+            i++;
+    }
+    printf("\n");
+}
+
+void multilist_alloc_t::use(uint32_t pos, uint32_t size)
+{
+    assert(pos+size <= count && size > 0);
+    if (sizes[pos] <= 0)
+    {
+        uint32_t start = pos;
+        if (sizes[start] < 0)
+            start += sizes[start]+1;
+        else
+            while (start > 0 && !sizes[start])
+                start--;
+        assert(sizes[start] >= size);
+        use_full(start);
+        uint32_t full = sizes[start];
+        sizes[pos-1] = -pos+start;
+        sizes[start] = pos-start;
+        free(start);
+        sizes[pos+size-1] = -size;
+        sizes[pos] = size;
+        if (pos+size < start+full)
+        {
+            sizes[start+full-1] = -(start+full-pos-size);
+            sizes[pos+size] = start+full-pos-size;
+            free(pos+size);
+        }
+    }
+    else
+    {
+        assert(sizes[pos] >= size);
+        use_full(pos);
+        if (sizes[pos] > size)
+        {
+            uint32_t full = sizes[pos];
+            sizes[pos+size-1] = -size;
+            sizes[pos] = size;
+            sizes[pos+full-1] = -full+size;
+            sizes[pos+size] = full-size;
+            free(pos+size);
+        }
+    }
+#ifdef MULTILIST_TRACE
+    print();
+#endif
+}
+
+void multilist_alloc_t::use_full(uint32_t pos)
+{
+    uint32_t prevsize = sizes[pos];
+    assert(prevsize);
+    assert(nexts[pos]);
+    uint32_t pi = (prevsize < maxn ? prevsize : maxn)-1;
+    if (heads[pi] == pos+1)
+        heads[pi] = nexts[pos]-1;
+    if (prevs[pos])
+        nexts[prevs[pos]-1] = nexts[pos];
+    if (nexts[pos] >= 2)
+        prevs[nexts[pos]-2] = prevs[pos];
+    prevs[pos] = 0;
+    nexts[pos] = 0;
+}
+
+void multilist_alloc_t::free(uint32_t pos)
+{
+    do_free(pos);
+#ifdef MULTILIST_TRACE
+    print();
+#endif
+}
+
+void multilist_alloc_t::do_free(uint32_t pos)
+{
+    assert(!nexts[pos]);
+    uint32_t size = sizes[pos];
+    assert(size > 0);
+    // merge with previous?
+    if (pos > 0 && nexts[pos+(sizes[pos-1] == 1 ? -1 : sizes[pos-1])] > 0)
+    {
+        assert(sizes[pos-1] < 0 || sizes[pos-1] == 1);
+        uint32_t prevsize = sizes[pos-1] < 0 ? -sizes[pos-1] : 1;
+        use_full(pos-prevsize);
+        sizes[pos] = 0;
+        sizes[pos-1] = 0;
+        size += prevsize;
+        pos -= prevsize;
+        sizes[pos+size-1] = -size;
+        sizes[pos] = size;
+    }
+    // merge with next?
+    if (pos+size < count && nexts[pos+size] >= 1)
+    {
+        uint32_t nextsize = sizes[pos+size];
+        use_full(pos+size);
+        sizes[pos+size] = 0;
+        sizes[pos+size-1] = 0;
+        size += nextsize;
+        sizes[pos+size-1] = -size;
+        sizes[pos] = size;
+    }
+    uint32_t ni = (size < maxn ? size : maxn)-1;
+    nexts[pos] = heads[ni]+1;
+    prevs[pos] = 0;
+    if (heads[ni])
+        prevs[heads[ni]-1] = pos+1;
+    heads[ni] = pos+1;
+}
+
+multilist_index_t::multilist_index_t(uint32_t count, uint32_t max_used, uint32_t init_used):
+    count(count), max_used(max_used)
+{
+    assert(init_used < max_used);
+    nexts.resize(count, UINT32_MAX);
+    prevs.resize(count, UINT32_MAX);
+    heads.resize(max_used, UINT32_MAX);
+    for (size_t i = 0; i < count-1; i++)
+    {
+        nexts[i] = i+1;
+        prevs[i+1] = i;
+    }
+    prevs[0] = UINT32_MAX;
+    nexts[count-1] = UINT32_MAX;
+    heads[init_used] = 0;
+}
+
+uint32_t multilist_index_t::find(uint32_t wanted_used)
+{
+    assert(wanted_used < max_used);
+    return heads[wanted_used];
+}
+
+void multilist_index_t::change(uint32_t pos, uint32_t old_used, uint32_t new_used)
+{
+    if (new_used == old_used)
+        return;
+    assert(old_used < max_used && new_used < max_used);
+    if (prevs[pos] != UINT32_MAX)
+        nexts[prevs[pos]] = nexts[pos];
+    if (nexts[pos] != UINT32_MAX)
+        prevs[nexts[pos]] = prevs[pos];
+    if (heads[old_used] == pos)
+        heads[old_used] = nexts[pos];
+    prevs[pos] = UINT32_MAX;
+    if (heads[new_used] != UINT32_MAX)
+        prevs[heads[new_used]] = pos;
+    nexts[pos] = heads[new_used];
+    heads[new_used] = pos;
+}
+
+void multilist_index_t::print()
+{
+    printf("heads:");
+    for (int i = 0; i < max_used; i++)
+        if (heads[i] != UINT32_MAX)
+            printf(" %u=%u", i, heads[i]);
+    printf("\n");
+    printf("prevs:");
+    for (int i = 0; i < count; i++)
+        if (prevs[i] != UINT32_MAX)
+            printf(" %d=%d", i, prevs[i]);
+    printf("\n");
+    printf("nexts:");
+    for (int i = 0; i < count; i++)
+        if (nexts[i] != UINT32_MAX)
+            printf(" %d=%d", i, nexts[i]);
+    printf("\n");
+}
--- a/src/blockstore/multilist.h
+++ b/src/blockstore/multilist.h
@@ -0,0 +1,37 @@
+// Variable-length O(1) disk space allocator
+// Copyright (c) Vitaliy Filippov, 2025+
+// License: VNPL-1.1 (see README.md for details)
+
+#pragma once
+
+#include <stdint.h>
+#include <vector>
+
+struct multilist_alloc_t
+{
+    const uint32_t count, maxn;
+    std::vector<int32_t> sizes;
+    std::vector<uint32_t> nexts, prevs, heads;
+
+    multilist_alloc_t(uint32_t count, uint32_t maxn);
+    bool is_free(uint32_t pos);
+    uint32_t find(uint32_t size);
+    void use_full(uint32_t pos);
+    void use(uint32_t pos, uint32_t size);
+    void do_free(uint32_t pos);
+    void free(uint32_t pos);
+    void verify();
+    void print();
+};
+
+struct multilist_index_t
+{
+    const uint32_t count, max_used;
+    std::vector<uint32_t> nexts, prevs, heads;
+
+    // used should be always < max_used
+    multilist_index_t(uint32_t count, uint32_t max_used, uint32_t init_used);
+    uint32_t find(uint32_t wanted_used);
+    void change(uint32_t pos, uint32_t old_used, uint32_t new_used);
+    void print();
+};
--- a/src/client/etcd_state_client.cpp
+++ b/src/client/etcd_state_client.cpp
@@ -1167,7 +1167,7 @@ void etcd_state_client_t::parse_state(const etcd_kv_t & kv)
            if (!cur_primary || !value["state"].is_array() || !state ||
                (state & PG_OFFLINE) && state != PG_OFFLINE ||
                (state & PG_PEERING) && state != PG_PEERING ||
-                (state & PG_INCOMPLETE) && state != PG_INCOMPLETE)
+                (state & PG_INCOMPLETE) && state != PG_INCOMPLETE && state != (PG_INCOMPLETE|PG_HAS_INVALID))
            {
                fprintf(stderr, "Unexpected pool %u PG %u state in etcd: primary=%ju, state=%s\n", pool_id, pg_num, cur_primary, value["state"].dump().c_str());
                return;
--- a/src/client/object_id.h
+++ b/src/client/object_id.h
@@ -6,8 +6,20 @@
 #include <stdint.h>
 #include <functional>

+#define POOL_SCHEME_REPLICATED 1
+#define POOL_SCHEME_XOR 2
+#define POOL_SCHEME_EC 3
+#define POOL_ID_MAX 0x10000
+#define POOL_ID_BITS 16
+#define INODE_POOL(inode) (pool_id_t)((inode) >> (64 - POOL_ID_BITS))
+#define INODE_NO_POOL(inode) (inode_t)((inode) & (((uint64_t)1 << (64-POOL_ID_BITS)) - 1))
+#define INODE_WITH_POOL(pool_id, inode) (((inode_t)(pool_id) << (64-POOL_ID_BITS)) | INODE_NO_POOL(inode))
+
 typedef uint64_t inode_t;

+// Pool ID is 16 bits long
+typedef uint32_t pool_id_t;
+
 // 16 bytes per object/stripe id
 // stripe = (start of the parity stripe + peer role)
 // i.e. for example (256KB + one of 0,1,2)
--- a/src/client/osd_ops.h
+++ b/src/client/osd_ops.h
@@ -44,11 +44,6 @@
 #define DIRECT_IO_ALIGNMENT 512
 #endif

-// Memory allocation alignment (page size is usually optimal)
-#ifndef MEM_ALIGNMENT
-#define MEM_ALIGNMENT 4096
-#endif
-
 // Constants for osd_reply_describe_item_t.loc_bad
 #define LOC_OUTDATED 1
 #define LOC_CORRUPTED 2
--- a/src/disk_tool/CMakeLists.txt
+++ b/src/disk_tool/CMakeLists.txt
@@ -7,7 +7,8 @@ add_executable(vitastor-disk
 	disk_tool.cpp disk_simple_offsets.cpp
 	disk_tool_discard.cpp disk_tool_journal.cpp disk_tool_meta.cpp disk_tool_prepare.cpp disk_tool_resize.cpp
 	disk_tool_resize_auto.cpp disk_tool_udev.cpp disk_tool_utils.cpp disk_tool_upgrade.cpp
-	../util/crc32c.c ../util/str_util.cpp ../util/json_util.cpp ../../json11/json11.cpp ../util/rw_blocking.cpp ../util/allocator.cpp ../util/ringloop.cpp ../blockstore/blockstore_disk.cpp
+	../util/crc32c.c ../util/str_util.cpp ../util/json_util.cpp ../../json11/json11.cpp ../util/rw_blocking.cpp ../util/allocator.cpp ../util/ringloop.cpp
+	../blockstore/blockstore_disk.cpp ../blockstore/blockstore_heap.cpp ../blockstore/multilist.cpp
 )
 target_link_libraries(vitastor-disk
 	tcmalloc_minimal
--- a/src/disk_tool/disk_simple_offsets.cpp
+++ b/src/disk_tool/disk_simple_offsets.cpp
@@ -11,6 +11,7 @@
 #include "str_util.h"
 #include "blockstore.h"
 #include "blockstore_disk.h"
+#include "blockstore_heap.h"

 // Calculate offsets for a block device and print OSD command line parameters
 void disk_tool_simple_offsets(json11::Json cfg, bool json_output)
@@ -23,6 +24,9 @@ void disk_tool_simple_offsets(json11::Json cfg, bool json_output)
    uint64_t journal_offset = parse_size(cfg["journal_offset"].string_value());
    uint64_t device_size = parse_size(cfg["device_size"].string_value());
    uint32_t csum_block_size = parse_size(cfg["csum_block_size"].string_value());
+    uint32_t meta_format = cfg["meta_format"].uint64_value();
+    if (!meta_format)
+        meta_format = BLOCKSTORE_META_FORMAT_HEAP;
    uint32_t data_csum_type = BLOCKSTORE_CSUM_NONE;
    if (cfg["data_csum_type"] == "crc32c")
        data_csum_type = BLOCKSTORE_CSUM_CRC32C;
@@ -123,10 +127,39 @@ void disk_tool_simple_offsets(json11::Json cfg, bool json_output)
    uint64_t meta_offset = journal_offset + ((journal_size+device_block_size-1)/device_block_size)*device_block_size;
    uint64_t data_csum_size = (data_csum_type ? data_block_size/csum_block_size*(data_csum_type & 0xFF) : 0);
    uint64_t clean_entry_bitmap_size = data_block_size/bitmap_granularity/8;
-    uint64_t clean_entry_size = 24 /*sizeof(clean_disk_entry)*/ + 2*clean_entry_bitmap_size + data_csum_size + 4 /*entry_csum*/;
-    uint64_t entries_per_block = device_block_size / clean_entry_size;
    uint64_t object_count = ((device_size-meta_offset)/data_block_size);
-    uint64_t meta_size = (1 + (object_count+entries_per_block-1)/entries_per_block) * device_block_size;
+    uint64_t meta_size;
+    if (meta_format == BLOCKSTORE_META_FORMAT_HEAP)
+    {
+        uint32_t min_object_size = sizeof(heap_object_t)+sizeof(heap_write_t)+data_csum_size+2*clean_entry_bitmap_size;
+        uint32_t meta_block_target_free_space = cfg["meta_block_target_free_space"].uint64_value();
+        if (!meta_block_target_free_space || meta_block_target_free_space > device_block_size-min_object_size)
+            meta_block_target_free_space = 800;
+        double meta_reserve = cfg["meta_reserve"].number_value();
+        if (!meta_reserve)
+            meta_reserve = 1.5;
+        else if (meta_reserve < 1)
+            meta_reserve = 1;
+        uint32_t entries_per_block = (device_block_size-meta_block_target_free_space) / min_object_size;
+        meta_size = device_block_size * (uint64_t)((object_count+entries_per_block-1) / entries_per_block * meta_reserve);
+    }
+    else if (meta_format == BLOCKSTORE_META_FORMAT_V2)
+    {
+        uint64_t clean_entry_size = 24 /*sizeof(clean_disk_entry)*/ + 2*clean_entry_bitmap_size + data_csum_size + 4 /*entry_csum*/;
+        uint64_t entries_per_block = device_block_size / clean_entry_size;
+        meta_size = (1 + (object_count+entries_per_block-1)/entries_per_block) * device_block_size;
+    }
+    else if (meta_format == BLOCKSTORE_META_FORMAT_V1)
+    {
+        uint64_t clean_entry_size = 24 /*sizeof(clean_disk_entry)*/ + 2*clean_entry_bitmap_size;
+        uint64_t entries_per_block = device_block_size / clean_entry_size;
+        meta_size = (1 + (object_count+entries_per_block-1)/entries_per_block) * device_block_size;
+    }
+    else
+    {
+        fprintf(stderr, "meta_format %u is not supported\n", meta_format);
+        exit(1);
+    }
    uint64_t data_offset = meta_offset + meta_size;
    if (format == "json")
    {
--- a/src/disk_tool/disk_tool.cpp
+++ b/src/disk_tool/disk_tool.cpp
@@ -48,6 +48,8 @@ static const char *help_text =
    "    --max_other 10%\n"
    "      Use disks for OSD data even if they already have non-Vitastor partitions,\n"
    "      but only if these take up no more than this percent of disk space.\n"
+    "    --dry-run\n"
+    "      Check and print new OSD count for each disk but do not actually create them.\n"
    "  \n"
    "  Options (single-device mode):\n"
    "    --data_device <DEV>        Use partition <DEV> for data\n"
@@ -179,8 +181,8 @@ static const char *help_text =
    "  Options:\n"
    "  --all             Scan the whole journal area for entries and dump them, even outdated ones\n"
    "  --json            Dump journal in JSON format\n"
-    "  --format entries  (Default) Dump actual journal entries as an array, without data\n"
-    "  --format data     Same as \"entries\", but also include small write data\n"
+    "  --format data     (Default) Dump journal entries as an array, with small write data\n"
+    "  --format entries  Dump actual journal entries as an array, without data\n"
    "  --format blocks   Dump as an array of journal blocks each containing array of entries\n"
    "\n"
    "vitastor-disk write-journal <osd_device>\n"
@@ -190,12 +192,12 @@ static const char *help_text =
    "  You can specify any OSD device (data, metadata or journal), or the layout manually.\n"
    "\n"
    "vitastor-disk dump-meta <osd_device>\n"
-    "vitastor-disk dump-meta <meta_file> <meta_block_size> <offset> <size>\n"
+    "vitastor-disk dump-meta [osd_options...]\n"
    "  Dump metadata in JSON format.\n"
    "  You can specify any OSD device (data, metadata or journal), or the layout manually.\n"
    "\n"
    "vitastor-disk write-meta <osd_device>\n"
-    "vitastor-disk write-meta <meta_file> <offset> <size>\n"
+    "vitastor-disk write-meta [osd_options...]\n"
    "  Write metadata from JSON taken from standard input in the same format as produced by `dump-meta`.\n"
    "  You can specify any OSD device (data, metadata or journal), or the layout manually.\n"
    "\n"
@@ -362,47 +364,63 @@ int main(int argc, char *argv[])
    }
    else if (!strcmp(cmd[0], "dump-meta"))
    {
-        if (cmd.size() != 2 && cmd.size() < 5)
-        {
-            print_help(help_text, "vitastor-disk", cmd[0], false);
-            return 1;
-        }
-        self.dsk.meta_device = cmd[1];
-        if (cmd.size() > 2)
+        if (cmd.size() == 5)
        {
+            // Old format
+            self.dsk.meta_device = cmd[1];
            self.dsk.meta_block_size = strtoul(cmd[2], NULL, 10);
            self.dsk.meta_offset = strtoull(cmd[3], NULL, 10);
-            self.dsk.meta_len = strtoull(cmd[4], NULL, 10);
+            self.dsk.meta_area_size = strtoull(cmd[4], NULL, 10);
+        }
+        else if (cmd.size() == 2)
+        {
+            // First argument is an OSD device - take metadata layout parameters from it
+            self.dsk.meta_device = cmd[1];
+            if (self.dump_load_check_superblock(self.dsk.meta_device))
+                return 1;
        }
        else
        {
-            // First argument is an OSD device - take metadata layout parameters from it
-            if (self.dump_load_check_superblock(self.dsk.meta_device))
-                return 1;
+            // Parse all OSD options from cmdline
+            self.dsk.parse_config(self.options);
+            if (self.options["io"] != "")
+                self.dsk.data_io = self.dsk.meta_io = self.dsk.journal_io = self.options["io"];
+            // FIXME: This is a really repeated pattern, move it somewhere
+            self.dsk.open_data();
+            self.dsk.open_meta();
+            self.dsk.open_journal();
+            self.dsk.calc_lengths();
+            self.dsk.close_all();
        }
        return self.dump_meta();
    }
    else if (!strcmp(cmd[0], "write-meta"))
    {
-        if (cmd.size() != 2 && cmd.size() < 4)
-        {
-            print_help(help_text, "vitastor-disk", cmd[0], false);
-            return 1;
-        }
-        self.new_meta_device = cmd[1];
-        if (cmd.size() > 2)
+        if (cmd.size() == 4)
        {
+            self.new_meta_device = cmd[1];
            self.new_meta_offset = strtoull(cmd[2], NULL, 10);
            self.new_meta_len = strtoull(cmd[3], NULL, 10);
        }
-        else
+        else if (cmd.size() == 2)
        {
            // First argument is an OSD device - take metadata layout parameters from it
+            self.new_meta_device = cmd[1];
            if (self.dump_load_check_superblock(self.new_meta_device))
                return 1;
            self.new_meta_device = self.dsk.meta_device;
            self.new_meta_offset = self.dsk.meta_offset;
-            self.new_meta_len = self.dsk.meta_len;
+            self.new_meta_len = self.dsk.meta_area_size;
+        }
+        else
+        {
+            // Parse all OSD options from cmdline
+            self.dsk.parse_config(self.options);
+            self.dsk.open_data();
+            self.dsk.open_meta();
+            self.dsk.open_journal();
+            self.dsk.calc_lengths();
+            self.dsk.close_all();
        }
        std::string json_err;
        json11::Json meta = json11::Json::parse(read_all_fd(0), json_err);
--- a/src/disk_tool/disk_tool.h
+++ b/src/disk_tool/disk_tool.h
@@ -15,7 +15,9 @@
 #include "json11/json11.hpp"
 #include "blockstore_disk.h"
 #include "blockstore_impl.h"
+#include "meta_v1.h"
 #include "crc32c.h"
+#include "allocator.h"

 // vITADisk
 #define VITASTOR_DISK_MAGIC 0x6b73694441544976
@@ -43,7 +45,9 @@ struct disk_tool_t
    std::map<std::string, std::string> options;
    bool test_mode = false;
    bool all = false, json = false, now = false;
-    bool dump_with_blocks, dump_with_data;
+    bool dump_with_blocks = false, dump_with_data = false;
+    bool dump_as_old = false;
+    int log_level = 1;
    blockstore_disk_t dsk;

    // resize data and/or move metadata and journal
@@ -58,25 +62,30 @@ struct disk_tool_t
    uint64_t meta_pos;
    uint64_t journal_pos, journal_calc_data_pos;

+    uint8_t *buffer_area = NULL;
    bool first_block, first_entry;

-    allocator_t *data_alloc;
+    allocator_t *data_alloc = NULL;
    std::map<uint64_t, uint64_t> data_remap;
    std::map<uint64_t, uint64_t>::iterator remap_it;
-    ring_loop_t *ringloop;
+    ring_loop_t *ringloop = NULL;
    ring_consumer_t ring_consumer;
    int remap_active;
    journal_entry_start je_start;
-    uint8_t *new_journal_buf, *new_meta_buf, *new_journal_ptr, *new_journal_data;
+    uint8_t *new_journal_buf = NULL, *new_meta_buf = NULL, *new_journal_ptr = NULL, *new_journal_data = NULL;
+    blockstore_meta_header_v3_t *new_meta_hdr = NULL;
+    blockstore_disk_t new_dsk;
+    blockstore_heap_t *new_heap = NULL;
    uint64_t new_journal_in_pos;
    int64_t data_idx_diff;
    uint64_t total_blocks, free_first, free_last;
    uint64_t new_clean_entry_bitmap_size, new_data_csum_size, new_clean_entry_size, new_entries_per_block;
-    int new_journal_fd, new_meta_fd;
-    resizer_data_moving_t *moving_blocks;
+    uint32_t new_meta_format = 0;
+    int new_journal_fd = -1, new_meta_fd = -1;
+    resizer_data_moving_t *moving_blocks = NULL;

    bool started;
-    void *small_write_data;
+    void *small_write_data = NULL;
    uint32_t data_crc32;
    bool data_csum_valid;
    uint32_t crc32_last;
@@ -88,17 +97,24 @@ struct disk_tool_t
    void dump_journal_entry(int num, journal_entry *je, bool json);
    int process_journal(std::function<int(void*)> block_fn, bool do_open = true);
    int process_journal_block(void *buf, std::function<void(int, journal_entry*)> iter_fn);
-    int process_meta(std::function<void(blockstore_meta_header_v2_t *)> hdr_fn,
-        std::function<void(uint64_t, clean_disk_entry*, uint8_t*)> record_fn, bool do_open = true);
+    int process_meta(std::function<void(blockstore_meta_header_v3_t *)> hdr_fn,
+        std::function<void(blockstore_heap_t *heap, heap_object_t *obj, uint32_t meta_block_num)> obj_fn,
+        std::function<void(uint64_t block_num, clean_disk_entry *entry_v1, uint8_t *bitmap)> record_fn,
+        bool with_data, bool do_open);

    int dump_meta();
-    void dump_meta_header(blockstore_meta_header_v2_t *hdr);
+    void dump_meta_header(blockstore_meta_header_v3_t *hdr);
    void dump_meta_entry(uint64_t block_num, clean_disk_entry *entry, uint8_t *bitmap);
+    void dump_heap_entry_as_old(blockstore_heap_t *heap, heap_object_t *obj);
+    void dump_heap_entry(blockstore_heap_t *heap, heap_object_t *obj);

    int dump_load_check_superblock(const std::string & device);

    int write_json_journal(json11::Json entries);
    int write_json_meta(json11::Json meta);
+    int write_json_heap(json11::Json meta, json11::Json journal);
+    int index_journal_by_object(json11::Json journal,
+        std::map<object_id, std::vector<json11::Json::object>> & journal_by_object);

    int resize_data(std::string device);
    int resize_parse_move_journal(std::map<std::string, std::string> & move_options, bool dry_run);
@@ -106,13 +122,17 @@ struct disk_tool_t

    int raw_resize();
    int resize_parse_params();
-    void resize_init(blockstore_meta_header_v2_t *hdr);
+    void resize_init(blockstore_meta_header_v3_t *hdr);
    int resize_remap_blocks();
    int resize_copy_data();
-    int resize_rewrite_journal();
+    void resize_alloc_journal();
+    void build_journal_start();
+    void choose_journal_block(uint32_t je_size);
+    int resize_rebuild_journal();
    int resize_write_new_journal();
-    int resize_rewrite_meta();
+    int resize_rebuild_meta();
    int resize_write_new_meta();
+    void free_new_meta();

    int udev_import(std::string device);
    int read_sb(std::string device);
@@ -134,7 +154,8 @@ struct disk_tool_t
    int prepare(std::vector<std::string> devices);
    std::vector<vitastor_dev_info_t> collect_devices(const std::vector<std::string> & devices);
    json11::Json add_partitions(vitastor_dev_info_t & devinfo, std::vector<std::string> sizes);
-    std::vector<std::string> get_new_data_parts(vitastor_dev_info_t & dev, uint64_t osd_per_disk, uint64_t max_other_percent);
+    std::vector<std::string> get_new_data_parts(vitastor_dev_info_t & dev,
+        uint64_t osd_per_disk, uint64_t max_other_percent, uint64_t *check_new_count);
    int get_meta_partition(std::vector<vitastor_dev_info_t> & ssds, std::map<std::string, std::string> & options);

    int upgrade_simple_unit(std::string unit);
--- a/src/disk_tool/disk_tool_discard.cpp
+++ b/src/disk_tool/disk_tool_discard.cpp
@@ -54,12 +54,22 @@ int disk_tool_t::trim_data(std::string device)
    fprintf(stderr, "Reading metadata\n");
    data_alloc = new allocator_t(dsk.block_count);
    r = process_meta(
-        [this](blockstore_meta_header_v2_t *hdr) {},
+        [this](blockstore_meta_header_v3_t *hdr) {},
+        [this](blockstore_heap_t *heap, heap_object_t *obj, uint32_t meta_block_num)
+        {
+            for (auto wr = obj->get_writes(); wr; wr = wr->next())
+            {
+                if ((wr->entry_type & BS_HEAP_TYPE) == BS_HEAP_BIG_WRITE)
+                {
+                    data_alloc->set(wr->big_location(heap) / dsk.data_block_size, true);
+                }
+            }
+        },
        [this](uint64_t block_num, clean_disk_entry *entry, uint8_t *bitmap)
        {
            data_alloc->set(block_num, true);
        },
-        false
+        false, false
    );
    if (r != 0)
    {
@@ -83,7 +93,7 @@ int disk_tool_t::trim_data(std::string device)
        return r;
    }
    // Trim
-    r = dsk.trim_data(data_alloc);
+    r = dsk.trim_data([&](uint64_t block_num) { return data_alloc->get(block_num); });
    dsk.close_all();
    return r == 0;
 }
--- a/src/disk_tool/disk_tool_journal.cpp
+++ b/src/disk_tool/disk_tool_journal.cpp
@@ -5,8 +5,8 @@

 int disk_tool_t::dump_journal()
 {
-    dump_with_blocks = options["format"] == "blocks";
-    dump_with_data = options["format"] == "data" || options["format"] == "blocks,data";
+    dump_with_blocks = options["format"] == "blocks" || options["format"] == "blocks,data";
+    dump_with_data = options["format"] == "data" || options["format"] == "blocks,data" || options["format"] == "";
    if (dsk.journal_block_size < DIRECT_IO_ALIGNMENT || (dsk.journal_block_size % DIRECT_IO_ALIGNMENT) ||
        dsk.journal_block_size > 128*1024)
    {
@@ -525,7 +525,9 @@ int disk_tool_t::write_json_journal(json11::Json entries)
                .data_offset = (uint64_t)(new_journal_data-new_journal_buf),
                .crc32_data = !dsk.data_csum_type ? 0 : (uint32_t)sscanf_json("%x", rec["data_crc32"]),
            };
-            uint32_t data_csum_size = !dsk.data_csum_type ? 0 : ne->small_write.len/dsk.csum_block_size*(dsk.data_csum_type & 0xFF);
+            uint32_t data_csum_blocks = !dsk.data_csum_type ? 0 :
+                (((ne->small_write.offset+ne->small_write.len)/dsk.csum_block_size - ne->small_write.len/dsk.csum_block_size));
+            uint32_t data_csum_size = data_csum_blocks*(dsk.data_csum_type & 0xFF);
            fromhexstr(rec["bitmap"].string_value(), dsk.clean_entry_bitmap_size, ((uint8_t*)ne) + sizeof(journal_entry_small_write) + data_csum_size);
            fromhexstr(rec["data"].string_value(), ne->small_write.len, new_journal_data);
            if (ne->small_write.len > 0 && !rec["data"].is_string())
@@ -534,17 +536,21 @@ int disk_tool_t::write_json_journal(json11::Json entries)
                free(new_journal_buf);
                return 1;
            }
-            if (dsk.data_csum_type)
-                fromhexstr(rec["block_csums"].string_value(), data_csum_size, ((uint8_t*)ne) + sizeof(journal_entry_small_write));
-            if (rec["data"].is_string())
+            if (ne->small_write.len > 0)
            {
                if (!dsk.data_csum_type)
                    ne->small_write.crc32_data = crc32c(0, new_journal_data, ne->small_write.len);
                else if (dsk.data_csum_type == BLOCKSTORE_CSUM_CRC32C)
                {
                    uint32_t *block_csums = (uint32_t*)(((uint8_t*)ne) + sizeof(journal_entry_small_write));
-                    for (uint32_t i = 0; i < ne->small_write.len; i += dsk.csum_block_size, block_csums++)
-                        *block_csums = crc32c(0, new_journal_data+i, dsk.csum_block_size);
+                    for (uint32_t i = 0; i < data_csum_blocks; i++)
+                    {
+                        uint32_t block_begin = (ne->small_write.offset/dsk.csum_block_size + i) * dsk.csum_block_size;
+                        uint32_t block_end = (ne->small_write.offset/dsk.csum_block_size + (i+1)) * dsk.csum_block_size;
+                        block_begin = block_begin < ne->small_write.offset ? ne->small_write.offset : block_begin;
+                        block_end = block_end > ne->small_write.offset+ne->small_write.len ? ne->small_write.offset+ne->small_write.len : block_end;
+                        block_csums[i] = crc32c(0, new_journal_data+block_begin-ne->small_write.offset, block_end-block_begin);
+                    }
                }
            }
            new_journal_data += ne->small_write.len;
@@ -565,7 +571,9 @@ int disk_tool_t::write_json_journal(json11::Json entries)
                .len = (uint32_t)rec["len"].uint64_value(),
                .location = sscanf_json(NULL, rec["loc"]),
            };
-            uint32_t data_csum_size = !dsk.data_csum_type ? 0 : ne->big_write.len/dsk.csum_block_size*(dsk.data_csum_type & 0xFF);
+            uint32_t data_csum_blocks = !dsk.data_csum_type ? 0 :
+                (((ne->small_write.offset+ne->small_write.len)/dsk.csum_block_size - ne->small_write.len/dsk.csum_block_size));
+            uint32_t data_csum_size = data_csum_blocks*(dsk.data_csum_type & 0xFF);
            fromhexstr(rec["bitmap"].string_value(), dsk.clean_entry_bitmap_size, ((uint8_t*)ne) + sizeof(journal_entry_big_write) + data_csum_size);
            if (dsk.data_csum_type)
                fromhexstr(rec["block_csums"].string_value(), data_csum_size, ((uint8_t*)ne) + sizeof(journal_entry_big_write));
--- a/src/disk_tool/disk_tool_meta.cpp
+++ b/src/disk_tool/disk_tool_meta.cpp
@@ -6,40 +6,120 @@
 #include "osd_id.h"
 #include "json_util.h"

-int disk_tool_t::process_meta(std::function<void(blockstore_meta_header_v2_t *)> hdr_fn,
-    std::function<void(uint64_t, clean_disk_entry*, uint8_t*)> record_fn, bool do_open)
+#define FREE_SPACE_BIT 0x8000
+
+int disk_tool_t::process_meta(std::function<void(blockstore_meta_header_v3_t *)> hdr_fn,
+    std::function<void(blockstore_heap_t *heap, heap_object_t *obj, uint32_t meta_block_num)> obj_fn,
+    std::function<void(uint64_t block_num, clean_disk_entry *entry_v1, uint8_t *bitmap)> record_fn,
+    bool with_data, bool do_open)
 {
+    int r = 0;
    if (dsk.meta_block_size % DIRECT_IO_ALIGNMENT)
    {
        fprintf(stderr, "Invalid metadata block size: is not a multiple of %d\n", DIRECT_IO_ALIGNMENT);
        return 1;
    }
+    int buf_size = 1024*1024;
+    if (buf_size % dsk.meta_block_size)
+        buf_size = 8*dsk.meta_block_size;
+    uint8_t *data = NULL;
+    data = (uint8_t*)memalign_or_die(MEM_ALIGNMENT, buf_size);
+    blockstore_meta_header_v3_t *hdr = (blockstore_meta_header_v3_t *)data;
    if (do_open)
    {
        if (dsk.meta_fd >= 0)
        {
            fprintf(stderr, "Bug: Metadata device is already opened\n");
-            return 1;
+close_error:
+            r = 1;
+            goto close_free;
        }
        dsk.meta_fd = open(dsk.meta_device.c_str(), (options["io"] == "cached" ? 0 : O_DIRECT) | O_RDONLY);
        if (dsk.meta_fd < 0)
        {
            fprintf(stderr, "Failed to open metadata device %s: %s\n", dsk.meta_device.c_str(), strerror(errno));
-            return 1;
+            goto close_error;
        }
    }
-    int buf_size = 1024*1024;
-    if (buf_size % dsk.meta_block_size)
-        buf_size = 8*dsk.meta_block_size;
-    if (buf_size > dsk.meta_len)
-        buf_size = dsk.meta_len;
-    void *data = memalign_or_die(MEM_ALIGNMENT, buf_size);
-    lseek64(dsk.meta_fd, dsk.meta_offset, 0);
-    read_blocking(dsk.meta_fd, data, dsk.meta_block_size);
-    // Check superblock
-    blockstore_meta_header_v2_t *hdr = (blockstore_meta_header_v2_t *)data;
-    if (hdr->zero == 0 && hdr->magic == BLOCKSTORE_META_MAGIC_V1)
+    else if (dsk.meta_fd < 0)
    {
+        fprintf(stderr, "Bug: Metadata device is not opened\n");
+        goto close_error;
+    }
+    // Check superblock
+    lseek64(dsk.meta_fd, dsk.meta_offset, 0);
+    read_blocking(dsk.meta_fd, hdr, dsk.meta_block_size);
+    if (hdr->zero == 0 && hdr->magic == BLOCKSTORE_META_MAGIC_V1 && hdr->version == BLOCKSTORE_META_FORMAT_HEAP)
+    {
+        if (hdr->data_csum_type != 0 &&
+            hdr->data_csum_type != BLOCKSTORE_CSUM_CRC32C)
+        {
+            goto csum_unknown;
+        }
+        if (!dsk.journal_len && !with_data)
+        {
+            fprintf(stderr, "Buffer area (former journal) location must be specified to dump \"heap\" with data\n");
+            goto close_error;
+        }
+        // Load buffer_area
+        if (with_data)
+        {
+            buffer_area = (uint8_t*)memalign_or_die(MEM_ALIGNMENT, dsk.journal_len);
+            if (dsk.journal_device == dsk.meta_device || dsk.journal_device == "")
+            {
+                dsk.journal_fd = dsk.meta_fd;
+            }
+            else if (do_open)
+            {
+                if (dsk.journal_fd >= 0)
+                {
+                    fprintf(stderr, "Bug: Metadata device is already opened\n");
+                    goto close_error;
+                }
+                dsk.journal_fd = open(dsk.journal_device.c_str(), (options["io"] == "cached" ? 0 : O_DIRECT) | O_RDONLY);
+                if (dsk.journal_fd < 0)
+                {
+                    fprintf(stderr, "Failed to open journal device %s: %s\n", dsk.journal_device.c_str(), strerror(errno));
+                    goto close_error;
+                }
+            }
+            else if (dsk.journal_fd < 0)
+            {
+                fprintf(stderr, "Bug: journal device is not opened\n");
+                goto close_error;
+            }
+            uint64_t journal_pos = 0;
+            lseek64(dsk.journal_fd, dsk.journal_offset+journal_pos, 0);
+            while (journal_pos < dsk.journal_len)
+            {
+                uint64_t read_len = buf_size < dsk.journal_len-journal_pos ? buf_size : dsk.journal_len-journal_pos;
+                read_blocking(dsk.journal_fd, buffer_area+journal_pos, read_len);
+                journal_pos += read_len;
+            }
+        }
+        blockstore_heap_t *heap = new blockstore_heap_t(&dsk, buffer_area, log_level);
+        // Load heap and just iterate it in memory
+        hdr_fn(hdr);
+        hdr = NULL;
+        meta_pos = dsk.meta_block_size;
+        lseek64(dsk.meta_fd, dsk.meta_offset+meta_pos, 0);
+        while (meta_pos < dsk.meta_area_size)
+        {
+            uint64_t read_len = buf_size < dsk.meta_area_size-meta_pos ? buf_size : dsk.meta_area_size-meta_pos;
+            read_blocking(dsk.meta_fd, data, read_len);
+            heap->read_blocks(meta_pos-dsk.meta_block_size, read_len, data, [&](heap_object_t *obj)
+            {
+                obj_fn(heap, obj, ((uint8_t*)obj-data+meta_pos)/dsk.meta_block_size);
+            }, [](uint32_t, uint32_t, uint8_t*){});
+            meta_pos += read_len;
+        }
+        delete heap;
+    }
+    else if (hdr->zero == 0 && hdr->magic == BLOCKSTORE_META_MAGIC_V1)
+    {
+        dsk.meta_format = hdr->version;
+        dsk.calc_lengths();
+        dsk.check_lengths();
        if (hdr->version == BLOCKSTORE_META_FORMAT_V1)
        {
            // Vitastor 0.6-0.8 - static array of clean_disk_entry with bitmaps
@@ -53,41 +133,21 @@ int disk_tool_t::process_meta(std::function<void(blockstore_meta_header_v2_t *)>
            if (hdr->data_csum_type != 0 &&
                hdr->data_csum_type != BLOCKSTORE_CSUM_CRC32C)
            {
+csum_unknown:
                fprintf(stderr, "I don't know checksum format %u, the only supported format is crc32c = %u.\n", hdr->data_csum_type, BLOCKSTORE_CSUM_CRC32C);
-                free(data);
-                if (do_open)
-                {
-                    close(dsk.meta_fd);
-                    dsk.meta_fd = -1;
-                }
-                return 1;
+                goto close_error;
            }
        }
        else
        {
            // Unsupported version
            fprintf(stderr, "Metadata format is too new for me (stored version is %ju, max supported %u).\n", hdr->version, BLOCKSTORE_META_FORMAT_V2);
-            free(data);
-            if (do_open)
-            {
-                close(dsk.meta_fd);
-                dsk.meta_fd = -1;
-            }
-            return 1;
+            goto close_error;
        }
        if (hdr->meta_block_size != dsk.meta_block_size)
        {
            fprintf(stderr, "Using block size of %u bytes based on information from the superblock\n", hdr->meta_block_size);
            dsk.meta_block_size = hdr->meta_block_size;
-            if (buf_size % dsk.meta_block_size)
-            {
-                buf_size = 8*dsk.meta_block_size;
-                void *new_data = memalign_or_die(MEM_ALIGNMENT, buf_size);
-                memcpy(new_data, data, dsk.meta_block_size);
-                free(data);
-                data = new_data;
-                hdr = (blockstore_meta_header_v2_t *)data;
-            }
        }
        dsk.meta_format = hdr->version;
        dsk.data_block_size = hdr->data_block_size;
@@ -101,14 +161,15 @@ int disk_tool_t::process_meta(std::function<void(blockstore_meta_header_v2_t *)>
                    *(hdr->data_csum_type & 0xff))
                : 0)
            + (dsk.meta_format == BLOCKSTORE_META_FORMAT_V2 ? 4 /*entry_csum*/ : 0);
+        // Read
        uint64_t block_num = 0;
        hdr_fn(hdr);
        hdr = NULL;
        meta_pos = dsk.meta_block_size;
        lseek64(dsk.meta_fd, dsk.meta_offset+meta_pos, 0);
-        while (meta_pos < dsk.meta_len)
+        while (meta_pos < dsk.min_meta_len)
        {
-            uint64_t read_len = buf_size < dsk.meta_len-meta_pos ? buf_size : dsk.meta_len-meta_pos;
+            uint64_t read_len = buf_size < dsk.min_meta_len-meta_pos ? buf_size : dsk.min_meta_len-meta_pos;
            read_blocking(dsk.meta_fd, data, read_len);
            meta_pos += read_len;
            for (uint64_t blk = 0; blk < read_len; blk += dsk.meta_block_size)
@@ -123,7 +184,7 @@ int disk_tool_t::process_meta(std::function<void(blockstore_meta_header_v2_t *)>
                            uint32_t *entry_csum = (uint32_t*)((uint8_t*)entry + dsk.clean_entry_size - 4);
                            if (*entry_csum != crc32c(0, entry, dsk.clean_entry_size - 4))
                            {
-                                fprintf(stderr, "Metadata entry %ju is corrupt (checksum mismatch), skipping\n", block_num);
+                                fprintf(stderr, "Metadata entry %lu is corrupt (checksum mismatch), skipping\n", block_num);
                                continue;
                            }
                        }
@@ -135,14 +196,15 @@ int disk_tool_t::process_meta(std::function<void(blockstore_meta_header_v2_t *)>
    }
    else
    {
-        // Vitastor 0.4-0.5 - static array of clean_disk_entry
+        // Vitastor 0.4-0.5 - static array of clean_disk_entry without header
+        lseek64(dsk.meta_fd, dsk.meta_offset, 0);
        dsk.clean_entry_bitmap_size = 0;
        dsk.clean_entry_size = sizeof(clean_disk_entry);
        uint64_t block_num = 0;
        hdr_fn(NULL);
-        while (meta_pos < dsk.meta_len)
+        while (meta_pos < dsk.meta_area_size)
        {
-            uint64_t read_len = buf_size < dsk.meta_len-meta_pos ? buf_size : dsk.meta_len-meta_pos;
+            uint64_t read_len = buf_size < dsk.meta_area_size-meta_pos ? buf_size : dsk.meta_area_size-meta_pos;
            read_blocking(dsk.meta_fd, data, read_len);
            meta_pos += read_len;
            for (uint64_t blk = 0; blk < read_len; blk += dsk.meta_block_size)
@@ -158,13 +220,25 @@ int disk_tool_t::process_meta(std::function<void(blockstore_meta_header_v2_t *)>
            }
        }
    }
+close_free:
    free(data);
+    if (buffer_area)
+    {
+        free(buffer_area);
+        buffer_area = NULL;
+    }
    if (do_open)
    {
        close(dsk.meta_fd);
        dsk.meta_fd = -1;
+        if (dsk.journal_fd >= 0)
+        {
+            if (dsk.journal_fd != dsk.meta_fd)
+                close(dsk.journal_fd);
+            dsk.journal_fd = -1;
+        }
    }
-    return 0;
+    return r;
 }

 int disk_tool_t::dump_load_check_superblock(const std::string & device)
@@ -180,7 +254,7 @@ int disk_tool_t::dump_load_check_superblock(const std::string & device)
        dsk.open_data();
        dsk.open_meta();
        dsk.open_journal();
-        dsk.calc_lengths(true);
+        dsk.calc_lengths();
    }
    catch (std::exception & e)
    {
@@ -195,15 +269,33 @@ int disk_tool_t::dump_load_check_superblock(const std::string & device)
 int disk_tool_t::dump_meta()
 {
    int r = process_meta(
-        [this](blockstore_meta_header_v2_t *hdr) { dump_meta_header(hdr); },
-        [this](uint64_t block_num, clean_disk_entry *entry, uint8_t *bitmap) { dump_meta_entry(block_num, entry, bitmap); }
+        [this](blockstore_meta_header_v3_t *hdr)
+        {
+            if (dump_as_old)
+            {
+                hdr->version = BLOCKSTORE_META_FORMAT_V2;
+                hdr->compacted_lsn = 0;
+                hdr->header_csum = 0;
+                hdr->header_csum = crc32c(0, hdr, sizeof(blockstore_meta_header_v2_t));
+            }
+            dump_meta_header(hdr);
+        },
+        [this](blockstore_heap_t *heap, heap_object_t *obj, uint32_t meta_block_num)
+        {
+            if (dump_as_old)
+                dump_heap_entry_as_old(heap, obj);
+            else
+                dump_heap_entry(heap, obj);
+        },
+        [this](uint64_t block_num, clean_disk_entry *entry, uint8_t *bitmap) { dump_meta_entry(block_num, entry, bitmap); },
+        true, true
    );
    if (r == 0)
        printf("\n]}\n");
    return r;
 }

-void disk_tool_t::dump_meta_header(blockstore_meta_header_v2_t *hdr)
+void disk_tool_t::dump_meta_header(blockstore_meta_header_v3_t *hdr)
 {
    if (hdr)
    {
@@ -224,14 +316,151 @@ void disk_tool_t::dump_meta_header(blockstore_meta_header_v2_t *hdr)
                csum_type_str(hdr->data_csum_type).c_str(), hdr->csum_block_size
            );
        }
+        else if (hdr->version == BLOCKSTORE_META_FORMAT_HEAP)
+        {
+            printf(
+                "{\"version\":\"3.0\",\"meta_block_size\":%u,\"data_block_size\":%u,\"bitmap_granularity\":%u,"
+                "\"data_csum_type\":\"%s\",\"csum_block_size\":%u,\"entries\":[\n",
+                hdr->meta_block_size, hdr->data_block_size, hdr->bitmap_granularity,
+                csum_type_str(hdr->data_csum_type).c_str(), hdr->csum_block_size
+            );
+        }
    }
    else
    {
-        printf("{\"version\":\"0.5\",\"meta_block_size\":%ju,\"entries\":[\n", dsk.meta_block_size);
+        printf("{\"version\":\"0.5\",\"meta_block_size\":%u,\"entries\":[\n", dsk.meta_block_size);
    }
    first_entry = true;
 }

+void disk_tool_t::dump_heap_entry_as_old(blockstore_heap_t *heap, heap_object_t *obj)
+{
+    heap_write_t *wr = NULL;
+    for (wr = obj->get_writes(); wr && wr->entry_type != (BS_HEAP_BIG_WRITE|BS_HEAP_STABLE) &&
+        wr->entry_type != (BS_HEAP_TOMBSTONE|BS_HEAP_STABLE); wr = wr->next())
+    {
+    }
+    if (!wr || wr->entry_type != (BS_HEAP_BIG_WRITE|BS_HEAP_STABLE))
+    {
+        return;
+    }
+    printf(
+#define ENTRY_FMT "{\"block\":%u,\"pool\":%u,\"inode\":\"0x%jx\",\"stripe\":\"0x%jx\",\"version\":%ju"
+        (first_entry ? ENTRY_FMT : (",\n" ENTRY_FMT)),
+#undef ENTRY_FMT
+        wr->big().block_num, INODE_POOL(obj->inode), INODE_NO_POOL(obj->inode),
+        obj->stripe, wr->version
+    );
+    printf(",\"bitmap\":\"");
+    uint8_t* bitmap = wr->get_int_bitmap(heap);
+    for (uint64_t i = 0; i < dsk.clean_entry_bitmap_size; i++)
+    {
+        printf("%02x", bitmap[i]);
+    }
+    bitmap = wr->get_ext_bitmap(heap);
+    printf("\",\"ext_bitmap\":\"");
+    for (uint64_t i = 0; i < dsk.clean_entry_bitmap_size; i++)
+    {
+        printf("%02x", bitmap[i]);
+    }
+    uint8_t *csums = wr->get_checksums(heap);
+    uint32_t csum_size = wr->get_csum_size(heap);
+    if (csums)
+    {
+        printf("\",\"block_csums\":\"");
+        for (uint32_t i = 0; i < csum_size; i++)
+        {
+            printf("%02x", csums[i]);
+        }
+    }
+    if (wr->get_checksum(heap))
+    {
+        printf("\",\"crc32c\":\"%08x", *wr->get_checksum(heap));
+    }
+    printf("\"}");
+    first_entry = false;
+}
+
+void disk_tool_t::dump_heap_entry(blockstore_heap_t *heap, heap_object_t *obj)
+{
+    printf(
+#define ENTRY_FMT "{\"pool\":%u,\"inode\":\"0x%jx\",\"stripe\":\"0x%jx\",\"writes\":["
+        (first_entry ? ENTRY_FMT : (",\n" ENTRY_FMT)),
+#undef ENTRY_FMT
+        INODE_POOL(obj->inode), INODE_NO_POOL(obj->inode), obj->stripe
+    );
+    heap_write_t *wr = NULL;
+    bool first_wr = true;
+    for (wr = obj->get_writes(); wr; wr = wr->next())
+    {
+        printf(
+#define ENTRY_FMT "{\"lsn\":%ju,\"version\":%ju,\"type\":\"%s\",\"stable\":%s"
+            (first_wr ? ENTRY_FMT : ("," ENTRY_FMT)),
+#undef ENTRY_FMT
+            wr->lsn, wr->version, (wr->entry_type & BS_HEAP_TYPE) == BS_HEAP_SMALL_WRITE ? "small" : (
+                (wr->entry_type & BS_HEAP_TYPE) == BS_HEAP_BIG_WRITE ? "big" : (
+                (wr->entry_type & BS_HEAP_TYPE) == BS_HEAP_INTENT_WRITE ? "intent" : (
+                (wr->entry_type & BS_HEAP_TYPE) == BS_HEAP_TOMBSTONE ? "tombstone" : "unknown"))),
+            (wr->entry_type & BS_HEAP_STABLE) ? "true" : "false"
+        );
+        if ((wr->entry_type & BS_HEAP_TYPE) == BS_HEAP_BIG_WRITE)
+        {
+            printf(",\"location\":%ju", wr->big_location(heap));
+        }
+        else if ((wr->entry_type & BS_HEAP_TYPE) == BS_HEAP_INTENT_WRITE)
+        {
+            printf(",\"offset\":%u,\"len\":%u", wr->small().offset, wr->small().len);
+        }
+        else if ((wr->entry_type & BS_HEAP_TYPE) == BS_HEAP_SMALL_WRITE)
+        {
+            if (!dump_with_data)
+            {
+                printf(",\"offset\":%u,\"len\":%u,\"location\":%ju", wr->small().offset, wr->small().len, wr->small().location);
+            }
+            else
+            {
+                printf(",\"data\":\"");
+                for (uint32_t i = 0; i < wr->small().len; i++)
+                    printf("%02x", buffer_area[wr->small().location + i]);
+                printf("\"");
+            }
+        }
+        uint8_t* bitmap = wr->get_int_bitmap(heap);
+        if (bitmap)
+        {
+            printf(",\"bitmap\":\"");
+            for (uint64_t i = 0; i < dsk.clean_entry_bitmap_size; i++)
+                printf("%02x", bitmap[i]);
+            printf("\"");
+        }
+        bitmap = wr->get_ext_bitmap(heap);
+        if (bitmap)
+        {
+            printf(",\"ext_bitmap\":\"");
+            for (uint64_t i = 0; i < dsk.clean_entry_bitmap_size; i++)
+                printf("%02x", bitmap[i]);
+            printf("\"");
+        }
+        uint8_t *csums = wr->get_checksums(heap);
+        if (csums)
+        {
+            printf(",\"block_csums\":\"");
+            uint32_t csum_size = wr->get_csum_size(heap);
+            for (uint32_t i = 0; i < csum_size; i++)
+                printf("%02x", csums[i]);
+            printf("\"");
+        }
+        if (wr->get_checksum(heap))
+        {
+            printf(",\"data_crc32c\":\"%08x\"", *wr->get_checksum(heap));
+        }
+        printf("}");
+        first_wr = false;
+    }
+    printf("]}");
+    first_entry = false;
+}
+
 void disk_tool_t::dump_meta_entry(uint64_t block_num, clean_disk_entry *entry, uint8_t *bitmap)
 {
    printf(
@@ -294,7 +523,7 @@ int disk_tool_t::write_json_meta(json11::Json meta)
                ? BLOCKSTORE_CSUM_CRC32C
                : BLOCKSTORE_CSUM_NONE);
        new_hdr->csum_block_size = meta["csum_block_size"].uint64_value();
-        new_hdr->header_csum = crc32c(0, new_hdr, sizeof(*new_hdr));
+        new_hdr->header_csum = crc32c(0, new_hdr, sizeof(blockstore_meta_header_v2_t));
    }
    uint32_t new_clean_entry_header_size = (new_hdr->version == BLOCKSTORE_META_FORMAT_V1
        ? sizeof(clean_disk_entry) : sizeof(clean_disk_entry) + 4 /*entry_csum*/);
@@ -304,6 +533,7 @@ int disk_tool_t::write_json_meta(json11::Json meta)
        : 0);
    new_clean_entry_size = new_clean_entry_header_size + 2*new_clean_entry_bitmap_size + new_data_csum_size;
    new_entries_per_block = new_hdr->meta_block_size / new_clean_entry_size;
+    // FIXME: Use a streaming json parser
    for (const auto & e: meta["entries"].array_items())
    {
        uint64_t data_block = e["block"].uint64_value();
@@ -337,7 +567,379 @@ int disk_tool_t::write_json_meta(json11::Json meta)
        }
    }
    int r = resize_write_new_meta();
-    free(new_meta_buf);
-    new_meta_buf = NULL;
+    free_new_meta();
    return r;
 }
+
+int disk_tool_t::write_json_heap(json11::Json meta, json11::Json journal)
+{
+    new_meta_hdr->zero = 0;
+    new_meta_hdr->magic = BLOCKSTORE_META_MAGIC_V1;
+    new_meta_hdr->version = BLOCKSTORE_META_FORMAT_HEAP;
+    new_meta_hdr->meta_block_size = meta["meta_block_size"].uint64_value()
+        ? meta["meta_block_size"].uint64_value() : 4096;
+    new_meta_hdr->data_block_size = meta["data_block_size"].uint64_value()
+        ? meta["data_block_size"].uint64_value() : 131072;
+    new_meta_hdr->bitmap_granularity = meta["bitmap_granularity"].uint64_value()
+        ? meta["bitmap_granularity"].uint64_value() : 4096;
+    new_meta_hdr->data_csum_type = meta["data_csum_type"].is_number()
+        ? meta["data_csum_type"].uint64_value()
+        : (meta["data_csum_type"].string_value() == "crc32c"
+            ? BLOCKSTORE_CSUM_CRC32C
+            : BLOCKSTORE_CSUM_NONE);
+    new_meta_hdr->csum_block_size = meta["csum_block_size"].uint64_value();
+    new_meta_hdr->header_csum = crc32c(0, new_meta_hdr, sizeof(blockstore_meta_header_v3_t));
+    new_clean_entry_bitmap_size = (new_meta_hdr->data_block_size / new_meta_hdr->bitmap_granularity + 7) / 8;
+    new_clean_entry_size = 0;
+    new_entries_per_block = 0;
+    new_data_csum_size = (new_meta_hdr->data_csum_type
+        ? ((new_meta_hdr->data_block_size+new_meta_hdr->csum_block_size-1)/new_meta_hdr->csum_block_size*(new_meta_hdr->data_csum_type & 0xFF))
+        : 0);
+    new_journal_buf = new_journal_len ? (uint8_t*)memalign(MEM_ALIGNMENT, new_journal_len) : NULL;
+    if (new_journal_len)
+    {
+        memset(new_journal_buf, 0, new_journal_len);
+    }
+    uint64_t total_used_space = 0;
+    uint32_t used_space = 0;
+    // FIXME: Use a streaming json parser
+    if (meta["version"] == "3.0")
+    {
+        // New format
+        std::vector<uint8_t> object_buf;
+        new_heap = new blockstore_heap_t(&dsk, new_journal_buf, 0);
+        for (const auto & meta_entry: meta["entries"].array_items())
+        {
+            bool invalid = false;
+            object_id oid = {
+                .inode = (sscanf_json(NULL, meta_entry["pool"]) << (64-POOL_ID_BITS)) | sscanf_json(NULL, meta_entry["inode"]),
+                .stripe = sscanf_json(NULL, meta_entry["stripe"]),
+            };
+            object_buf.clear();
+            object_buf.resize(sizeof(heap_object_t));
+            heap_object_t *obj = (heap_object_t*)object_buf.data();
+            obj->size = sizeof(heap_object_t);
+            obj->write_pos = meta_entry["writes"].array_items().size() ? sizeof(heap_object_t) : 0;
+            obj->entry_type = BS_HEAP_OBJECT;
+            obj->inode = oid.inode;
+            obj->stripe = oid.stripe;
+            size_t pos = sizeof(heap_object_t);
+            heap_write_t *last_wr = NULL;
+            for (auto & write_entry: meta_entry["writes"].array_items())
+            {
+                object_buf.resize(object_buf.size() + new_heap->get_max_write_entry_size());
+                heap_write_t *wr = (heap_write_t*)(object_buf.data() + pos);
+                last_wr = wr;
+                uint8_t wr_type = 0;
+                if (write_entry["type"] == "small")
+                    wr_type = BS_HEAP_SMALL_WRITE;
+                else if (write_entry["type"] == "intent")
+                    wr_type = BS_HEAP_INTENT_WRITE;
+                else if (write_entry["type"] == "big")
+                    wr_type = BS_HEAP_BIG_WRITE;
+                else if (write_entry["type"] == "tombstone")
+                    wr_type = BS_HEAP_TOMBSTONE;
+                else
+                {
+                    fprintf(stderr, "Write entry in %s has invalid type: %s, skipping object\n", meta_entry.dump().c_str(), write_entry["type"].dump().c_str());
+                    invalid = true;
+                    break;
+                }
+                wr->entry_type = wr_type | (write_entry["stable"].bool_value() ? BS_HEAP_STABLE : 0);
+                wr->lsn = write_entry["lsn"].uint64_value();
+                wr->version = write_entry["version"].uint64_value();
+                wr->size = wr->get_size(new_heap);
+                wr->next_pos = wr->size;
+                if (wr_type == BS_HEAP_SMALL_WRITE || wr_type == BS_HEAP_INTENT_WRITE)
+                {
+                    wr->small().offset = write_entry["offset"].uint64_value();
+                    wr->small().len = write_entry["len"].uint64_value();
+                    wr->small().location = write_entry["location"].uint64_value();
+                    if (wr_type == BS_HEAP_SMALL_WRITE && write_entry["data"].is_string() && wr->small().len > 0)
+                    {
+                        if (!new_journal_buf)
+                        {
+                            fprintf(stderr, "Loading small write data requires overwriting buffer area\n");
+                            free_new_meta();
+                            return 1;
+                        }
+                        wr->small().location = new_heap->find_free_buffer_area(wr->small().len);
+                        fromhexstr(write_entry["data"].string_value(), wr->small().len, new_journal_buf + wr->small().location);
+                    }
+                }
+                else if (wr_type == BS_HEAP_BIG_WRITE)
+                {
+                    uint64_t loc = write_entry["location"].uint64_value();
+                    assert(!(loc % dsk.data_block_size));
+                    assert((loc / dsk.data_block_size) < 0xFFFF0000);
+                    wr->set_big_location(new_heap, loc);
+                }
+                if (write_entry["bitmap"].is_string() && wr->get_int_bitmap(new_heap))
+                {
+                    fromhexstr(write_entry["bitmap"].string_value(), new_clean_entry_bitmap_size, wr->get_int_bitmap(new_heap));
+                }
+                if (write_entry["ext_bitmap"].is_string() && wr->get_ext_bitmap(new_heap))
+                {
+                    fromhexstr(write_entry["ext_bitmap"].string_value(), new_clean_entry_bitmap_size, wr->get_ext_bitmap(new_heap));
+                }
+                if (write_entry["block_csums"].is_string() && wr->get_checksums(new_heap))
+                {
+                    fromhexstr(write_entry["block_csums"].string_value(), wr->get_csum_size(new_heap), wr->get_ext_bitmap(new_heap));
+                }
+                if (write_entry["data_crc32c"].is_string() && wr->get_checksum(new_heap))
+                {
+                    *wr->get_checksum(new_heap) = sscanf_json("%jx", write_entry["data_crc32c"]);
+                }
+            }
+            if (invalid)
+            {
+                continue;
+            }
+            last_wr->next_pos = 0;
+            new_heap->copy_object(obj, NULL);
+        }
+    }
+    else
+    {
+        if (!journal.is_array())
+        {
+            fprintf(stderr, "Metadata should include journal in you want to convert it to the \"heap\" format\n");
+close_err:
+            free(new_meta_buf);
+            new_meta_buf = NULL;
+            return 1;
+        }
+        std::map<object_id, std::vector<json11::Json::object>> journal_by_object;
+        if (index_journal_by_object(journal, journal_by_object) != 0)
+        {
+            goto close_err;
+        }
+        journal = json11::Json();
+        // Convert old format to the new format
+        uint64_t next_lsn = 0;
+        uint64_t meta_offset = 0;
+        const uint32_t space_per_object = sizeof(heap_object_t) + sizeof(heap_write_t) +
+            new_clean_entry_bitmap_size*2 + new_data_csum_size;
+        uint64_t buffer_pos = 0;
+        // FIXME: Rather ugly. Remove the dependency on dsk from heap?
+        blockstore_disk_t dsk;
+        dsk.bitmap_granularity = new_meta_hdr->bitmap_granularity;
+        dsk.block_count = 16;
+        dsk.data_block_size = new_meta_hdr->data_block_size;
+        dsk.clean_entry_bitmap_size = new_clean_entry_bitmap_size;
+        dsk.csum_block_size = new_meta_hdr->csum_block_size;
+        dsk.data_csum_type = new_meta_hdr->data_csum_type;
+        dsk.journal_len = 4096;
+        dsk.meta_area_size = new_meta_len;
+        dsk.meta_block_size = new_meta_hdr->meta_block_size;
+        dsk.meta_block_target_free_space = 800;
+        blockstore_heap_t heap(&dsk, NULL, 0);
+        for (const auto & meta_entry: meta["entries"].array_items())
+        {
+            object_id oid = {
+                .inode = (sscanf_json(NULL, meta_entry["pool"]) << (64-POOL_ID_BITS)) | sscanf_json(NULL, meta_entry["inode"]),
+                .stripe = sscanf_json(NULL, meta_entry["stripe"]),
+            };
+            uint32_t space_for_this = space_per_object;
+            auto j_it = journal_by_object.find(oid);
+            if (j_it != journal_by_object.end())
+            {
+                for (auto & rec: j_it->second)
+                {
+                    if (rec["type"] == "small_write" || rec["type"] == "small_write_instant")
+                    {
+                        uint64_t off = rec["offset"].uint64_value();
+                        uint64_t len = rec["len"].uint64_value();
+                        if (off+len > new_meta_hdr->data_block_size)
+                        {
+                            fprintf(stderr, "Journal entry has too large offset or length: %s\n", json11::Json(rec).dump().c_str());
+                            goto close_err;
+                        }
+                        space_for_this += sizeof(heap_write_t) + new_clean_entry_bitmap_size +
+                            ((off+len+new_meta_hdr->csum_block_size-1)/new_meta_hdr->csum_block_size - off/new_meta_hdr->csum_block_size) * (new_meta_hdr->data_csum_type & 0xFF);
+                    }
+                    else /*if (rec["type"] == "big_write" || rec["type"] == "big_write_instant")*/
+                    {
+                        space_for_this += sizeof(heap_write_t) + 2*new_clean_entry_bitmap_size + new_data_csum_size;
+                    }
+                }
+            }
+            if (space_for_this > new_meta_hdr->meta_block_size)
+            {
+                fprintf(stderr, "Object doesn't fit in a single metadata block. Object meta: %s, object journal: %s\n",
+                    meta_entry.dump().c_str(), json11::Json(j_it->second).dump().c_str());
+                goto close_err;
+            }
+            if (used_space + space_for_this > new_meta_hdr->meta_block_size-dsk.meta_block_target_free_space)
+            {
+                if (used_space < new_meta_hdr->meta_block_size-2)
+                {
+                    *((uint16_t*)(new_meta_buf + meta_offset + used_space)) = FREE_SPACE_BIT | (uint16_t)(new_meta_hdr->meta_block_size-used_space);
+                }
+                meta_offset += new_meta_hdr->meta_block_size;
+                used_space = 0;
+                if (meta_offset >= new_meta_len)
+                {
+                    fprintf(stderr, "Metadata doesn't fit into the new area (total used space: %ju, minimum free space in block: %u/%u)\n",
+                        total_used_space, dsk.meta_block_target_free_space, new_meta_hdr->meta_block_size);
+                    goto close_err;
+                }
+            }
+            heap_object_t *obj = (heap_object_t*)(new_meta_buf + meta_offset + used_space);
+            obj->size = sizeof(heap_object_t);
+            obj->write_pos = sizeof(heap_object_t);
+            obj->entry_type = BS_HEAP_OBJECT;
+            obj->inode = oid.inode;
+            obj->stripe = oid.stripe;
+            heap_write_t *wr = obj->get_writes();
+            wr->next_pos = 0;
+            wr->entry_type = BS_HEAP_BIG_WRITE|BS_HEAP_STABLE;
+            wr->lsn = ++next_lsn;
+            wr->version = sscanf_json(NULL, meta_entry["version"]);
+            wr->set_big_location(&heap, meta_entry["block"].uint64_value() * new_meta_hdr->data_block_size);
+            wr->size = wr->get_size(&heap);
+            fromhexstr(meta_entry["bitmap"].string_value(), new_clean_entry_bitmap_size, wr->get_int_bitmap(&heap));
+            fromhexstr(meta_entry["ext_bitmap"].string_value(), new_clean_entry_bitmap_size, wr->get_ext_bitmap(&heap));
+            if (new_meta_hdr->data_csum_type != 0)
+                fromhexstr(meta_entry["data_csum"].string_value(), new_data_csum_size, wr->get_checksums(&heap));
+            if (j_it != journal_by_object.end())
+            {
+                for (auto & rec: j_it->second)
+                {
+                    wr->next_pos = wr->get_size(&heap);
+                    wr = wr->next();
+                    wr->next_pos = 0;
+                    wr->lsn = ++next_lsn;
+                    wr->version = rec["ver"].uint64_value();
+                    uint64_t wr_offset = rec["offset"].uint64_value();
+                    uint64_t wr_len = rec["len"].uint64_value();
+                    if (rec["type"] == "small_write" || rec["type"] == "small_write_instant")
+                    {
+                        if (wr_len > 0 && !rec["data"].is_string())
+                        {
+                            fprintf(stderr, "Error: entry data is missing, please generate the dump with --json --format data\n");
+                            goto close_err;
+                        }
+                        wr->entry_type = BS_HEAP_SMALL_WRITE | (rec["type"] == "small_write_instant" ? BS_HEAP_STABLE : 0);
+                        wr->small().offset = wr_offset;
+                        wr->small().len = wr_len;
+                        wr->small().location = buffer_pos;
+                        fromhexstr(rec["bitmap"].string_value(), new_clean_entry_bitmap_size, wr->get_ext_bitmap(&heap));
+                        fromhexstr(rec["data"].string_value(), wr_len, new_journal_buf+buffer_pos);
+                        if (wr_len > 0)
+                        {
+                            if (!new_meta_hdr->data_csum_type)
+                                *wr->get_checksum(&heap) = crc32c(0, new_journal_buf+buffer_pos, wr_len);
+                            else
+                                heap.calc_block_checksums((uint32_t*)wr->get_checksums(&heap), new_journal_buf+buffer_pos, NULL, wr_offset, wr_offset+wr_len, true, NULL);
+                        }
+                        buffer_pos += wr_len;
+                    }
+                    else if (rec["type"] == "big_write" || rec["type"] == "big_write_instant")
+                    {
+                        wr->entry_type = BS_HEAP_BIG_WRITE | (rec["type"] == "big_write_instant" ? BS_HEAP_STABLE : 0);
+                        wr->set_big_location(&heap, sscanf_json(NULL, rec["loc"]));
+                        bitmap_set(wr->get_int_bitmap(&heap), wr_offset, wr_len, new_meta_hdr->bitmap_granularity);
+                        fromhexstr(rec["bitmap"].string_value(), new_clean_entry_bitmap_size, wr->get_ext_bitmap(&heap));
+                        if (new_meta_hdr->data_csum_type != 0)
+                        {
+                            if ((wr_offset % new_meta_hdr->csum_block_size) || (wr_len % new_meta_hdr->csum_block_size))
+                            {
+                                fprintf(stderr,
+                                    "Error: big_write journal entries not aligned to csum_block_size can't be converted between v0.9 and v3.0 metadata\n"
+                                    "Stop writes and flush the journal or convert OSDs one by one without the journal if you still want to do it.\n");
+                                goto close_err;
+                            }
+                            fromhexstr(rec["block_csums"].string_value(),
+                                ((wr_offset+wr_len+new_meta_hdr->csum_block_size-1)/new_meta_hdr->csum_block_size
+                                    - wr_offset/new_meta_hdr->csum_block_size) * (new_meta_hdr->data_csum_type & 0xFF),
+                                wr->get_checksums(&heap));
+                        }
+                    }
+                    else
+                    {
+                        assert(0);
+                    }
+                    wr->size = wr->get_size(&heap);
+                }
+            }
+            obj->crc32c = obj->calc_crc32c();
+            assert(((uint8_t*)wr + wr->size - (uint8_t*)obj) == space_for_this);
+            used_space += space_for_this;
+            total_used_space += space_for_this;
+        }
+        if (used_space > 0 && used_space < new_meta_hdr->meta_block_size-2)
+        {
+            *((uint16_t*)(new_meta_buf + meta_offset + used_space)) = FREE_SPACE_BIT | (uint16_t)(new_meta_hdr->meta_block_size-used_space);
+        }
+    }
+    int r = resize_write_new_meta();
+    if (r == 0)
+    {
+        r = resize_write_new_journal();
+    }
+    free_new_meta();
+    return r;
+}
+
+int disk_tool_t::index_journal_by_object(json11::Json journal,
+    std::map<object_id, std::vector<json11::Json::object>> & journal_by_object)
+{
+    for (const auto & rec: journal.array_items())
+    {
+        object_id oid = {
+            .inode = sscanf_json(NULL, rec["inode"]),
+            .stripe = sscanf_json(NULL, rec["stripe"]),
+        };
+        auto & jbo = journal_by_object[oid];
+        if (rec["type"] == "small_write" || rec["type"] == "small_write_instant")
+        {
+            jbo.push_back(rec.object_items());
+        }
+        else if (rec["type"] == "big_write" || rec["type"] == "big_write_instant")
+        {
+            if (rec["type"] == "big_write_instant")
+                jbo.clear();
+            jbo.push_back(rec.object_items());
+        }
+        else if (rec["type"] == "delete")
+        {
+            jbo.clear();
+        }
+        else if (rec["type"] == "stable")
+        {
+            uint64_t commit_to = rec["version"].uint64_value();
+            for (size_t i = 0; i < jbo.size(); i++)
+            {
+                if (jbo[i]["version"].uint64_value() <= commit_to)
+                {
+                    if (jbo[i]["type"] == "big_write")
+                    {
+                        jbo.erase(jbo.begin(), jbo.begin()+i);
+                        i = 0;
+                        jbo[i]["type"] = "big_write_instant";
+                    }
+                    else if (jbo[i]["type"] == "small_write")
+                    {
+                        jbo[i]["type"] = "small_write_instant";
+                    }
+                }
+            }
+        }
+        else if (rec["type"] == "rollback")
+        {
+            uint64_t rollback_to = rec["version"].uint64_value();
+            for (size_t i = jbo.size()-1; i >= 0; i--)
+            {
+                if (jbo[i]["version"].uint64_value() > rollback_to)
+                    jbo.erase(jbo.begin()+i, jbo.begin()+i+1);
+            }
+        }
+        else
+        {
+            fprintf(stderr, "Unknown journal entry type: %s\n", rec.dump().c_str());
+            return -1;
+        }
+    }
+    return 0;
+}
--- a/src/disk_tool/disk_tool_prepare.cpp
+++ b/src/disk_tool/disk_tool_prepare.cpp
@@ -9,6 +9,7 @@
 int disk_tool_t::prepare_one(std::map<std::string, std::string> options, int is_hdd, json11::Json::object & result)
 {
    static const char *allow_additional_params[] = {
+        "meta_format",
        "data_csum_type",
        "csum_block_size",
        "autosync_writes",
@@ -72,6 +73,10 @@ int disk_tool_t::prepare_one(std::map<std::string, std::string> options, int is_
                options["disable_"+dev+"_fsync"] = "1";
        }
    }
+    if (options["meta_format"] == "")
+    {
+        options["meta_format"] = std::to_string(BLOCKSTORE_META_FORMAT_HEAP);
+    }
    if (options["meta_device"] == "" || options["meta_device"] == options["data_device"])
    {
        options["disable_meta_fsync"] = options["disable_data_fsync"];
@@ -108,35 +113,40 @@ int disk_tool_t::prepare_one(std::map<std::string, std::string> options, int is_
        if (options.find("autosync_writes") == options.end())
            options["autosync_writes"] = "512";
    }
+    uint64_t new_meta_len = parse_size(options["meta_len"]);
    json11::Json::object sb;
    blockstore_disk_t dsk;
    try
    {
        dsk.parse_config(options);
-        // Set all offsets to 4096 to calculate metadata size with excess
+        // Calculate metadata sizes (with excess)
        dsk.journal_offset = 4096;
-        dsk.meta_offset = 4096;
-        dsk.data_offset = 4096;
+        dsk.meta_offset = 4096 + (dsk.meta_device == dsk.journal_device ? dsk.cfg_journal_size : 0);
+        dsk.data_offset = 4096 + (dsk.data_device == dsk.meta_device && new_meta_len ? new_meta_len : 0) +
+            (dsk.data_device == dsk.journal_device ? dsk.cfg_journal_size : 0);
        dsk.data_io = dsk.meta_io = dsk.journal_io = (options["io"] == "cached" ? "cached" : "direct");
        dsk.open_data();
        dsk.open_meta();
        dsk.open_journal();
-        dsk.calc_lengths(true);
+        dsk.calc_lengths();
+        if (dsk.data_device == dsk.meta_device && !new_meta_len)
+            dsk.data_offset += (dsk.meta_format == BLOCKSTORE_META_FORMAT_HEAP ? dsk.min_meta_len*2 : dsk.min_meta_len);
+        dsk.meta_area_size = (dsk.data_device == dsk.meta_device ? dsk.data_offset : dsk.meta_device_size) - dsk.meta_offset;
        sb = json11::Json::object {
+            { "meta_format", options["meta_format"] },
            { "data_device", options["data_device"] },
            { "meta_device", options["meta_device"] },
            { "journal_device", options["journal_device"] },
            { "block_size", (uint64_t)dsk.data_block_size },
-            { "meta_block_size", dsk.meta_block_size },
-            { "journal_block_size", dsk.journal_block_size },
+            { "meta_block_size", (uint64_t)dsk.meta_block_size },
+            { "journal_block_size", (uint64_t)dsk.journal_block_size },
            { "data_size", dsk.cfg_data_size },
            { "disk_alignment", (uint64_t)dsk.disk_alignment },
-            { "bitmap_granularity", dsk.bitmap_granularity },
+            { "bitmap_granularity", (uint64_t)dsk.bitmap_granularity },
            { "disable_device_lock", dsk.disable_flock },
            { "journal_offset", 4096 },
-            { "meta_offset", 4096 + (dsk.meta_device == dsk.journal_device ? dsk.journal_len : 0) },
-            { "data_offset", 4096 + (dsk.data_device == dsk.meta_device ? dsk.meta_len : 0) +
-                (dsk.data_device == dsk.journal_device ? dsk.journal_len : 0) },
+            { "meta_offset", dsk.meta_offset },
+            { "data_offset", dsk.data_offset },
            { "journal_no_same_sector_overwrites", !is_hdd || is_hybrid },
            { "journal_sector_buffer_count", 1024 },
            { "disable_data_fsync", json_is_true(options["disable_data_fsync"]) },
@@ -180,7 +190,7 @@ int disk_tool_t::prepare_one(std::map<std::string, std::string> options, int is_
    }
    sb["osd_num"] = osd_num;
    // Zero out metadata and journal
-    if (write_zero(dsk.meta_fd, sb["meta_offset"].uint64_value(), dsk.meta_len) != 0 ||
+    if (write_zero(dsk.meta_fd, sb["meta_offset"].uint64_value(), dsk.meta_area_size) != 0 ||
        write_zero(dsk.journal_fd, sb["journal_offset"].uint64_value(), dsk.journal_len) != 0)
    {
        fprintf(stderr, "Failed to zero out metadata or journal: %s\n", strerror(errno));
@@ -435,7 +445,7 @@ json11::Json disk_tool_t::add_partitions(vitastor_dev_info_t & devinfo, std::vec
 }

 std::vector<std::string> disk_tool_t::get_new_data_parts(vitastor_dev_info_t & dev,
-    uint64_t osd_per_disk, uint64_t max_other_percent)
+    uint64_t osd_per_disk, uint64_t max_other_percent, uint64_t *check_new_count)
 {
    std::vector<std::string> use_parts;
    uint64_t want_parts = 0;
@@ -457,7 +467,6 @@ std::vector<std::string> disk_tool_t::get_new_data_parts(vitastor_dev_info_t & d
                {
                    // Use this partition
                    use_parts.push_back(part["uuid"].string_value());
-                    osds_exist++;
                }
                else
                {
@@ -480,9 +489,21 @@ std::vector<std::string> disk_tool_t::get_new_data_parts(vitastor_dev_info_t & d
        }
        // Still create OSD(s) if a disk has no more than (max_other_percent) other data
        if (osds_exist >= osd_per_disk || (dev.free+osds_size) < dev.size*(100-max_other_percent)/100)
+        {
            fprintf(stderr, "%s is already partitioned, skipping\n", dev.path.c_str());
+            use_parts.clear();
+        }
        else
-            want_parts = osd_per_disk-osds_exist;
+        {
+            if (use_parts.size() >= osd_per_disk-osds_exist)
+                use_parts.resize(osd_per_disk-osds_exist);
+            want_parts = osd_per_disk-osds_exist-use_parts.size();
+        }
+    }
+    if (check_new_count)
+    {
+        *check_new_count = want_parts;
+        return use_parts;
    }
    if (want_parts > 0)
    {
@@ -516,9 +537,9 @@ int disk_tool_t::get_meta_partition(std::vector<vitastor_dev_info_t> & ssds, std
        dsk.open_data();
        dsk.open_meta();
        dsk.open_journal();
-        dsk.calc_lengths(true);
+        dsk.calc_lengths();
        dsk.close_all();
-        meta_size = dsk.meta_len;
+        meta_size = dsk.min_meta_len;
    }
    catch (std::exception & e)
    {
@@ -684,10 +705,25 @@ int disk_tool_t::prepare(std::vector<std::string> devices)
    }
    json11::Json::array all_results, errors;
    auto journal_size = options["journal_size"];
+    if (options.find("dry_run") != options.end())
+    {
+        json11::Json::array results;
+        for (auto & dev: devinfo)
+        {
+            uint64_t new_part_count = 0;
+            auto existing_part_count = get_new_data_parts(dev, osd_per_disk, max_other_percent, &new_part_count).size();
+            results.push_back(json11::Json::object{ { "device_path", dev.path }, { "new_osd_count", existing_part_count+new_part_count } });
+            if (!json && new_part_count+existing_part_count > 0)
+                printf("Will initialize %ju OSD(s) on %s\n", existing_part_count+new_part_count, dev.path.c_str());
+        }
+        if (json)
+            printf("%s\n", json11::Json(json11::Json::object{{ "devices", results }}).dump().c_str());
+        return 0;
+    }
    for (auto & dev: devinfo)
    {
        // Select new partitions and create an OSD on each of them
-        for (const auto & uuid: get_new_data_parts(dev, osd_per_disk, max_other_percent))
+        for (const auto & uuid: get_new_data_parts(dev, osd_per_disk, max_other_percent, NULL))
        {
            options["force"] = true;
            options["data_device"] = "/dev/disk/by-partuuid/"+strtolower(uuid);
--- a/src/disk_tool/disk_tool_resize.cpp
+++ b/src/disk_tool/disk_tool_resize.cpp
@@ -1,6 +1,9 @@
 // Copyright (c) Vitaliy Filippov, 2019+
 // License: VNPL-1.1 (see README.md for details)

+#define _XOPEN_SOURCE
+#include <limits.h>
+
 #include "disk_tool.h"
 #include "rw_blocking.h"
 #include "str_util.h"
@@ -24,65 +27,87 @@ int disk_tool_t::raw_resize()
    // Parse parameters
    r = resize_parse_params();
    if (r != 0)
-        return r;
+        goto ret;
    // Fill allocator
    fprintf(stderr, "Reading metadata\n");
    data_alloc = new allocator_t((new_data_len < dsk.data_len ? dsk.data_len : new_data_len) / dsk.data_block_size);
    r = process_meta(
-        [this](blockstore_meta_header_v2_t *hdr)
+        [this](blockstore_meta_header_v3_t *hdr)
        {
            resize_init(hdr);
        },
+        [this](blockstore_heap_t *heap, heap_object_t *obj, uint32_t meta_block_num)
+        {
+            for (auto wr = obj->get_writes(); wr; wr = wr->next())
+            {
+                if ((wr->entry_type & BS_HEAP_TYPE) == BS_HEAP_BIG_WRITE)
+                {
+                    data_alloc->set(wr->big().block_num, true);
+                }
+            }
+        },
        [this](uint64_t block_num, clean_disk_entry *entry, uint8_t *bitmap)
        {
            data_alloc->set(block_num, true);
-        }
+        },
+        true, true
    );
    if (r != 0)
-        return r;
-    fprintf(stderr, "Reading journal\n");
-    r = process_journal([this](void *buf)
+        goto ret;
+    if (dsk.meta_format != BLOCKSTORE_META_FORMAT_HEAP)
    {
-        return process_journal_block(buf, [this](int num, journal_entry *je)
+        fprintf(stderr, "Reading journal\n");
+        r = process_journal([this](void *buf)
        {
-            if (je->type == JE_BIG_WRITE || je->type == JE_BIG_WRITE_INSTANT)
+            return process_journal_block(buf, [this](int num, journal_entry *je)
            {
-                data_alloc->set(je->big_write.location / dsk.data_block_size, true);
-            }
+                if (je->type == JE_BIG_WRITE || je->type == JE_BIG_WRITE_INSTANT)
+                {
+                    data_alloc->set(je->big_write.location / dsk.data_block_size, true);
+                }
+            });
        });
-    });
-    if (r != 0)
-        return r;
+        if (r != 0)
+            goto ret;
+    }
    // Remap blocks
    r = resize_remap_blocks();
    if (r != 0)
-        return r;
+        goto ret;
    // Copy data blocks into new places
    fprintf(stderr, "Moving data blocks\n");
    r = resize_copy_data();
    if (r != 0)
-        return r;
-    // Rewrite journal
-    fprintf(stderr, "Rebuilding journal\n");
-    r = resize_rewrite_journal();
-    if (r != 0)
-        return r;
+        goto ret;
    // Rewrite metadata
+    resize_alloc_journal();
    fprintf(stderr, "Rebuilding metadata\n");
-    r = resize_rewrite_meta();
+    r = resize_rebuild_meta();
    if (r != 0)
-        return r;
+        goto ret;
+    if (new_meta_format != BLOCKSTORE_META_FORMAT_HEAP)
+    {
+        // Rewrite journal
+        fprintf(stderr, "Rebuilding journal\n");
+        r = resize_rebuild_journal();
+        if (r != 0)
+            goto ret;
+        fprintf(stderr, "Writing new journal\n");
+    }
+    else
+        fprintf(stderr, "Writing new buffer area\n");
    // Write new journal
-    fprintf(stderr, "Writing new journal\n");
    r = resize_write_new_journal();
    if (r != 0)
-        return r;
+        goto ret;
    // Write new metadata
    fprintf(stderr, "Writing new metadata\n");
    r = resize_write_new_meta();
    if (r != 0)
-        return r;
+        goto ret;
    fprintf(stderr, "Done\n");
+ret:
+    free_new_meta();
    return 0;
 }

@@ -126,6 +151,8 @@ int disk_tool_t::resize_parse_params()
        ? parse_size(options["new_journal_offset"]) : dsk.journal_offset;
    new_journal_len = options.find("new_journal_len") != options.end()
        ? parse_size(options["new_journal_len"]) : dsk.journal_len;
+    new_meta_format = options.find("new_meta_format") != options.end()
+        ? stoull_full(options["new_meta_format"]) : 0;
    if (new_data_len+new_data_offset > dsk.data_device_size)
        new_data_len = dsk.data_device_size-new_data_offset;
    if (new_meta_device == dsk.data_device && new_data_offset < new_meta_offset &&
@@ -139,7 +166,7 @@ int disk_tool_t::resize_parse_params()
        new_data_offset == dsk.data_offset &&
        new_data_len == dsk.data_len &&
        new_meta_offset == dsk.meta_offset &&
-        (new_meta_len == dsk.meta_len || new_meta_len == 0) &&
+        (new_meta_len == dsk.meta_area_size || new_meta_len == 0) &&
        new_journal_offset == dsk.journal_offset &&
        new_journal_len == dsk.journal_len &&
        options.find("force") == options.end())
@@ -151,7 +178,7 @@ int disk_tool_t::resize_parse_params()
    return 0;
 }

-void disk_tool_t::resize_init(blockstore_meta_header_v2_t *hdr)
+void disk_tool_t::resize_init(blockstore_meta_header_v3_t *hdr)
 {
    if (hdr && dsk.data_block_size != hdr->data_block_size)
    {
@@ -170,6 +197,15 @@ void disk_tool_t::resize_init(blockstore_meta_header_v2_t *hdr)
        dsk.data_csum_type = hdr->data_csum_type;
        dsk.csum_block_size = hdr->csum_block_size;
    }
+    if (hdr && dsk.meta_format != hdr->version)
+    {
+        dsk.meta_format = hdr->version;
+    }
+    if (new_meta_format == 0)
+    {
+        new_meta_format = hdr && hdr->version == BLOCKSTORE_META_FORMAT_HEAP ? BLOCKSTORE_META_FORMAT_HEAP : BLOCKSTORE_META_FORMAT_V2;
+    }
+    dsk.calc_lengths();
    if (((new_data_offset-dsk.data_offset) % dsk.data_block_size))
    {
        fprintf(stderr, "Data alignment mismatch: old data offset is 0x%jx, new is 0x%jx, but alignment on %x should be equal\n",
@@ -359,15 +395,57 @@ int disk_tool_t::resize_copy_data()
    return 0;
 }

-int disk_tool_t::resize_rewrite_journal()
+void disk_tool_t::resize_alloc_journal()
 {
-    // Simply overwriting on the fly may be impossible because old and new areas may overlap
-    // For now, just build new journal data in memory
    new_journal_buf = (uint8_t*)memalign_or_die(MEM_ALIGNMENT, new_journal_len);
+    memset(new_journal_buf, 0, new_journal_len);
    new_journal_ptr = new_journal_buf;
    new_journal_data = new_journal_ptr + dsk.journal_block_size;
    new_journal_in_pos = 0;
-    memset(new_journal_buf, 0, new_journal_len);
+}
+
+void disk_tool_t::build_journal_start()
+{
+    journal_entry *ne = (journal_entry*)(new_journal_ptr + new_journal_in_pos);
+    *((journal_entry_start*)ne) = (journal_entry_start){
+        .magic = JOURNAL_MAGIC,
+        .type = JE_START,
+        .size = sizeof(journal_entry_start),
+        .journal_start = dsk.journal_block_size,
+        .version = JOURNAL_VERSION_V2,
+        .data_csum_type = dsk.data_csum_type,
+        .csum_block_size = dsk.csum_block_size,
+    };
+    ne->crc32 = je_crc32(ne);
+    new_journal_ptr += dsk.journal_block_size;
+    new_journal_data = new_journal_ptr+dsk.journal_block_size;
+    new_journal_in_pos = 0;
+}
+
+void disk_tool_t::choose_journal_block(uint32_t je_size)
+{
+    if (dsk.journal_block_size < new_journal_in_pos+je_size)
+    {
+        new_journal_ptr = new_journal_data;
+        if (new_journal_ptr-new_journal_buf >= new_journal_len)
+        {
+            fprintf(stderr, "Error: live entries don't fit to the new journal\n");
+            exit(1);
+        }
+        new_journal_data = new_journal_ptr+dsk.journal_block_size;
+        new_journal_in_pos = 0;
+        if (dsk.journal_block_size < je_size)
+        {
+            fprintf(stderr, "Error: journal entry too large (%u bytes)\n", je_size);
+            exit(1);
+        }
+    }
+}
+
+int disk_tool_t::resize_rebuild_journal()
+{
+    // Simply overwriting on the fly may be impossible because old and new areas may overlap
+    // For now, just build new journal data in memory
    process_journal([this](void *buf)
    {
        return process_journal_block(buf, [this](int num, journal_entry *je)
@@ -384,39 +462,11 @@ int disk_tool_t::resize_rewrite_journal()
                    );
                    exit(1);
                }
-                journal_entry *ne = (journal_entry*)(new_journal_ptr + new_journal_in_pos);
-                *((journal_entry_start*)ne) = (journal_entry_start){
-                    .magic = JOURNAL_MAGIC,
-                    .type = JE_START,
-                    .size = sizeof(journal_entry_start),
-                    .journal_start = dsk.journal_block_size,
-                    .version = JOURNAL_VERSION_V2,
-                    .data_csum_type = dsk.data_csum_type,
-                    .csum_block_size = dsk.csum_block_size,
-                };
-                ne->crc32 = je_crc32(ne);
-                new_journal_ptr += dsk.journal_block_size;
-                new_journal_data = new_journal_ptr+dsk.journal_block_size;
-                new_journal_in_pos = 0;
+                build_journal_start();
            }
            else
            {
-                if (dsk.journal_block_size < new_journal_in_pos+je->size)
-                {
-                    new_journal_ptr = new_journal_data;
-                    if (new_journal_ptr-new_journal_buf >= new_journal_len)
-                    {
-                        fprintf(stderr, "Error: live entries don't fit to the new journal\n");
-                        exit(1);
-                    }
-                    new_journal_data = new_journal_ptr+dsk.journal_block_size;
-                    new_journal_in_pos = 0;
-                    if (dsk.journal_block_size < je->size)
-                    {
-                        fprintf(stderr, "Error: journal entry too large (%u bytes)\n", je->size);
-                        exit(1);
-                    }
-                }
+                choose_journal_block(je->size);
                journal_entry *ne = (journal_entry*)(new_journal_ptr + new_journal_in_pos);
                memcpy(ne, je, je->size);
                ne->crc32_prev = new_crc32_prev;
@@ -463,30 +513,170 @@ int disk_tool_t::resize_write_new_journal()
    fsync(new_journal_fd);
    close(new_journal_fd);
    new_journal_fd = -1;
-    free(new_journal_buf);
-    new_journal_buf = NULL;
    return 0;
 }

-int disk_tool_t::resize_rewrite_meta()
+int disk_tool_t::resize_rebuild_meta()
 {
-    new_meta_buf = (uint8_t*)memalign_or_die(MEM_ALIGNMENT, new_meta_len);
-    memset(new_meta_buf, 0, new_meta_len);
+    if (new_meta_format == BLOCKSTORE_META_FORMAT_HEAP)
+    {
+        new_dsk = dsk;
+        new_dsk.data_offset = new_data_offset;
+        new_dsk.data_len = new_data_len;
+        new_dsk.block_count = new_data_len / dsk.data_block_size;
+        new_dsk.journal_device = new_journal_device;
+        new_dsk.journal_offset = new_journal_offset;
+        new_dsk.journal_len = new_journal_len;
+        new_dsk.meta_device = new_meta_device;
+        new_dsk.meta_offset = new_meta_offset;
+        new_dsk.meta_area_size = new_meta_len;
+        new_dsk.meta_format = new_meta_format;
+        new_heap = new blockstore_heap_t(&new_dsk, NULL, 0);
+        new_meta_hdr = (blockstore_meta_header_v3_t *)memalign_or_die(MEM_ALIGNMENT, dsk.meta_block_size);
+        memset(new_meta_hdr, 0, dsk.meta_block_size);
+    }
+    else
+    {
+        new_meta_buf = (uint8_t*)memalign_or_die(MEM_ALIGNMENT, new_meta_len);
+        memset(new_meta_buf, 0, new_meta_len);
+        new_meta_hdr = (blockstore_meta_header_v3_t *)new_meta_buf;
+    }
+    std::vector<heap_write_t*> writes;
    int r = process_meta(
-        [this](blockstore_meta_header_v2_t *hdr)
+        [&](blockstore_meta_header_v3_t *hdr)
        {
-            blockstore_meta_header_v2_t *new_hdr = (blockstore_meta_header_v2_t *)new_meta_buf;
-            new_hdr->zero = 0;
-            new_hdr->magic = BLOCKSTORE_META_MAGIC_V1;
-            new_hdr->version = BLOCKSTORE_META_FORMAT_V2;
-            new_hdr->meta_block_size = dsk.meta_block_size;
-            new_hdr->data_block_size = dsk.data_block_size;
-            new_hdr->bitmap_granularity = dsk.bitmap_granularity ? dsk.bitmap_granularity : 4096;
-            new_hdr->data_csum_type = dsk.data_csum_type;
-            new_hdr->csum_block_size = dsk.csum_block_size;
-            new_hdr->header_csum = crc32c(0, new_hdr, sizeof(*new_hdr));
+            new_meta_hdr->zero = 0;
+            new_meta_hdr->magic = BLOCKSTORE_META_MAGIC_V1;
+            new_meta_hdr->version = new_meta_format == 0 ? BLOCKSTORE_META_FORMAT_HEAP : new_meta_format;
+            new_meta_hdr->meta_block_size = dsk.meta_block_size;
+            new_meta_hdr->data_block_size = dsk.data_block_size;
+            new_meta_hdr->bitmap_granularity = dsk.bitmap_granularity ? dsk.bitmap_granularity : 4096;
+            new_meta_hdr->data_csum_type = dsk.data_csum_type;
+            new_meta_hdr->csum_block_size = dsk.csum_block_size;
+            new_meta_hdr->compacted_lsn = hdr->compacted_lsn;
+            new_meta_hdr->header_csum = 0;
+            new_meta_hdr->header_csum = crc32c(0, new_meta_hdr, new_meta_hdr->version == BLOCKSTORE_META_FORMAT_HEAP
+                ? sizeof(blockstore_meta_header_v3_t) : sizeof(blockstore_meta_header_v2_t));
+            if (hdr->version == BLOCKSTORE_META_FORMAT_HEAP && new_meta_format != BLOCKSTORE_META_FORMAT_HEAP)
+            {
+                build_journal_start();
+            }
        },
-        [this](uint64_t block_num, clean_disk_entry *entry, uint8_t *bitmap)
+        [&](blockstore_heap_t *heap, heap_object_t *obj, uint32_t meta_block_num)
+        {
+            for (auto wr = obj->get_writes(); wr; wr = wr->next())
+            {
+                if ((wr->entry_type & BS_HEAP_TYPE) == BS_HEAP_BIG_WRITE)
+                {
+                    uint64_t block_num = wr->big().block_num;
+                    auto remap_it = data_remap.find(block_num);
+                    if (remap_it != data_remap.end())
+                        block_num = remap_it->second;
+                    if (block_num < free_first || block_num >= total_blocks-free_last)
+                    {
+                        fprintf(stderr, "BUG: remapped block %ju not in range %ju..%ju\n", block_num, free_first, total_blocks-free_last);
+                        exit(1);
+                    }
+                    block_num += data_idx_diff;
+                    wr->big().block_num = block_num;
+                }
+                else if ((wr->entry_type & BS_HEAP_TYPE) == BS_HEAP_SMALL_WRITE)
+                {
+                    if (new_heap && wr->small().len > 0)
+                    {
+                        if (new_journal_ptr-new_journal_buf+wr->small().len > new_journal_len)
+                        {
+                            fprintf(stderr, "Small write data doesn't fit into the new buffer area\n");
+                            exit(1);
+                        }
+                        memcpy(new_journal_ptr, buffer_area+wr->small().location, wr->small().len);
+                        wr->small().location = new_journal_ptr-new_journal_buf;
+                        new_journal_ptr += wr->small().len;
+                    }
+                }
+                else if (!new_heap)
+                {
+                    fprintf(stderr, "Object %jx:%jx can't be converted to the old format because it contains %s\n",
+                        obj->inode, obj->stripe, (wr->entry_type & BS_HEAP_TYPE) == BS_HEAP_TOMBSTONE
+                            ? "a tombstone" : ((wr->entry_type & BS_HEAP_TYPE) == BS_HEAP_INTENT_WRITE ? "an intent_write entry" : "an unknown entry"));
+                    exit(1);
+                }
+            }
+            if (new_heap)
+            {
+                // New -> New
+                new_heap->copy_object(obj, NULL);
+            }
+            else
+            {
+                // Fill journal
+                writes.clear();
+                for (auto wr = obj->get_writes(); wr; wr = wr->next())
+                {
+                    writes.push_back(wr);
+                }
+                for (ssize_t i = writes.size()-2; i >= 0; i--)
+                {
+                    auto wr = writes[i];
+                    assert((wr->entry_type & BS_HEAP_TYPE) == BS_HEAP_SMALL_WRITE || wr->entry_type == BS_HEAP_BIG_WRITE);
+                    uint32_t je_size = ((wr->entry_type & BS_HEAP_TYPE) == BS_HEAP_SMALL_WRITE
+                        ? sizeof(journal_entry_small_write) + dsk.dirty_dyn_size(wr->small().offset, wr->small().len)
+                        : sizeof(journal_entry_big_write) + dsk.dirty_dyn_size(0, dsk.data_block_size));
+                    choose_journal_block(je_size);
+                    journal_entry *je = (journal_entry*)(new_journal_ptr + new_journal_in_pos);
+                    je->magic = JOURNAL_MAGIC;
+                    je->type = (wr->entry_type & BS_HEAP_STABLE) ? JE_SMALL_WRITE_INSTANT : JE_SMALL_WRITE;
+                    je->size = je_size;
+                    je->crc32_prev = new_crc32_prev;
+                    je->small_write.oid = (object_id){ .inode = obj->inode, .stripe = obj->stripe };
+                    je->small_write.version = wr->version;
+                    if (wr->type() == BS_HEAP_SMALL_WRITE)
+                    {
+                        je->small_write.offset = wr->small().offset;
+                        je->small_write.len = wr->small().len;
+                        je->small_write.data_offset = new_journal_data-new_journal_buf;
+                        if (je->small_write.data_offset + je->small_write.len > new_journal_len)
+                        {
+                            fprintf(stderr, "Error: live entries don't fit to the new journal\n");
+                            exit(1);
+                        }
+                        memcpy(new_journal_data, buffer_area+wr->small().location, je->small_write.len);
+                        new_journal_data += je->small_write.len;
+                        if (dsk.data_csum_type == 0 && wr->get_checksum(heap))
+                            je->small_write.crc32_data = *wr->get_checksum(heap);
+                    }
+                    else
+                    {
+                        je->big_write.location = wr->big_location(heap);
+                    }
+                    memcpy((uint8_t*)je + je->size, wr->get_ext_bitmap(heap), new_clean_entry_bitmap_size);
+                    if (dsk.data_csum_type != 0 && wr->get_checksums(heap))
+                    {
+                        memcpy((uint8_t*)je + je->size + new_clean_entry_bitmap_size, wr->get_checksums(heap), wr->get_csum_size(heap));
+                    }
+                    je->crc32 = je_crc32(je);
+                    new_journal_in_pos += je->size;
+                    new_crc32_prev = je->crc32;
+                }
+                // New -> Old
+                if (writes[writes.size()-1]->entry_type == BS_HEAP_BIG_WRITE|BS_HEAP_STABLE)
+                {
+                    auto big_wr = writes[writes.size()-1];
+                    uint64_t block_num = big_wr->big().block_num;
+                    clean_disk_entry *new_entry = (clean_disk_entry*)(new_meta_buf + dsk.meta_block_size +
+                        dsk.meta_block_size*(block_num / new_entries_per_block) +
+                        new_clean_entry_size*(block_num % new_entries_per_block));
+                    new_entry->oid = (object_id){ .inode = obj->inode, .stripe = obj->stripe };
+                    new_entry->version = big_wr->version;
+                    memcpy(new_entry->bitmap, big_wr->get_ext_bitmap(heap), new_clean_entry_bitmap_size);
+                    memcpy(new_entry->bitmap + new_clean_entry_bitmap_size, big_wr->get_int_bitmap(heap), new_clean_entry_bitmap_size);
+                    memcpy(new_entry->bitmap + 2*new_clean_entry_bitmap_size, big_wr->get_checksums(heap), new_data_csum_size);
+                    uint32_t *new_entry_csum = (uint32_t*)(((uint8_t*)new_entry) + new_clean_entry_size - 4);
+                    *new_entry_csum = crc32c(0, new_entry, new_clean_entry_size - 4);
+                }
+            }
+        },
+        [&](uint64_t block_num, clean_disk_entry *entry, uint8_t *bitmap)
        {
            auto remap_it = data_remap.find(block_num);
            if (remap_it != data_remap.end())
@@ -497,26 +687,42 @@ int disk_tool_t::resize_rewrite_meta()
                exit(1);
            }
            block_num += data_idx_diff;
-            clean_disk_entry *new_entry = (clean_disk_entry*)(new_meta_buf + dsk.meta_block_size +
-                dsk.meta_block_size*(block_num / new_entries_per_block) +
-                new_clean_entry_size*(block_num % new_entries_per_block));
-            new_entry->oid = entry->oid;
-            new_entry->version = entry->version;
-            if (bitmap)
-                memcpy(new_entry->bitmap, bitmap, 2*new_clean_entry_bitmap_size + new_data_csum_size);
+            if (new_heap)
+            {
+                // Old -> New
+                uint8_t wr_buf[new_heap->get_max_write_entry_size()];
+                heap_write_t *wr = (heap_write_t*)wr_buf;
+                wr->entry_type = BS_HEAP_BIG_WRITE|BS_HEAP_STABLE;
+                wr->big().block_num = block_num;
+                wr->next_pos = 0;
+                wr->size = wr->get_size(new_heap);
+                if (bitmap)
+                {
+                    memcpy(wr->get_ext_bitmap(new_heap), bitmap, new_clean_entry_bitmap_size);
+                    memcpy(wr->get_int_bitmap(new_heap), bitmap+new_clean_entry_bitmap_size, new_clean_entry_bitmap_size);
+                    memcpy(wr->get_checksums(new_heap), bitmap+2*new_clean_entry_bitmap_size, new_data_csum_size);
+                }
+                new_heap->post_write(entry->oid, wr, NULL, NULL);
+            }
            else
-                memset(new_entry->bitmap, 0xff, 2*new_clean_entry_bitmap_size);
-            uint32_t *new_entry_csum = (uint32_t*)(((uint8_t*)new_entry) + new_clean_entry_size - 4);
-            *new_entry_csum = crc32c(0, new_entry, new_clean_entry_size - 4);
-        }
+            {
+                // Old -> Old
+                clean_disk_entry *new_entry = (clean_disk_entry*)(new_meta_buf + dsk.meta_block_size +
+                    dsk.meta_block_size*(block_num / new_entries_per_block) +
+                    new_clean_entry_size*(block_num % new_entries_per_block));
+                new_entry->oid = entry->oid;
+                new_entry->version = entry->version;
+                if (bitmap)
+                    memcpy(new_entry->bitmap, bitmap, 2*new_clean_entry_bitmap_size + new_data_csum_size);
+                else
+                    memset(new_entry->bitmap, 0xff, 2*new_clean_entry_bitmap_size);
+                uint32_t *new_entry_csum = (uint32_t*)(((uint8_t*)new_entry) + new_clean_entry_size - 4);
+                *new_entry_csum = crc32c(0, new_entry, new_clean_entry_size - 4);
+            }
+        },
+        true, true
    );
-    if (r != 0)
-    {
-        free(new_meta_buf);
-        new_meta_buf = NULL;
-        return r;
-    }
-    return 0;
+    return r;
 }

 int disk_tool_t::resize_write_new_meta()
@@ -528,11 +734,60 @@ int disk_tool_t::resize_write_new_meta()
        return 1;
    }
    lseek64(new_meta_fd, new_meta_offset, 0);
-    write_blocking(new_meta_fd, new_meta_buf, new_meta_len);
+    if (new_meta_buf)
+    {
+        write_blocking(new_meta_fd, new_meta_buf, new_meta_len);
+    }
+    else
+    {
+        assert(new_heap);
+        uint32_t new_meta_blocks = new_meta_len / dsk.meta_block_size - 1;
+        uint8_t *zero_block = (uint8_t*)memalign_or_die(MEM_ALIGNMENT, dsk.meta_block_size);
+        memset(zero_block, 0, dsk.meta_block_size);
+        std::vector<iovec> iov;
+        iov.reserve(IOV_MAX);
+        iov.push_back((iovec){ .iov_base = new_meta_hdr, .iov_len = dsk.meta_block_size });
+        for (uint32_t i = 0; i < new_meta_blocks; i++)
+        {
+            uint8_t *data = new_heap->get_meta_block(i);
+            iov.push_back((iovec){ .iov_base = data ? data : zero_block, .iov_len = dsk.meta_block_size });
+            if (iov.size() >= IOV_MAX)
+            {
+                writev_blocking(new_meta_fd, iov.data(), iov.size());
+                iov.clear();
+            }
+        }
+        if (iov.size() > 0)
+            writev_blocking(new_meta_fd, iov.data(), iov.size());
+        free(zero_block);
+        zero_block = NULL;
+    }
    fsync(new_meta_fd);
    close(new_meta_fd);
    new_meta_fd = -1;
-    free(new_meta_buf);
-    new_meta_buf = NULL;
    return 0;
 }
+
+void disk_tool_t::free_new_meta()
+{
+    if (new_heap)
+    {
+        delete new_heap;
+        new_heap = NULL;
+    }
+    if ((uint8_t*)new_meta_hdr != new_meta_buf)
+    {
+        free(new_meta_hdr);
+        new_meta_hdr = NULL;
+    }
+    if (new_meta_buf)
+    {
+        free(new_meta_buf);
+        new_meta_buf = NULL;
+    }
+    if (new_journal_buf)
+    {
+        free(new_journal_buf);
+        new_journal_buf = NULL;
+    }
+}
--- a/src/disk_tool/disk_tool_resize_auto.cpp
+++ b/src/disk_tool/disk_tool_resize_auto.cpp
@@ -29,7 +29,7 @@ int disk_tool_t::resize_data(std::string device)
        dsk.open_data();
        dsk.open_meta();
        dsk.open_journal();
-        dsk.calc_lengths(true);
+        dsk.calc_lengths();
    }
    catch (std::exception & e)
    {
@@ -61,7 +61,7 @@ int disk_tool_t::resize_data(std::string device)
        dsk.journal_fd = old_journal_fd;
        dsk.meta_fd = old_meta_fd;
        dsk.data_fd = old_data_fd;
-        dsk.calc_lengths(true);
+        dsk.calc_lengths();
        dsk.journal_fd = -1;
        dsk.meta_fd = -1;
        dsk.data_fd = -1;
@@ -82,8 +82,10 @@ int disk_tool_t::resize_data(std::string device)
    auto new_meta_device = move_options.find("new_meta_device") != move_options.end()
        ? move_options["new_meta_device"] : dsk.meta_device;
    // Calculate new data & meta offsets
+    if (!new_meta_len)
+        new_meta_len = (dsk.meta_format == BLOCKSTORE_META_FORMAT_HEAP ? dsk.min_meta_len*2 : dsk.min_meta_len);
    new_data_offset = 4096 + (new_journal_device == dsk.data_device ? new_journal_len : 0) +
-        (new_meta_device == dsk.data_device ? dsk.meta_len : 0);
+        (new_meta_device == dsk.data_device ? new_meta_len : 0);
    new_data_offset += ((dsk.data_offset-new_data_offset) % dsk.data_block_size);
    if (new_data_offset != dsk.data_offset)
        move_options["new_data_offset"] = std::to_string(new_data_offset);
@@ -236,7 +238,7 @@ int disk_tool_t::resize_parse_move_meta(std::map<std::string, std::string> & mov
        auto new_journal_device = move_options.find("new_journal_device") != move_options.end()
            ? move_options["new_journal_device"] : dsk.journal_device;
        move_options["new_meta_device"] = dsk.data_device;
-        move_options["new_meta_len"] = std::to_string(dsk.meta_len);
+        move_options["new_meta_len"] = std::to_string(new_meta_len);
    }
    else
    {
@@ -246,7 +248,6 @@ int disk_tool_t::resize_parse_move_meta(std::map<std::string, std::string> & mov
        std::string parent_dev = get_parent_device(real_dev);
        if (parent_dev == "")
            return 1;
-        uint64_t new_meta_len = 0;
        if (parent_dev == real_dev)
        {
            // whole disk - create partition
@@ -260,7 +261,7 @@ int disk_tool_t::resize_parse_move_meta(std::map<std::string, std::string> & mov
                fprintf(stderr, "metadata is already on a partition of %s\n", options["move_meta"].c_str());
                return 0;
            }
-            new_meta_len = ((dsk.meta_len+1024*1024-1)/1024/1024)*1024*1024;
+            new_meta_len = ((dsk.meta_area_size+1024*1024-1)/1024/1024)*1024*1024;
            if (!dry_run)
            {
                auto devinfos = collect_devices({ real_dev });
--- a/src/disk_tool/disk_tool_upgrade.cpp
+++ b/src/disk_tool/disk_tool_upgrade.cpp
@@ -107,7 +107,7 @@ int disk_tool_t::upgrade_simple_unit(std::string unit)
                dsk.open_data();
                dsk.open_meta();
                dsk.open_journal();
-                dsk.calc_lengths(true);
+                dsk.calc_lengths();
                dsk.close_all();
            }
            catch (std::exception & e)
@@ -116,9 +116,8 @@ int disk_tool_t::upgrade_simple_unit(std::string unit)
                fprintf(stderr, "Error: %s\n", e.what());
                return 1;
            }
-            options.erase("meta_format");
-            if (m_is_d && m_o < d_o && d_o-m_o < dsk.meta_len)
-                d_o += ((dsk.meta_len - (d_o-m_o) + blk-1) / blk) * blk;
+            if (m_is_d && m_o < d_o && d_o-m_o < dsk.min_meta_len)
+                d_o += ((dsk.min_meta_len - (d_o-m_o) + blk-1) / blk) * blk;
        }
        resize["new_data_offset"] = d_o;
        resize["new_meta_offset"] = m_o;
--- a/src/osd/osd.cpp
+++ b/src/osd/osd.cpp
@@ -292,7 +292,7 @@ void osd_t::parse_config(bool init)
    scrub_sleep_ms = config["scrub_sleep"].uint64_value();
    scrub_list_limit = config["scrub_list_limit"].uint64_value();
    if (!scrub_list_limit)
-        scrub_list_limit = 1000;
+        scrub_list_limit = 262144;
    if (!old_auto_scrub && auto_scrub)
    {
        // Schedule scrubbing
--- a/src/osd/osd.h
+++ b/src/osd/osd.h
@@ -98,6 +98,12 @@ struct osd_pg_lock_t
    uint64_t state = 0;
 };

+struct osd_unstable_wr_t
+{
+    uint64_t latest_ver = 0;
+    uint64_t ver_count = 0;
+};
+
 class osd_t
 {
    // config
@@ -123,6 +129,7 @@ class osd_t
    int slow_log_interval = 10;
    int immediate_commit = IMMEDIATE_NONE;
    int autosync_interval = DEFAULT_AUTOSYNC_INTERVAL; // "emergency" sync every 5 seconds
+    int autosync_dirty_per_object = 16;
    int autosync_writes = DEFAULT_AUTOSYNC_WRITES;
    uint64_t recovery_queue_depth = 1;
    uint64_t recovery_sleep_us = 0;
@@ -142,7 +149,7 @@ class osd_t
    uint64_t global_scrub_interval = 30*86400;
    uint64_t scrub_queue_depth = 1;
    uint64_t scrub_sleep_ms = 0;
-    uint32_t scrub_list_limit = 1000;
+    uint32_t scrub_list_limit = 262144;
    bool scrub_find_best = true;
    uint64_t scrub_ec_max_bruteforce = 100;
    bool enable_pg_locks = false;
@@ -195,7 +202,8 @@ class osd_t

    // Unstable writes
    uint64_t unstable_write_count = 0;
-    std::map<osd_object_id_t, uint64_t> unstable_writes;
+    uint64_t unstable_per_object = 0;
+    std::map<osd_object_id_t, osd_unstable_wr_t> unstable_writes;
    std::deque<osd_op_t*> syncs_in_progress;

    // client & peer I/O
@@ -265,7 +273,6 @@ class osd_t
    void report_statistics();
    void report_pg_state(pg_t & pg);
    void report_pg_states();
-    void apply_no_inode_stats();
    void apply_pg_count();
    void apply_pg_config();

--- a/src/osd/osd_cluster.cpp
+++ b/src/osd/osd_cluster.cpp
@@ -274,19 +274,27 @@ void osd_t::report_statistics()
    json11::Json::object last_stat;
    pool_id_t last_pool = 0;
    std::map<uint64_t, uint64_t> bs_empty_space;
-    auto & bs_inode_space = bs ? bs->get_inode_space_stats() : bs_empty_space;
-    for (auto kv: bs_inode_space)
+    const auto & bs_inode_space = bs ? bs->get_inode_space_stats() : bs_empty_space;
+    for (auto it = bs_inode_space.begin(); it != bs_inode_space.end(); )
    {
-        pool_id_t pool_id = INODE_POOL(kv.first);
-        uint64_t only_inode_num = INODE_NO_POOL(kv.first);
+        pool_id_t pool_id = INODE_POOL(it->first);
+        uint64_t only_inode_num = INODE_NO_POOL(it->first);
        if (!last_pool || pool_id != last_pool)
        {
+            auto pool_it = st_cli.pool_config.find(pool_id);
+            if (pool_it != st_cli.pool_config.end() && !pool_it->second.used_for_app.empty())
+            {
+                // Skip pool
+                it = bs_inode_space.lower_bound(INODE_WITH_POOL(pool_id+1, 0));
+                continue;
+            }
            if (last_pool)
                inode_space[std::to_string(last_pool)] = last_stat;
            last_stat = json11::Json::object();
            last_pool = pool_id;
        }
-        last_stat[std::to_string(only_inode_num)] = kv.second;
+        last_stat[std::to_string(only_inode_num)] = it->second;
+        it++;
    }
    if (last_pool)
        inode_space[std::to_string(last_pool)] = last_stat;
@@ -461,10 +469,6 @@ void osd_t::on_change_etcd_state_hook(std::map<std::string, etcd_kv_t> & changes
        parse_config(false);
    }
    bool pools = changes.find(st_cli.etcd_prefix+"/config/pools") != changes.end();
-    if (pools)
-    {
-        apply_no_inode_stats();
-    }
    if (run_primary)
    {
        bool pgs = changes.find(st_cli.etcd_prefix+"/pg/config") != changes.end();
@@ -495,8 +499,6 @@ void osd_t::on_reload_config_hook(json11::Json::object & global_config)
 // Acquire lease
 void osd_t::acquire_lease()
 {
-    // Apply no_inode_stats before the first statistics report
-    apply_no_inode_stats();
    // Maximum lease TTL is (report interval) + retries * (timeout + repeat interval)
    st_cli.etcd_call("/lease/grant", json11::Json::object {
        { "TTL", etcd_report_interval+(st_cli.max_etcd_attempts*(2*st_cli.etcd_quick_timeout)+999)/1000 }
@@ -685,7 +687,6 @@ void osd_t::on_load_pgs_hook(bool success)
    else
    {
        peering_state &= ~OSD_LOADING_PGS;
-        apply_no_inode_stats();
        if (run_primary)
        {
            apply_pg_count();
@@ -694,23 +695,6 @@ void osd_t::on_load_pgs_hook(bool success)
    }
 }

-void osd_t::apply_no_inode_stats()
-{
-    if (!bs)
-    {
-        return;
-    }
-    std::vector<uint64_t> no_inode_stats;
-    for (auto & pool_item: st_cli.pool_config)
-    {
-        if (!pool_item.second.used_for_app.empty())
-        {
-            no_inode_stats.push_back(pool_item.first);
-        }
-    }
-    bs->set_no_inode_stats(no_inode_stats);
-}
-
 void osd_t::apply_pg_count()
 {
    for (auto & pool_item: st_cli.pool_config)
--- a/src/osd/osd_id.h
+++ b/src/osd/osd_id.h
@@ -5,18 +5,6 @@

 #include "object_id.h"

-#define POOL_SCHEME_REPLICATED 1
-#define POOL_SCHEME_XOR 2
-#define POOL_SCHEME_EC 3
-#define POOL_ID_MAX 0x10000
-#define POOL_ID_BITS 16
-#define INODE_POOL(inode) (pool_id_t)((inode) >> (64 - POOL_ID_BITS))
-#define INODE_NO_POOL(inode) (inode_t)((inode) & (((uint64_t)1 << (64-POOL_ID_BITS)) - 1))
-#define INODE_WITH_POOL(pool_id, inode) (((inode_t)(pool_id) << (64-POOL_ID_BITS)) | INODE_NO_POOL(inode))
-
-// Pool ID is 16 bits long
-typedef uint32_t pool_id_t;
-
 typedef uint64_t osd_num_t;
 typedef uint32_t pg_num_t;

--- a/src/osd/osd_peering_pg.cpp
+++ b/src/osd/osd_peering_pg.cpp
@@ -128,6 +128,8 @@ void pg_obj_state_check_t::handle_version()
        n_copies++;
        if (replicated && replica > 0 || replica >= pg->pg_size)
        {
+            printf("Object %jx:%jx has invalid chunk number: %u > %u\n", list[list_pos].oid.inode,
+                list[list_pos].oid.stripe, replica, replicated ? 0 : pg->pg_size);
            n_invalid++;
        }
        else
--- a/src/osd/osd_primary.cpp
+++ b/src/osd/osd_primary.cpp
@@ -790,9 +790,9 @@ resume_5:
        if (immediate_commit == IMMEDIATE_NONE)
        {
            unstable_write_count++;
-            if (unstable_write_count >= autosync_writes)
+            if (unstable_write_count >= autosync_writes ||
+                unstable_per_object >= autosync_dirty_per_object)
            {
-                unstable_write_count = 0;
                autosync();
            }
        }
--- a/src/osd/osd_primary.h
+++ b/src/osd/osd_primary.h
@@ -9,7 +9,6 @@
 #define SUBMIT_READ 0
 #define SUBMIT_RMW_READ 1
 #define SUBMIT_WRITE 2
-#define SUBMIT_SCRUB_READ 3

 struct unstable_osd_num_t
 {
@@ -44,6 +43,7 @@ struct osd_primary_op_data_t
            osd_num_t *dirty_osds;
            int dirty_osd_count;
            obj_ver_id *unstable_writes;
+            uint64_t *unstable_ver_counts;
            obj_ver_osd_t *copies_to_delete;
            int copies_to_delete_count;
        };
--- a/src/osd/osd_primary_subops.cpp
+++ b/src/osd/osd_primary_subops.cpp
@@ -130,7 +130,7 @@ void osd_t::submit_primary_subops(int submit_type, uint64_t op_version, const ui
        if (osd_set[role] != 0 && (wr || !rep && stripes[role].read_end != 0))
            n_subops++;
    }
-    if (!n_subops && (submit_type == SUBMIT_RMW_READ || rep))
+    if (zero_read >= 0 && !n_subops && (submit_type == SUBMIT_RMW_READ || rep))
        n_subops = 1;
    else
        zero_read = -1;
@@ -153,13 +153,13 @@ int osd_t::submit_primary_subop_batch(int submit_type, inode_t inode, uint64_t o
    for (int role = 0; role < (op_data->pg ? op_data->pg->pg_size : 1); role++)
    {
        // We always submit zero-length writes to all replicas, even if the stripe is not modified
-        if (!(wr || !rep && stripes[role].read_end != 0 || zero_read == role || submit_type == SUBMIT_SCRUB_READ))
+        if (!(wr || !rep && stripes[role].read_end != 0 || zero_read == role))
        {
            continue;
        }
        osd_num_t role_osd_num = osd_set[role];
        int stripe_num = rep ? 0 : role;
-        osd_rmw_stripe_t *si = stripes + (submit_type == SUBMIT_SCRUB_READ ? role : stripe_num);
+        osd_rmw_stripe_t *si = stripes + stripe_num;
        if (role_osd_num != 0)
        {
            si->osd_num = role_osd_num;
--- a/src/osd/osd_primary_sync.cpp
+++ b/src/osd/osd_primary_sync.cpp
@@ -45,7 +45,10 @@ resume_2:
    if (unstable_writes.size() > 0)
    {
        op_data->unstable_write_osds = new std::vector<unstable_osd_num_t>();
-        op_data->unstable_writes = new obj_ver_id[this->unstable_writes.size()];
+        op_data->unstable_writes = (obj_ver_id*)malloc_or_die(
+            (sizeof(obj_ver_id) + sizeof(uint64_t)) * this->unstable_writes.size());
+        op_data->unstable_ver_counts = (uint64_t*)((uint8_t*)op_data->unstable_writes +
+            sizeof(obj_ver_id) * this->unstable_writes.size());
        osd_num_t last_osd = 0;
        int last_start = 0, last_end = 0;
        for (auto it = this->unstable_writes.begin(); it != this->unstable_writes.end(); it++)
@@ -65,8 +68,9 @@ resume_2:
            }
            op_data->unstable_writes[last_end] = (obj_ver_id){
                .oid = it->first.oid,
-                .version = it->second,
+                .version = it->second.latest_ver,
            };
+            op_data->unstable_ver_counts[last_end] = it->second.ver_count;
            last_end++;
        }
        if (last_osd != 0)
@@ -78,6 +82,8 @@ resume_2:
            });
        }
        this->unstable_writes.clear();
+        this->unstable_write_count = 0;
+        this->unstable_per_object = 0;
    }
    {
        op_data->dirty_pg_count = dirty_pgs.size();
@@ -175,11 +181,12 @@ resume_6:
                    };
                    if (pgs.at(wpg).state & PG_ACTIVE)
                    {
-                        uint64_t & dest = this->unstable_writes[(osd_object_id_t){
+                        auto & dest = this->unstable_writes[(osd_object_id_t){
                            .osd_num = unstable_osd.osd_num,
                            .oid = w.oid,
                        }];
-                        dest = dest < w.version ? w.version : dest;
+                        dest.latest_ver = dest.latest_ver < w.version ? w.version : dest.latest_ver;
+                        dest.ver_count += op_data->unstable_ver_counts[unstable_osd.start + i];
                        dirty_pgs.insert(wpg);
                    }
                }
@@ -236,7 +243,7 @@ resume_8:
    if (op_data->unstable_writes)
    {
        delete op_data->unstable_write_osds;
-        delete[] op_data->unstable_writes;
+        free(op_data->unstable_writes);
        op_data->unstable_writes = NULL;
        op_data->unstable_write_osds = NULL;
    }
--- a/src/osd/osd_primary_write.cpp
+++ b/src/osd/osd_primary_write.cpp
@@ -409,9 +409,9 @@ continue_others:
    }
    // finish_op would invalidate next_it if it cleared pg.write_queue, but it doesn't do that :)
    finish_op(cur_op, cur_op->reply.hdr.retval);
-    if (unstable_write_count >= autosync_writes)
+    if (unstable_write_count >= autosync_writes ||
+        unstable_per_object >= autosync_dirty_per_object)
    {
-        unstable_write_count = 0;
        autosync();
    }
    if (next_op)
@@ -544,13 +544,17 @@ lazy:
            for (auto & chunk: loc_set)
            {
                this->dirty_osds.insert(chunk.osd_num);
-                this->unstable_writes[(osd_object_id_t){
+                auto & unst = this->unstable_writes[(osd_object_id_t){
                    .osd_num = chunk.osd_num,
                    .oid = {
                        .inode = op_data->oid.inode,
                        .stripe = op_data->oid.stripe | chunk.role,
                    },
-                }] = op_data->fact_ver;
+                }];
+                unst.latest_ver = op_data->fact_ver;
+                unst.ver_count++;
+                if (unstable_per_object < unst.ver_count)
+                    unstable_per_object = unst.ver_count;
            }
        }
        else
--- a/src/test/CMakeLists.txt
+++ b/src/test/CMakeLists.txt
@@ -34,6 +34,25 @@ add_executable(test_allocator EXCLUDE_FROM_ALL test_allocator.cpp ../util/alloca
 add_dependencies(build_tests test_allocator)
 add_test(NAME test_allocator COMMAND test_allocator)

+# test_heap
+add_executable(test_heap
+	EXCLUDE_FROM_ALL
+	test_heap.cpp
+	../blockstore/multilist.cpp
+	../blockstore/blockstore_heap.cpp
+	../util/crc32c.c
+	../util/allocator.cpp
+	../blockstore/blockstore_disk.cpp
+	../util/str_util.cpp
+)
+target_link_libraries(test_heap
+	${ISAL_LIBRARIES}
+)
+add_dependencies(build_tests test_heap)
+add_test(NAME test_heap COMMAND test_heap)
+target_compile_options(test_heap PRIVATE -coverage)
+target_link_options(test_heap PRIVATE -coverage)
+
 # test_cas
 add_executable(test_cas
 	test_cas.cpp
@@ -47,11 +66,15 @@ add_executable(test_crc32
 	test_crc32.cpp
 )
 target_link_libraries(test_crc32
-	vitastor_blk
+	vitastor_blk ${ISAL_LIBRARIES}
 )

-## test_blockstore, test_shit
-#add_executable(test_blockstore test_blockstore.cpp)
-#target_link_libraries(test_blockstore blockstore)
+# test_blockstore
+add_executable(test_blockstore EXCLUDE_FROM_ALL test_blockstore.cpp ringloop_mock.cpp)
+add_dependencies(build_tests test_blockstore)
+target_link_libraries(test_blockstore vitastor_blk vitastor_common ${ISAL_LIBRARIES})
+add_test(NAME test_blockstore COMMAND test_blockstore)
+
+## test_shit
 #add_executable(test_shit test_shit.cpp osd_peering_pg.cpp)
 #target_link_libraries(test_shit ${LIBURING_LIBRARIES} m)
--- a/src/test/osd_test.cpp
+++ b/src/test/osd_test.cpp
@@ -16,6 +16,7 @@

 #include <stdexcept>

+#include "malloc_or_die.h"
 #include "addr_util.h"
 #include "osd_ops.h"
 #include "rw_blocking.h"
@@ -194,7 +195,7 @@ uint64_t test_read(int connect_fd, uint64_t inode, uint64_t stripe, uint64_t ver
    op.sec_rw.version = version;
    op.sec_rw.offset = offset;
    op.sec_rw.len = len;
-    void *data = memalign(MEM_ALIGNMENT, op.sec_rw.len);
+    void *data = memalign_or_die(MEM_ALIGNMENT, op.sec_rw.len);
    write_blocking(connect_fd, op.buf, OSD_PACKET_SIZE);
    int r = read_blocking(connect_fd, reply.buf, OSD_PACKET_SIZE);
    if (!check_reply(r, op, reply, op.sec_rw.len))
@@ -221,7 +222,7 @@ uint64_t test_read(int connect_fd, uint64_t inode, uint64_t stripe, uint64_t ver
    {
        return 0;
    }
-    data = memalign(MEM_ALIGNMENT, sizeof(obj_ver_id)*reply.hdr.retval);
+    data = memalign_or_die(MEM_ALIGNMENT, sizeof(obj_ver_id)*reply.hdr.retval);
    r = read_blocking(connect_fd, data, sizeof(obj_ver_id)*reply.hdr.retval);
    if (r != sizeof(obj_ver_id)*reply.hdr.retval)
    {
@@ -254,7 +255,7 @@ uint64_t test_write(int connect_fd, uint64_t inode, uint64_t stripe, uint64_t ve
    op.sec_rw.version = version;
    op.sec_rw.offset = 0;
    op.sec_rw.len = 128*1024;
-    void *data = memalign(MEM_ALIGNMENT, op.sec_rw.len);
+    void *data = memalign_or_die(MEM_ALIGNMENT, op.sec_rw.len);
    for (int i = 0; i < (op.sec_rw.len)/sizeof(uint64_t); i++)
        ((uint64_t*)data)[i] = pattern;
    write_blocking(connect_fd, op.buf, OSD_PACKET_SIZE);
@@ -289,7 +290,7 @@ void* test_primary_read(int connect_fd, uint64_t inode, uint64_t offset, uint64_
    op.rw.inode = inode;
    op.rw.offset = offset;
    op.rw.len = len;
-    void *data = memalign(MEM_ALIGNMENT, len);
+    void *data = memalign_or_die(MEM_ALIGNMENT, len);
    write_blocking(connect_fd, op.buf, OSD_PACKET_SIZE);
    int r = read_blocking(connect_fd, reply.buf, OSD_PACKET_SIZE);
    if (!check_reply(r, op, reply, len))
@@ -317,7 +318,7 @@ void test_primary_write(int connect_fd, uint64_t inode, uint64_t offset, uint64_
    op.rw.inode = inode;
    op.rw.offset = offset;
    op.rw.len = len;
-    void *data = memalign(MEM_ALIGNMENT, len);
+    void *data = memalign_or_die(MEM_ALIGNMENT, len);
    set_pattern(data, len, pattern);
    write_blocking(connect_fd, op.buf, OSD_PACKET_SIZE);
    write_blocking(connect_fd, data, len);
@@ -363,7 +364,7 @@ void test_list_stab(int connect_fd)
    assert(check_reply(r, op, reply, -1));
    int total_count = reply.hdr.retval;
    int stable_count = reply.sec_list.stable_count;
-    obj_ver_id *data = (obj_ver_id*)malloc(total_count * sizeof(obj_ver_id));
+    obj_ver_id *data = (obj_ver_id*)malloc_or_die(total_count * sizeof(obj_ver_id));
    assert(data);
    assert(read_blocking(connect_fd, data, total_count * sizeof(obj_ver_id)) == (total_count * sizeof(obj_ver_id)));
    int last_start = stable_count;
@@ -381,7 +382,7 @@ void test_list_stab(int connect_fd)
            last_start = i;
        }
    }
-    obj_ver_id *data2 = (obj_ver_id*)malloc(sizeof(obj_ver_id) * 32);
+    obj_ver_id *data2 = (obj_ver_id*)malloc_or_die(sizeof(obj_ver_id) * 32);
    assert(data2);
    free(data2);
    free(data);
--- a/src/test/ringloop_mock.cpp
+++ b/src/test/ringloop_mock.cpp
@@ -0,0 +1,395 @@
+// Copyright (c) Vitaliy Filippov, 2019+
+// License: VNPL-1.1 or GNU GPL-2.0+ (see README.md for details)
+
+#include <random>
+
+#include "ringloop_mock.h"
+#include "malloc_or_die.h"
+
+ring_loop_mock_t::ring_loop_mock_t(int qd, std::function<void(io_uring_sqe *)> submit_cb)
+{
+    this->submit_cb = std::move(submit_cb);
+    sqes.resize(qd);
+    ring_datas.resize(qd);
+    free_ring_datas.reserve(qd);
+    submit_ring_datas.reserve(qd);
+    completed_ring_datas.reserve(qd);
+    for (size_t i = 0; i < ring_datas.size(); i++)
+    {
+        free_ring_datas.push_back(ring_datas.data() + i);
+    }
+    in_loop = false;
+}
+
+void ring_loop_mock_t::register_consumer(ring_consumer_t *consumer)
+{
+    unregister_consumer(consumer);
+    consumers.push_back(consumer);
+}
+
+void ring_loop_mock_t::unregister_consumer(ring_consumer_t *consumer)
+{
+    for (int i = 0; i < consumers.size(); i++)
+    {
+        if (consumers[i] == consumer)
+        {
+            consumers.erase(consumers.begin()+i, consumers.begin()+i+1);
+            break;
+        }
+    }
+}
+
+void ring_loop_mock_t::wakeup()
+{
+    loop_again = true;
+}
+
+void ring_loop_mock_t::set_immediate(const std::function<void()> & cb)
+{
+    immediate_queue.push_back(cb);
+    wakeup();
+}
+
+unsigned ring_loop_mock_t::space_left()
+{
+    return free_ring_datas.size();
+}
+
+bool ring_loop_mock_t::has_work()
+{
+    return loop_again;
+}
+
+bool ring_loop_mock_t::has_sendmsg_zc()
+{
+    return false;
+}
+
+int ring_loop_mock_t::register_eventfd()
+{
+    return -1;
+}
+
+io_uring_sqe* ring_loop_mock_t::get_sqe()
+{
+    if (free_ring_datas.size() == 0)
+    {
+        return NULL;
+    }
+    ring_data_t *d = free_ring_datas.back();
+    free_ring_datas.pop_back();
+    submit_ring_datas.push_back(d);
+    io_uring_sqe *sqe = &sqes[d - ring_datas.data()];
+    *sqe = { 0 };
+    io_uring_sqe_set_data(sqe, d);
+    return sqe;
+}
+
+int ring_loop_mock_t::submit()
+{
+    for (size_t i = 0; i < submit_ring_datas.size(); i++)
+    {
+        submit_cb(&sqes[submit_ring_datas[i] - ring_datas.data()]);
+    }
+    submit_ring_datas.clear();
+    return 0;
+}
+
+int ring_loop_mock_t::wait()
+{
+    return 0;
+}
+
+unsigned ring_loop_mock_t::save()
+{
+    return submit_ring_datas.size();
+}
+
+void ring_loop_mock_t::restore(unsigned sqe_tail)
+{
+    while (submit_ring_datas.size() > sqe_tail)
+    {
+        free_ring_datas.push_back(submit_ring_datas.back());
+        submit_ring_datas.pop_back();
+    }
+}
+
+void ring_loop_mock_t::loop()
+{
+    if (in_loop)
+    {
+        return;
+    }
+    in_loop = true;
+    submit();
+    while (completed_ring_datas.size())
+    {
+        ring_data_t *d = completed_ring_datas.back();
+        completed_ring_datas.pop_back();
+        if (d->callback)
+        {
+            struct ring_data_t dl;
+            dl.iov = d->iov;
+            dl.res = d->res;
+            dl.more = dl.prev = false;
+            dl.callback.swap(d->callback);
+            free_ring_datas.push_back(d);
+            dl.callback(&dl);
+        }
+        else
+        {
+            fprintf(stderr, "Warning: empty callback in SQE\n");
+            free_ring_datas.push_back(d);
+        }
+    }
+    do
+    {
+        loop_again = false;
+        for (int i = 0; i < consumers.size(); i++)
+        {
+            consumers[i]->loop();
+            if (immediate_queue.size())
+            {
+                immediate_queue2.swap(immediate_queue);
+                for (auto & cb: immediate_queue2)
+                    cb();
+                immediate_queue2.clear();
+            }
+        }
+    } while (loop_again);
+    in_loop = false;
+}
+
+void ring_loop_mock_t::mark_completed(ring_data_t *data)
+{
+    completed_ring_datas.push_back(data);
+    wakeup();
+}
+
+disk_mock_t::disk_mock_t(size_t size, bool buffered)
+{
+    this->size = size;
+    this->data = (uint8_t*)malloc_or_die(size);
+    this->buffered = buffered;
+}
+
+disk_mock_t::~disk_mock_t()
+{
+    discard_buffers(true, 0);
+    free(data);
+}
+
+void disk_mock_t::erase_buffers(uint64_t begin, uint64_t end)
+{
+    for (auto it = buffers.upper_bound(begin); it != buffers.end(); )
+    {
+        const uint64_t bs = it->first - it->second.iov_len;
+        const uint64_t be = it->first;
+        if (bs >= end)
+        {
+            break;
+        }
+        if (bs >= begin && be <= end)
+        {
+            // Remove the whole buffer
+            buffers.erase(it++);
+        }
+        else if (bs < begin && be > end)
+        {
+            // Cut beginning & end & stop
+            uint8_t *ce = (uint8_t*)malloc_or_die(be-end);
+            memcpy(ce, it->second.iov_base + (end-bs), be-end);
+            uint8_t *cs = (uint8_t*)realloc(it->second.iov_base, begin-bs);
+            if (!cs)
+                throw std::bad_alloc();
+            buffers[begin] = (iovec){ .iov_base = cs, .iov_len = begin-bs };
+            buffers[be] = (iovec){ .iov_base = ce, .iov_len = be-end };
+            break;
+        }
+        else if (bs < begin)
+        {
+            // Cut beginning
+            uint8_t *cs = (uint8_t*)realloc(it->second.iov_base, begin-bs);
+            if (!cs)
+                throw std::bad_alloc();
+            buffers[begin] = (iovec){ .iov_base = cs, .iov_len = begin-bs };
+            buffers.erase(it++);
+        }
+        else
+        {
+            // Cut end & stop
+            assert(be > end);
+            uint8_t *ce = (uint8_t*)malloc_or_die(be-end);
+            memcpy(ce, it->second.iov_base + (end-bs), be-end);
+            buffers[be] = (iovec){ .iov_base = ce, .iov_len = be-end };
+            buffers.erase(it);
+            break;
+        }
+    }
+}
+
+void disk_mock_t::clear(size_t offset, size_t len)
+{
+    if (offset < size)
+    {
+        memset(data+offset, 0, len < size-offset ? len : size-offset);
+    }
+}
+
+void disk_mock_t::discard_buffers(bool all, uint32_t seed)
+{
+    if (trace)
+        printf("disk: discard buffers all=%d seed=%u\n", all, seed);
+    if (all)
+    {
+        for (auto & b: buffers)
+            free(b.second.iov_base);
+        buffers.clear();
+    }
+    else
+    {
+        std::mt19937 rnd(seed);
+        for (auto it = buffers.begin(); it != buffers.end(); )
+        {
+            if (rnd() < 0x80000000)
+            {
+                free(it->second.iov_base);
+                buffers.erase(it++);
+            }
+            else
+                it++;
+        }
+    }
+}
+
+ssize_t disk_mock_t::copy_from_sqe(io_uring_sqe *sqe, uint8_t *to, uint64_t base_offset)
+{
+    size_t off = sqe->off;
+    iovec *v = (iovec*)sqe->addr;
+    size_t n = sqe->len;
+    for (size_t i = 0; i < n; i++)
+    {
+        if (off >= size)
+        {
+            off = sqe->off - EINVAL; // :D
+            break;
+        }
+        size_t cur = (off + v[i].iov_len > size ? size-off : v[i].iov_len);
+        if (trace)
+            printf("disk: write %zu+%zu from %jx\n", off, cur, (uint64_t)v[i].iov_base);
+        memcpy(to + off - base_offset, v[i].iov_base, cur);
+        off += v[i].iov_len;
+    }
+    return off - sqe->off;
+}
+
+void disk_mock_t::read_item(uint8_t *to, uint64_t offset, uint64_t len)
+{
+    uint64_t last = offset;
+    for (auto it = buffers.upper_bound(offset); it != buffers.end(); it++)
+    {
+        const uint64_t bs = it->first - it->second.iov_len;
+        const uint64_t be = it->first;
+        if (bs >= offset+len)
+        {
+            break;
+        }
+        if (last < bs)
+        {
+            // Fill the gap between buffers
+            memcpy(to+last-offset, data+last, bs-last);
+            last = bs;
+        }
+        if (last < offset)
+        {
+            last = offset;
+        }
+        uint64_t cur_end = be < offset+len ? be : offset+len;
+        memcpy(to+last-offset, it->second.iov_base+last-bs, cur_end-last);
+        last = be;
+    }
+    if (last < offset+len)
+    {
+        // Fill the gap in the end
+        memcpy(to+last-offset, data+last, offset+len-last);
+    }
+}
+
+bool disk_mock_t::submit(io_uring_sqe *sqe)
+{
+    ring_data_t *userdata = (ring_data_t*)sqe->user_data;
+    if (sqe->opcode == IORING_OP_READV)
+    {
+        size_t off = sqe->off;
+        iovec *v = (iovec*)sqe->addr;
+        size_t n = sqe->len;
+        for (size_t i = 0; i < n; i++)
+        {
+            if (off < size)
+            {
+                size_t cur = (off + v[i].iov_len > size ? size-off : v[i].iov_len);
+                if (trace)
+                    printf("disk: read %zu+%zu to %jx\n", off, cur, (uint64_t)v[i].iov_base);
+                if (buffers.size())
+                    read_item((uint8_t*)v[i].iov_base, off, cur);
+                else
+                    memcpy(v[i].iov_base, data + off, cur);
+            }
+            off += v[i].iov_len;
+        }
+        userdata->res = off - sqe->off;
+    }
+    else if (sqe->opcode == IORING_OP_WRITEV)
+    {
+        uint64_t end = 0;
+        if (buffered)
+        {
+            // Remove overwritten parts of buffers
+            end = sqe->off;
+            for (uint32_t i = 0; i < sqe->len; i++)
+            {
+                end += ((iovec*)sqe->addr)[i].iov_len;
+            }
+            erase_buffers(sqe->off, end);
+        }
+        if (!buffered || (sqe->rw_flags & RWF_DSYNC))
+        {
+            // Simple "immediate" mode
+            userdata->res = copy_from_sqe(sqe, data, 0);
+        }
+        else
+        {
+            // Buffered mode
+            uint8_t *buf = (uint8_t*)malloc_or_die(end - sqe->off);
+            userdata->res = copy_from_sqe(sqe, buf, sqe->off);
+            if (userdata->res == -EINVAL)
+                free(buf);
+            else
+                buffers[end] = (iovec){ .iov_base = buf, .iov_len = end-sqe->off };
+        }
+    }
+    else if (sqe->opcode == IORING_OP_FSYNC)
+    {
+        if (trace)
+            printf("disk: fsync\n");
+        if (buffers.size())
+        {
+            for (auto & b: buffers)
+            {
+                memcpy(data + b.first - b.second.iov_len, b.second.iov_base, b.second.iov_len);
+                free(b.second.iov_base);
+            }
+            buffers.clear();
+        }
+        userdata->res = 0;
+    }
+    else
+    {
+        return false;
+    }
+    // Execution variability should also be introduced:
+    // 1) reads submitted in parallel to writes (not after completing the write) should return old or new data randomly
+    // 2) parallel operation completions should be delivered in random order
+    // 3) when fsync is enabled, write cache should be sometimes lost during a simulated power outage
+    return true;
+}
--- a/src/test/ringloop_mock.h
+++ b/src/test/ringloop_mock.h
@@ -0,0 +1,61 @@
+// Copyright (c) Vitaliy Filippov, 2019+
+// License: VNPL-1.1 or GNU GPL-2.0+ (see README.md for details)
+
+#pragma once
+
+#include "ringloop.h"
+
+class ring_loop_mock_t: public ring_loop_i
+{
+    std::vector<std::function<void()>> immediate_queue, immediate_queue2;
+    std::vector<ring_consumer_t*> consumers;
+    std::vector<io_uring_sqe> sqes;
+    std::vector<ring_data_t> ring_datas;
+    std::vector<ring_data_t *> free_ring_datas;
+    std::vector<ring_data_t *> submit_ring_datas;
+    std::vector<ring_data_t *> completed_ring_datas;
+    std::function<void(io_uring_sqe *)> submit_cb;
+    bool in_loop;
+    bool loop_again;
+    bool support_zc = false;
+
+public:
+    ring_loop_mock_t(int qd, std::function<void(io_uring_sqe *)> submit_cb);
+
+    void register_consumer(ring_consumer_t *consumer);
+    void unregister_consumer(ring_consumer_t *consumer);
+    void wakeup();
+    void set_immediate(const std::function<void()> & cb);
+    unsigned space_left();
+    bool has_work();
+    bool has_sendmsg_zc();
+
+    int register_eventfd();
+    io_uring_sqe* get_sqe();
+    int submit();
+    int wait();
+    void loop();
+    unsigned save();
+    void restore(unsigned sqe_tail);
+
+    void mark_completed(ring_data_t *data);
+};
+
+class disk_mock_t
+{
+    uint8_t *data = NULL;
+    std::map<uint64_t, iovec> buffers;
+    size_t size = 0;
+    bool buffered = false;
+
+    void erase_buffers(uint64_t begin, uint64_t end);
+    ssize_t copy_from_sqe(io_uring_sqe *sqe, uint8_t *to, uint64_t base_offset);
+    void read_item(uint8_t *to, uint64_t offset, uint64_t len);
+public:
+    bool trace = false;
+    disk_mock_t(size_t size, bool buffered);
+    ~disk_mock_t();
+    void clear(size_t offset, size_t len);
+    void discard_buffers(bool all, uint32_t seed);
+    bool submit(io_uring_sqe *sqe);
+};
--- a/src/test/test_blockstore.cpp
+++ b/src/test/test_blockstore.cpp
@@ -2,127 +2,544 @@
 // License: VNPL-1.1 (see README.md for details)

 #include <malloc.h>
-#include "blockstore.h"
-#include "epoll_manager.h"
+#include "str_util.h"
+#include "ringloop_mock.h"
+#include "blockstore_impl.h"
+
+struct bs_test_t
+{
+    blockstore_config_t config;
+    disk_mock_t *data_disk = NULL;
+    disk_mock_t *meta_disk = NULL;
+    std::function<bool(io_uring_sqe*)> sqe_handler;
+    ring_loop_mock_t *ringloop = NULL;
+    timerfd_manager_t *tfd = NULL;
+    blockstore_impl_t *bs = NULL;
+
+    ~bs_test_t()
+    {
+        destroy();
+    }
+
+    void destroy_bs()
+    {
+        if (bs)
+        {
+            delete bs;
+            bs = NULL;
+        }
+    }
+
+    void destroy()
+    {
+        while (bs && !bs->is_safe_to_stop())
+            ringloop->loop();
+        destroy_bs();
+        if (tfd)
+        {
+            delete tfd;
+            tfd = NULL;
+        }
+        if (meta_disk)
+        {
+            delete meta_disk;
+            meta_disk = NULL;
+        }
+        if (data_disk)
+        {
+            delete data_disk;
+            data_disk = NULL;
+        }
+        if (ringloop)
+        {
+            delete ringloop;
+            ringloop = NULL;
+        }
+    }
+
+    void default_cfg()
+    {
+        config["data_device"] = "./test_data.bin";
+        config["data_device_size"] = "1073741824";
+        config["data_device_sect"] = "4096";
+        config["meta_offset"] = "0";
+        config["journal_offset"] = "16777216";
+        config["data_offset"] = "33554432";
+        config["disable_data_fsync"] = "1";
+        config["immediate_commit"] = "all";
+        config["log_level"] = "10";
+        config["data_csum_type"] = "crc32c";
+        config["csum_block_size"] = "4096";
+    }
+
+    void init()
+    {
+        if (!ringloop)
+        {
+            ringloop = new ring_loop_mock_t(RINGLOOP_DEFAULT_SIZE, [&](io_uring_sqe *sqe)
+            {
+                if (sqe_handler && sqe_handler(sqe))
+                {
+                }
+                else if (sqe->fd == MOCK_DATA_FD)
+                {
+                    bool ok = data_disk->submit(sqe);
+                    assert(ok);
+                    ringloop->mark_completed((ring_data_t*)sqe->user_data);
+                }
+                else if (sqe->fd == MOCK_META_FD)
+                {
+                    bool ok = meta_disk->submit(sqe);
+                    assert(ok);
+                    ringloop->mark_completed((ring_data_t*)sqe->user_data);
+                }
+                else
+                {
+                    assert(0);
+                }
+            });
+        }
+        if (!tfd)
+        {
+            tfd = new timerfd_manager_t(nullptr);
+        }
+        if (!data_disk)
+        {
+            data_disk = new disk_mock_t(parse_size(config["data_device_size"]), config["disable_data_fsync"] != "1");
+            data_disk->clear(0, parse_size(config["data_offset"]));
+        }
+        uint64_t meta_size = parse_size(config["meta_device_size"]);
+        if (meta_size && !meta_disk)
+        {
+            meta_disk = new disk_mock_t(meta_size, config["disable_meta_fsync"] != "1");
+            meta_disk->clear(0, meta_size);
+        }
+        if (!bs)
+        {
+            bs = new blockstore_impl_t(config, ringloop, tfd, true);
+            while (!bs->is_started())
+                ringloop->loop();
+            printf("blockstore initialized\n");
+        }
+    }
+
+    void exec_op(blockstore_op_t *op)
+    {
+        bool done = false;
+        op->callback = [&](blockstore_op_t *op)
+        {
+            printf("op opcode=%lu completed retval=%d\n", op->opcode, op->retval);
+            done = true;
+        };
+        bs->enqueue_op(op);
+        while (!done)
+            ringloop->loop();
+        op->callback = nullptr;
+    }
+};
+
+static bool memcheck(uint8_t *buf, uint8_t byte, size_t len)
+{
+    for (size_t i = 0; i < len; i++)
+        if (buf[i] != byte)
+            return false;
+    return true;
+}
+
+static void test_simple()
+{
+    printf("\n-- test_simple\n");
+
+    bs_test_t test;
+    test.default_cfg();
+    test.init();
+
+    // Write
+    blockstore_op_t op;
+    uint64_t version = 0;
+    op.opcode = BS_OP_WRITE;
+    op.oid = { .inode = 1, .stripe = 0 };
+    op.version = 1;
+    op.offset = 16384;
+    op.len = 4096;
+    op.buf = (uint8_t*)memalign_or_die(MEM_ALIGNMENT, 128*1024);
+    memset(op.buf, 0xaa, 4096);
+    test.exec_op(&op);
+    assert(op.retval == op.len);
+
+    // Sync
+    printf("version %ju written, syncing\n", op.version);
+    version = op.version;
+    op.opcode = BS_OP_SYNC;
+    test.exec_op(&op);
+    assert(op.retval == 0);
+
+    // Commit
+    printf("commit version %ju\n", version);
+    op.opcode = BS_OP_STABLE;
+    op.len = 1;
+    *((obj_ver_id*)op.buf) = {
+        .oid = { .inode = 1, .stripe = 0 },
+        .version = version,
+    };
+    test.exec_op(&op);
+    assert(op.retval == 0);
+
+    // Read
+    printf("reading 0-128K\n");
+    op.opcode = BS_OP_READ;
+    op.oid = { .inode = 1, .stripe = 0 };
+    op.version = UINT64_MAX;
+    op.offset = 0;
+    op.len = 128*1024;
+    test.exec_op(&op);
+    assert(op.retval == op.len);
+    assert(op.version == 1);
+    uint8_t *cmp = (uint8_t*)memalign_or_die(MEM_ALIGNMENT, 128*1024);
+    memset(cmp, 0, 128*1024);
+    memset(cmp+16384, 0xaa, 4096);
+    if (memcmp(op.buf, cmp, 128*1024) == 0)
+        printf("read successful\n");
+    else
+    {
+        printf("read returned incorrect data\n");
+        abort();
+    }
+
+    // Zero-length read
+    printf("reading 0-0\n");
+    op.version = UINT64_MAX;
+    op.offset = 0;
+    op.len = 0;
+    test.exec_op(&op);
+    assert(op.retval == op.len);
+    assert(op.version == 1);
+
+    // Small read
+    printf("reading 16K-24K\n");
+    op.version = UINT64_MAX;
+    op.offset = 16*1024;
+    op.len = 8*1024;
+    test.exec_op(&op);
+    assert(op.retval == op.len);
+    assert(!memcmp(op.buf, cmp+16*1024, 8*1024));
+
+    free(cmp);
+
+    free(op.buf);
+}
+
+static void test_fsync(bool separate_meta)
+{
+    printf("\n-- test_fsync%s\n", separate_meta ? " separate_meta" : "");
+
+    bs_test_t test;
+    test.default_cfg();
+    test.config["disable_data_fsync"] = "0";
+    test.config["immediate_commit"] = "none";
+    if (separate_meta)
+    {
+        test.config["meta_device"] = "./test_meta.bin";
+        test.config["disable_meta_fsync"] = "1";
+        test.config["meta_device_size"] = "33554432";
+        test.config["meta_device_sect"] = "4096";
+        test.config["data_offset"] = "0";
+    }
+    test.init();
+
+    // Write
+    printf("writing\n");
+    blockstore_op_t op;
+    op.opcode = BS_OP_WRITE;
+    op.oid = { .inode = 1, .stripe = 0 };
+    op.version = 1;
+    op.offset = 16384;
+    op.len = 4096;
+    op.buf = (uint8_t*)memalign_or_die(MEM_ALIGNMENT, 4096);
+    memset(op.buf, 0xaa, 4096);
+    test.exec_op(&op);
+    assert(op.retval == op.len);
+
+    // Destroy and restart without sync
+    printf("destroying\n");
+    test.destroy_bs();
+    test.data_disk->discard_buffers(true, 0);
+    test.init();
+
+    // Check ENOENT
+    printf("checking for ENOENT\n");
+    blockstore_op_t op2;
+    op2.opcode = BS_OP_READ;
+    op2.oid = { .inode = 1, .stripe = 0 };
+    op2.version = UINT64_MAX;
+    op2.offset = 0;
+    op2.len = 128*1024;
+    op2.buf = (uint8_t*)memalign_or_die(MEM_ALIGNMENT, 128*1024);
+    test.exec_op(&op2);
+    assert(op2.retval == -ENOENT);
+
+    // Write again
+    printf("writing again\n");
+    test.exec_op(&op);
+    assert(op.retval == op.len);
+
+    // Sync
+    printf("version %ju written, syncing\n", op.version);
+    op.opcode = BS_OP_SYNC;
+    test.exec_op(&op);
+    assert(op.retval == 0);
+
+    // Discard and restart again
+    printf("destroying again\n");
+    test.destroy_bs();
+    test.data_disk->discard_buffers(true, 0);
+    test.init();
+
+    // Check that it's present now
+    printf("checking for OK\n");
+    op2.version = UINT64_MAX;
+    test.exec_op(&op2);
+    assert(op2.retval == op2.len);
+    assert(is_zero(op2.buf, 16*1024));
+    assert(memcmp(op2.buf+16*1024, op.buf, 4*1024) == 0);
+    assert(is_zero(op2.buf+20*1024, 108*1024));
+
+    free(op.buf);
+    free(op2.buf);
+}
+
+static void test_intent_over_unstable()
+{
+    printf("\n-- test_intent_over_unstable\n");
+
+    bs_test_t test;
+    test.default_cfg();
+    test.init();
+
+    // Write
+    printf("writing\n");
+    blockstore_op_t op;
+    op.opcode = BS_OP_WRITE;
+    op.oid = { .inode = 1, .stripe = 0 };
+    op.version = 1;
+    op.offset = 20480;
+    op.len = 4096;
+    op.buf = (uint8_t*)memalign_or_die(MEM_ALIGNMENT, 4096);
+    memset(op.buf, 0xaa, 4096);
+    test.exec_op(&op);
+    assert(op.retval == op.len);
+
+    // Write again
+    printf("writing again\n");
+    op.version = 2;
+    op.offset = 28*1024;
+    test.exec_op(&op);
+    assert(op.retval == op.len);
+
+    free(op.buf);
+}
+
+static void test_padded_csum_intent(bool perfect)
+{
+    printf("\n-- test_padded_csum_intent%s\n", perfect ? " perfect_csum_update" : "");
+
+    bs_test_t test;
+    test.default_cfg();
+    test.config["csum_block_size"] = "16384";
+    if (perfect)
+        test.config["perfect_csum_update"] = "1";
+    test.init();
+
+    // Write
+    printf("writing\n");
+    blockstore_op_t op;
+    op.opcode = BS_OP_WRITE;
+    op.oid = { .inode = 1, .stripe = 0 };
+    op.version = 1;
+    op.offset = 8192;
+    op.len = 4096;
+    op.buf = (uint8_t*)memalign_or_die(MEM_ALIGNMENT, 4096);
+    memset(op.buf, 0xaa, 4096);
+    test.exec_op(&op);
+    assert(op.retval == op.len);
+
+    // Read
+    printf("reading\n");
+    blockstore_op_t op2;
+    op2.opcode = BS_OP_READ;
+    op2.oid = { .inode = 1, .stripe = 0 };
+    op2.version = UINT64_MAX;
+    op2.offset = 0;
+    op2.len = 128*1024;
+    op2.buf = (uint8_t*)memalign_or_die(MEM_ALIGNMENT, 128*1024);
+    test.exec_op(&op2);
+    assert(op2.retval == op2.len);
+    assert(is_zero(op2.buf, 8*1024));
+    assert(memcmp(op2.buf+8*1024, op.buf, 4*1024) == 0);
+    assert(is_zero(op2.buf+12*1024, 116*1024));
+
+    // Write again (intent if not "perfect")
+    printf("writing (%s)\n", perfect ? "small" : "intent");
+    op.version = 2;
+    op.offset = 28*1024;
+    memset(op.buf, 0xbb, 4096);
+    test.exec_op(&op);
+    assert(op.retval == op.len);
+
+    // Write again (small because uncompactable)
+    printf("writing (small)\n");
+    op.version = 3;
+    op.offset = 60*1024;
+    memset(op.buf, 0xcc, 4096);
+    test.exec_op(&op);
+    assert(op.retval == op.len);
+
+    // Check that these are really big+intent+small writes
+    // (intent is not collapsible because of csum_block_size > bitmap_granularity)
+    heap_object_t *obj = test.bs->heap->read_entry((object_id){ .inode = 1, .stripe = 0 }, NULL);
+    assert(obj);
+    assert(obj->get_writes()->next());
+    assert(obj->get_writes()->next()->next());
+    assert(!obj->get_writes()->next()->next()->next());
+    assert(obj->get_writes()->entry_type == BS_HEAP_SMALL_WRITE);
+    assert(obj->get_writes()->next()->entry_type == (perfect ? BS_HEAP_SMALL_WRITE : BS_HEAP_INTENT_WRITE));
+    assert(obj->get_writes()->next()->next()->entry_type == BS_HEAP_BIG_WRITE);
+
+    // Commit
+    printf("commit version 3\n");
+    op.opcode = BS_OP_STABLE;
+    op.len = 1;
+    *((obj_ver_id*)op.buf) = {
+        .oid = { .inode = 1, .stripe = 0 },
+        .version = 3,
+    };
+    test.exec_op(&op);
+    assert(op.retval == 0);
+    assert(test.bs->heap->get_compact_queue_size());
+
+    // Trigger & wait compaction
+    test.bs->flusher->request_trim();
+    while (test.bs->heap->get_compact_queue_size())
+        test.ringloop->loop();
+    while (test.bs->flusher->is_active())
+        test.ringloop->loop();
+    test.bs->flusher->release_trim();
+    // Check that compaction succeeded
+    assert(!test.bs->heap->get_to_compact_count());
+
+    // Read again and check
+    printf("reading compacted\n");
+    op2.version = UINT64_MAX;
+    test.exec_op(&op2);
+    assert(op2.retval == op2.len);
+    assert(memcheck(op2.buf, 0, 8*1024));
+    assert(memcheck(op2.buf+8*1024, 0xaa, 4*1024));
+    assert(memcheck(op2.buf+12*1024, 0, 16*1024));
+    assert(memcheck(op2.buf+28*1024, 0xbb, 4*1024));
+    assert(memcheck(op2.buf+32*1024, 0, 28*1024));
+    assert(memcheck(op2.buf+60*1024, 0xcc, 4*1024));
+    assert(memcheck(op2.buf+64*1024, 0, 64*1024));
+
+    obj = test.bs->heap->read_entry((object_id){ .inode = 1, .stripe = 0 }, NULL);
+    assert(!obj->get_writes()->next());
+
+    free(op.buf);
+    free(op2.buf);
+}
+
+static void test_padded_csum_parallel_read(bool perfect, uint32_t offset)
+{
+    printf("\n-- test_padded_csum_parallel_read%s offset=%u\n", perfect ? " perfect_csum_update" : "", offset);
+
+    bs_test_t test;
+    test.default_cfg();
+    test.config["csum_block_size"] = "16384";
+    test.config["atomic_write_size"] = "0";
+    if (perfect)
+        test.config["perfect_csum_update"] = "1";
+    test.init();
+
+    // Write
+    printf("writing (initial)\n");
+    blockstore_op_t op;
+    op.opcode = BS_OP_WRITE_STABLE;
+    op.oid = { .inode = 1, .stripe = 0 };
+    op.version = 1;
+    op.offset = 8192;
+    op.len = 16384;
+    op.buf = (uint8_t*)memalign_or_die(MEM_ALIGNMENT, 16384);
+    memset(op.buf, 0xaa, 16384);
+    test.exec_op(&op);
+    assert(op.retval == op.len);
+
+    // Write 2
+    printf("writing (%u+%u)\n", offset, 4096);
+    op.version = 2;
+    op.offset = offset;
+    op.len = 4096;
+    memset(op.buf, 0xbb, 4096);
+    test.exec_op(&op);
+    assert(op.retval == op.len);
+
+    // Trigger & wait compaction
+    test.bs->flusher->request_trim();
+    std::vector<ring_data_t*> flush_writes;
+    test.sqe_handler = [&](io_uring_sqe *sqe)
+    {
+        if (sqe->fd == MOCK_DATA_FD && sqe->opcode == IORING_OP_WRITEV &&
+            sqe->off >= test.bs->dsk.data_offset)
+        {
+            bool ok = test.data_disk->submit(sqe);
+            assert(ok);
+            flush_writes.push_back((ring_data_t*)sqe->user_data);
+            return true;
+        }
+        return false;
+    };
+    // Wait for 2 flusher writes, execute and pause them
+    while (test.bs->heap->get_compact_queue_size() && flush_writes.size() < 1)
+        test.ringloop->loop();
+    while (test.bs->flusher->is_active() && flush_writes.size() < 1)
+        test.ringloop->loop();
+    // Run a read operation in parallel - it shouldn't complain about checksum errors
+    printf("reading in parallel\n");
+    blockstore_op_t op2;
+    op2.opcode = BS_OP_READ;
+    op2.oid = { .inode = 1, .stripe = 0 };
+    op2.version = 1;
+    op2.offset = 0;
+    op2.len = 128*1024;
+    op2.buf = (uint8_t*)memalign_or_die(MEM_ALIGNMENT, 128*1024);
+    test.exec_op(&op2);
+    assert(op2.retval == op2.len);
+    // Continue flushing
+    test.sqe_handler = NULL;
+    for (auto & w: flush_writes)
+        test.ringloop->mark_completed(w);
+    flush_writes.clear();
+    while (test.bs->heap->get_compact_queue_size() && flush_writes.size() < 2)
+        test.ringloop->loop();
+    while (test.bs->flusher->is_active() && flush_writes.size() < 2)
+        test.ringloop->loop();
+    test.bs->flusher->release_trim();
+    // Check that compaction succeeded
+    assert(!test.bs->heap->get_to_compact_count());
+
+    free(op.buf);
+    free(op2.buf);
+}

 int main(int narg, char *args[])
 {
-    blockstore_config_t config;
-    config["meta_device"] = "./test_meta.bin";
-    config["journal_device"] = "./test_journal.bin";
-    config["data_device"] = "./test_data.bin";
-    ring_loop_t *ringloop = new ring_loop_t(RINGLOOP_DEFAULT_SIZE);
-    epoll_manager_t *epmgr = new epoll_manager_t(ringloop);
-    blockstore_t *bs = new blockstore_t(config, ringloop, epmgr->tfd);
-
-    blockstore_op_t op;
-    int main_state = 0;
-    uint64_t version = 0;
-    ring_consumer_t main_cons;
-    op.callback = [&](blockstore_op_t *op)
-    {
-        printf("op completed %d\n", op->retval);
-        if (main_state == 1)
-            main_state = 2;
-        else if (main_state == 3)
-            main_state = 4;
-        else if (main_state == 5)
-            main_state = 6;
-        else if (main_state == 7)
-            main_state = 8;
-        else if (main_state == 9)
-            main_state = 10;
-    };
-    main_cons.loop = [&]()
-    {
-        if (main_state == 0)
-        {
-            if (bs->is_started())
-            {
-                printf("init completed\n");
-                op.opcode = BS_OP_WRITE;
-                op.oid = { .inode = 1, .stripe = 0 };
-                op.version = 0;
-                op.offset = 16384;
-                op.len = 4096;
-                op.buf = (uint8_t*)memalign(512, 128*1024);
-                memset(op.buf, 0xaa, 4096);
-                bs->enqueue_op(&op);
-                main_state = 1;
-            }
-        }
-        else if (main_state == 2)
-        {
-            printf("version %ju written, syncing\n", op.version);
-            version = op.version;
-            op.opcode = BS_OP_SYNC;
-            bs->enqueue_op(&op);
-            main_state = 3;
-        }
-        else if (main_state == 4)
-        {
-            printf("stabilizing version %ju\n", version);
-            op.opcode = BS_OP_STABLE;
-            op.len = 1;
-            *((obj_ver_id*)op.buf) = {
-                .oid = { .inode = 1, .stripe = 0 },
-                .version = version,
-            };
-            bs->enqueue_op(&op);
-            main_state = 5;
-        }
-        else if (main_state == 6)
-        {
-            printf("stabilizing version %ju\n", version);
-            op.opcode = BS_OP_STABLE;
-            op.len = 1;
-            *((obj_ver_id*)op.buf) = {
-                .oid = { .inode = 1, .stripe = 0 },
-                .version = version,
-            };
-            bs->enqueue_op(&op);
-            main_state = 7;
-        }
-        else if (main_state == 8)
-        {
-            printf("reading 0-128K\n");
-            op.opcode = BS_OP_READ;
-            op.oid = { .inode = 1, .stripe = 0 };
-            op.version = UINT64_MAX;
-            op.offset = 0;
-            op.len = 128*1024;
-            bs->enqueue_op(&op);
-            main_state = 9;
-        }
-        else if (main_state == 10)
-        {
-            void *cmp = memalign(512, 128*1024);
-            memset(cmp, 0, 128*1024);
-            memset(cmp+16384, 0xaa, 4096);
-            int ok = 1;
-            for (int i = 0; i < 128*1024; i += 4096)
-            {
-                if (memcmp(cmp+i, op.buf+i, 4096) != 0)
-                {
-                    printf("bitmap works incorrectly, bytes %d - %d differ (%02x, should be %02x)\n", i, i+4096, ((uint8_t*)op.buf)[i], ((uint8_t*)cmp)[i]);
-                    ok = 0;
-                }
-            }
-            if (ok)
-                printf("bitmap works correctly\n");
-            free(cmp);
-            main_state = 11;
-        }
-    };
-
-    ringloop->register_consumer(&main_cons);
-    while (1)
-    {
-        ringloop->loop();
-        ringloop->wait();
-    }
-    delete bs;
-    delete epmgr;
-    delete ringloop;
+    test_simple();
+    test_fsync(false);
+    test_fsync(true);
+    test_intent_over_unstable();
+    test_padded_csum_intent(false);
+    test_padded_csum_intent(true);
+    test_padded_csum_parallel_read(false, 8192);
+    test_padded_csum_parallel_read(true, 8192);
+    test_padded_csum_parallel_read(false, 16384);
+    test_padded_csum_parallel_read(true, 16384);
    return 0;
 }
--- a/src/test/test_heap.cpp
+++ b/src/test/test_heap.cpp
--- a/src/util/malloc_or_die.h
+++ b/src/util/malloc_or_die.h
@@ -8,6 +8,11 @@

 #pragma GCC visibility push(default)

+// Memory allocation alignment (page size is usually optimal)
+#ifndef MEM_ALIGNMENT
+#define MEM_ALIGNMENT 4096
+#endif
+
 inline void* memalign_or_die(size_t alignment, size_t size)
 {
    void *buf = memalign(alignment, size);
--- a/src/util/ringloop.cpp
+++ b/src/util/ringloop.cpp
@@ -192,7 +192,7 @@ void ring_loop_t::restore(unsigned sqe_tail)
    ring.sq.sqe_tail = sqe_tail;
 }

-int ring_loop_t::sqes_left()
+unsigned ring_loop_t::space_left()
 {
    struct io_uring_sq *sq = &ring.sq;
    unsigned int head = io_uring_smp_load_acquire(sq->khead);
--- a/src/util/ringloop.h
+++ b/src/util/ringloop.h
@@ -14,6 +14,7 @@
 #include <string>
 #include <functional>
 #include <vector>
+#include <map>
 #include <mutex>

 #define RINGLOOP_DEFAULT_SIZE 1024
@@ -32,7 +33,27 @@ struct ring_consumer_t
    std::function<void(void)> loop;
 };

-class __attribute__((visibility("default"))) ring_loop_t
+class __attribute__((visibility("default"))) ring_loop_i
+{
+public:
+    virtual ~ring_loop_i() = default;
+    virtual void register_consumer(ring_consumer_t *consumer) = 0;
+    virtual void unregister_consumer(ring_consumer_t *consumer) = 0;
+    virtual int register_eventfd() = 0;
+    virtual io_uring_sqe* get_sqe() = 0;
+    virtual void set_immediate(const std::function<void()> & cb) = 0;
+    virtual int submit() = 0;
+    virtual int wait() = 0;
+    virtual unsigned space_left() = 0;
+    virtual bool has_work() = 0;
+    virtual bool has_sendmsg_zc() = 0;
+    virtual void loop() = 0;
+    virtual void wakeup() = 0;
+    virtual unsigned save() = 0;
+    virtual void restore(unsigned sqe_tail) = 0;
+};
+
+class __attribute__((visibility("default"))) ring_loop_t: public ring_loop_i
 {
    std::vector<std::function<void()>> immediate_queue, immediate_queue2;
    std::vector<ring_consumer_t*> consumers;
@@ -54,7 +75,7 @@ public:
    int register_eventfd();

    io_uring_sqe* get_sqe();
-    inline void set_immediate(const std::function<void()> cb)
+    inline void set_immediate(const std::function<void()> & cb)
    {
        immediate_queue.push_back(cb);
        wakeup();
@@ -68,11 +89,7 @@ public:
        struct io_uring_cqe *cqe;
        return io_uring_wait_cqe(&ring, &cqe);
    }
-    int sqes_left();
-    inline unsigned space_left()
-    {
-        return free_ring_data_ptr;
-    }
+    unsigned space_left();
    inline bool has_work()
    {
        return loop_again;
--- a/src/util/robin_hood.h
+++ b/src/util/robin_hood.h
--- a/src/util/str_util.cpp
+++ b/src/util/str_util.cpp
@@ -30,7 +30,7 @@ std::string base64_encode(const std::string &in)
    return out;
 }

-static char T[256] = { 0 };
+static int T[256] = { 0 };

 std::string base64_decode(const std::string &in)
 {
--- a/src/util/timerfd_manager.cpp
+++ b/src/util/timerfd_manager.cpp
@@ -4,6 +4,7 @@
 #include <sys/timerfd.h>
 #include <sys/poll.h>
 #include <sys/epoll.h>
+#include <assert.h>
 #include <unistd.h>
 #include <errno.h>
 #include <string.h>
@@ -15,21 +16,27 @@ timerfd_manager_t::timerfd_manager_t(std::function<void(int, bool, std::function
 {
    this->set_fd_handler = set_fd_handler;
    wait_state = 0;
-    timerfd = timerfd_create(CLOCK_MONOTONIC, TFD_NONBLOCK);
-    if (timerfd < 0)
+    if (set_fd_handler)
    {
-        throw std::runtime_error(std::string("timerfd_create: ") + strerror(errno));
+        timerfd = timerfd_create(CLOCK_MONOTONIC, TFD_NONBLOCK);
+        if (timerfd < 0)
+        {
+            throw std::runtime_error(std::string("timerfd_create: ") + strerror(errno));
+        }
+        set_fd_handler(timerfd, false, [this](int fd, int events)
+        {
+            handle_readable();
+        });
    }
-    set_fd_handler(timerfd, false, [this](int fd, int events)
-    {
-        handle_readable();
-    });
 }

 timerfd_manager_t::~timerfd_manager_t()
 {
-    set_fd_handler(timerfd, false, NULL);
-    close(timerfd);
+    if (timerfd >= 0)
+    {
+        set_fd_handler(timerfd, false, NULL);
+        close(timerfd);
+    }
 }

 void timerfd_manager_t::inc_timer(timerfd_timer_t & t)
@@ -52,7 +59,14 @@ int timerfd_manager_t::set_timer_us(uint64_t micros, bool repeat, std::function<
 {
    int timer_id = id++;
    timespec start;
-    clock_gettime(CLOCK_MONOTONIC, &start);
+    if (timerfd >= 0)
+    {
+        clock_gettime(CLOCK_MONOTONIC, &start);
+    }
+    else
+    {
+        start = cur;
+    }
    timers.push_back({
        .id = timer_id,
        .micros = micros,
@@ -101,7 +115,7 @@ again:
    {
        nearest = -1;
        itimerspec exp = {};
-        if (timerfd_settime(timerfd, 0, &exp, NULL))
+        if (timerfd >= 0 && timerfd_settime(timerfd, 0, &exp, NULL))
        {
            throw std::runtime_error(std::string("timerfd_settime: ") + strerror(errno));
        }
@@ -120,7 +134,14 @@ again:
            }
        }
        timespec now;
-        clock_gettime(CLOCK_MONOTONIC, &now);
+        if (timerfd >= 0)
+        {
+            clock_gettime(CLOCK_MONOTONIC, &now);
+        }
+        else
+        {
+            now = cur;
+        }
        itimerspec exp = {
            .it_interval = { 0 },
            .it_value = timers[nearest].next,
@@ -142,7 +163,7 @@ again:
            }
            exp.it_value = { .tv_sec = 0, .tv_nsec = 1 };
        }
-        if (timerfd_settime(timerfd, 0, &exp, NULL))
+        if (timerfd >= 0 && timerfd_settime(timerfd, 0, &exp, NULL))
        {
            throw std::runtime_error(std::string("timerfd_settime: ") + strerror(errno));
        }
@@ -178,3 +199,13 @@ void timerfd_manager_t::trigger_nearest()
    nearest = -1;
    cb(nearest_id);
 }
+
+void timerfd_manager_t::tick(timespec passed)
+{
+    assert(timerfd == -1);
+    cur.tv_sec += passed.tv_sec;
+    cur.tv_nsec += passed.tv_nsec;
+    cur.tv_sec += (cur.tv_nsec / 1000000000);
+    cur.tv_nsec = (cur.tv_nsec % 1000000000);
+    set_nearest(true);
+}
--- a/src/util/timerfd_manager.h
+++ b/src/util/timerfd_manager.h
@@ -19,11 +19,12 @@ struct timerfd_timer_t
 class __attribute__((visibility("default"))) timerfd_manager_t
 {
    int wait_state = 0;
-    int timerfd;
+    int timerfd = -1;
    int nearest = -1;
    int id = 1;
    int onstack = 0;
    std::vector<timerfd_timer_t> timers;
+    timespec cur = {};

    void inc_timer(timerfd_timer_t & t);
    void set_nearest(bool trigger_inline);
@@ -37,4 +38,5 @@ public:
    int set_timer(uint64_t millis, bool repeat, std::function<void(int)> callback);
    int set_timer_us(uint64_t micros, bool repeat, std::function<void(int)> callback);
    void clear_timer(int timer_id);
+    void tick(timespec passed);
 };
--- a/src/util/wyhash.h
+++ b/src/util/wyhash.h
@@ -0,0 +1,237 @@
+// Copied from https://github.com/martinus/unordered_dense, version 4.5.0
+// Licensed under the MIT License <http://opensource.org/licenses/MIT>.
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2022-2024 Martin Leitner-Ankerl <martin.ankerl@gmail.com>
+
+#pragma once
+
+#include <cstdint>          // for uint64_t, uint32_t, uint8_t, UINT64_C
+#include <cstring>          // for size_t, memcpy, memset
+#include <functional>       // for equal_to, hash
+#include <memory>           // for allocator, allocator_traits, shared_ptr
+
+#if defined(__GNUC__) || defined(__INTEL_COMPILER) || defined(__clang__)
+#    define ANKERL_UNORDERED_DENSE_LIKELY(x) __builtin_expect(x, 1)   // NOLINT(cppcoreguidelines-macro-usage)
+#    define ANKERL_UNORDERED_DENSE_UNLIKELY(x) __builtin_expect(x, 0) // NOLINT(cppcoreguidelines-macro-usage)
+#else
+#    define ANKERL_UNORDERED_DENSE_LIKELY(x) (x)   // NOLINT(cppcoreguidelines-macro-usage)
+#    define ANKERL_UNORDERED_DENSE_UNLIKELY(x) (x) // NOLINT(cppcoreguidelines-macro-usage)
+#endif
+
+// This is a stripped-down implementation of wyhash: https://github.com/wangyi-fudan/wyhash
+// No big-endian support (because different values on different machines don't matter),
+// hardcodes seed and the secret, reformats the code, and clang-tidy fixes.
+namespace wyhash {
+
+namespace detail {
+
+inline void mum(uint64_t* a, uint64_t* b) {
+#    if defined(__SIZEOF_INT128__)
+    __uint128_t r = *a;
+    r *= *b;
+    *a = static_cast<uint64_t>(r);
+    *b = static_cast<uint64_t>(r >> 64U);
+#    elif defined(_MSC_VER) && defined(_M_X64)
+    *a = _umul128(*a, *b, b);
+#    else
+    uint64_t ha = *a >> 32U;
+    uint64_t hb = *b >> 32U;
+    uint64_t la = static_cast<uint32_t>(*a);
+    uint64_t lb = static_cast<uint32_t>(*b);
+    uint64_t hi{};
+    uint64_t lo{};
+    uint64_t rh = ha * hb;
+    uint64_t rm0 = ha * lb;
+    uint64_t rm1 = hb * la;
+    uint64_t rl = la * lb;
+    uint64_t t = rl + (rm0 << 32U);
+    auto c = static_cast<uint64_t>(t < rl);
+    lo = t + (rm1 << 32U);
+    c += static_cast<uint64_t>(lo < t);
+    hi = rh + (rm0 >> 32U) + (rm1 >> 32U) + c;
+    *a = lo;
+    *b = hi;
+#    endif
+}
+
+// multiply and xor mix function, aka MUM
+inline auto mix(uint64_t a, uint64_t b) -> uint64_t {
+    mum(&a, &b);
+    return a ^ b;
+}
+
+// read functions. WARNING: we don't care about endianness, so results are different on big endian!
+inline auto r8(const uint8_t* p) -> uint64_t {
+    uint64_t v{};
+    std::memcpy(&v, p, 8U);
+    return v;
+}
+
+inline auto r4(const uint8_t* p) -> uint64_t {
+    uint32_t v{};
+    std::memcpy(&v, p, 4);
+    return v;
+}
+
+// reads 1, 2, or 3 bytes
+inline auto r3(const uint8_t* p, size_t k) -> uint64_t {
+    return (static_cast<uint64_t>(p[0]) << 16U) | (static_cast<uint64_t>(p[k >> 1U]) << 8U) | p[k - 1];
+}
+
+inline auto hash(void const* key, size_t len) -> uint64_t {
+    static uint64_t secret[4] = {UINT64_C(0xa0761d6478bd642f),
+        UINT64_C(0xe7037ed1a0b428db),
+        UINT64_C(0x8ebc6af09c88c6e3),
+        UINT64_C(0x589965cc75374cc3)};
+
+    auto const* p = static_cast<uint8_t const*>(key);
+    uint64_t seed = secret[0];
+    uint64_t a{};
+    uint64_t b{};
+    if (ANKERL_UNORDERED_DENSE_LIKELY(len <= 16)) {
+        if (ANKERL_UNORDERED_DENSE_LIKELY(len >= 4)) {
+            a = (r4(p) << 32U) | r4(p + ((len >> 3U) << 2U));
+            b = (r4(p + len - 4) << 32U) | r4(p + len - 4 - ((len >> 3U) << 2U));
+        } else if (ANKERL_UNORDERED_DENSE_LIKELY(len > 0)) {
+            a = r3(p, len);
+            b = 0;
+        } else {
+            a = 0;
+            b = 0;
+        }
+    } else {
+        size_t i = len;
+        if (ANKERL_UNORDERED_DENSE_UNLIKELY(i > 48)) {
+            uint64_t see1 = seed;
+            uint64_t see2 = seed;
+            do {
+                seed = mix(r8(p) ^ secret[1], r8(p + 8) ^ seed);
+                see1 = mix(r8(p + 16) ^ secret[2], r8(p + 24) ^ see1);
+                see2 = mix(r8(p + 32) ^ secret[3], r8(p + 40) ^ see2);
+                p += 48;
+                i -= 48;
+            } while (ANKERL_UNORDERED_DENSE_LIKELY(i > 48));
+            seed ^= see1 ^ see2;
+        }
+        while (ANKERL_UNORDERED_DENSE_UNLIKELY(i > 16)) {
+            seed = mix(r8(p) ^ secret[1], r8(p + 8) ^ seed);
+            i -= 16;
+            p += 16;
+        }
+        a = r8(p + i - 16);
+        b = r8(p + i - 8);
+    }
+
+    return mix(secret[1] ^ len, mix(a ^ secret[1], b ^ seed));
+}
+
+inline auto hash(uint64_t x) -> uint64_t {
+    return mix(x, UINT64_C(0x9E3779B97F4A7C15));
+}
+
+} // namespace detail
+
+template <typename T, typename Enable = void>
+struct hash {
+    auto operator()(T const& obj) const noexcept(noexcept(std::declval<std::hash<T>>().operator()(std::declval<T const&>())))
+        -> uint64_t {
+        return std::hash<T>{}(obj);
+    }
+};
+
+template <typename T>
+struct hash<T, typename std::hash<T>::is_avalanching> {
+    using is_avalanching = void;
+    auto operator()(T const& obj) const noexcept(noexcept(std::declval<std::hash<T>>().operator()(std::declval<T const&>())))
+        -> uint64_t {
+        return std::hash<T>{}(obj);
+    }
+};
+
+template <typename CharT>
+struct hash<std::basic_string<CharT>> {
+    using is_avalanching = void;
+    auto operator()(std::basic_string<CharT> const& str) const noexcept -> uint64_t {
+        return detail::hash(str.data(), sizeof(CharT) * str.size());
+    }
+};
+
+template <class T>
+struct hash<T*> {
+    using is_avalanching = void;
+    auto operator()(T* ptr) const noexcept -> uint64_t {
+        // NOLINTNEXTLINE(cppcoreguidelines-pro-type-reinterpret-cast)
+        return detail::hash(reinterpret_cast<uintptr_t>(ptr));
+    }
+};
+
+template <class T>
+struct hash<std::unique_ptr<T>> {
+    using is_avalanching = void;
+    auto operator()(std::unique_ptr<T> const& ptr) const noexcept -> uint64_t {
+        // NOLINTNEXTLINE(cppcoreguidelines-pro-type-reinterpret-cast)
+        return detail::hash(reinterpret_cast<uintptr_t>(ptr.get()));
+    }
+};
+
+template <class T>
+struct hash<std::shared_ptr<T>> {
+    using is_avalanching = void;
+    auto operator()(std::shared_ptr<T> const& ptr) const noexcept -> uint64_t {
+        // NOLINTNEXTLINE(cppcoreguidelines-pro-type-reinterpret-cast)
+        return detail::hash(reinterpret_cast<uintptr_t>(ptr.get()));
+    }
+};
+
+template <typename Enum>
+struct hash<Enum, typename std::enable_if<std::is_enum<Enum>::value>::type> {
+    using is_avalanching = void;
+    auto operator()(Enum e) const noexcept -> uint64_t {
+        using underlying = typename std::underlying_type_t<Enum>;
+        return detail::hash(static_cast<underlying>(e));
+    }
+};
+
+// NOLINTNEXTLINE(cppcoreguidelines-macro-usage)
+#    define ANKERL_UNORDERED_DENSE_HASH_STATICCAST(T)                    \
+        template <>                                                      \
+        struct hash<T> {                                                 \
+            using is_avalanching = void;                                 \
+            auto operator()(T const& obj) const noexcept -> uint64_t {   \
+                return detail::hash(static_cast<uint64_t>(obj)); \
+            }                                                            \
+        }
+
+#    if defined(__GNUC__) && !defined(__clang__)
+#        pragma GCC diagnostic push
+#        pragma GCC diagnostic ignored "-Wuseless-cast"
+#    endif
+// see https://en.cppreference.com/w/cpp/utility/hash
+ANKERL_UNORDERED_DENSE_HASH_STATICCAST(bool);
+ANKERL_UNORDERED_DENSE_HASH_STATICCAST(char);
+ANKERL_UNORDERED_DENSE_HASH_STATICCAST(signed char);
+ANKERL_UNORDERED_DENSE_HASH_STATICCAST(unsigned char);
+#    if ANKERL_UNORDERED_DENSE_CPP_VERSION >= 202002L && defined(__cpp_char8_t)
+ANKERL_UNORDERED_DENSE_HASH_STATICCAST(char8_t);
+#    endif
+ANKERL_UNORDERED_DENSE_HASH_STATICCAST(char16_t);
+ANKERL_UNORDERED_DENSE_HASH_STATICCAST(char32_t);
+ANKERL_UNORDERED_DENSE_HASH_STATICCAST(wchar_t);
+ANKERL_UNORDERED_DENSE_HASH_STATICCAST(short);
+ANKERL_UNORDERED_DENSE_HASH_STATICCAST(unsigned short);
+ANKERL_UNORDERED_DENSE_HASH_STATICCAST(int);
+ANKERL_UNORDERED_DENSE_HASH_STATICCAST(unsigned int);
+ANKERL_UNORDERED_DENSE_HASH_STATICCAST(long);
+ANKERL_UNORDERED_DENSE_HASH_STATICCAST(long long);
+ANKERL_UNORDERED_DENSE_HASH_STATICCAST(unsigned long);
+ANKERL_UNORDERED_DENSE_HASH_STATICCAST(unsigned long long);
+
+#    undef ANKERL_UNORDERED_DENSE_HASH_STATICCAST
+#    undef ANKERL_UNORDERED_DENSE_LIKELY
+#    undef ANKERL_UNORDERED_DENSE_UNLIKELY
+
+#    if defined(__GNUC__) && !defined(__clang__)
+#        pragma GCC diagnostic pop
+#    endif
+
+} // namespace wyhash
--- a/tests/run_tests.sh
+++ b/tests/run_tests.sh
@@ -70,6 +70,8 @@ TEST_NAME=local_read POOLCFG='"local_reads":"random",' ./test_heal.sh
 SCHEME=ec ./test_heal.sh
 ANTIETCD=1 ./test_heal.sh

+./test_reweight_half.sh
+
 TEST_NAME=csum_32k_dmj OSD_ARGS="--data_csum_type crc32c --csum_block_size 32k --inmemory_metadata false --inmemory_journal false" OFFSET_ARGS=$OSD_ARGS ./test_heal.sh
 TEST_NAME=csum_32k_dj  OSD_ARGS="--data_csum_type crc32c --csum_block_size 32k --inmemory_journal false" OFFSET_ARGS=$OSD_ARGS ./test_heal.sh
 TEST_NAME=csum_32k     OSD_ARGS="--data_csum_type crc32c --csum_block_size 32k" OFFSET_ARGS=$OSD_ARGS ./test_heal.sh
--- a/tests/test_resize.sh
+++ b/tests/test_resize.sh
@@ -25,26 +25,33 @@ done

 for i in $(seq 1 $OSD_COUNT); do
    offsets=$(build/src/disk_tool/vitastor-disk simple-offsets --format json ./testdata/bin/test_osd$i.bin)
+    opts=$(build/src/disk_tool/vitastor-disk simple-offsets --format options ./testdata/bin/test_osd$i.bin)
    meta_offset=$(echo $offsets | jq -r .meta_offset)
    data_offset=$(echo $offsets | jq -r .data_offset)
-    build/src/disk_tool/vitastor-disk dump-journal --io cached --json ./testdata/bin/test_osd$i.bin 4096 0 $meta_offset >./testdata/journal_before_resize.json
-    build/src/disk_tool/vitastor-disk dump-meta --io cached ./testdata/bin/test_osd$i.bin 4096 $meta_offset $((data_offset-meta_offset)) >./testdata/meta_before_resize.json
+    #build/src/disk_tool/vitastor-disk dump-journal --io cached --json ./testdata/bin/test_osd$i.bin 4096 0 $meta_offset >./testdata/journal_before_resize.json
+    #build/src/disk_tool/vitastor-disk dump-meta --io cached ./testdata/bin/test_osd$i.bin 4096 $meta_offset $((data_offset-meta_offset)) >./testdata/meta_before_resize.json
+    build/src/disk_tool/vitastor-disk dump-meta --io cached $opts >./testdata/meta_before_resize.json
+    new_data_offset=$((128*1024*1024+data_offset%131072))
    build/src/disk_tool/vitastor-disk raw-resize --io cached \
-        $(build/src/disk_tool/vitastor-disk simple-offsets --format options ./testdata/bin/test_osd$i.bin 2>/dev/null) \
+        $opts \
        --new_meta_offset 0 \
        --new_meta_len $((1024*1024)) \
        --new_journal_offset $((1024*1024)) \
-        --new_data_offset $((128*1024*1024+32768))
-    build/src/disk_tool/vitastor-disk dump-journal --io cached --json ./testdata/bin/test_osd$i.bin 4096 $((1024*1024)) $((127*1024*1024)) >./testdata/journal_after_resize.json
-    build/src/disk_tool/vitastor-disk dump-meta --io cached ./testdata/bin/test_osd$i.bin 4096 0 $((1024*1024)) >./testdata/meta_after_resize.json
+        --new_data_offset $new_data_offset
+    #build/src/disk_tool/vitastor-disk dump-journal --io cached --json ./testdata/bin/test_osd$i.bin 4096 $((1024*1024)) $((127*1024*1024)) >./testdata/journal_after_resize.json
+    build/src/disk_tool/vitastor-disk dump-meta --io cached $opts \
+        --meta_offset 0 \
+        --meta_len $((1024*1024)) \
+        --journal_offset $((1024*1024)) \
+        --data_offset $new_data_offset >./testdata/meta_after_resize.json
    if ! (cat ./testdata/meta_before_resize.json ./testdata/meta_after_resize.json | \
-        jq -e -s 'map([ .entries[] | del(.block) ] | sort_by(.pool, .inode, .stripe)) | .[0] == .[1] and (.[0] | length) > 1000'); then
+        jq -e -s 'map([ .entries[] | del(.block, .writes[].location) ] | sort_by(.pool, .inode, .stripe)) | .[0] == .[1] and (.[0] | length) > 1000'); then
        format_error "OSD $i metadata corrupted after resizing"
    fi
-    if ! (cat ./testdata/journal_before_resize.json ./testdata/journal_after_resize.json | \
-        jq -e -s 'map([ .[] | del(.crc32, .crc32_prev, .valid, .loc, .start) ]) | .[0] == .[1] and (.[0] | length) > 1'); then
-        format_error "OSD $i journal corrupted after resizing"
-    fi
+    #if ! (cat ./testdata/journal_before_resize.json ./testdata/journal_after_resize.json | \
+    #    jq -e -s 'map([ .[] | del(.crc32, .crc32_prev, .valid, .loc, .start) ]) | .[0] == .[1] and (.[0] | length) > 1'); then
+    #    format_error "OSD $i journal corrupted after resizing"
+    #fi
 done

 $ETCDCTL del --prefix /vitastor/osd/state/
@@ -54,7 +61,7 @@ for i in $(seq 1 $OSD_COUNT); do
        --data_device ./testdata/bin/test_osd$i.bin \
        --meta_offset 0 \
        --journal_offset $((1024*1024)) \
-        --data_offset $((128*1024*1024+32768)) >>./testdata/osd$i.log 2>&1 &
+        --data_offset $new_data_offset >>./testdata/osd$i.log 2>&1 &
    eval OSD${i}_PID=$!
 done

--- a/tests/test_resize_auto.sh
+++ b/tests/test_resize_auto.sh
@@ -15,7 +15,7 @@ trap "kill -9 $(jobs -p) || true; sudo losetup -d $LOOP1 $LOOP2"' || true' EXIT
 # also test prepare --hybrid :)
 # non-vitastor random type UUID to prevent udev activation
 mount | grep '/dev type devtmpfs' || sudo mount udev /dev/ -t devtmpfs
-sudo build/src/disk_tool/vitastor-disk-test prepare --no_init 1 --meta_reserve 1x,1M \
+sudo build/src/disk_tool/vitastor-disk-test prepare --meta_format 2 --no_init 1 --meta_reserve 1x,1M \
    --block_size 131072 --osd_num 987654 --part_type_uuid 0df42ae0-3695-4395-a957-7d5ff3645c56 \
    --hybrid --fast-devices $LOOP2 $LOOP1

--- a/tests/test_reweight_half.sh
+++ b/tests/test_reweight_half.sh
@@ -0,0 +1,41 @@
+#!/bin/bash -ex
+
+. `dirname $0`/common.sh
+
+node mon/mon-main.js $MON_PARAMS --etcd_address $ETCD_URL --etcd_prefix "/vitastor" >>./testdata/mon.log 2>&1 &
+MON_PID=$!
+wait_etcd
+
+TIME=$(date '+%s')
+$ETCDCTL put /vitastor/osd/stats/1 '{"host":"host1","size":1073741824,"time":"'$TIME'"}'
+$ETCDCTL put /vitastor/osd/stats/2 '{"host":"host1","size":1073741824,"time":"'$TIME'"}'
+$ETCDCTL put /vitastor/osd/stats/3 '{"host":"host2","size":1073741824,"time":"'$TIME'"}'
+$ETCDCTL put /vitastor/osd/stats/4 '{"host":"host2","size":1073741824,"time":"'$TIME'"}'
+build/src/cmd/vitastor-cli --etcd_address $ETCD_URL create-pool testpool -s 2 -n 16 --force
+
+sleep 2
+
+# check that all OSDs have 8 PGs
+$ETCDCTL get /vitastor/pg/config --print-value-only | \
+    jq -s -e '([ .[0].items["1"] | .[].osd_set | map_values(. | tonumber) | select(.[0] == 1 or .[1] == 1) ] | length) == 8'
+$ETCDCTL get /vitastor/pg/config --print-value-only | \
+    jq -s -e '([ .[0].items["1"] | .[].osd_set | map_values(. | tonumber) | select(.[0] == 2 or .[1] == 2) ] | length) == 8'
+$ETCDCTL get /vitastor/pg/config --print-value-only | \
+    jq -s -e '([ .[0].items["1"] | .[].osd_set | map_values(. | tonumber) | select(.[0] == 3 or .[1] == 3) ] | length) == 8'
+$ETCDCTL get /vitastor/pg/config --print-value-only | \
+    jq -s -e '([ .[0].items["1"] | .[].osd_set | map_values(. | tonumber) | select(.[0] == 4 or .[1] == 4) ] | length) == 8'
+
+build/src/cmd/vitastor-cli --etcd_address $ETCD_URL modify-osd --reweight 0.5 3
+
+sleep 2
+
+$ETCDCTL get /vitastor/pg/config --print-value-only | \
+    jq -s -e '([ .[0].items["1"] | .[].osd_set | map_values(. | tonumber) | select(.[0] == 1 or .[1] == 1) ] | length) == 8'
+$ETCDCTL get /vitastor/pg/config --print-value-only | \
+    jq -s -e '([ .[0].items["1"] | .[].osd_set | map_values(. | tonumber) | select(.[0] == 2 or .[1] == 2) ] | length) == 8'
+$ETCDCTL get /vitastor/pg/config --print-value-only | \
+    jq -s -e '([ .[0].items["1"] | .[].osd_set | map_values(. | tonumber) | select(.[0] == 3 or .[1] == 3) ] | length) <= 6'
+$ETCDCTL get /vitastor/pg/config --print-value-only | \
+    jq -s -e '([ .[0].items["1"] | .[].osd_set | map_values(. | tonumber) | select(.[0] == 4 or .[1] == 4) ] | length) >= 10'
+
+format_green OK
--- a/tests/test_scrub.sh
+++ b/tests/test_scrub.sh
@@ -7,6 +7,8 @@ if [[ ("$SCHEME" = "" || "$SCHEME" = "replicated") && ("$PG_SIZE" = "" || "$PG_S
    OSD_COUNT=2
 fi

+OSD_ARGS="--scrub_list_limit 1000 $OSD_ARGS"
+
 . `dirname $0`/run_3osds.sh

 check_qemu