144 changed files with 2294 additions and 6917 deletions
--- a/.dockerignore
+++ b/.dockerignore
@ -1,19 +0,0 @@
-.git
-build
-packages
-mon/node_modules
-*.o
-*.so
-osd
-stub_osd
-stub_uring_osd
-stub_bench
-osd_test
-dump_journal
-nbd_proxy
-rm_inode
-fio
-qemu
-rpm/*.Dockerfile
-debian/*.Dockerfile
-Dockerfile
--- a/.gitignore
+++ b/.gitignore
@ -1,18 +0,0 @@
-*.o
-*.so
-package-lock.json
-fio
-qemu
-osd
-stub_osd
-stub_uring_osd
-stub_bench
-osd_test
-osd_peering_pg_test
-dump_journal
-nbd_proxy
-rm_inode
-test_allocator
-test_blockstore
-test_shit
-osd_rmw_test
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@ -1,5 +0,0 @@
-cmake_minimum_required(VERSION 2.8)
-
-project(vitastor)
-
-add_subdirectory(src)
--- a/27
+++ b/27
@ -1,27 +0,0 @@
-Copyright (c) Vitaliy Filippov (vitalif [at] yourcmc.ru), 2019+
-
-All server-side code (OSD, Monitor and so on) is licensed under the terms of
-Vitastor Network Public License 1.1 (VNPL 1.1), a copyleft license based on
-GNU GPLv3.0 with the additional "Network Interaction" clause which requires
-opensourcing all programs directly or indirectly interacting with Vitastor
-through a computer network and expressly designed to be used in conjunction
-with it ("Proxy Programs"). Proxy Programs may be made public not only under
-the terms of the same license, but also under the terms of any GPL-Compatible
-Free Software License, as listed by the Free Software Foundation.
-This is a stricter copyleft license than the Affero GPL.
-
-Please note that VNPL doesn't require you to open the code of proprietary
-software running inside a VM if it's not specially designed to be used with
-Vitastor.
-
-Basically, you can't use the software in a proprietary environment to provide
-its functionality to users without opensourcing all intermediary components
-standing between the user and Vitastor or purchasing a commercial license
-from the author 😀.
-
-Client libraries (cluster_client and so on) are dual-licensed under the same
-VNPL 1.1 and also GNU GPL 2.0 or later to allow for compatibility with GPLed
-software like QEMU and fio.
-
-You can find the full text of VNPL-1.1 in the file [VNPL-1.1.txt](VNPL-1.1.txt).
-GPL 2.0 is also included in this repository as [GPL-2.0.txt](GPL-2.0.txt).
--- a/Make-gen.pl
+++ b/Make-gen.pl
@ -0,0 +1,46 @@
+#!/usr/bin/perl
+
+use strict;
+
+my $deps = {};
+for my $line (split /\n/, `grep '^#include "' *.cpp *.h`)
+{
+    if ($line =~ /^([^:]+):\#include "([^"]+)"/s)
+    {
+        $deps->{$1}->{$2} = 1;
+    }
+}
+
+my $added;
+do
+{
+    $added = 0;
+    for my $file (keys %$deps)
+    {
+        for my $dep (keys %{$deps->{$file}})
+        {
+            if ($deps->{$dep})
+            {
+                for my $subdep (keys %{$deps->{$dep}})
+                {
+                    if (!$deps->{$file}->{$subdep})
+                    {
+                        $added = 1;
+                        $deps->{$file}->{$subdep} = 1;
+                    }
+                }
+            }
+        }
+    }
+} while ($added);
+
+for my $file (sort keys %$deps)
+{
+    if ($file =~ /\.cpp$/)
+    {
+        my $obj = $file;
+        $obj =~ s/\.cpp$/.o/s;
+        print "$obj: $file ".join(" ", sort keys %{$deps->{$file}})."\n";
+        print "\tg++ \$(CXXFLAGS) -c -o \$\@ \$\<\n";
+    }
+}
--- a/177
+++ b/177
@ -0,0 +1,177 @@
+BLOCKSTORE_OBJS := allocator.o blockstore.o blockstore_impl.o blockstore_init.o blockstore_open.o blockstore_journal.o blockstore_read.o \
+	blockstore_write.o blockstore_sync.o blockstore_stable.o blockstore_rollback.o blockstore_flush.o crc32c.o ringloop.o
+# -fsanitize=address
+CXXFLAGS := -g -O3 -Wall -Wno-sign-compare -Wno-comment -Wno-parentheses -Wno-pointer-arith -fPIC -fdiagnostics-color=always
+all: libfio_blockstore.so osd libfio_sec_osd.so libfio_cluster.so stub_osd stub_uring_osd stub_bench osd_test dump_journal qemu_driver.so nbd_proxy rm_inode
+clean:
+	rm -f *.o
+
+dump_journal: dump_journal.cpp crc32c.o blockstore_journal.h
+	g++ $(CXXFLAGS) -o $@ $< crc32c.o
+
+libblockstore.so: $(BLOCKSTORE_OBJS)
+	g++ $(CXXFLAGS) -o $@ -shared $(BLOCKSTORE_OBJS) -ltcmalloc_minimal -luring
+libfio_blockstore.so: ./libblockstore.so fio_engine.o json11.o
+	g++ $(CXXFLAGS) -shared -o $@ fio_engine.o json11.o ./libblockstore.so -ltcmalloc_minimal -luring
+
+OSD_OBJS := osd.o osd_secondary.o msgr_receive.o msgr_send.o osd_peering.o osd_flush.o osd_peering_pg.o \
+	osd_primary.o osd_primary_subops.o etcd_state_client.o messenger.o osd_cluster.o http_client.o osd_ops.o pg_states.o \
+	osd_rmw.o json11.o base64.o timerfd_manager.o epoll_manager.o
+osd: ./libblockstore.so osd_main.cpp osd.h osd_ops.h $(OSD_OBJS)
+	g++ $(CXXFLAGS) -o $@ osd_main.cpp $(OSD_OBJS) ./libblockstore.so -ltcmalloc_minimal -luring
+
+stub_osd: stub_osd.o rw_blocking.o
+	g++ $(CXXFLAGS) -o $@ stub_osd.o rw_blocking.o -ltcmalloc_minimal
+
+osd_rmw_test: osd_rmw_test.o
+	g++ $(CXXFLAGS) -o $@ osd_rmw_test.o
+
+STUB_URING_OSD_OBJS := stub_uring_osd.o epoll_manager.o messenger.o msgr_send.o msgr_receive.o ringloop.o timerfd_manager.o json11.o
+stub_uring_osd: $(STUB_URING_OSD_OBJS)
+	g++ $(CXXFLAGS) -o $@ -ltcmalloc_minimal $(STUB_URING_OSD_OBJS) -luring
+stub_bench: stub_bench.cpp osd_ops.h rw_blocking.o
+	g++ $(CXXFLAGS) -o $@ stub_bench.cpp rw_blocking.o -ltcmalloc_minimal
+osd_test: osd_test.cpp osd_ops.h rw_blocking.o
+	g++ $(CXXFLAGS) -o $@ osd_test.cpp rw_blocking.o -ltcmalloc_minimal
+osd_peering_pg_test: osd_peering_pg_test.cpp osd_peering_pg.o
+	g++ $(CXXFLAGS) -o $@ $< osd_peering_pg.o -ltcmalloc_minimal
+
+libfio_sec_osd.so: fio_sec_osd.o rw_blocking.o
+	g++ $(CXXFLAGS) -ltcmalloc_minimal -shared -o $@ fio_sec_osd.o rw_blocking.o
+
+FIO_CLUSTER_OBJS := cluster_client.o epoll_manager.o etcd_state_client.o \
+	messenger.o msgr_send.o msgr_receive.o ringloop.o json11.o http_client.o osd_ops.o pg_states.o timerfd_manager.o base64.o
+libfio_cluster.so: fio_cluster.o $(FIO_CLUSTER_OBJS)
+	g++ $(CXXFLAGS) -ltcmalloc_minimal -shared -o $@ $< $(FIO_CLUSTER_OBJS) -luring
+
+nbd_proxy: nbd_proxy.o $(FIO_CLUSTER_OBJS)
+	g++ $(CXXFLAGS) -ltcmalloc_minimal -o $@ $< $(FIO_CLUSTER_OBJS) -luring
+
+rm_inode: rm_inode.o $(FIO_CLUSTER_OBJS)
+	g++ $(CXXFLAGS) -ltcmalloc_minimal -o $@ $< $(FIO_CLUSTER_OBJS) -luring
+
+qemu_driver.o: qemu_driver.c qemu_proxy.h
+	gcc -I qemu/b/qemu `pkg-config glib-2.0 --cflags` \
+		-I qemu/include $(CXXFLAGS) -c -o $@ $<
+
+qemu_driver.so: qemu_driver.o qemu_proxy.o $(FIO_CLUSTER_OBJS)
+	g++ $(CXXFLAGS) -ltcmalloc_minimal -shared -o $@ $< $(FIO_CLUSTER_OBJS) qemu_driver.o qemu_proxy.o -luring
+
+test_blockstore: ./libblockstore.so test_blockstore.cpp timerfd_interval.o
+	g++ $(CXXFLAGS) -o test_blockstore test_blockstore.cpp timerfd_interval.o ./libblockstore.so -ltcmalloc_minimal -luring
+test: test.cpp osd_peering_pg.o
+	g++ $(CXXFLAGS) -o test test.cpp osd_peering_pg.o -luring -lm
+test_allocator: test_allocator.cpp allocator.o
+	g++ $(CXXFLAGS) -o test_allocator test_allocator.cpp allocator.o
+
+crc32c.o: crc32c.c crc32c.h
+	g++ $(CXXFLAGS) -c -o $@ $<
+json11.o: json11/json11.cpp
+	g++ $(CXXFLAGS) -c -o json11.o json11/json11.cpp
+
+# Autogenerated
+
+allocator.o: allocator.cpp allocator.h
+	g++ $(CXXFLAGS) -c -o $@ $<
+base64.o: base64.cpp base64.h
+	g++ $(CXXFLAGS) -c -o $@ $<
+blockstore.o: blockstore.cpp allocator.h blockstore.h blockstore_flush.h blockstore_impl.h blockstore_init.h blockstore_journal.h cpp-btree/btree_map.h crc32c.h malloc_or_die.h object_id.h ringloop.h
+	g++ $(CXXFLAGS) -c -o $@ $<
+blockstore_flush.o: blockstore_flush.cpp allocator.h blockstore.h blockstore_flush.h blockstore_impl.h blockstore_init.h blockstore_journal.h cpp-btree/btree_map.h crc32c.h malloc_or_die.h object_id.h ringloop.h
+	g++ $(CXXFLAGS) -c -o $@ $<
+blockstore_impl.o: blockstore_impl.cpp allocator.h blockstore.h blockstore_flush.h blockstore_impl.h blockstore_init.h blockstore_journal.h cpp-btree/btree_map.h crc32c.h malloc_or_die.h object_id.h ringloop.h
+	g++ $(CXXFLAGS) -c -o $@ $<
+blockstore_init.o: blockstore_init.cpp allocator.h blockstore.h blockstore_flush.h blockstore_impl.h blockstore_init.h blockstore_journal.h cpp-btree/btree_map.h crc32c.h malloc_or_die.h object_id.h ringloop.h
+	g++ $(CXXFLAGS) -c -o $@ $<
+blockstore_journal.o: blockstore_journal.cpp allocator.h blockstore.h blockstore_flush.h blockstore_impl.h blockstore_init.h blockstore_journal.h cpp-btree/btree_map.h crc32c.h malloc_or_die.h object_id.h ringloop.h
+	g++ $(CXXFLAGS) -c -o $@ $<
+blockstore_open.o: blockstore_open.cpp allocator.h blockstore.h blockstore_flush.h blockstore_impl.h blockstore_init.h blockstore_journal.h cpp-btree/btree_map.h crc32c.h malloc_or_die.h object_id.h ringloop.h
+	g++ $(CXXFLAGS) -c -o $@ $<
+blockstore_read.o: blockstore_read.cpp allocator.h blockstore.h blockstore_flush.h blockstore_impl.h blockstore_init.h blockstore_journal.h cpp-btree/btree_map.h crc32c.h malloc_or_die.h object_id.h ringloop.h
+	g++ $(CXXFLAGS) -c -o $@ $<
+blockstore_rollback.o: blockstore_rollback.cpp allocator.h blockstore.h blockstore_flush.h blockstore_impl.h blockstore_init.h blockstore_journal.h cpp-btree/btree_map.h crc32c.h malloc_or_die.h object_id.h ringloop.h
+	g++ $(CXXFLAGS) -c -o $@ $<
+blockstore_stable.o: blockstore_stable.cpp allocator.h blockstore.h blockstore_flush.h blockstore_impl.h blockstore_init.h blockstore_journal.h cpp-btree/btree_map.h crc32c.h malloc_or_die.h object_id.h ringloop.h
+	g++ $(CXXFLAGS) -c -o $@ $<
+blockstore_sync.o: blockstore_sync.cpp allocator.h blockstore.h blockstore_flush.h blockstore_impl.h blockstore_init.h blockstore_journal.h cpp-btree/btree_map.h crc32c.h malloc_or_die.h object_id.h ringloop.h
+	g++ $(CXXFLAGS) -c -o $@ $<
+blockstore_write.o: blockstore_write.cpp allocator.h blockstore.h blockstore_flush.h blockstore_impl.h blockstore_init.h blockstore_journal.h cpp-btree/btree_map.h crc32c.h malloc_or_die.h object_id.h ringloop.h
+	g++ $(CXXFLAGS) -c -o $@ $<
+cluster_client.o: cluster_client.cpp cluster_client.h etcd_state_client.h http_client.h json11/json11.hpp malloc_or_die.h messenger.h object_id.h osd_id.h osd_ops.h ringloop.h timerfd_manager.h
+	g++ $(CXXFLAGS) -c -o $@ $<
+dump_journal.o: dump_journal.cpp allocator.h blockstore.h blockstore_flush.h blockstore_impl.h blockstore_init.h blockstore_journal.h cpp-btree/btree_map.h crc32c.h malloc_or_die.h object_id.h ringloop.h
+	g++ $(CXXFLAGS) -c -o $@ $<
+epoll_manager.o: epoll_manager.cpp epoll_manager.h ringloop.h timerfd_manager.h
+	g++ $(CXXFLAGS) -c -o $@ $<
+etcd_state_client.o: etcd_state_client.cpp base64.h etcd_state_client.h http_client.h json11/json11.hpp object_id.h osd_id.h osd_ops.h pg_states.h timerfd_manager.h
+	g++ $(CXXFLAGS) -c -o $@ $<
+fio_cluster.o: fio_cluster.cpp cluster_client.h epoll_manager.h etcd_state_client.h fio/arch/arch.h fio/fio.h fio/optgroup.h fio_headers.h http_client.h json11/json11.hpp malloc_or_die.h messenger.h object_id.h osd_id.h osd_ops.h ringloop.h timerfd_manager.h
+	g++ $(CXXFLAGS) -c -o $@ $<
+fio_engine.o: fio_engine.cpp blockstore.h fio/arch/arch.h fio/fio.h fio/optgroup.h fio_headers.h json11/json11.hpp object_id.h ringloop.h
+	g++ $(CXXFLAGS) -c -o $@ $<
+fio_sec_osd.o: fio_sec_osd.cpp fio/arch/arch.h fio/fio.h fio/optgroup.h fio_headers.h object_id.h osd_id.h osd_ops.h rw_blocking.h
+	g++ $(CXXFLAGS) -c -o $@ $<
+http_client.o: http_client.cpp http_client.h json11/json11.hpp timerfd_manager.h
+	g++ $(CXXFLAGS) -c -o $@ $<
+messenger.o: messenger.cpp json11/json11.hpp malloc_or_die.h messenger.h object_id.h osd_id.h osd_ops.h ringloop.h timerfd_manager.h
+	g++ $(CXXFLAGS) -c -o $@ $<
+msgr_receive.o: msgr_receive.cpp json11/json11.hpp malloc_or_die.h messenger.h object_id.h osd_id.h osd_ops.h ringloop.h timerfd_manager.h
+	g++ $(CXXFLAGS) -c -o $@ $<
+msgr_send.o: msgr_send.cpp json11/json11.hpp malloc_or_die.h messenger.h object_id.h osd_id.h osd_ops.h ringloop.h timerfd_manager.h
+	g++ $(CXXFLAGS) -c -o $@ $<
+nbd_proxy.o: nbd_proxy.cpp cluster_client.h epoll_manager.h etcd_state_client.h http_client.h json11/json11.hpp malloc_or_die.h messenger.h object_id.h osd_id.h osd_ops.h ringloop.h timerfd_manager.h
+	g++ $(CXXFLAGS) -c -o $@ $<
+osd.o: osd.cpp blockstore.h cpp-btree/btree_map.h epoll_manager.h etcd_state_client.h http_client.h json11/json11.hpp malloc_or_die.h messenger.h object_id.h osd.h osd_id.h osd_ops.h osd_peering_pg.h pg_states.h ringloop.h timerfd_manager.h
+	g++ $(CXXFLAGS) -c -o $@ $<
+osd_cluster.o: osd_cluster.cpp base64.h blockstore.h cpp-btree/btree_map.h epoll_manager.h etcd_state_client.h http_client.h json11/json11.hpp malloc_or_die.h messenger.h object_id.h osd.h osd_id.h osd_ops.h osd_peering_pg.h pg_states.h ringloop.h timerfd_manager.h
+	g++ $(CXXFLAGS) -c -o $@ $<
+osd_flush.o: osd_flush.cpp blockstore.h cpp-btree/btree_map.h epoll_manager.h etcd_state_client.h http_client.h json11/json11.hpp malloc_or_die.h messenger.h object_id.h osd.h osd_id.h osd_ops.h osd_peering_pg.h pg_states.h ringloop.h timerfd_manager.h
+	g++ $(CXXFLAGS) -c -o $@ $<
+osd_main.o: osd_main.cpp blockstore.h cpp-btree/btree_map.h epoll_manager.h etcd_state_client.h http_client.h json11/json11.hpp malloc_or_die.h messenger.h object_id.h osd.h osd_id.h osd_ops.h osd_peering_pg.h pg_states.h ringloop.h timerfd_manager.h
+	g++ $(CXXFLAGS) -c -o $@ $<
+osd_ops.o: osd_ops.cpp object_id.h osd_id.h osd_ops.h
+	g++ $(CXXFLAGS) -c -o $@ $<
+osd_peering.o: osd_peering.cpp base64.h blockstore.h cpp-btree/btree_map.h epoll_manager.h etcd_state_client.h http_client.h json11/json11.hpp malloc_or_die.h messenger.h object_id.h osd.h osd_id.h osd_ops.h osd_peering_pg.h pg_states.h ringloop.h timerfd_manager.h
+	g++ $(CXXFLAGS) -c -o $@ $<
+osd_peering_pg.o: osd_peering_pg.cpp cpp-btree/btree_map.h object_id.h osd_id.h osd_ops.h osd_peering_pg.h pg_states.h
+	g++ $(CXXFLAGS) -c -o $@ $<
+osd_peering_pg_test.o: osd_peering_pg_test.cpp cpp-btree/btree_map.h object_id.h osd_id.h osd_ops.h osd_peering_pg.h pg_states.h
+	g++ $(CXXFLAGS) -c -o $@ $<
+osd_primary.o: osd_primary.cpp blockstore.h cpp-btree/btree_map.h epoll_manager.h etcd_state_client.h http_client.h json11/json11.hpp malloc_or_die.h messenger.h object_id.h osd.h osd_id.h osd_ops.h osd_peering_pg.h osd_primary.h osd_rmw.h pg_states.h ringloop.h timerfd_manager.h
+	g++ $(CXXFLAGS) -c -o $@ $<
+osd_primary_subops.o: osd_primary_subops.cpp blockstore.h cpp-btree/btree_map.h epoll_manager.h etcd_state_client.h http_client.h json11/json11.hpp malloc_or_die.h messenger.h object_id.h osd.h osd_id.h osd_ops.h osd_peering_pg.h osd_primary.h osd_rmw.h pg_states.h ringloop.h timerfd_manager.h
+	g++ $(CXXFLAGS) -c -o $@ $<
+osd_rmw.o: osd_rmw.cpp malloc_or_die.h object_id.h osd_id.h osd_rmw.h xor.h
+	g++ $(CXXFLAGS) -c -o $@ $<
+osd_rmw_test.o: osd_rmw_test.cpp malloc_or_die.h object_id.h osd_id.h osd_rmw.cpp osd_rmw.h test_pattern.h xor.h
+	g++ $(CXXFLAGS) -c -o $@ $<
+osd_secondary.o: osd_secondary.cpp blockstore.h cpp-btree/btree_map.h epoll_manager.h etcd_state_client.h http_client.h json11/json11.hpp malloc_or_die.h messenger.h object_id.h osd.h osd_id.h osd_ops.h osd_peering_pg.h pg_states.h ringloop.h timerfd_manager.h
+	g++ $(CXXFLAGS) -c -o $@ $<
+osd_test.o: osd_test.cpp object_id.h osd_id.h osd_ops.h rw_blocking.h test_pattern.h
+	g++ $(CXXFLAGS) -c -o $@ $<
+pg_states.o: pg_states.cpp pg_states.h
+	g++ $(CXXFLAGS) -c -o $@ $<
+qemu_proxy.o: qemu_proxy.cpp cluster_client.h etcd_state_client.h http_client.h json11/json11.hpp malloc_or_die.h messenger.h object_id.h osd_id.h osd_ops.h qemu_proxy.h ringloop.h timerfd_manager.h
+	g++ $(CXXFLAGS) -c -o $@ $<
+ringloop.o: ringloop.cpp ringloop.h
+	g++ $(CXXFLAGS) -c -o $@ $<
+rm_inode.o: rm_inode.cpp cluster_client.h etcd_state_client.h http_client.h json11/json11.hpp malloc_or_die.h messenger.h object_id.h osd_id.h osd_ops.h ringloop.h timerfd_manager.h
+	g++ $(CXXFLAGS) -c -o $@ $<
+rw_blocking.o: rw_blocking.cpp rw_blocking.h
+	g++ $(CXXFLAGS) -c -o $@ $<
+stub_bench.o: stub_bench.cpp object_id.h osd_id.h osd_ops.h rw_blocking.h
+	g++ $(CXXFLAGS) -c -o $@ $<
+stub_osd.o: stub_osd.cpp object_id.h osd_id.h osd_ops.h rw_blocking.h
+	g++ $(CXXFLAGS) -c -o $@ $<
+stub_uring_osd.o: stub_uring_osd.cpp epoll_manager.h json11/json11.hpp malloc_or_die.h messenger.h object_id.h osd_id.h osd_ops.h ringloop.h timerfd_manager.h
+	g++ $(CXXFLAGS) -c -o $@ $<
+test.o: test.cpp allocator.h blockstore.h blockstore_flush.h blockstore_impl.h blockstore_init.h blockstore_journal.h cpp-btree/btree_map.h crc32c.h malloc_or_die.h object_id.h osd_id.h osd_ops.h osd_peering_pg.h pg_states.h ringloop.h
+	g++ $(CXXFLAGS) -c -o $@ $<
+test_allocator.o: test_allocator.cpp allocator.h
+	g++ $(CXXFLAGS) -c -o $@ $<
+test_blockstore.o: test_blockstore.cpp blockstore.h object_id.h ringloop.h timerfd_interval.h
+	g++ $(CXXFLAGS) -c -o $@ $<
+timerfd_interval.o: timerfd_interval.cpp ringloop.h timerfd_interval.h
+	g++ $(CXXFLAGS) -c -o $@ $<
+timerfd_manager.o: timerfd_manager.cpp timerfd_manager.h
+	g++ $(CXXFLAGS) -c -o $@ $<
--- a/README-ru.md
+++ b/README-ru.md
@ -1,491 +0,0 @@
-## Vitastor
-
-[Read English version](README.md)
-
-## Идея
-
-Я всего лишь хочу сделать качественную блочную SDS!
-
-Vitastor - распределённая блочная SDS, прямой аналог Ceph RBD и внутренних СХД популярных
-облачных провайдеров. Однако, в отличие от них, Vitastor быстрый и при этом простой.
-Только пока маленький :-).
-
-Архитектурная схожесть с Ceph означает заложенную на уровне алгоритмов записи строгую консистентность,
-репликацию через первичный OSD, симметричную кластеризацию без единой точки отказа
-и автоматическое распределение данных по любому числу дисков любого размера с настраиваемыми схемами
-избыточности - репликацией или с произвольными кодами коррекции ошибок.
-
-## Возможности
-
-Vitastor на данный момент находится в статусе предварительного выпуска, расширенные
-возможности пока отсутствуют, а в будущих версиях вероятны "ломающие" изменения.
-
-Однако следующее уже реализовано:
-
- Базовая часть - надёжное кластерное блочное хранилище без единой точки отказа
- Производительность ;-D
- Несколько схем отказоустойчивости: репликация, XOR n+1 (1 диск чётности), коды коррекции ошибок
-  Рида-Соломона на основе библиотеки jerasure с любым числом дисков данных и чётности в группе
- Конфигурация через простые человекочитаемые JSON-структуры в etcd
- Автоматическое распределение данных по OSD, с поддержкой:
-  - Математической оптимизации для лучшей равномерности распределения и минимизации перемещений данных
-  - Нескольких пулов с разными схемами избыточности
-  - Дерева распределения, выбора OSD по тегам / классам устройств (только SSD, только HDD) и по поддереву
-  - Настраиваемых доменов отказа (диск/сервер/стойка и т.п.)
- Восстановление деградированных блоков
- Ребаланс, то есть перемещение данных между OSD (дисками)
- Поддержка "ленивого" fsync (fsync не на каждую операцию)
- Сбор статистики ввода/вывода в etcd
- Клиентская библиотека режима пользователя для ввода/вывода
- Драйвер диска для QEMU (собирается вне дерева исходников QEMU)
- Драйвер диска для утилиты тестирования производительности fio (также собирается вне дерева исходников fio)
- NBD-прокси для монтирования образов ядром ("блочное устройство в режиме пользователя")
- Утилита удаления образов/инодов (vitastor-rm)
- Пакеты для Debian и CentOS
- Статистика операций ввода/вывода и занятого места в разрезе инодов
- Именование инодов через хранение их метаданных в etcd
- Снапшоты и copy-on-write клоны
-
-## Планы разработки
-
- Более корректные скрипты разметки дисков и автоматического запуска OSD
- Другие инструменты администрирования
- Плагины для OpenStack, Kubernetes, OpenNebula, Proxmox и других облачных систем
- iSCSI-прокси
- Таймауты операций и более быстрое выявление отказов
- Фоновая проверка целостности без контрольных сумм (сверка реплик)
- Контрольные суммы
- Оптимизации для гибридных SSD+HDD хранилищ
- Поддержка RDMA и NVDIMM
- Web-интерфейс
- Возможно, сжатие
- Возможно, поддержка кэширования данных через системный page cache
-
-## Архитектура
-
-Так же, как и в Ceph, в Vitastor:
-
- Есть пулы (pools), PG, OSD, мониторы, домены отказа, дерево распределения (аналог crush-дерева).
- Образы делятся на блоки фиксированного размера (объекты), и эти объекты распределяются по OSD.
- У OSD есть журнал и метаданные и они тоже могут размещаться на отдельных быстрых дисках.
- Все операции записи тоже транзакционны. В Vitastor, правда, есть режим отложенного/ленивого fsync
-  (коммита), в котором fsync не вызывается на каждую операцию записи, что делает его более
-  пригодным для использования на "плохих" (десктопных) SSD. Однако все операции записи
-  в любом случае атомарны.
- Клиентская библиотека тоже старается ждать восстановления после любого отказа кластера, то есть,
-  вы тоже можете перезагрузить хоть весь кластер разом, и клиенты только на время зависнут,
-  но не отключатся.
-
-Некоторые базовые термины для тех, кто не знаком с Ceph:
-
- OSD (Object Storage Daemon) - процесс, который хранит данные на одном диске и обрабатывает
-  запросы чтения/записи от клиентов.
- Пул (Pool) - контейнер для данных, имеющих одну и ту же схему избыточности и правила распределения по OSD.
- PG (Placement Group) - группа объектов, хранимых на одном и том же наборе реплик (OSD).
-  Несколько PG могут храниться на одном и том же наборе реплик, но объекты одной PG
-  в норме не хранятся на разных наборах OSD.
- Монитор - демон, хранящий состояние кластера.
- Домен отказа (Failure Domain) - группа OSD, которым вы разрешаете "упасть" всем вместе.
-  Иными словами, это группа OSD, в которые СХД не помещает разные копии одного и того же
-  блока данных. Например, если домен отказа - сервер, то на двух дисках одного сервера
-  никогда не окажется 2 и более копий одного и того же блока данных, а значит, даже
-  если в этом сервере откажут все диски, это будет равносильно потере только 1 копии
-  любого блока данных.
- Дерево распределения (Placement Tree / CRUSH Tree) - иерархическая группировка OSD
-  в узлы, которые далее можно использовать как домены отказа. То есть, диск (OSD) входит в
-  сервер, сервер входит в стойку, стойка входит в ряд, ряд в датацентр и т.п.
-
-Чем Vitastor отличается от Ceph:
-
- Vitastor в первую очередь сфокусирован на SSD. Также Vitastor, вероятно, должен неплохо работать
-  с комбинацией SSD и HDD через bcache, а в будущем, возможно, будут добавлены и нативные способы
-  оптимизации под SSD+HDD. Однако хранилище на основе одних лишь жёстких дисков, вообще без SSD,
-  не в приоритете, поэтому оптимизации под этот кейс могут вообще не состояться.
- OSD Vitastor однопоточный и всегда таким останется, так как это самый оптимальный способ работы.
-  Если вам не хватает 1 ядра на 1 диск, просто делите диск на разделы и запускайте на нём несколько OSD.
-  Но, скорее всего, вам хватит и 1 ядра - Vitastor не так прожорлив к ресурсам CPU, как Ceph.
- Журнал и метаданные всегда размещаются в памяти, благодаря чему никогда не тратится лишнее время
-  на чтение метаданных с диска. Размер метаданных линейно зависит от размера диска и блока данных,
-  который задаётся в конфигурации кластера и по умолчанию составляет 128 КБ. С блоком 128 КБ метаданные
-  занимают примерно 512 МБ памяти на 1 ТБ дискового пространства (и это всё равно меньше, чем нужно Ceph-у).
-  Журнал вообще не должен быть большим, например, тесты производительности в данном документе проводились
-  с журналом размером всего 16 МБ. Большой журнал, вероятно, даже вреден, т.к. "грязные" записи (записи,
-  не сброшенные из журнала) тоже занимают память и могут немного замедлять работу.
- В Vitastor нет внутреннего copy-on-write. Я считаю, что реализация CoW-хранилища гораздо сложнее,
-  поэтому сложнее добиться устойчиво хороших результатов. Возможно, в один прекрасный день
-  я придумаю красивый алгоритм для CoW-хранилища, но пока нет - внутреннего CoW в Vitastor не будет.
-  Всё это не относится к "внешнему" CoW (снапшотам и клонам).
- Базовый слой Vitastor - простое блочное хранилище с блоками фиксированного размера, а не сложное
-  объектное хранилище с расширенными возможностями, как в Ceph (RADOS).
- В Vitastor есть режим "ленивых fsync", в котором OSD группирует запросы записи перед сбросом их
-  на диск, что позволяет получить лучшую производительность с дешёвыми настольными SSD без конденсаторов
-  ("Advanced Power Loss Protection" / "Capacitor-Based Power Loss Protection").
-  Тем не менее, такой режим всё равно медленнее использования нормальных серверных SSD и мгновенного
-  fsync, так как приводит к дополнительным операциям передачи данных по сети, поэтому рекомендуется
-  всё-таки использовать хорошие серверные диски, тем более, стоят они почти так же, как десктопные.
- PG эфемерны. Это означает, что они не хранятся на дисках и существуют только в памяти работающих OSD.
- Процессы восстановления оперируют отдельными объектами, а не целыми PG.
- PGLOG-ов нет.
- "Мониторы" не хранят данные. Конфигурация и состояние кластера хранятся в etcd в простых человекочитаемых
-  JSON-структурах. Мониторы Vitastor только следят за состоянием кластера и управляют перемещением данных.
-  В этом смысле монитор Vitastor не является критичным компонентом системы и больше похож на Ceph-овский
-  менеджер (MGR). Монитор Vitastor написан на node.js.
- Распределение PG не основано на консистентных хешах. Вместо этого все маппинги PG хранятся прямо в etcd
-  (ибо нет никакой проблемы сохранить несколько сотен-тысяч записей в памяти, а не считать каждый раз хеши).
-  Перераспределение PG по OSD выполняется через математическую оптимизацию,
-  а конкретно, сведение задачи к ЛП (задаче линейного программирования) и решение оной с помощью утилиты
-  lp_solve. Такой подход позволяет обычно выравнивать распределение места почти идеально - равномерность
-  обычно составляет 96-99%, в отличие от Ceph, где на голом CRUSH-е без балансировщика обычно выходит 80-90%.
-  Также это позволяет минимизировать объём перемещения данных и случайность связей между OSD, а также менять
-  распределение вручную, не боясь сломать логику перебалансировки. В таком подходе есть и потенциальный
-  недостаток - есть предположение, что в очень большом кластере он может сломаться - однако вплоть до
-  нескольких сотен OSD подход точно работает нормально. Ну и, собственно, при необходимости легко
-  реализовать и консистентные хеши.
- Отдельный слой, подобный слою "CRUSH-правил", отсутствует. Вы настраиваете схемы отказоустойчивости,
-  домены отказа и правила выбора OSD напрямую в конфигурации пулов.
-
-## Понимание сути производительности систем хранения
-
-Вкратце: для быстрой хранилки задержки важнее, чем пиковые iops-ы.
-
-Лучшая возможная задержка достигается при тестировании в 1 поток с глубиной очереди 1,
-что приблизительно означает минимально нагруженное состояние кластера. В данном случае
-IOPS = 1/задержка. Ни числом серверов, ни дисков, ни серверных процессов/потоков
-задержка не масштабируется... Она зависит только от того, насколько быстро один
-серверный процесс (и клиент) обрабатывают одну операцию.
-
-Почему задержки важны? Потому, что некоторые приложения *не могут* использовать глубину
-очереди больше 1, ибо их задача не параллелизуется. Важный пример - это все СУБД
-с поддержкой консистентности (ACID), потому что все они обеспечивают её через
-журналирование, а журналы пишутся последовательно и с fsync() после каждой операции.
-
-fsync, кстати - это ещё одна очень важная вещь, про которую почти всегда забывают в тестах.
-Смысл в том, что все современные диски имеют кэши/буферы записи и не гарантируют, что
-данные реально физически записываются на носитель до того, как вы делаете fsync(),
-который транслируется в команду сброса кэша операционной системой.
-
-Дешёвые SSD для настольных ПК и ноутбуков очень быстрые без fsync - NVMe диски, например,
-могут обработать порядка 80000 операций записи в секунду с глубиной очереди 1 без fsync.
-Однако с fsync, когда они реально вынуждены писать каждый блок данных во флеш-память,
-они выжимают лишь 1000-2000 операций записи в секунду (число практически постоянное
-для всех моделей SSD).
-
-Серверные SSD часто имеют суперконденсаторы, работающие как встроенный источник
-бесперебойного питания и дающие дискам успеть сбросить их DRAM-кэш в постоянную
-флеш-память при отключении питания. Благодаря этому диски с чистой совестью
-*игнорируют fsync*, так как точно знают, что данные из кэша доедут до постоянной
-памяти.
-
-Все наиболее известные программные СХД, например, Ceph и внутренние СХД, используемые
-такими облачными провайдерами, как Amazon, Google, Яндекс, медленные в смысле задержки.
-В лучшем случае они дают задержки от 0.3мс на чтение и 0.6мс на запись 4 КБ блоками
-даже при условии использования наилучшего возможного железа.
-
-И это в эпоху SSD, когда вы можете пойти на рынок и купить там SSD, задержка которого
-на чтение будет 0.1мс, а на запись - 0.04мс, за 100$ или даже дешевле.
-
-Когда мне нужно быстро протестировать производительность дисковой подсистемы, я
-использую следующие 6 команд, с небольшими вариациями:
-
- Линейная запись:
-  `fio -ioengine=libaio -direct=1 -invalidate=1 -name=test -bs=4M -iodepth=32 -rw=write -runtime=60 -filename=/dev/sdX`
- Линейное чтение:
-  `fio -ioengine=libaio -direct=1 -invalidate=1 -name=test -bs=4M -iodepth=32 -rw=read -runtime=60 -filename=/dev/sdX`
- Запись в 1 поток (T1Q1):
-  `fio -ioengine=libaio -direct=1 -invalidate=1 -name=test -bs=4k -iodepth=1 -fsync=1 -rw=randwrite -runtime=60 -filename=/dev/sdX`
- Чтение в 1 поток (T1Q1):
-  `fio -ioengine=libaio -direct=1 -invalidate=1 -name=test -bs=4k -iodepth=1 -rw=randread -runtime=60 -filename=/dev/sdX`
- Параллельная запись (numjobs используется, когда 1 ядро CPU не может насытить диск):
-  `fio -ioengine=libaio -direct=1 -invalidate=1 -name=test -bs=4k -iodepth=128 [-numjobs=4 -group_reporting] -rw=randwrite -runtime=60 -filename=/dev/sdX`
- Параллельное чтение (numjobs - аналогично):
-  `fio -ioengine=libaio -direct=1 -invalidate=1 -name=test -bs=4k -iodepth=128 [-numjobs=4 -group_reporting] -rw=randread -runtime=60 -filename=/dev/sdX`
-
-## Теоретическая максимальная производительность Vitastor
-
-При использовании репликации:
- Задержка чтения в 1 поток (T1Q1): 1 сетевой RTT + 1 чтение с диска.
- Запись+fsync в 1 поток:
-  - С мгновенным сбросом: 2 RTT + 1 запись.
-  - С отложенным ("ленивым") сбросом: 4 RTT + 1 запись + 1 fsync.
- Параллельное чтение: сумма IOPS всех дисков либо производительность сети, если в сеть упрётся раньше.
- Параллельная запись: сумма IOPS всех дисков / число реплик / WA либо производительность сети, если в сеть упрётся раньше.
-
-При использовании кодов коррекции ошибок (EC):
- Задержка чтения в 1 поток (T1Q1): 1.5 RTT + 1 чтение.
- Запись+fsync в 1 поток:
-  - С мгновенным сбросом: 3.5 RTT + 1 чтение + 2 записи.
-  - С отложенным ("ленивым") сбросом: 5.5 RTT + 1 чтение + 2 записи + 2 fsync.
- Под 0.5 на самом деле подразумевается (k-1)/k, где k - число дисков данных,
-  что означает, что дополнительное обращение по сети не нужно, когда операция
-  чтения обслуживается локально.
- Параллельное чтение: сумма IOPS всех дисков либо производительность сети, если в сеть упрётся раньше.
- Параллельная запись: сумма IOPS всех дисков / общее число дисков данных и чётности / WA либо производительность сети, если в сеть упрётся раньше.
-  Примечание: IOPS дисков в данном случае надо брать в смешанном режиме чтения/записи в пропорции, аналогичной формулам выше.
-
-WA (мультипликатор записи) для 4 КБ блоков в Vitastor обычно составляет 3-5:
-1. Запись метаданных в журнал
-2. Запись блока данных в журнал
-3. Запись метаданных в БД
-4. Ещё одна запись метаданных в журнал при использовании EC
-5. Запись блока данных на диск данных
-
-Если вы найдёте SSD, хорошо работающий с 512-байтными блоками данных (Optane?),
-то 1, 3 и 4 можно снизить до 512 байт (1/8 от размера данных) и получить WA всего 2.375.
-
-Кроме того, WA снижается при использовании отложенного/ленивого сброса при параллельной
-нагрузке, т.к. блоки журнала записываются на диск только когда они заполняются или явным
-образом запрашивается fsync.
-
-## Пример сравнения с Ceph
-
-Железо - 4 сервера, в каждом:
- 6x SATA SSD Intel D3-4510 3.84 TB
- 2x Xeon Gold 6242 (16 cores @ 2.8 GHz)
- 384 GB RAM
- 1x 25 GbE сетевая карта (Mellanox ConnectX-4 LX), подключённая к свитчу Juniper QFX5200
-
-Экономия энергии CPU отключена. В тестах и Vitastor, и Ceph развёрнуто по 2 OSD на 1 SSD.
-
-Все результаты ниже относятся к случайной нагрузке 4 КБ блоками (если явно не указано обратное).
-
-Производительность голых дисков:
- T1Q1 запись ~27000 iops (задержка ~0.037ms)
- T1Q1 чтение ~9800 iops (задержка ~0.101ms)
- T1Q32 запись ~60000 iops
- T1Q32 чтение ~81700 iops
-
-Ceph 15.2.4 (Bluestore):
- T1Q1 запись ~1000 iops (задержка ~1ms)
- T1Q1 чтение ~1750 iops (задержка ~0.57ms)
- T8Q64 запись ~100000 iops, потребление CPU процессами OSD около 40 ядер на каждом сервере
- T8Q64 чтение ~480000 iops, потребление CPU процессами OSD около 40 ядер на каждом сервере
-
-Тесты в 8 потоков проводились на 8 400GB RBD образах со всех хостов (с каждого хоста запускалось 2 процесса fio).
-Это нужно потому, что в Ceph несколько RBD-клиентов, пишущих в 1 образ, очень сильно замедляются.
-
-Настройки RocksDB и Bluestore в Ceph не менялись, единственным изменением было отключение cephx_sign_messages.
-
-На самом деле, результаты теста не такие уж и плохие для Ceph (могло быть хуже).
-Собственно говоря, эти серверы как раз хорошо сбалансированы для Ceph - 6 SATA SSD как раз
-утилизируют 25-гигабитную сеть, а без 2 мощных процессоров Ceph-у бы не хватило ядер,
-чтобы выдать пристойный результат. Собственно, что и показывает жор 40 ядер в процессе
-параллельного теста.
-
-Vitastor:
- T1Q1 запись: 7087 iops (задержка 0.14ms)
- T1Q1 чтение: 6838 iops (задержка 0.145ms)
- T2Q64 запись: 162000 iops, потребление CPU - 3 ядра на каждом сервере
- T8Q64 чтение: 895000 iops, потребление CPU - 4 ядра на каждом сервере
- Линейная запись (4M T1Q32): 2800 МБ/с
- Линейное чтение (4M T1Q32): 1500 МБ/с
-
-Тест на чтение в 8 потоков проводился на 1 большом образе (3.2 ТБ) со всех хостов (опять же, по 2 fio с каждого).
-В Vitastor никакой разницы между 1 образом и 8-ю нет. Естественно, примерно 1/4 запросов чтения
-в такой конфигурации, как и в тестах Ceph выше, обслуживалась с локальной машины. Если проводить
-тест так, чтобы все операции всегда обращались к первичным OSD по сети - тест сильнее упирался
-в сеть и результат составлял примерно 689000 iops.
-
-Настройки Vitastor: `--disable_data_fsync true --immediate_commit all --flusher_count 8
-  --disk_alignment 4096 --journal_block_size 4096 --meta_block_size 4096
-  --journal_no_same_sector_overwrites true --journal_sector_buffer_count 1024
-  --journal_size 16777216`.
-
-### EC/XOR 2+1
-
-Vitastor:
- T1Q1 запись: 2808 iops (задержка ~0.355ms)
- T1Q1 чтение: 6190 iops (задержка ~0.16ms)
- T2Q64 запись: 85500 iops, потребление CPU - 3.4 ядра на каждом сервере
- T8Q64 чтение: 812000 iops, потребление CPU - 4.7 ядра на каждом сервере
- Линейная запись (4M T1Q32): 3200 МБ/с
- Линейное чтение (4M T1Q32): 1800 МБ/с
-
-Ceph:
- T1Q1 запись: 730 iops (задержка ~1.37ms latency)
- T1Q1 чтение: 1500 iops с холодным кэшем метаданных (задержка ~0.66ms), 2300 iops через 2 минуты прогрева (задержка ~0.435ms)
- T4Q128 запись (4 RBD images): 45300 iops, потребление CPU - 30 ядер на каждом сервере
- T8Q64 чтение (4 RBD images): 278600 iops, потребление CPU - 40 ядер на каждом сервере
- Линейная запись (4M T1Q32): 1950 МБ/с в пустой образ, 2500 МБ/с в заполненный образ
- Линейное чтение (4M T1Q32): 2400 МБ/с
-
-### NBD
-
-NBD - на данный момент единственный способ монтировать Vitastor ядром Linux, но он
-приводит к дополнительным копированиям данных, поэтому немного ухудшает производительность,
-правда, в основном - линейную, а случайная затрагивается слабо.
-
-NBD расшифровывается как "сетевое блочное устройство", но на самом деле оно также
-работает просто как аналог FUSE для блочных устройств, то есть, представляет собой
-"блочное устройство в пространстве пользователя".
-
-Vitastor с однопоточной NBD прокси на том же стенде:
- T1Q1 запись: 6000 iops (задержка 0.166ms)
- T1Q1 чтение: 5518 iops (задержка 0.18ms)
- T1Q128 запись: 94400 iops
- T1Q128 чтение: 103000 iops
- Линейная запись (4M T1Q128): 1266 МБ/с (в сравнении с 2800 МБ/с через fio)
- Линейное чтение (4M T1Q128): 975 МБ/с (в сравнении с 1500 МБ/с через fio)
-
-## Установка
-
-### Debian
-
- Добавьте ключ репозитория Vitastor:
-  `wget -q -O - https://vitastor.io/debian/pubkey | sudo apt-key add -`
- Добавьте репозиторий Vitastor в /etc/apt/sources.list:
-  - Debian 11 (Bullseye/Sid): `deb https://vitastor.io/debian bullseye main`
-  - Debian 10 (Buster): `deb https://vitastor.io/debian buster main`
- Для Debian 10 (Buster) также включите репозиторий backports:
-  `deb http://deb.debian.org/debian buster-backports main`
- Установите пакеты: `apt update; apt install vitastor lp-solve etcd linux-image-amd64 qemu`
-
-### CentOS
-
- Добавьте в систему репозиторий Vitastor:
-  - CentOS 7: `yum install https://vitastor.io/rpms/centos/7/vitastor-release-1.0-1.el7.noarch.rpm`
-  - CentOS 8: `dnf install https://vitastor.io/rpms/centos/8/vitastor-release-1.0-1.el8.noarch.rpm`
- Включите EPEL: `yum/dnf install epel-release`
- Включите дополнительные репозитории CentOS:
-  - CentOS 7: `yum install centos-release-scl`
-  - CentOS 8: `dnf install centos-release-advanced-virtualization`
- Включите elrepo-kernel:
-  - CentOS 7: `yum install https://www.elrepo.org/elrepo-release-7.el7.elrepo.noarch.rpm`
-  - CentOS 8: `dnf install https://www.elrepo.org/elrepo-release-8.el8.elrepo.noarch.rpm`
- Установите пакеты: `yum/dnf install vitastor lpsolve etcd kernel-ml qemu-kvm`
-
-### Установка из исходников
-
- Установите ядро 5.4 или более новое, для поддержки io_uring. Желательно 5.8 или даже новее,
-  так как в 5.4 есть как минимум 1 известный баг, ведущий к зависанию с io_uring и контроллером HP SmartArray.
- Установите liburing 0.4 или более новый и его заголовки.
- Установите lp_solve.
- Установите etcd. Внимание: вам нужна версия с исправлением отсюда: https://github.com/vitalif/etcd/,
-  из ветки release-3.4, так как в etcd есть баг, который [будет](https://github.com/etcd-io/etcd/pull/12402)
-  исправлен только в 3.4.15. Баг приводит к неспособности Vitastor запустить PG, когда их хотя бы 500 штук.
- Установите node.js 10 или новее.
- Установите gcc и g++ 8.x или новее.
- Склонируйте данный репозиторий с подмодулями: `git clone https://yourcmc.ru/git/vitalif/vitastor/`.
- Желательно пересобрать QEMU с патчем, который делает необязательным запуск через LD_PRELOAD.
-  См `qemu-*.*-vitastor.patch` - выберите версию, наиболее близкую вашей версии QEMU.
- Установите QEMU 3.0 или новее, возьмите исходные коды установленного пакета, начните его пересборку,
-  через некоторое время остановите её и скопируйте следующие заголовки:
-   - `<qemu>/include` &rarr; `<vitastor>/qemu/include`
-   - Debian:
-      * Берите qemu из основного репозитория
-      * `<qemu>/b/qemu/config-host.h` &rarr; `<vitastor>/qemu/b/qemu/config-host.h`
-      * `<qemu>/b/qemu/qapi` &rarr; `<vitastor>/qemu/b/qemu/qapi`
-   - CentOS 8:
-      * Берите qemu из репозитория Advanced-Virtualization. Чтобы включить его, запустите
-        `yum install centos-release-advanced-virtualization.noarch` и далее `yum install qemu`
-      * `<qemu>/config-host.h` &rarr; `<vitastor>/qemu/b/qemu/config-host.h`
-      * Для QEMU 3.0+: `<qemu>/qapi` &rarr; `<vitastor>/qemu/b/qemu/qapi`
-      * Для QEMU 2.0+: `<qemu>/qapi-types.h` &rarr; `<vitastor>/qemu/b/qemu/qapi-types.h`
-   - `config-host.h` и `qapi` нужны, т.к. в них содержатся автогенерируемые заголовки
- Установите fio 3.7 или новее, возьмите исходники пакета и сделайте на них симлинк с `<vitastor>/fio`.
- Соберите и установите Vitastor командой `mkdir build && cd build && cmake .. && make -j8 && make install`.
-  Обратите внимание на переменную cmake `QEMU_PLUGINDIR` - под RHEL её нужно установить равной `qemu-kvm`.
-
-## Запуск
-
-Внимание: процедура пока что достаточно нетривиальная, задавать конфигурацию и смещения
-на диске нужно почти вручную. Это будет исправлено в ближайшем будущем.
-
- Желательны SATA SSD или NVMe диски с конденсаторами (серверные SSD). Можно использовать и
-  десктопные SSD, включив режим отложенного fsync, но производительность однопоточной записи
-  в этом случае пострадает.
- Быстрая сеть, минимум 10 гбит/с
- Для наилучшей производительности нужно отключить энергосбережение CPU: `cpupower idle-set -D 0 && cpupower frequency-set -g performance`.
- Пропишите нужные вам значения вверху файлов `/usr/lib/vitastor/mon/make-units.sh` и `/usr/lib/vitastor/mon/make-osd.sh`.
- Создайте юниты systemd для etcd и мониторов: `/usr/lib/vitastor/mon/make-units.sh`
- Создайте юниты для OSD: `/usr/lib/vitastor/mon/make-osd.sh /dev/disk/by-partuuid/XXX [/dev/disk/by-partuuid/YYY ...]`
- Вы можете поменять параметры OSD в юнитах systemd. Смысл некоторых параметров:
-  - `disable_data_fsync 1` - отключает fsync, используется с SSD с конденсаторами.
-  - `immediate_commit all` - используется с SSD с конденсаторами.
-  - `disable_device_lock 1` - отключает блокировку файла устройства, нужно, только если вы запускаете
-    несколько OSD на одном блочном устройстве.
-  - `flusher_count 256` - "flusher" - микропоток, удаляющий старые данные из журнала.
-    Не волнуйтесь об этой настройке, 256 теперь достаточно практически всегда.
-  - `disk_alignment`, `journal_block_size`, `meta_block_size` следует установить равными размеру
-    внутреннего блока SSD. Это почти всегда 4096.
-  - `journal_no_same_sector_overwrites true` запрещает перезапись одного и того же сектора журнала подряд
-    много раз в процессе записи. Большинство (99%) SSD не нуждаются в данной опции. Однако выяснилось, что
-    диски, используемые на одном из тестовых стендов - Intel D3-S4510 - очень сильно не любят такую
-    перезапись, и для них была добавлена эта опция. Когда данный режим включён, также нужно поднимать
-    значение `journal_sector_buffer_count`, так как иначе Vitastor не хватит буферов для записи в журнал.
- Запустите все etcd: `systemctl start etcd`
- Создайте глобальную конфигурацию в etcd: `etcdctl --endpoints=... put /vitastor/config/global '{"immediate_commit":"all"}'`
-  (если все ваши диски - серверные с конденсаторами).
- Создайте пулы: `etcdctl --endpoints=... put /vitastor/config/pools '{"1":{"name":"testpool","scheme":"replicated","pg_size":2,"pg_minsize":1,"pg_count":256,"failure_domain":"host"}}'`.
-  Для jerasure EC-пулов конфигурация должна выглядеть так: `2:{"name":"ecpool","scheme":"jerasure","pg_size":4,"parity_chunks":2,"pg_minsize":2,"pg_count":256,"failure_domain":"host"}`.
- Запустите все OSD: `systemctl start vitastor.target`
- Ваш кластер должен быть готов - один из мониторов должен уже сконфигурировать PG, а OSD должны запустить их.
- Вы можете проверить состояние PG прямо в etcd: `etcdctl --endpoints=... get --prefix /vitastor/pg/state`. Все PG должны быть 'active'.
- Пример команды для запуска тестов: `fio -thread -ioengine=libfio_vitastor.so -name=test -bs=4M -direct=1 -iodepth=16 -rw=write -etcd=10.115.0.10:2379/v3 -pool=1 -inode=1 -size=400G`.
- Пример команды для заливки образа ВМ в vitastor через qemu-img:
-  ```
-  qemu-img convert -f qcow2 debian10.qcow2 -p -O raw 'vitastor:etcd_host=10.115.0.10\:2379/v3:pool=1:inode=1:size=2147483648'
-  ```
-  Если вы используете немодифицированный QEMU, данной команде потребуется переменная окружения `LD_PRELOAD=/usr/lib/x86_64-linux-gnu/qemu/block-vitastor.so`.
- Пример команды запуска QEMU:
-  ```
-  qemu-system-x86_64 -enable-kvm -m 1024
-    -drive 'file=vitastor:etcd_host=10.115.0.10\:2379/v3:pool=1:inode=1:size=2147483648',format=raw,if=none,id=drive-virtio-disk0,cache=none
-    -device virtio-blk-pci,scsi=off,bus=pci.0,addr=0x5,drive=drive-virtio-disk0,id=virtio-disk0,bootindex=1,write-cache=off,physical_block_size=4096,logical_block_size=512
-    -vnc 0.0.0.0:0
-  ```
- Пример команды удаления образа (инода) из Vitastor:
-  ```
-  vitastor-rm --etcd_address 10.115.0.10:2379/v3 --pool 1 --inode 1 --parallel_osds 16 --iodepth 32
-  ```
-
-## Известные проблемы
-
- Запросы удаления объектов могут в данный момент приводить к "неполным" объектам в EC-пулах,
-  если в процессе удаления произойдут отказы OSD или серверов, потому что правильная обработка
-  запросов удаления в кластере должна быть "трёхфазной", а это пока не реализовано. Если вы
-  столкнётесь с такой ситуацией, просто повторите запрос удаления.
-
-## Принципы реализации
-
- Я люблю архитектурно простые решения. Vitastor проектируется именно так и я намерен
-  и далее следовать данному принципу.
- Если вы пришли сюда за идеальным кодом на C++, вы, вероятно, не по адресу. "Общепринятые"
-  практики написания C++ кода меня не очень волнуют, так как зачастую, опять-таки, ведут к
-  излишним усложнениям и код получается красивый... но медленный.
- По той же причине в коде иногда можно встретить велосипеды типа собственного упрощённого
-  HTTP-клиента для работы с etcd. Зато эти велосипеды маленькие и компактные и не требуют
-  использования десятка внешних библиотек.
- node.js для монитора - не случайный выбор. Он очень быстрый, имеет встроенную событийную
-  машину, приятный нейтральный C-подобный язык программирования и развитую инфраструктуру.
-
-## Автор и лицензия
-
-Автор: Виталий Филиппов (vitalif [at] yourcmc.ru), 2019+
-
-Заходите в Telegram-чат Vitastor: https://t.me/vitastor
-
-Лицензия: VNPL 1.1 на серверный код и двойная VNPL 1.1 + GPL 2.0+ на клиентский.
-
-VNPL - "сетевой копилефт", собственная свободная копилефт-лицензия
-Vitastor Network Public License 1.1, основанная на GNU GPL 3.0 с дополнительным
-условием "Сетевого взаимодействия", требующим распространять все программы,
-специально разработанные для использования вместе с Vitastor и взаимодействующие
-с ним по сети, под лицензией VNPL или под любой другой свободной лицензией.
-
-Идея VNPL - расширение действия копилефта не только на модули, явным образом
-связываемые с кодом Vitastor, но также на модули, оформленные в виде микросервисов
-и взаимодействующие с ним по сети.
-
-Таким образом, если вы хотите построить на основе Vitastor сервис, содержаший
-компоненты с закрытым кодом, взаимодействующие с Vitastor, вам нужна коммерческая
-лицензия от автора 😀.
-
-На Windows и любое другое ПО, не разработанное *специально* для использования
-вместе с Vitastor, никакие ограничения не накладываются.
-
-Клиентские библиотеки распространяются на условиях двойной лицензии VNPL 1.0
-и также на условиях GNU GPL 2.0 или более поздней версии. Так сделано в целях
-совместимости с таким ПО, как QEMU и fio.
-
-Вы можете найти полный текст VNPL 1.1 в файле [VNPL-1.1.txt](VNPL-1.1.txt),
-а GPL 2.0 в файле [GPL-2.0.txt](GPL-2.0.txt).
--- a/README.md
+++ b/README.md
@ -1,7 +1,5 @@
 ## Vitastor

-[Читать на русском](README-ru.md)
-
 ## The Idea

 Make Software-Defined Block Storage Great Again.
@ -18,8 +16,7 @@ breaking changes in the future. However, the following is implemented:

 - Basic part: highly-available block storage with symmetric clustering and no SPOF
 - Performance ;-D
- Multiple redundancy schemes: Replication, XOR n+1, Reed-Solomon erasure codes
-  based on jerasure library with any number of data and parity drives in a group
+- Two redundancy schemes: Replication and XOR n+1 (simplest case of EC)
 - Configuration via simple JSON data structures in etcd
 - Automatic data distribution over OSDs, with support for:
  - Mathematical optimization for better uniformity and less data movement
@ -34,24 +31,25 @@ breaking changes in the future. However, the following is implemented:
 - QEMU driver (built out-of-tree)
 - Loadable fio engine for benchmarks (also built out-of-tree)
 - NBD proxy for kernel mounts
- Inode removal tool (vitastor-rm)
- Packaging for Debian and CentOS
- Per-inode I/O and space usage statistics
- Inode metadata storage in etcd
- Snapshots and copy-on-write image clones
+- Inode removal tool (./rm_inode)

 ## Roadmap

- Better OSD creation and auto-start tools
+- Packaging for Debian and, probably, CentOS too
+- OSD creation tool (OSDs currently have to be created by hand)
 - Other administrative tools
- Plugins for OpenStack, Kubernetes, OpenNebula, Proxmox and other cloud systems
+- Per-inode I/O and space usage statistics
+- jerasure EC support with any number of data and parity drives in a group
+- Parallel usage of multiple network interfaces
+- Proxmox and OpenNebula plugins
 - iSCSI proxy
+- Inode metadata storage in etcd
+- Snapshots and copy-on-write image clones
 - Operation timeouts and better failure detection
 - Scrubbing without checksums (verification of replicas)
 - Checksums
 - SSD+HDD optimizations, possibly including tiered storage and soft journal flushes
 - RDMA and NVDIMM support
- Web GUI
 - Compression (possibly)
 - Read caching using system page cache (possibly)

@ -282,34 +280,7 @@ Vitastor with single-thread NBD on the same hardware:
 - Linear write (4M T1Q128): 1266 MB/s (compared to 2800 MB/s via fio)
 - Linear read (4M T1Q128): 975 MB/s (compared to 1500 MB/s via fio)

-## Installation
-
-### Debian
-
- Trust Vitastor package signing key:
-  `wget -q -O - https://vitastor.io/debian/pubkey | sudo apt-key add -`
- Add Vitastor package repository to your /etc/apt/sources.list:
-  - Debian 11 (Bullseye/Sid): `deb https://vitastor.io/debian bullseye main`
-  - Debian 10 (Buster): `deb https://vitastor.io/debian buster main`
- For Debian 10 (Buster) also enable backports repository:
-  `deb http://deb.debian.org/debian buster-backports main`
- Install packages: `apt update; apt install vitastor lp-solve etcd linux-image-amd64 qemu`
-
-### CentOS
-
- Add Vitastor package repository:
-  - CentOS 7: `yum install https://vitastor.io/rpms/centos/7/vitastor-release-1.0-1.el7.noarch.rpm`
-  - CentOS 8: `dnf install https://vitastor.io/rpms/centos/8/vitastor-release-1.0-1.el8.noarch.rpm`
- Enable EPEL: `yum/dnf install epel-release`
- Enable additional CentOS repositories:
-  - CentOS 7: `yum install centos-release-scl`
-  - CentOS 8: `dnf install centos-release-advanced-virtualization`
- Enable elrepo-kernel:
-  - CentOS 7: `yum install https://www.elrepo.org/elrepo-release-7.el7.elrepo.noarch.rpm`
-  - CentOS 8: `dnf install https://www.elrepo.org/elrepo-release-8.el8.elrepo.noarch.rpm`
- Install packages: `yum/dnf install vitastor lpsolve etcd kernel-ml qemu-kvm`
-
-### Building from Source
+## Building

 - Install Linux kernel 5.4 or newer, for io_uring support. 5.8 or later is highly recommended because
  there is at least one known io_uring hang with 5.4 and an HP SmartArray controller.
@ -319,10 +290,10 @@ Vitastor with single-thread NBD on the same hardware:
  branch release-3.4, because there is a bug in upstream etcd which makes Vitastor OSDs fail to
  move PGs out of "starting" state if you have at least around ~500 PGs or so. The custom build
  will be unnecessary when etcd merges the fix: https://github.com/etcd-io/etcd/pull/12402.
- Install node.js 10 or newer.
- Install gcc and g++ 8.x or newer.
+- Install node.js 12 or newer.
+- Install gcc and g++ 9.x or later.
 - Clone https://yourcmc.ru/git/vitalif/vitastor/ with submodules.
- Install QEMU 3.0+, get its source, begin to build it, stop the build and copy headers:
+- Install QEMU 4.x or 5.x, get its source, begin to build it, stop the build and copy headers:
   - `<qemu>/include` &rarr; `<vitastor>/qemu/include`
   - Debian:
      * Use qemu packages from the main repository
@ -332,14 +303,11 @@ Vitastor with single-thread NBD on the same hardware:
      * Use qemu packages from the Advanced-Virtualization repository. To enable it, run
        `yum install centos-release-advanced-virtualization.noarch` and then `yum install qemu`
      * `<qemu>/config-host.h` &rarr; `<vitastor>/qemu/b/qemu/config-host.h`
-      * For QEMU 3.0+: `<qemu>/qapi` &rarr; `<vitastor>/qemu/b/qemu/qapi`
-      * For QEMU 2.0+: `<qemu>/qapi-types.h` &rarr; `<vitastor>/qemu/b/qemu/qapi-types.h`
+      * `<qemu>/qapi` &rarr; `<vitastor>/qemu/b/qemu/qapi`
   - `config-host.h` and `qapi` are required because they contain generated headers
- You can also rebuild QEMU with a patch that makes LD_PRELOAD unnecessary to load vitastor driver.
-  See `qemu-*.*-vitastor.patch`.
- Install fio 3.7 or later, get its source and symlink it into `<vitastor>/fio`.
- Build & install Vitastor with `mkdir build && cd build && cmake .. && make -j8 && make install`.
-  Pay attention to the `QEMU_PLUGINDIR` cmake option - it must be set to `qemu-kvm` on RHEL.
+- Install fio 3.16 or later, get its source and symlink it into `<vitastor>/fio`.
+- Build Vitastor with `make -j8`.
+- Copy binaries somewhere.

 ## Running

@ -350,16 +318,20 @@ and calculate disk offsets almost by hand. This will be fixed in near future.
  with lazy fsync, but prepare for inferior single-thread latency.
 - Get a fast network (at least 10 Gbit/s).
 - Disable CPU powersaving: `cpupower idle-set -D 0 && cpupower frequency-set -g performance`.
- Check `/usr/lib/vitastor/mon/make-units.sh` and `/usr/lib/vitastor/mon/make-osd.sh` and
-  put desired values into the variables at the top of these files.
- Create systemd units for the monitor and etcd: `/usr/lib/vitastor/mon/make-units.sh`
- Create systemd units for your OSDs: `/usr/lib/vitastor/mon/make-osd.sh /dev/disk/by-partuuid/XXX [/dev/disk/by-partuuid/YYY ...]`
- You can edit the units and change OSD configuration. Notable configuration variables:
+- Start etcd with `--max-txn-ops=100000 --auto-compaction-retention=10 --auto-compaction-mode=revision` options.
+- Create global configuration in etcd: `etcdctl --endpoints=... put /vitastor/config/global '{"immediate_commit":"all"}'`
+  (if all your drives have capacitors).
+- Create pool configuration in etcd: `etcdctl --endpoints=... put /vitastor/config/pools '{"1":{"name":"testpool","scheme":"replicated","pg_size":2,"pg_minsize":1,"pg_count":256,"failure_domain":"host"}}'`.
+- Calculate offsets for your drives with `node ./mon/simple-offsets.js --device /dev/sdX`.
+- Make systemd units for your OSDs. Look at `./mon/make-units.sh` for example.
+  Notable configuration variables from the example:
  - `disable_data_fsync 1` - only safe with server-grade drives with capacitors.
  - `immediate_commit all` - use this if all your drives are server-grade.
  - `disable_device_lock 1` - only required if you run multiple OSDs on one block device.
-  - `flusher_count 256` - flusher is a micro-thread that removes old data from the journal.
-    You don't have to worry about this parameter anymore, 256 is enough.
+  - `flusher_count 16` - flusher is a micro-thread that removes old data from the journal.
+    More flushers mean more aggressive journal flushing which allows for more throughput
+    but slightly hurts latency under less load. Flushing will probably be improved in the future
+    because currently high queue depths sometimes lead to performance degradation.
  - `disk_alignment`, `journal_block_size`, `meta_block_size` should be set to the internal
    block size of your SSDs which is 4096 on most drives.
  - `journal_no_same_sector_overwrites true` prevents multiple overwrites of the same journal sector.
@ -370,42 +342,39 @@ and calculate disk offsets almost by hand. This will be fixed in near future.
    setting is set, it is also required to raise `journal_sector_buffer_count` setting, which is the
    number of dirty journal sectors that may be written to at the same time.
 - `systemctl start vitastor.target` everywhere.
- Create global configuration in etcd: `etcdctl --endpoints=... put /vitastor/config/global '{"immediate_commit":"all"}'`
-  (if all your drives have capacitors).
- Create pool configuration in etcd: `etcdctl --endpoints=... put /vitastor/config/pools '{"1":{"name":"testpool","scheme":"replicated","pg_size":2,"pg_minsize":1,"pg_count":256,"failure_domain":"host"}}'`.
-  For jerasure pools the configuration should look like the following: `2:{"name":"ecpool","scheme":"jerasure","pg_size":4,"parity_chunks":2,"pg_minsize":2,"pg_count":256,"failure_domain":"host"}`.
+- Start any number of monitors: `cd mon; node mon-main.js --etcd_url 'http://10.115.0.10:2379,http://10.115.0.11:2379,http://10.115.0.12:2379,http://10.115.0.13:2379' --etcd_prefix '/vitastor' --etcd_start_timeout 5`.
 - At this point, one of the monitors will configure PGs and OSDs will start them.
 - You can check PG states with `etcdctl --endpoints=... get --prefix /vitastor/pg/state`. All PGs should become 'active'.
- Run tests with (for example): `fio -thread -ioengine=libfio_vitastor.so -name=test -bs=4M -direct=1 -iodepth=16 -rw=write -etcd=10.115.0.10:2379/v3 -pool=1 -inode=1 -size=400G`.
+- Run tests with (for example): `fio -thread -ioengine=./libfio_cluster.so -name=test -bs=4M -direct=1 -iodepth=16 -rw=write -etcd=10.115.0.10:2379/v3 -pool=1 -inode=1 -size=400G`.
 - Upload VM disk image with qemu-img (for example):
  ```
-  qemu-img convert -f qcow2 debian10.qcow2 -p -O raw 'vitastor:etcd_host=10.115.0.10\:2379/v3:pool=1:inode=1:size=2147483648'
+  LD_PRELOAD=./qemu_driver.so qemu-img convert -f qcow2 debian10.qcow2 -p
+    -O raw 'vitastor:etcd_host=10.115.0.10\:2379/v3:pool=1:inode=1:size=2147483648'
  ```
-  Note that the command requires to be run with `LD_PRELOAD=/usr/lib/x86_64-linux-gnu/qemu/block-vitastor.so qemu-img ...`
-  if you use unmodified QEMU.
 - Run QEMU with (for example):
  ```
-  qemu-system-x86_64 -enable-kvm -m 1024
+  LD_PRELOAD=./qemu_driver.so qemu-system-x86_64 -enable-kvm -m 1024
    -drive 'file=vitastor:etcd_host=10.115.0.10\:2379/v3:pool=1:inode=1:size=2147483648',format=raw,if=none,id=drive-virtio-disk0,cache=none
    -device virtio-blk-pci,scsi=off,bus=pci.0,addr=0x5,drive=drive-virtio-disk0,id=virtio-disk0,bootindex=1,write-cache=off,physical_block_size=4096,logical_block_size=512
    -vnc 0.0.0.0:0
  ```
 - Remove inode with (for example):
  ```
-  vitastor-rm --etcd_address 10.115.0.10:2379/v3 --pool 1 --inode 1 --parallel_osds 16 --iodepth 32
+  ./rm_inode --etcd_address 10.115.0.10:2379/v3 --pool 1 --inode 1 --parallel_osds 16 --iodepth 32
  ```

 ## Known Problems

- Object deletion requests may currently lead to 'incomplete' objects in EC pools
-  if your OSDs crash during deletion because proper handling of object cleanup
-  in a cluster should be "three-phase" and it's currently not implemented.
-  Just repeat the removal request again in this case.
+- Object deletion requests may currently lead to 'incomplete' objects if your OSDs crash during
+  deletion because proper handling of object cleanup in a cluster should be "three-phase"
+  and it's currently not implemented. Inode removal tool currently can't handle unclean
+  objects, so incomplete objects become undeletable. This will be fixed in near future
+  by allowing the inode removal tool to delete unclean objects. With this problem fixed
+  you'll be able just to repeat the removal again.

 ## Implementation Principles

- I like architecturally simple solutions. Vitastor is and will always be designed
-  exactly like that.
+- I like simple and stupid solutions, so expect Vitastor to stay simple.
 - I also like reinventing the wheel to some extent, like writing my own HTTP client
  for etcd interaction instead of using prebuilt libraries, because in this case
  I'm confident about what my code does and what it doesn't do.
@ -420,30 +389,25 @@ and calculate disk offsets almost by hand. This will be fixed in near future.

 Copyright (c) Vitaliy Filippov (vitalif [at] yourcmc.ru), 2019+

-Join Vitastor Telegram Chat: https://t.me/vitastor
+You can also find me in the Russian Telegram Ceph chat: https://t.me/ceph_ru

 All server-side code (OSD, Monitor and so on) is licensed under the terms of
-Vitastor Network Public License 1.1 (VNPL 1.1), a copyleft license based on
+Vitastor Network Public License 1.0 (VNPL 1.0), a copyleft license based on
 GNU GPLv3.0 with the additional "Network Interaction" clause which requires
 opensourcing all programs directly or indirectly interacting with Vitastor
-through a computer network and expressly designed to be used in conjunction
-with it ("Proxy Programs"). Proxy Programs may be made public not only under
-the terms of the same license, but also under the terms of any GPL-Compatible
-Free Software License, as listed by the Free Software Foundation.
+through a computer network ("Proxy Programs"). Proxy Programs may be made public
+not only under the terms of the same license, but also under the terms of any
+GPL-Compatible Free Software License, as listed by the Free Software Foundation.
 This is a stricter copyleft license than the Affero GPL.

-Please note that VNPL doesn't require you to open the code of proprietary
-software running inside a VM if it's not specially designed to be used with
-Vitastor.
-
 Basically, you can't use the software in a proprietary environment to provide
 its functionality to users without opensourcing all intermediary components
 standing between the user and Vitastor or purchasing a commercial license
 from the author 😀.

 Client libraries (cluster_client and so on) are dual-licensed under the same
-VNPL 1.1 and also GNU GPL 2.0 or later to allow for compatibility with GPLed
+VNPL 1.0 and also GNU GPL 2.0 or later to allow for compatibility with GPLed
 software like QEMU and fio.

-You can find the full text of VNPL-1.1 in the file [VNPL-1.1.txt](VNPL-1.1.txt).
+You can find the full text of VNPL-1.0 in the file [VNPL-1.0.txt](VNPL-1.0.txt).
 GPL 2.0 is also included in this repository as [GPL-2.0.txt](GPL-2.0.txt).
--- a/VNPL-1.0.txt
+++ b/VNPL-1.0.txt
@ -1,7 +1,7 @@
                     VITASTOR NETWORK PUBLIC LICENSE
-                     Version 1.1,  6 February 2021
+                       Version 1, 17 September 2020

- Copyright (C) 2021 Vitaliy Filippov <vitalif@yourcmc.ru>
+ Copyright (C) 2020 Vitaliy Filippov <vitalif@yourcmc.ru>
 Everyone is permitted to copy and distribute verbatim copies
 of this license document, but changing it is not allowed.

@ -540,15 +540,12 @@ License would be to refrain entirely from conveying the Program.

  13. Remote Network Interaction.

-  A "Proxy Program" means a separate program which is specially designed to
-be used in conjunction with the covered work and interacts with it directly
-or indirectly through any kind of API (application programming interfaces),
-a computer network, an imitation of such network, or another Proxy Program
-itself.
-
-  Notwithstanding any other provision of this License, if you provide any user
-with an opportunity to interact with the covered work through a computer
-network, an imitation of such network, or any number of "Proxy Programs",
+  Notwithstanding any other provision of this License, if you provide
+any user an opportunity to interact with the covered work directly
+or indirectly through a computer network, an imitation of such network,
+or an additional program (hereinafter referred to as a "Proxy Program")
+that, in turn, interacts with the covered work through a computer network,
+an imitation of such network, or another Proxy Program itself,
 you must prominently offer that user an opportunity to receive the
 Corresponding Source of the covered work and all Proxy Programs from a
 network server at no charge, through some standard or customary means of
--- a/src/allocator.cpp
+++ b/src/allocator.cpp
@ -1,5 +1,5 @@
 // Copyright (c) Vitaliy Filippov, 2019+
-// License: VNPL-1.1 (see README.md for details)
+// License: VNPL-1.0 (see README.md for details)

 #include <stdexcept>
 #include "allocator.h"
@ -13,19 +13,19 @@ allocator::allocator(uint64_t blocks)
    {
        throw std::invalid_argument("blocks");
    }
-    uint64_t p2 = 1;
-    total = 0;
+    uint64_t p2 = 1, total = 1;
    while (p2 * 64 < blocks)
    {
-        total += p2;
        p2 = p2 * 64;
+        total += p2;
    }
+    total -= p2;
    total += (blocks+63) / 64;
-    mask = new uint64_t[total];
+    mask = new uint64_t[2 + total];
    size = free = blocks;
    last_one_mask = (blocks % 64) == 0
        ? UINT64_MAX
-        : ((1l << (blocks % 64)) - 1);
+        : ~(UINT64_MAX << (64 - blocks % 64));
    for (uint64_t i = 0; i < total; i++)
    {
        mask[i] = 0;
@ -99,10 +99,6 @@ uint64_t allocator::find_free()
    uint64_t p2 = 1, offset = 0, addr = 0, f, i;
    while (p2 < size)
    {
-        if (offset+addr >= total)
-        {
-            return UINT64_MAX;
-        }
        uint64_t m = mask[offset + addr];
        for (i = 0, f = 1; i < 64; i++, f <<= 1)
        {
@ -117,6 +113,11 @@ uint64_t allocator::find_free()
            return UINT64_MAX;
        }
        addr = (addr * 64) | i;
+        if (addr >= size)
+        {
+            // No space
+            return UINT64_MAX;
+        }
        offset += p2;
        p2 = p2 * 64;
    }
@ -127,35 +128,3 @@ uint64_t allocator::get_free_count()
 {
    return free;
 }
-
-void bitmap_set(void *bitmap, uint64_t start, uint64_t len, uint64_t bitmap_granularity)
-{
-    if (start == 0)
-    {
-        if (len == 32*bitmap_granularity)
-        {
-            *((uint32_t*)bitmap) = UINT32_MAX;
-            return;
-        }
-        else if (len == 64*bitmap_granularity)
-        {
-            *((uint64_t*)bitmap) = UINT64_MAX;
-            return;
-        }
-    }
-    unsigned bit_start = start / bitmap_granularity;
-    unsigned bit_end = ((start + len) + bitmap_granularity - 1) / bitmap_granularity;
-    while (bit_start < bit_end)
-    {
-        if (!(bit_start & 7) && bit_end >= bit_start+8)
-        {
-            ((uint8_t*)bitmap)[bit_start / 8] = UINT8_MAX;
-            bit_start += 8;
-        }
-        else
-        {
-            ((uint8_t*)bitmap)[bit_start / 8] |= 1 << (bit_start % 8);
-            bit_start++;
-        }
-    }
-}
--- a/src/allocator.h
+++ b/src/allocator.h
@ -1,5 +1,5 @@
 // Copyright (c) Vitaliy Filippov, 2019+
-// License: VNPL-1.1 (see README.md for details)
+// License: VNPL-1.0 (see README.md for details)

 #pragma once

@ -8,7 +8,6 @@
 // Hierarchical bitmap allocator
 class allocator
 {
-    uint64_t total;
    uint64_t size;
    uint64_t free;
    uint64_t last_one_mask;
@ -20,5 +19,3 @@ public:
    uint64_t find_free();
    uint64_t get_free_count();
 };
-
-void bitmap_set(void *bitmap, uint64_t start, uint64_t len, uint64_t bitmap_granularity);
--- a/src/base64.cpp
+++ b/src/base64.cpp
@ -1,5 +1,5 @@
 // Copyright (c) Vitaliy Filippov, 2019+
-// License: VNPL-1.1 (see README.md for details)
+// License: VNPL-1.0 (see README.md for details)

 #include "base64.h"

--- a/src/base64.h
+++ b/src/base64.h
@ -1,5 +1,5 @@
 // Copyright (c) Vitaliy Filippov, 2019+
-// License: VNPL-1.1 (see README.md for details)
+// License: VNPL-1.0 (see README.md for details)

 #pragma once
 #include <string>
--- a/src/blockstore.cpp
+++ b/src/blockstore.cpp
@ -1,5 +1,5 @@
 // Copyright (c) Vitaliy Filippov, 2019+
-// License: VNPL-1.1 (see README.md for details)
+// License: VNPL-1.0 (see README.md for details)

 #include "blockstore_impl.h"

@ -35,7 +35,12 @@ bool blockstore_t::is_safe_to_stop()

 void blockstore_t::enqueue_op(blockstore_op_t *op)
 {
-    impl->enqueue_op(op);
+    impl->enqueue_op(op, false);
+}
+
+void blockstore_t::enqueue_op_first(blockstore_op_t *op)
+{
+    impl->enqueue_op(op, true);
 }

 std::unordered_map<object_id, uint64_t> & blockstore_t::get_unstable_writes()
@ -43,11 +48,6 @@ std::unordered_map<object_id, uint64_t> & blockstore_t::get_unstable_writes()
    return impl->unstable_writes;
 }

-std::map<uint64_t, uint64_t> & blockstore_t::get_inode_space_stats()
-{
-    return impl->inode_space_stats;
-}
-
 uint32_t blockstore_t::get_block_size()
 {
    return impl->get_block_size();
@ -63,7 +63,7 @@ uint64_t blockstore_t::get_free_block_count()
    return impl->get_free_block_count();
 }

-uint32_t blockstore_t::get_bitmap_granularity()
+uint32_t blockstore_t::get_disk_alignment()
 {
-    return impl->get_bitmap_granularity();
+    return impl->get_disk_alignment();
 }
--- a/src/blockstore.h
+++ b/src/blockstore.h
@ -1,5 +1,5 @@
 // Copyright (c) Vitaliy Filippov, 2019+
-// License: VNPL-1.1 (see README.md for details)
+// License: VNPL-1.0 (see README.md for details)

 #pragma once

@ -27,7 +27,6 @@
 #define DEFAULT_ORDER 17
 #define MIN_BLOCK_SIZE 4*1024
 #define MAX_BLOCK_SIZE 128*1024*1024
-#define DEFAULT_BITMAP_GRANULARITY 4096

 #define BS_OP_MIN 1
 #define BS_OP_READ 1
@ -65,8 +64,6 @@ Input:
 - offset, len = offset and length within object. length may be zero, in that case
  read operation only returns the version / write operation only bumps the version
 - buf = pre-allocated buffer for data (read) / with data (write). may be NULL if len == 0.
- bitmap = pointer to the new 'external' object bitmap data. Its part which is respective to the
-  write request is copied into the metadata area bitwise and stored there.

 Output:
 - retval = number of bytes actually read/written or negative error number (-EINVAL or -ENOSPC)
@ -144,7 +141,6 @@ struct blockstore_op_t
    uint32_t offset;
    uint32_t len;
    void *buf;
-    void *bitmap;
    int retval;

    uint8_t private_data[BS_OP_PRIVATE_DATA_SIZE];
@ -179,16 +175,17 @@ public:
    // Submission
    void enqueue_op(blockstore_op_t *op);

+    // Insert operation into the beginning of the queue
+    // Intended for the OSD syncer "thread" to be able to stabilize something when the journal is full
+    void enqueue_op_first(blockstore_op_t *op);
+
    // Unstable writes are added here (map of object_id -> version)
    std::unordered_map<object_id, uint64_t> & get_unstable_writes();

-    // Get per-inode space usage statistics
-    std::map<uint64_t, uint64_t> & get_inode_space_stats();
-
    // FIXME rename to object_size
    uint32_t get_block_size();
    uint64_t get_block_count();
    uint64_t get_free_block_count();

-    uint32_t get_bitmap_granularity();
+    uint32_t get_disk_alignment();
 };
--- a/src/blockstore_flush.cpp
+++ b/src/blockstore_flush.cpp
@ -1,5 +1,5 @@
 // Copyright (c) Vitaliy Filippov, 2019+
-// License: VNPL-1.1 (see README.md for details)
+// License: VNPL-1.0 (see README.md for details)

 #include "blockstore_impl.h"

@ -7,8 +7,6 @@ journal_flusher_t::journal_flusher_t(int flusher_count, blockstore_impl_t *bs)
 {
    this->bs = bs;
    this->flusher_count = flusher_count;
-    this->cur_flusher_count = 1;
-    this->target_flusher_count = 1;
    dequeuing = false;
    trimming = false;
    active_flushers = 0;
@ -70,31 +68,14 @@ bool journal_flusher_t::is_active()

 void journal_flusher_t::loop()
 {
-    target_flusher_count = bs->write_iodepth*2;
-    if (target_flusher_count <= 0)
-        target_flusher_count = 1;
-    else if (target_flusher_count > flusher_count)
-        target_flusher_count = flusher_count;
-    if (target_flusher_count > cur_flusher_count)
-        cur_flusher_count = target_flusher_count;
-    else if (target_flusher_count < cur_flusher_count)
+    for (int i = 0; (active_flushers > 0 || dequeuing) && i < flusher_count; i++)
    {
-        while (target_flusher_count < cur_flusher_count)
-        {
-            if (co[cur_flusher_count-1].wait_state)
-                break;
-            cur_flusher_count--;
-        }
-    }
-    for (int i = 0; (active_flushers > 0 || dequeuing) && i < cur_flusher_count; i++)
        co[i].loop();
+    }
 }

 void journal_flusher_t::enqueue_flush(obj_ver_id ov)
 {
-#ifdef BLOCKSTORE_DEBUG
-    printf("enqueue_flush %lx:%lx v%lu\n", ov.oid.inode, ov.oid.stripe, ov.version);
-#endif
    auto it = flush_versions.find(ov.oid);
    if (it != flush_versions.end())
    {
@ -113,11 +94,8 @@ void journal_flusher_t::enqueue_flush(obj_ver_id ov)
    }
 }

-void journal_flusher_t::unshift_flush(obj_ver_id ov, bool force)
+void journal_flusher_t::unshift_flush(obj_ver_id ov)
 {
-#ifdef BLOCKSTORE_DEBUG
-    printf("unshift_flush %lx:%lx v%lu\n", ov.oid.inode, ov.oid.stripe, ov.version);
-#endif
    auto it = flush_versions.find(ov.oid);
    if (it != flush_versions.end())
    {
@ -127,38 +105,15 @@ void journal_flusher_t::unshift_flush(obj_ver_id ov, bool force)
    else
    {
        flush_versions[ov.oid] = ov.version;
-        if (!force)
-            flush_queue.push_front(ov.oid);
-    }
-    if (force)
        flush_queue.push_front(ov.oid);
-    if (force || !dequeuing && (flush_queue.size() >= flusher_start_threshold || trim_wanted > 0))
+    }
+    if (!dequeuing && (flush_queue.size() >= flusher_start_threshold || trim_wanted > 0))
    {
        dequeuing = true;
        bs->ringloop->wakeup();
    }
 }

-void journal_flusher_t::remove_flush(object_id oid)
-{
-#ifdef BLOCKSTORE_DEBUG
-    printf("undo_flush %lx:%lx\n", oid.inode, oid.stripe);
-#endif
-    auto v_it = flush_versions.find(oid);
-    if (v_it != flush_versions.end())
-    {
-        flush_versions.erase(v_it);
-        for (auto q_it = flush_queue.begin(); q_it != flush_queue.end(); q_it++)
-        {
-            if (*q_it == oid)
-            {
-                flush_queue.erase(q_it);
-                break;
-            }
-        }
-    }
-}
-
 void journal_flusher_t::request_trim()
 {
    dequeuing = true;
@ -239,7 +194,6 @@ bool journal_flusher_co::loop()
 resume_0:
    if (!flusher->flush_queue.size() || !flusher->dequeuing)
    {
-stop_flusher:
        if (flusher->trim_wanted > 0 && flusher->journal_trim_counter > 0)
        {
            // Attempt forced trim
@ -344,7 +298,9 @@ stop_flusher:
 #ifdef BLOCKSTORE_DEBUG
                    printf("No older flushes, stopping\n");
 #endif
-                    goto stop_flusher;
+                    flusher->dequeuing = false;
+                    wait_state = 0;
+                    return true;
                }
            }
        }
@ -363,8 +319,8 @@ resume_1:
            return false;
        }
        // Writes and deletes shouldn't happen at the same time
-        assert(!has_writes || !has_delete);
-        if (!has_writes && !has_delete || has_delete && old_clean_loc == UINT64_MAX)
+        assert(!(copy_count > 0 || has_writes) || !has_delete);
+        if (copy_count == 0 && !has_writes && !has_delete || has_delete && old_clean_loc == UINT64_MAX)
        {
            // Nothing to flush
            bs->erase_dirty(dirty_start, std::next(dirty_end), clean_loc);
@ -426,18 +382,18 @@ resume_1:
        {
            new_clean_bitmap = (bs->inmemory_meta
                ? meta_new.buf + meta_new.pos*bs->clean_entry_size + sizeof(clean_disk_entry)
-                : bs->clean_bitmap + (clean_loc >> bs->block_order)*(2*bs->clean_entry_bitmap_size));
+                : bs->clean_bitmap + (clean_loc >> bs->block_order)*bs->clean_entry_bitmap_size);
            if (clean_init_bitmap)
            {
                memset(new_clean_bitmap, 0, bs->clean_entry_bitmap_size);
-                bitmap_set(new_clean_bitmap, clean_bitmap_offset, clean_bitmap_len, bs->bitmap_granularity);
+                bitmap_set(new_clean_bitmap, clean_bitmap_offset, clean_bitmap_len);
            }
        }
        for (it = v.begin(); it != v.end(); it++)
        {
            if (new_clean_bitmap)
            {
-                bitmap_set(new_clean_bitmap, it->offset, it->len, bs->bitmap_granularity);
+                bitmap_set(new_clean_bitmap, it->offset, it->len);
            }
            await_sqe(4);
            data->iov = (struct iovec){ it->buf, (size_t)it->len };
@ -471,7 +427,6 @@ resume_1:
                wait_state = 5;
                return false;
            }
-            // zero out old metadata entry
            memset(meta_old.buf + meta_old.pos*bs->clean_entry_size, 0, bs->clean_entry_size);
            await_sqe(15);
            data->iov = (struct iovec){ meta_old.buf, bs->meta_block_size };
@ -483,7 +438,6 @@ resume_1:
        }
        if (has_delete)
        {
-            // zero out new metadata entry
            memset(meta_new.buf + meta_new.pos*bs->clean_entry_size, 0, bs->clean_entry_size);
        }
        else
@ -491,8 +445,8 @@ resume_1:
            clean_disk_entry *new_entry = (clean_disk_entry*)(meta_new.buf + meta_new.pos*bs->clean_entry_size);
            if (new_entry->oid.inode != 0 && new_entry->oid != cur.oid)
            {
-                printf("Fatal error (metadata corruption or bug): tried to overwrite non-zero metadata entry %lu (%lx:%lx) with %lx:%lx\n",
-                    clean_loc >> bs->block_order, new_entry->oid.inode, new_entry->oid.stripe, cur.oid.inode, cur.oid.stripe);
+                printf("Fatal error (metadata corruption or bug): tried to overwrite non-zero metadata entry %lx (%lx:%lx) with %lx:%lx\n",
+                    clean_loc, new_entry->oid.inode, new_entry->oid.stripe, cur.oid.inode, cur.oid.stripe);
                exit(1);
            }
            new_entry->oid = cur.oid;
@ -501,12 +455,6 @@ resume_1:
            {
                memcpy(&new_entry->bitmap, new_clean_bitmap, bs->clean_entry_bitmap_size);
            }
-            // copy latest external bitmap/attributes
-            if (bs->clean_entry_bitmap_size)
-            {
-                void *bmp_ptr = bs->clean_entry_bitmap_size > sizeof(void*) ? dirty_end->second.bitmap : &dirty_end->second.bitmap;
-                memcpy((void*)(new_entry+1) + bs->clean_entry_bitmap_size, bmp_ptr, bs->clean_entry_bitmap_size);
-            }
        }
        await_sqe(6);
        data->iov = (struct iovec){ meta_new.buf, bs->meta_block_size };
@ -565,7 +513,7 @@ resume_1:
        if (repeat_it != flusher->sync_to_repeat.end() && repeat_it->second > cur.version)
        {
            // Requeue version
-            flusher->unshift_flush({ .oid = cur.oid, .version = repeat_it->second }, false);
+            flusher->unshift_flush({ .oid = cur.oid, .version = repeat_it->second });
        }
        flusher->sync_to_repeat.erase(repeat_it);
    trim_journal:
@ -654,7 +602,7 @@ bool journal_flusher_co::scan_dirty(int wait_base)
        {
            char err[1024];
            snprintf(
-                err, 1024, "BUG: Unexpected dirty_entry %lx:%lx v%lu unstable state during flush: %d",
+                err, 1024, "BUG: Unexpected dirty_entry %lx:%lx v%lu state during flush: %d",
                dirty_it->first.oid.inode, dirty_it->first.oid.stripe, dirty_it->first.version, dirty_it->second.state
            );
            throw std::runtime_error(err);
@ -831,34 +779,31 @@ bool journal_flusher_co::fsync_batch(bool fsync_meta, int wait_base)
    sync_found:
        cur_sync->ready_count++;
        flusher->syncing_flushers++;
-    resume_1:
-        if (!cur_sync->state)
+        if (flusher->syncing_flushers >= flusher->flusher_count || !flusher->flush_queue.size())
        {
-            if (flusher->syncing_flushers >= flusher->cur_flusher_count || !flusher->flush_queue.size())
+            // Sync batch is ready. Do it.
+            await_sqe(0);
+            data->iov = { 0 };
+            data->callback = simple_callback_w;
+            my_uring_prep_fsync(sqe, fsync_meta ? bs->meta_fd : bs->data_fd, IORING_FSYNC_DATASYNC);
+            cur_sync->state = 1;
+            wait_count++;
+        resume_1:
+            if (wait_count > 0)
            {
-                // Sync batch is ready. Do it.
-                await_sqe(0);
-                data->iov = { 0 };
-                data->callback = simple_callback_w;
-                my_uring_prep_fsync(sqe, fsync_meta ? bs->meta_fd : bs->data_fd, IORING_FSYNC_DATASYNC);
-                cur_sync->state = 1;
-                wait_count++;
-            resume_2:
-                if (wait_count > 0)
-                {
-                    wait_state = 2;
-                    return false;
-                }
-                // Sync completed. All previous coroutines waiting for it must be resumed
-                cur_sync->state = 2;
-                bs->ringloop->wakeup();
-            }
-            else
-            {
-                // Wait until someone else sends and completes a sync.
                wait_state = 1;
                return false;
            }
+            // Sync completed. All previous coroutines waiting for it must be resumed
+            cur_sync->state = 2;
+            bs->ringloop->wakeup();
+        }
+        // Wait until someone else sends and completes a sync.
+    resume_2:
+        if (!cur_sync->state)
+        {
+            wait_state = 2;
+            return false;
        }
        flusher->syncing_flushers--;
        cur_sync->ready_count--;
@ -869,3 +814,35 @@ bool journal_flusher_co::fsync_batch(bool fsync_meta, int wait_base)
    }
    return true;
 }
+
+void journal_flusher_co::bitmap_set(void *bitmap, uint64_t start, uint64_t len)
+{
+    if (start == 0)
+    {
+        if (len == 32*bs->bitmap_granularity)
+        {
+            *((uint32_t*)bitmap) = UINT32_MAX;
+            return;
+        }
+        else if (len == 64*bs->bitmap_granularity)
+        {
+            *((uint64_t*)bitmap) = UINT64_MAX;
+            return;
+        }
+    }
+    unsigned bit_start = start / bs->bitmap_granularity;
+    unsigned bit_end = ((start + len) + bs->bitmap_granularity - 1) / bs->bitmap_granularity;
+    while (bit_start < bit_end)
+    {
+        if (!(bit_start & 7) && bit_end >= bit_start+8)
+        {
+            ((uint8_t*)bitmap)[bit_start / 8] = UINT8_MAX;
+            bit_start += 8;
+        }
+        else
+        {
+            ((uint8_t*)bitmap)[bit_start / 8] |= 1 << (bit_start % 8);
+            bit_start++;
+        }
+    }
+}
--- a/src/blockstore_flush.h
+++ b/src/blockstore_flush.h
@ -1,5 +1,5 @@
 // Copyright (c) Vitaliy Filippov, 2019+
-// License: VNPL-1.1 (see README.md for details)
+// License: VNPL-1.0 (see README.md for details)

 struct copy_buffer_t
 {
@ -69,6 +69,7 @@ class journal_flusher_co
    bool modify_meta_read(uint64_t meta_loc, flusher_meta_write_t &wr, int wait_base);
    void update_clean_db();
    bool fsync_batch(bool fsync_meta, int wait_base);
+    void bitmap_set(void *bitmap, uint64_t start, uint64_t len);
 public:
    journal_flusher_co();
    bool loop();
@ -79,7 +80,7 @@ class journal_flusher_t
 {
    int trim_wanted = 0;
    bool dequeuing;
-    int flusher_count, cur_flusher_count, target_flusher_count;
+    int flusher_count;
    int flusher_start_threshold;
    journal_flusher_co *co;
    blockstore_impl_t *bs;
@ -106,6 +107,5 @@ public:
    void request_trim();
    void release_trim();
    void enqueue_flush(obj_ver_id oid);
-    void unshift_flush(obj_ver_id oid, bool force);
-    void remove_flush(object_id oid);
+    void unshift_flush(obj_ver_id oid);
 };
--- a/src/blockstore_impl.cpp
+++ b/src/blockstore_impl.cpp
@ -1,5 +1,5 @@
 // Copyright (c) Vitaliy Filippov, 2019+
-// License: VNPL-1.1 (see README.md for details)
+// License: VNPL-1.0 (see README.md for details)

 #include "blockstore_impl.h"

@ -10,9 +10,9 @@ blockstore_impl_t::blockstore_impl_t(blockstore_config_t & config, ring_loop_t *
    ring_consumer.loop = [this]() { loop(); };
    ringloop->register_consumer(&ring_consumer);
    initialized = 0;
+    zero_object = (uint8_t*)memalign_or_die(MEM_ALIGNMENT, block_size);
    data_fd = meta_fd = journal.fd = -1;
    parse_config(config);
-    zero_object = (uint8_t*)memalign_or_die(MEM_ALIGNMENT, block_size);
    try
    {
        open_data();
@ -101,14 +101,26 @@ void blockstore_impl_t::loop()
    {
        // try to submit ops
        unsigned initial_ring_space = ringloop->space_left();
-        // has_writes == 0 - no writes before the current queue item
-        // has_writes == 1 - some writes in progress
-        // has_writes == 2 - tried to submit some writes, but failed
-        int has_writes = 0, op_idx = 0, new_idx = 0;
-        for (; op_idx < submit_queue.size(); op_idx++, new_idx++)
+        // FIXME: rework this "sync polling"
+        auto cur_sync = in_progress_syncs.begin();
+        while (cur_sync != in_progress_syncs.end())
        {
-            auto op = submit_queue[op_idx];
-            submit_queue[new_idx] = op;
+            if (continue_sync(*cur_sync) != 2)
+            {
+                // List is unmodified
+                cur_sync++;
+            }
+            else
+            {
+                cur_sync = in_progress_syncs.begin();
+            }
+        }
+        auto cur = submit_queue.begin();
+        int has_writes = 0;
+        while (cur != submit_queue.end())
+        {
+            auto op_ptr = cur;
+            auto op = *(cur++);
            // FIXME: This needs some simplification
            // Writes should not block reads if the ring is not full and reads don't depend on them
            // In all other cases we should stop submission
@ -130,13 +142,10 @@ void blockstore_impl_t::loop()
            }
            unsigned ring_space = ringloop->space_left();
            unsigned prev_sqe_pos = ringloop->save();
-            // 0 = can't submit
-            // 1 = in progress
-            // 2 = can be removed from queue
-            int wr_st = 0;
+            bool dequeue_op = false;
            if (op->opcode == BS_OP_READ)
            {
-                wr_st = dequeue_read(op);
+                dequeue_op = dequeue_read(op);
            }
            else if (op->opcode == BS_OP_WRITE || op->opcode == BS_OP_WRITE_STABLE)
            {
@ -145,8 +154,8 @@ void blockstore_impl_t::loop()
                    // Some writes already could not be submitted
                    continue;
                }
-                wr_st = dequeue_write(op);
-                has_writes = wr_st > 0 ? 1 : 2;
+                dequeue_op = dequeue_write(op);
+                has_writes = dequeue_op ? 1 : 2;
            }
            else if (op->opcode == BS_OP_DELETE)
            {
@ -155,8 +164,8 @@ void blockstore_impl_t::loop()
                    // Some writes already could not be submitted
                    continue;
                }
-                wr_st = dequeue_del(op);
-                has_writes = wr_st > 0 ? 1 : 2;
+                dequeue_op = dequeue_del(op);
+                has_writes = dequeue_op ? 1 : 2;
            }
            else if (op->opcode == BS_OP_SYNC)
            {
@ -169,31 +178,29 @@ void blockstore_impl_t::loop()
                    // Can't submit SYNC before previous writes
                    continue;
                }
-                wr_st = continue_sync(op, false);
-                if (wr_st != 2)
-                {
-                    has_writes = wr_st > 0 ? 1 : 2;
-                }
+                dequeue_op = dequeue_sync(op);
            }
            else if (op->opcode == BS_OP_STABLE)
            {
-                wr_st = dequeue_stable(op);
+                dequeue_op = dequeue_stable(op);
            }
            else if (op->opcode == BS_OP_ROLLBACK)
            {
-                wr_st = dequeue_rollback(op);
+                dequeue_op = dequeue_rollback(op);
            }
            else if (op->opcode == BS_OP_LIST)
            {
-                // LIST doesn't need to be blocked by previous modifications
+                // LIST doesn't need to be blocked by previous modifications,
+                // it only needs to include all in-progress writes as they're guaranteed
+                // to be readable and stabilizable/rollbackable by subsequent operations
                process_list(op);
-                wr_st = 2;
+                dequeue_op = true;
            }
-            if (wr_st == 2)
+            if (dequeue_op)
            {
-                new_idx--;
+                submit_queue.erase(op_ptr);
            }
-            if (wr_st == 0)
+            else
            {
                ringloop->restore(prev_sqe_pos);
                if (PRIV(op)->wait_for == WAIT_SQE)
@ -204,14 +211,6 @@ void blockstore_impl_t::loop()
                }
            }
        }
-        if (op_idx != new_idx)
-        {
-            while (op_idx < submit_queue.size())
-            {
-                submit_queue[new_idx++] = submit_queue[op_idx++];
-            }
-            submit_queue.resize(new_idx);
-        }
        if (!readonly)
        {
            flusher->loop();
@ -234,7 +233,7 @@ bool blockstore_impl_t::is_safe_to_stop()
 {
    // It's safe to stop blockstore when there are no in-flight operations,
    // no in-progress syncs and flusher isn't doing anything
-    if (submit_queue.size() > 0 || !readonly && flusher->is_active())
+    if (submit_queue.size() > 0 || in_progress_syncs.size() > 0 || !readonly && flusher->is_active())
    {
        return false;
    }
@ -288,7 +287,7 @@ void blockstore_impl_t::check_wait(blockstore_op_t *op)
    else if (PRIV(op)->wait_for == WAIT_JOURNAL_BUFFER)
    {
        int next = ((journal.cur_sector + 1) % journal.sector_count);
-        if (journal.sector_info[next].flush_count > 0 ||
+        if (journal.sector_info[next].usage_count > 0 ||
            journal.sector_info[next].dirty)
        {
            // do not submit
@ -301,7 +300,7 @@ void blockstore_impl_t::check_wait(blockstore_op_t *op)
    }
    else if (PRIV(op)->wait_for == WAIT_FREE)
    {
-        if (!data_alloc->get_free_count() && flusher->is_active())
+        if (!data_alloc->get_free_count() && !flusher->is_active())
        {
 #ifdef BLOCKSTORE_DEBUG
            printf("Still waiting for free space on the data device\n");
@ -316,7 +315,7 @@ void blockstore_impl_t::check_wait(blockstore_op_t *op)
    }
 }

-void blockstore_impl_t::enqueue_op(blockstore_op_t *op)
+void blockstore_impl_t::enqueue_op(blockstore_op_t *op, bool first)
 {
    if (op->opcode < BS_OP_MIN || op->opcode > BS_OP_MAX ||
        ((op->opcode == BS_OP_READ || op->opcode == BS_OP_WRITE || op->opcode == BS_OP_WRITE_STABLE) && (
@ -324,7 +323,8 @@ void blockstore_impl_t::enqueue_op(blockstore_op_t *op)
            op->len > block_size-op->offset ||
            (op->len % disk_alignment)
        )) ||
-        readonly && op->opcode != BS_OP_READ && op->opcode != BS_OP_LIST)
+        readonly && op->opcode != BS_OP_READ && op->opcode != BS_OP_LIST ||
+        first && (op->opcode == BS_OP_WRITE || op->opcode == BS_OP_WRITE_STABLE))
    {
        // Basic verification not passed
        op->retval = -EINVAL;
@ -374,12 +374,25 @@ void blockstore_impl_t::enqueue_op(blockstore_op_t *op)
        std::function<void (blockstore_op_t*)>(op->callback)(op);
        return;
    }
+    if (op->opcode == BS_OP_SYNC && immediate_commit == IMMEDIATE_ALL)
+    {
+        op->retval = 0;
+        std::function<void (blockstore_op_t*)>(op->callback)(op);
+        return;
+    }
    // Call constructor without allocating memory. We'll call destructor before returning op back
    new ((void*)op->private_data) blockstore_op_private_t;
    PRIV(op)->wait_for = 0;
    PRIV(op)->op_state = 0;
    PRIV(op)->pending_ops = 0;
-    submit_queue.push_back(op);
+    if (!first)
+    {
+        submit_queue.push_back(op);
+    }
+    else
+    {
+        submit_queue.push_front(op);
+    }
    ringloop->wakeup();
 }

--- a/src/blockstore_impl.h
+++ b/src/blockstore_impl.h
@ -1,5 +1,5 @@
 // Copyright (c) Vitaliy Filippov, 2019+
-// License: VNPL-1.1 (see README.md for details)
+// License: VNPL-1.0 (see README.md for details)

 #pragma once

@ -30,13 +30,12 @@
 #define BS_ST_BIG_WRITE 0x02
 #define BS_ST_DELETE 0x03

-#define BS_ST_WAIT_DEL 0x10
-#define BS_ST_WAIT_BIG 0x20
-#define BS_ST_IN_FLIGHT 0x30
-#define BS_ST_SUBMITTED 0x40
-#define BS_ST_WRITTEN 0x50
-#define BS_ST_SYNCED 0x60
-#define BS_ST_STABLE 0x70
+#define BS_ST_WAIT_BIG 0x10
+#define BS_ST_IN_FLIGHT 0x20
+#define BS_ST_SUBMITTED 0x30
+#define BS_ST_WRITTEN 0x40
+#define BS_ST_SYNCED 0x50
+#define BS_ST_STABLE 0x60

 #define BS_ST_INSTANT 0x100

@ -77,8 +76,7 @@

 #include "blockstore_journal.h"

-// 32 bytes = 24 bytes + block bitmap (4 bytes by default) + external attributes (also bitmap, 4 bytes by default)
-// per "clean" entry on disk with fixed metadata tables
+// 24 bytes + block bitmap per "clean" entry on disk with fixed metadata tables
 // FIXME: maybe add crc32's to metadata
 struct __attribute__((__packed__)) clean_disk_entry
 {
@ -94,7 +92,7 @@ struct __attribute__((__packed__)) clean_entry
    uint64_t location;
 };

-// 64 = 24 + 40 bytes per dirty entry in memory (obj_ver_id => dirty_entry)
+// 56 = 24 + 32 bytes per dirty entry in memory (obj_ver_id => dirty_entry)
 struct __attribute__((__packed__)) dirty_entry
 {
    uint32_t state;
@ -103,7 +101,6 @@ struct __attribute__((__packed__)) dirty_entry
    uint32_t offset;   // data offset within object (stripe)
    uint32_t len;      // data length
    uint64_t journal_sector; // journal sector used for this entry
-    void* bitmap;   // either external bitmap itself when it fits, or a pointer to it when it doesn't
 };

 // - Sync must be submitted after previous writes/deletes (not before!)
@ -156,12 +153,12 @@ struct blockstore_op_private_t

    // Write
    struct iovec iov_zerofill[3];
-    // Warning: must not have a default value here because it's written to before calling constructor in blockstore_write.cpp O_o
-    uint64_t real_version;

    // Sync
    std::vector<obj_ver_id> sync_big_writes, sync_small_writes;
    int sync_small_checked, sync_big_checked;
+    std::list<blockstore_op_t*>::iterator in_progress_ptr;
+    int prev_sync_count;
 };

 // https://github.com/algorithm-ninja/cpp-btree
@ -199,10 +196,7 @@ class blockstore_impl_t
    // Suitable only for server SSDs with capacitors, requires disabled data and journal fsyncs
    int immediate_commit = IMMEDIATE_NONE;
    bool inmemory_meta = false;
-    // Maximum flusher count
-    unsigned flusher_count;
-    // Maximum queue depth
-    unsigned max_write_iodepth = 128;
+    int flusher_count;
    /******* END OF OPTIONS *******/

    struct ring_consumer_t ring_consumer;
@ -210,8 +204,9 @@ class blockstore_impl_t
    blockstore_clean_db_t clean_db;
    uint8_t *clean_bitmap = NULL;
    blockstore_dirty_db_t dirty_db;
-    std::vector<blockstore_op_t*> submit_queue;
+    std::list<blockstore_op_t*> submit_queue; // FIXME: funny thing is that vector is better here
    std::vector<obj_ver_id> unsynced_big_writes, unsynced_small_writes;
+    std::list<blockstore_op_t*> in_progress_syncs; // ...and probably here, too
    allocator *data_alloc = NULL;
    uint8_t *zero_object;

@ -228,7 +223,6 @@ class blockstore_impl_t

    struct journal_t journal;
    journal_flusher_t *flusher;
-    int write_iodepth = 0;

    bool live = false, queue_stall = false;
    ring_loop_t *ringloop;
@ -251,7 +245,6 @@ class blockstore_impl_t
    void open_data();
    void open_meta();
    void open_journal();
-    uint8_t* get_clean_entry_bitmap(uint64_t block_loc, int offset);

    // Asynchronous init
    int initialized;
@ -271,7 +264,6 @@ class blockstore_impl_t

    // Write
    bool enqueue_write(blockstore_op_t *op);
-    void cancel_all_writes(blockstore_op_t *op, blockstore_dirty_db_t::iterator dirty_it, int retval);
    int dequeue_write(blockstore_op_t *op);
    int dequeue_del(blockstore_op_t *op);
    int continue_write(blockstore_op_t *op);
@ -279,9 +271,11 @@ class blockstore_impl_t
    void handle_write_event(ring_data_t *data, blockstore_op_t *op);

    // Sync
-    int continue_sync(blockstore_op_t *op, bool queue_has_in_progress_sync);
+    int dequeue_sync(blockstore_op_t *op);
    void handle_sync_event(ring_data_t *data, blockstore_op_t *op);
-    void ack_sync(blockstore_op_t *op);
+    int continue_sync(blockstore_op_t *op);
+    void ack_one_sync(blockstore_op_t *op);
+    int ack_sync(blockstore_op_t *op);

    // Stabilize
    int dequeue_stable(blockstore_op_t *op);
@ -321,16 +315,13 @@ public:
    bool is_stalled();

    // Submission
-    void enqueue_op(blockstore_op_t *op);
+    void enqueue_op(blockstore_op_t *op, bool first = false);

    // Unstable writes are added here (map of object_id -> version)
    std::unordered_map<object_id, uint64_t> unstable_writes;

-    // Space usage statistics
-    std::map<uint64_t, uint64_t> inode_space_stats;
-
    inline uint32_t get_block_size() { return block_size; }
    inline uint64_t get_block_count() { return block_count; }
    inline uint64_t get_free_block_count() { return data_alloc->get_free_count(); }
-    inline uint32_t get_bitmap_granularity() { return disk_alignment; }
+    inline uint32_t get_disk_alignment() { return disk_alignment; }
 };
--- a/src/blockstore_init.cpp
+++ b/src/blockstore_init.cpp
@ -1,5 +1,5 @@
 // Copyright (c) Vitaliy Filippov, 2019+
-// License: VNPL-1.1 (see README.md for details)
+// License: VNPL-1.0 (see README.md for details)

 #include "blockstore_impl.h"

@ -100,7 +100,7 @@ void blockstore_init_meta::handle_entries(void* entries, unsigned count, int blo
        clean_disk_entry *entry = (clean_disk_entry*)(entries + i*bs->clean_entry_size);
        if (!bs->inmemory_meta && bs->clean_entry_bitmap_size)
        {
-            memcpy(bs->clean_bitmap + (done_cnt+i)*2*bs->clean_entry_bitmap_size, &entry->bitmap, 2*bs->clean_entry_bitmap_size);
+            memcpy(bs->clean_bitmap + (done_cnt+i)*bs->clean_entry_bitmap_size, &entry->bitmap, bs->clean_entry_bitmap_size);
        }
        if (entry->oid.inode > 0)
        {
@ -111,14 +111,10 @@ void blockstore_init_meta::handle_entries(void* entries, unsigned count, int blo
                {
                    // free the previous block
 #ifdef BLOCKSTORE_DEBUG
-                    printf("Free block %lu (new location is %lu)\n", clean_it->second.location >> block_order, done_cnt+i);
+                    printf("Free block %lu (new location is %lu)\n", clean_it->second.location >> block_order, done_cnt+i >> block_order);
 #endif
                    bs->data_alloc->set(clean_it->second.location >> block_order, false);
                }
-                else
-                {
-                    bs->inode_space_stats[entry->oid.inode] += bs->block_size;
-                }
                entries_loaded++;
 #ifdef BLOCKSTORE_DEBUG
                printf("Allocate block (clean entry) %lu: %lx:%lx v%lu\n", done_cnt+i, entry->oid.inode, entry->oid.stripe, entry->version);
@ -534,21 +530,6 @@ int blockstore_init_journal::handle_journal_part(void *buf, uint64_t done_pos, u
                        .oid = je->small_write.oid,
                        .version = je->small_write.version,
                    };
-                    void *bmp = (void*)je + sizeof(journal_entry_small_write);
-                    if (bs->clean_entry_bitmap_size <= sizeof(void*))
-                    {
-                        memcpy(&bmp, bmp, bs->clean_entry_bitmap_size);
-                    }
-                    else if (!bs->journal.inmemory)
-                    {
-                        // FIXME Using large blockstore objects and not keeping journal in memory
-                        // will result in a lot of small allocations for entry bitmaps. This can
-                        // only be fixed by using a patched map with dynamic entry size, but not
-                        // the btree_map, because it doesn't keep iterators valid all the time.
-                        void *bmp_cp = malloc_or_die(bs->clean_entry_bitmap_size);
-                        memcpy(bmp_cp, bmp, bs->clean_entry_bitmap_size);
-                        bmp = bmp_cp;
-                    }
                    bs->dirty_db.emplace(ov, (dirty_entry){
                        .state = (BS_ST_SMALL_WRITE | BS_ST_SYNCED),
                        .flags = 0,
@ -556,7 +537,6 @@ int blockstore_init_journal::handle_journal_part(void *buf, uint64_t done_pos, u
                        .offset = je->small_write.offset,
                        .len = je->small_write.len,
                        .journal_sector = proc_pos,
-                        .bitmap = bmp,
                    });
                    bs->journal.used_sectors[proc_pos]++;
 #ifdef BLOCKSTORE_DEBUG
@ -577,56 +557,11 @@ int blockstore_init_journal::handle_journal_part(void *buf, uint64_t done_pos, u
            {
 #ifdef BLOCKSTORE_DEBUG
                printf(
-                    "je_big_write%s oid=%lx:%lx ver=%lu loc=%lu\n",
+                    "je_big_write%s oid=%lx:%lx ver=%lu loc=%08lx\n",
                    je->type == JE_BIG_WRITE_INSTANT ? "_instant" : "",
-                    je->big_write.oid.inode, je->big_write.oid.stripe, je->big_write.version, je->big_write.location >> bs->block_order
+                    je->big_write.oid.inode, je->big_write.oid.stripe, je->big_write.version, je->big_write.location
                );
 #endif
-                auto dirty_it = bs->dirty_db.upper_bound((obj_ver_id){
-                    .oid = je->big_write.oid,
-                    .version = UINT64_MAX,
-                });
-                if (dirty_it != bs->dirty_db.begin() && bs->dirty_db.size() > 0)
-                {
-                    dirty_it--;
-                    if (dirty_it->first.oid == je->big_write.oid &&
-                        dirty_it->first.version >= je->big_write.version &&
-                        (dirty_it->second.state & BS_ST_TYPE_MASK) == BS_ST_DELETE)
-                    {
-                        // It is allowed to overwrite a deleted object with a
-                        // version number smaller than deletion version number,
-                        // because the presence of a BIG_WRITE entry means that
-                        // its data and metadata are already flushed.
-                        // We don't know if newer versions are flushed, but
-                        // the previous delete definitely is.
-                        // So we flush previous dirty entries, but retain the clean one.
-                        // This feature is required for writes happening shortly
-                        // after deletes.
-                        auto dirty_end = dirty_it;
-                        dirty_end++;
-                        while (1)
-                        {
-                            if (dirty_it == bs->dirty_db.begin())
-                            {
-                                break;
-                            }
-                            dirty_it--;
-                            if (dirty_it->first.oid != je->big_write.oid)
-                            {
-                                dirty_it++;
-                                break;
-                            }
-                        }
-                        auto clean_it = bs->clean_db.find(je->big_write.oid);
-                        bs->erase_dirty(
-                            dirty_it, dirty_end,
-                            clean_it != bs->clean_db.end() ? clean_it->second.location : UINT64_MAX
-                        );
-                        // Remove it from the flusher's queue, too
-                        // Otherwise it may end up referring to a small unstable write after reading the rest of the journal
-                        bs->flusher->remove_flush(je->big_write.oid);
-                    }
-                }
                auto clean_it = bs->clean_db.find(je->big_write.oid);
                if (clean_it == bs->clean_db.end() ||
                    clean_it->second.version < je->big_write.version)
@ -636,21 +571,6 @@ int blockstore_init_journal::handle_journal_part(void *buf, uint64_t done_pos, u
                        .oid = je->big_write.oid,
                        .version = je->big_write.version,
                    };
-                    void *bmp = (void*)je + sizeof(journal_entry_big_write);
-                    if (bs->clean_entry_bitmap_size <= sizeof(void*))
-                    {
-                        memcpy(&bmp, bmp, bs->clean_entry_bitmap_size);
-                    }
-                    else if (!bs->journal.inmemory)
-                    {
-                        // FIXME Using large blockstore objects and not keeping journal in memory
-                        // will result in a lot of small allocations for entry bitmaps. This can
-                        // only be fixed by using a patched map with dynamic entry size, but not
-                        // the btree_map, because it doesn't keep iterators valid all the time.
-                        void *bmp_cp = malloc_or_die(bs->clean_entry_bitmap_size);
-                        memcpy(bmp_cp, bmp, bs->clean_entry_bitmap_size);
-                        bmp = bmp_cp;
-                    }
                    bs->dirty_db.emplace(ov, (dirty_entry){
                        .state = (BS_ST_BIG_WRITE | BS_ST_SYNCED),
                        .flags = 0,
@ -658,7 +578,6 @@ int blockstore_init_journal::handle_journal_part(void *buf, uint64_t done_pos, u
                        .offset = je->big_write.offset,
                        .len = je->big_write.len,
                        .journal_sector = proc_pos,
-                        .bitmap = bmp,
                    });
 #ifdef BLOCKSTORE_DEBUG
                    printf("Allocate block %lu\n", je->big_write.location >> bs->block_order);
@ -709,7 +628,7 @@ int blockstore_init_journal::handle_journal_part(void *buf, uint64_t done_pos, u
                printf("je_delete oid=%lx:%lx ver=%lu\n", je->del.oid.inode, je->del.oid.stripe, je->del.version);
 #endif
                auto clean_it = bs->clean_db.find(je->del.oid);
-                if (clean_it != bs->clean_db.end() &&
+                if (clean_it == bs->clean_db.end() ||
                    clean_it->second.version < je->del.version)
                {
                    // oid, version
--- a/src/blockstore_init.h
+++ b/src/blockstore_init.h
@ -1,5 +1,5 @@
 // Copyright (c) Vitaliy Filippov, 2019+
-// License: VNPL-1.1 (see README.md for details)
+// License: VNPL-1.0 (see README.md for details)

 #pragma once

--- a/src/blockstore_journal.cpp
+++ b/src/blockstore_journal.cpp
@ -1,12 +1,12 @@
 // Copyright (c) Vitaliy Filippov, 2019+
-// License: VNPL-1.1 (see README.md for details)
+// License: VNPL-1.0 (see README.md for details)

 #include "blockstore_impl.h"

 blockstore_journal_check_t::blockstore_journal_check_t(blockstore_impl_t *bs)
 {
    this->bs = bs;
-    sectors_to_write = 0;
+    sectors_required = 0;
    next_pos = bs->journal.next_free;
    next_sector = bs->journal.cur_sector;
    first_sector = -1;
@ -20,26 +20,23 @@ int blockstore_journal_check_t::check_available(blockstore_op_t *op, int entries
    int required = entries_required;
    while (1)
    {
-        int fits = bs->journal.no_same_sector_overwrites && next_pos == bs->journal.next_free && bs->journal.sector_info[next_sector].written
+        int fits = bs->journal.no_same_sector_overwrites && bs->journal.sector_info[next_sector].written
            ? 0
            : (bs->journal.block_size - next_in_pos) / size;
        if (fits > 0)
        {
-            if (fits > required)
-            {
-                fits = required;
-            }
            if (first_sector == -1)
            {
                first_sector = next_sector;
            }
            required -= fits;
            next_in_pos += fits * size;
-            sectors_to_write++;
+            sectors_required++;
        }
        else if (bs->journal.sector_info[next_sector].dirty)
        {
-            sectors_to_write++;
+            // sectors_required is more like "sectors to write"
+            sectors_required++;
        }
        if (required <= 0)
        {
@ -62,7 +59,7 @@ int blockstore_journal_check_t::check_available(blockstore_op_t *op, int entries
                " is too small for a batch of "+std::to_string(entries_required)+" entries of "+std::to_string(size)+" bytes"
            );
        }
-        if (bs->journal.sector_info[next_sector].flush_count > 0 ||
+        if (bs->journal.sector_info[next_sector].usage_count > 0 ||
            bs->journal.sector_info[next_sector].dirty)
        {
            // No memory buffer available. Wait for it.
@ -74,18 +71,17 @@ int blockstore_journal_check_t::check_available(blockstore_op_t *op, int entries
                    dirty++;
                    used++;
                }
-                if (bs->journal.sector_info[i].flush_count > 0)
+                if (bs->journal.sector_info[i].usage_count > 0)
                {
                    used++;
                }
            }
            // In fact, it's even more rare than "ran out of journal space", so print a warning
            printf(
-                "Ran out of journal sector buffers: %d/%lu buffers used (%d dirty), next buffer (%ld)"
-                " is %s and flushed %lu times. Consider increasing \'journal_sector_buffer_count\'\n",
+                "Ran out of journal sector buffers: %d/%lu buffers used (%d dirty), next buffer (%ld) is %s and flushed %lu times\n",
                used, bs->journal.sector_count, dirty, next_sector,
                bs->journal.sector_info[next_sector].dirty ? "dirty" : "not dirty",
-                bs->journal.sector_info[next_sector].flush_count
+                bs->journal.sector_info[next_sector].usage_count
            );
            PRIV(op)->wait_for = WAIT_JOURNAL_BUFFER;
            return 0;
@ -104,8 +100,11 @@ int blockstore_journal_check_t::check_available(blockstore_op_t *op, int entries
    {
        // No space in the journal. Wait until used_start changes.
        printf(
-            "Ran out of journal space (used_start=%08lx, next_free=%08lx, dirty_start=%08lx)\n",
-            bs->journal.used_start, bs->journal.next_free, bs->journal.dirty_start
+            "Ran out of journal space (free space: %lu bytes, sectors to write: %d)\n",
+            (bs->journal.next_free >= bs->journal.used_start
+                ? bs->journal.len-bs->journal.block_size - (bs->journal.next_free-bs->journal.used_start)
+                : bs->journal.used_start - bs->journal.next_free),
+            sectors_required
        );
        PRIV(op)->wait_for = WAIT_JOURNAL;
        bs->flusher->request_trim();
@ -117,21 +116,22 @@ int blockstore_journal_check_t::check_available(blockstore_op_t *op, int entries

 journal_entry* prefill_single_journal_entry(journal_t & journal, uint16_t type, uint32_t size)
 {
-    if (!journal.entry_fits(size))
+    if (journal.block_size - journal.in_sector_pos < size ||
+        journal.no_same_sector_overwrites && journal.sector_info[journal.cur_sector].written)
    {
        assert(!journal.sector_info[journal.cur_sector].dirty);
        // Move to the next journal sector
-        if (journal.sector_info[journal.cur_sector].flush_count > 0)
+        journal.sector_info[journal.cur_sector].written = false;
+        if (journal.sector_info[journal.cur_sector].usage_count > 0)
        {
            // Also select next sector buffer in memory
            journal.cur_sector = ((journal.cur_sector + 1) % journal.sector_count);
-            assert(!journal.sector_info[journal.cur_sector].flush_count);
+            assert(!journal.sector_info[journal.cur_sector].usage_count);
        }
        else
        {
            journal.dirty_start = journal.next_free;
        }
-        journal.sector_info[journal.cur_sector].written = false;
        journal.sector_info[journal.cur_sector].offset = journal.next_free;
        journal.in_sector_pos = 0;
        journal.next_free = (journal.next_free+journal.block_size) < journal.len ? journal.next_free + journal.block_size : journal.block_size;
@ -157,7 +157,7 @@ void prepare_journal_sector_write(journal_t & journal, int cur_sector, io_uring_
 {
    journal.sector_info[cur_sector].dirty = false;
    journal.sector_info[cur_sector].written = true;
-    journal.sector_info[cur_sector].flush_count++;
+    journal.sector_info[cur_sector].usage_count++;
    ring_data_t *data = ((ring_data_t*)sqe->user_data);
    data->iov = (struct iovec){
        (journal.inmemory
--- a/src/blockstore_journal.h
+++ b/src/blockstore_journal.h
@ -1,5 +1,5 @@
 // Copyright (c) Vitaliy Filippov, 2019+
-// License: VNPL-1.1 (see README.md for details)
+// License: VNPL-1.0 (see README.md for details)

 #pragma once

@ -54,9 +54,6 @@ struct __attribute__((__packed__)) journal_entry_small_write
    // data_offset is its offset within journal
    uint64_t data_offset;
    uint32_t crc32_data;
-    // small_write and big_write entries are followed by the "external" bitmap
-    // its size is dynamic and included in journal entry's <size> field
-    uint8_t bitmap[];
 };

 struct __attribute__((__packed__)) journal_entry_big_write
@ -71,9 +68,6 @@ struct __attribute__((__packed__)) journal_entry_big_write
    uint32_t offset;
    uint32_t len;
    uint64_t location;
-    // small_write and big_write entries are followed by the "external" bitmap
-    // its size is dynamic and included in journal entry's <size> field
-    uint8_t bitmap[];
 };

 struct __attribute__((__packed__)) journal_entry_stable
@ -139,7 +133,7 @@ inline uint32_t je_crc32(journal_entry *je)
 struct journal_sector_info_t
 {
    uint64_t offset;
-    uint64_t flush_count;
+    uint64_t usage_count;
    bool written;
    bool dirty;
 };
@ -176,18 +170,13 @@ struct journal_t
    ~journal_t();
    bool trim();
    uint64_t get_trim_pos();
-    inline bool entry_fits(int size)
-    {
-        return !(block_size - in_sector_pos < size ||
-            no_same_sector_overwrites && sector_info[cur_sector].written);
-    }
 };

 struct blockstore_journal_check_t
 {
    blockstore_impl_t *bs;
    uint64_t next_pos, next_sector, next_in_pos;
-    int sectors_to_write, first_sector;
+    int sectors_required, first_sector;
    bool right_dir; // writing to the end or the beginning of the ring buffer

    blockstore_journal_check_t(blockstore_impl_t *bs);
--- a/src/blockstore_open.cpp
+++ b/src/blockstore_open.cpp
@ -1,5 +1,5 @@
 // Copyright (c) Vitaliy Filippov, 2019+
-// License: VNPL-1.1 (see README.md for details)
+// License: VNPL-1.0 (see README.md for details)

 #include <sys/file.h>
 #include "blockstore_impl.h"
@ -70,7 +70,6 @@ void blockstore_impl_t::parse_config(blockstore_config_t & config)
    meta_block_size = strtoull(config["meta_block_size"].c_str(), NULL, 10);
    bitmap_granularity = strtoull(config["bitmap_granularity"].c_str(), NULL, 10);
    flusher_count = strtoull(config["flusher_count"].c_str(), NULL, 10);
-    max_write_iodepth = strtoull(config["max_write_iodepth"].c_str(), NULL, 10);
    // Validate
    if (!block_size)
    {
@ -84,17 +83,13 @@ void blockstore_impl_t::parse_config(blockstore_config_t & config)
    {
        flusher_count = 32;
    }
-    if (!max_write_iodepth)
-    {
-        max_write_iodepth = 128;
-    }
    if (!disk_alignment)
    {
        disk_alignment = 4096;
    }
    else if (disk_alignment % MEM_ALIGNMENT)
    {
-        throw std::runtime_error("disk_alignment must be a multiple of "+std::to_string(MEM_ALIGNMENT));
+        throw std::runtime_error("disk_alingment must be a multiple of "+std::to_string(MEM_ALIGNMENT));
    }
    if (!journal_block_size)
    {
@ -118,7 +113,7 @@ void blockstore_impl_t::parse_config(blockstore_config_t & config)
    }
    if (!bitmap_granularity)
    {
-        bitmap_granularity = DEFAULT_BITMAP_GRANULARITY;
+        bitmap_granularity = 4096;
    }
    else if (bitmap_granularity % disk_alignment)
    {
@ -170,7 +165,7 @@ void blockstore_impl_t::parse_config(blockstore_config_t & config)
    }
    // init some fields
    clean_entry_bitmap_size = block_size / bitmap_granularity / 8;
-    clean_entry_size = sizeof(clean_disk_entry) + 2*clean_entry_bitmap_size;
+    clean_entry_size = sizeof(clean_disk_entry) + clean_entry_bitmap_size;
    journal.block_size = journal_block_size;
    journal.next_free = journal_block_size;
    journal.used_start = journal_block_size;
@ -237,7 +232,7 @@ void blockstore_impl_t::calc_lengths()
    }
    else if (clean_entry_bitmap_size)
    {
-        clean_bitmap = (uint8_t*)malloc(block_count * 2*clean_entry_bitmap_size);
+        clean_bitmap = (uint8_t*)malloc(block_count * clean_entry_bitmap_size);
        if (!clean_bitmap)
            throw std::runtime_error("Failed to allocate memory for the metadata sparse write bitmap");
    }
--- a/src/blockstore_read.cpp
+++ b/src/blockstore_read.cpp
@ -1,5 +1,5 @@
 // Copyright (c) Vitaliy Filippov, 2019+
-// License: VNPL-1.1 (see README.md for details)
+// License: VNPL-1.0 (see README.md for details)

 #include "blockstore_impl.h"

@ -94,21 +94,6 @@ endwhile:
    return 1;
 }

-uint8_t* blockstore_impl_t::get_clean_entry_bitmap(uint64_t block_loc, int offset)
-{
-    uint8_t *clean_entry_bitmap;
-    uint64_t meta_loc = block_loc >> block_order;
-    if (inmemory_meta)
-    {
-        uint64_t sector = (meta_loc / (meta_block_size / clean_entry_size)) * meta_block_size;
-        uint64_t pos = (meta_loc % (meta_block_size / clean_entry_size));
-        clean_entry_bitmap = (uint8_t*)(metadata_buffer + sector + pos*clean_entry_size + sizeof(clean_disk_entry) + offset);
-    }
-    else
-        clean_entry_bitmap = (uint8_t*)(clean_bitmap + meta_loc*2*clean_entry_bitmap_size + offset);
-    return clean_entry_bitmap;
-}
-
 int blockstore_impl_t::dequeue_read(blockstore_op_t *read_op)
 {
    auto clean_it = clean_db.find(read_op->oid);
@ -127,7 +112,7 @@ int blockstore_impl_t::dequeue_read(blockstore_op_t *read_op)
        read_op->version = 0;
        read_op->retval = read_op->len;
        FINISH_OP(read_op);
-        return 2;
+        return 1;
    }
    uint64_t fulfilled = 0;
    PRIV(read_op)->pending_ops = 0;
@ -149,11 +134,6 @@ int blockstore_impl_t::dequeue_read(blockstore_op_t *read_op)
                if (!result_version)
                {
                    result_version = dirty_it->first.version;
-                    if (read_op->bitmap)
-                    {
-                        void *bmp_ptr = (clean_entry_bitmap_size > sizeof(void*) ? dirty_it->second.bitmap : &dirty_it->second.bitmap);
-                        memcpy(read_op->bitmap, bmp_ptr, clean_entry_bitmap_size);
-                    }
                }
                if (!fulfill_read(read_op, fulfilled, dirty.offset, dirty.offset + dirty.len,
                    dirty.state, dirty_it->first.version, dirty.location + (IS_JOURNAL(dirty.state) ? 0 : dirty.offset)))
@ -175,11 +155,6 @@ int blockstore_impl_t::dequeue_read(blockstore_op_t *read_op)
        if (!result_version)
        {
            result_version = clean_it->second.version;
-            if (read_op->bitmap)
-            {
-                void *bmp_ptr = get_clean_entry_bitmap(clean_it->second.location, clean_entry_bitmap_size);
-                memcpy(read_op->bitmap, bmp_ptr, clean_entry_bitmap_size);
-            }
        }
        if (fulfilled < read_op->len)
        {
@ -194,7 +169,18 @@ int blockstore_impl_t::dequeue_read(blockstore_op_t *read_op)
            }
            else
            {
-                uint8_t *clean_entry_bitmap = get_clean_entry_bitmap(clean_it->second.location, 0);
+                uint64_t meta_loc = clean_it->second.location >> block_order;
+                uint8_t *clean_entry_bitmap;
+                if (inmemory_meta)
+                {
+                    uint64_t sector = (meta_loc / (meta_block_size / clean_entry_size)) * meta_block_size;
+                    uint64_t pos = (meta_loc % (meta_block_size / clean_entry_size));
+                    clean_entry_bitmap = (uint8_t*)(metadata_buffer + sector + pos*clean_entry_size + sizeof(clean_disk_entry));
+                }
+                else
+                {
+                    clean_entry_bitmap = (uint8_t*)(clean_bitmap + meta_loc*clean_entry_bitmap_size);
+                }
                uint64_t bmp_start = 0, bmp_end = 0, bmp_size = block_size/bitmap_granularity;
                while (bmp_start < bmp_size)
                {
@ -205,8 +191,8 @@ int blockstore_impl_t::dequeue_read(blockstore_op_t *read_op)
                    if (bmp_end > bmp_start)
                    {
                        // fill with zeroes
-                        assert(fulfill_read(read_op, fulfilled, bmp_start * bitmap_granularity,
-                            bmp_end * bitmap_granularity, (BS_ST_DELETE | BS_ST_STABLE), 0, 0));
+                        fulfill_read(read_op, fulfilled, bmp_start * bitmap_granularity,
+                            bmp_end * bitmap_granularity, (BS_ST_DELETE | BS_ST_STABLE), 0, 0);
                    }
                    bmp_start = bmp_end;
                    while (clean_entry_bitmap[bmp_end >> 3] & (1 << (bmp_end & 0x7)) && bmp_end < bmp_size)
@ -232,7 +218,7 @@ int blockstore_impl_t::dequeue_read(blockstore_op_t *read_op)
    else if (fulfilled < read_op->len)
    {
        // fill remaining parts with zeroes
-        assert(fulfill_read(read_op, fulfilled, 0, block_size, (BS_ST_DELETE | BS_ST_STABLE), 0, 0));
+        fulfill_read(read_op, fulfilled, 0, block_size, (BS_ST_DELETE | BS_ST_STABLE), 0, 0);
    }
    assert(fulfilled == read_op->len);
    read_op->version = result_version;
@ -246,10 +232,10 @@ int blockstore_impl_t::dequeue_read(blockstore_op_t *read_op)
        }
        read_op->retval = read_op->len;
        FINISH_OP(read_op);
-        return 2;
+        return 1;
    }
    read_op->retval = 0;
-    return 2;
+    return 1;
 }

 void blockstore_impl_t::handle_read_event(ring_data_t *data, blockstore_op_t *op)
--- a/src/blockstore_rollback.cpp
+++ b/src/blockstore_rollback.cpp
@ -1,5 +1,5 @@
 // Copyright (c) Vitaliy Filippov, 2019+
-// License: VNPL-1.1 (see README.md for details)
+// License: VNPL-1.0 (see README.md for details)

 #include "blockstore_impl.h"

@ -50,7 +50,7 @@ skip_ov:
                {
                    op->retval = -EBUSY;
                    FINISH_OP(op);
-                    return 2;
+                    return 1;
                }
                if (dirty_it == dirty_db.begin())
                {
@ -66,7 +66,7 @@ skip_ov:
        // Already rolled back
        op->retval = 0;
        FINISH_OP(op);
-        return 2;
+        return 1;
    }
    // Check journal space
    blockstore_journal_check_t space_check(this);
@ -75,35 +75,44 @@ skip_ov:
        return 0;
    }
    // There is sufficient space. Get SQEs
-    struct io_uring_sqe *sqe[space_check.sectors_to_write];
-    for (i = 0; i < space_check.sectors_to_write; i++)
+    struct io_uring_sqe *sqe[space_check.sectors_required];
+    for (i = 0; i < space_check.sectors_required; i++)
    {
        BS_SUBMIT_GET_SQE_DECL(sqe[i]);
    }
    // Prepare and submit journal entries
    auto cb = [this, op](ring_data_t *data) { handle_rollback_event(data, op); };
    int s = 0, cur_sector = -1;
+    if ((journal_block_size - journal.in_sector_pos) < sizeof(journal_entry_rollback) &&
+        journal.sector_info[journal.cur_sector].dirty)
+    {
+        if (cur_sector == -1)
+            PRIV(op)->min_flushed_journal_sector = 1 + journal.cur_sector;
+        cur_sector = journal.cur_sector;
+        prepare_journal_sector_write(journal, cur_sector, sqe[s++], cb);
+    }
    for (i = 0, v = (obj_ver_id*)op->buf; i < op->len; i++, v++)
    {
-        if (!journal.entry_fits(sizeof(journal_entry_rollback)) &&
-            journal.sector_info[journal.cur_sector].dirty)
-        {
-            if (cur_sector == -1)
-                PRIV(op)->min_flushed_journal_sector = 1 + journal.cur_sector;
-            prepare_journal_sector_write(journal, journal.cur_sector, sqe[s++], cb);
-            cur_sector = journal.cur_sector;
-        }
        journal_entry_rollback *je = (journal_entry_rollback*)
            prefill_single_journal_entry(journal, JE_ROLLBACK, sizeof(journal_entry_rollback));
+        journal.sector_info[journal.cur_sector].dirty = false;
        je->oid = v->oid;
        je->version = v->version;
        je->crc32 = je_crc32((journal_entry*)je);
        journal.crc32_last = je->crc32;
+        if (cur_sector != journal.cur_sector)
+        {
+            // Write previous sector. We should write the sector only after filling it,
+            // because otherwise we'll write a lot more sectors in the "no_same_sector_overwrite" mode
+            if (cur_sector != -1)
+                prepare_journal_sector_write(journal, cur_sector, sqe[s++], cb);
+            else
+                PRIV(op)->min_flushed_journal_sector = 1 + journal.cur_sector;
+            cur_sector = journal.cur_sector;
+        }
    }
-    prepare_journal_sector_write(journal, journal.cur_sector, sqe[s++], cb);
-    assert(s == space_check.sectors_to_write);
-    if (cur_sector == -1)
-        PRIV(op)->min_flushed_journal_sector = 1 + journal.cur_sector;
+    if (cur_sector != -1)
+        prepare_journal_sector_write(journal, cur_sector, sqe[s++], cb);
    PRIV(op)->max_flushed_journal_sector = 1 + journal.cur_sector;
    PRIV(op)->pending_ops = s;
    PRIV(op)->op_state = 1;
@ -126,8 +135,11 @@ resume_2:
 resume_3:
    if (!disable_journal_fsync)
    {
-        io_uring_sqe *sqe;
-        BS_SUBMIT_GET_SQE_DECL(sqe);
+        io_uring_sqe *sqe = get_sqe();
+        if (!sqe)
+        {
+            return 0;
+        }
        ring_data_t *data = ((ring_data_t*)sqe->user_data);
        my_uring_prep_fsync(sqe, journal.fd, IORING_FSYNC_DATASYNC);
        data->iov = { 0 };
@ -148,7 +160,7 @@ resume_5:
    // Acknowledge op
    op->retval = 0;
    FINISH_OP(op);
-    return 2;
+    return 1;
 }

 void blockstore_impl_t::mark_rolled_back(const obj_ver_id & ov)
@ -163,7 +175,10 @@ void blockstore_impl_t::mark_rolled_back(const obj_ver_id & ov)
        auto rm_start = it;
        auto rm_end = it;
        it--;
-        while (1)
+        while (it->first.oid == ov.oid &&
+            it->first.version > ov.version &&
+            !IS_IN_FLIGHT(it->second.state) &&
+            !IS_STABLE(it->second.state))
        {
            if (it->first.oid != ov.oid)
                break;
@ -173,7 +188,7 @@ void blockstore_impl_t::mark_rolled_back(const obj_ver_id & ov)
                    max_unstable = it->first.version;
                break;
            }
-            else if (IS_IN_FLIGHT(it->second.state) || IS_STABLE(it->second.state))
+            else if (IS_STABLE(it->second.state))
                break;
            // Remove entry
            rm_start = it;
@ -184,14 +199,14 @@ void blockstore_impl_t::mark_rolled_back(const obj_ver_id & ov)
        if (rm_start != rm_end)
        {
            erase_dirty(rm_start, rm_end, UINT64_MAX);
-            auto unstab_it = unstable_writes.find(ov.oid);
-            if (unstab_it != unstable_writes.end())
-            {
-                if (max_unstable == 0)
-                    unstable_writes.erase(unstab_it);
-                else
-                    unstab_it->second = max_unstable;
-            }
+        }
+        auto unstab_it = unstable_writes.find(ov.oid);
+        if (unstab_it != unstable_writes.end())
+        {
+            if (max_unstable == 0)
+                unstable_writes.erase(unstab_it);
+            else
+                unstab_it->second = max_unstable;
        }
    }
 }
@ -210,44 +225,19 @@ void blockstore_impl_t::handle_rollback_event(ring_data_t *data, blockstore_op_t
    if (PRIV(op)->pending_ops == 0)
    {
        PRIV(op)->op_state++;
-        ringloop->wakeup();
+        if (!continue_rollback(op))
+        {
+            submit_queue.push_front(op);
+        }
    }
 }

 void blockstore_impl_t::erase_dirty(blockstore_dirty_db_t::iterator dirty_start, blockstore_dirty_db_t::iterator dirty_end, uint64_t clean_loc)
 {
-    if (dirty_end == dirty_start)
-    {
-        return;
-    }
    auto dirty_it = dirty_end;
-    dirty_it--;
-    if (IS_DELETE(dirty_it->second.state))
+    while (dirty_it != dirty_start)
    {
-        object_id oid = dirty_it->first.oid;
-#ifdef BLOCKSTORE_DEBUG
-        printf("Unblock writes-after-delete %lx:%lx v%lx\n", oid.inode, oid.stripe, dirty_it->first.version);
-#endif
-        dirty_it = dirty_end;
-        // Unblock operations blocked by delete flushing
-        uint32_t next_state = BS_ST_IN_FLIGHT;
-        while (dirty_it != dirty_db.end() && dirty_it->first.oid == oid)
-        {
-            if ((dirty_it->second.state & BS_ST_WORKFLOW_MASK) == BS_ST_WAIT_DEL)
-            {
-                dirty_it->second.state = (dirty_it->second.state & ~BS_ST_WORKFLOW_MASK) | next_state;
-                if (IS_BIG_WRITE(dirty_it->second.state))
-                {
-                    next_state = BS_ST_WAIT_BIG;
-                }
-            }
-            dirty_it++;
-        }
-        dirty_it = dirty_end;
        dirty_it--;
-    }
-    while (1)
-    {
        if (IS_BIG_WRITE(dirty_it->second.state) && dirty_it->second.location != clean_loc)
        {
 #ifdef BLOCKSTORE_DEBUG
@ -266,16 +256,6 @@ void blockstore_impl_t::erase_dirty(blockstore_dirty_db_t::iterator dirty_start,
        {
            journal.used_sectors.erase(dirty_it->second.journal_sector);
        }
-        if (clean_entry_bitmap_size > sizeof(void*))
-        {
-            free(dirty_it->second.bitmap);
-            dirty_it->second.bitmap = NULL;
-        }
-        if (dirty_it == dirty_start)
-        {
-            break;
-        }
-        dirty_it--;
    }
    dirty_db.erase(dirty_start, dirty_end);
 }
--- a/src/blockstore_stable.cpp
+++ b/src/blockstore_stable.cpp
@ -1,5 +1,5 @@
 // Copyright (c) Vitaliy Filippov, 2019+
-// License: VNPL-1.1 (see README.md for details)
+// License: VNPL-1.0 (see README.md for details)

 #include "blockstore_impl.h"

@ -60,7 +60,7 @@ int blockstore_impl_t::dequeue_stable(blockstore_op_t *op)
                // No such object version
                op->retval = -ENOENT;
                FINISH_OP(op);
-                return 2;
+                return 1;
            }
            else
            {
@ -77,7 +77,7 @@ int blockstore_impl_t::dequeue_stable(blockstore_op_t *op)
            // Object not synced yet. Caller must sync it first
            op->retval = -EBUSY;
            FINISH_OP(op);
-            return 2;
+            return 1;
        }
        else if (!IS_STABLE(dirty_it->second.state))
        {
@ -89,7 +89,7 @@ int blockstore_impl_t::dequeue_stable(blockstore_op_t *op)
        // Already stable
        op->retval = 0;
        FINISH_OP(op);
-        return 2;
+        return 1;
    }
    // Check journal space
    blockstore_journal_check_t space_check(this);
@ -98,36 +98,45 @@ int blockstore_impl_t::dequeue_stable(blockstore_op_t *op)
        return 0;
    }
    // There is sufficient space. Get SQEs
-    struct io_uring_sqe *sqe[space_check.sectors_to_write];
-    for (i = 0; i < space_check.sectors_to_write; i++)
+    struct io_uring_sqe *sqe[space_check.sectors_required];
+    for (i = 0; i < space_check.sectors_required; i++)
    {
        BS_SUBMIT_GET_SQE_DECL(sqe[i]);
    }
    // Prepare and submit journal entries
    auto cb = [this, op](ring_data_t *data) { handle_stable_event(data, op); };
    int s = 0, cur_sector = -1;
+    if ((journal_block_size - journal.in_sector_pos) < sizeof(journal_entry_stable) &&
+        journal.sector_info[journal.cur_sector].dirty)
+    {
+        if (cur_sector == -1)
+            PRIV(op)->min_flushed_journal_sector = 1 + journal.cur_sector;
+        cur_sector = journal.cur_sector;
+        prepare_journal_sector_write(journal, cur_sector, sqe[s++], cb);
+    }
    for (i = 0, v = (obj_ver_id*)op->buf; i < op->len; i++, v++)
    {
        // FIXME: Only stabilize versions that aren't stable yet
-        if (!journal.entry_fits(sizeof(journal_entry_stable)) &&
-            journal.sector_info[journal.cur_sector].dirty)
-        {
-            if (cur_sector == -1)
-                PRIV(op)->min_flushed_journal_sector = 1 + journal.cur_sector;
-            prepare_journal_sector_write(journal, journal.cur_sector, sqe[s++], cb);
-            cur_sector = journal.cur_sector;
-        }
        journal_entry_stable *je = (journal_entry_stable*)
            prefill_single_journal_entry(journal, JE_STABLE, sizeof(journal_entry_stable));
+        journal.sector_info[journal.cur_sector].dirty = false;
        je->oid = v->oid;
        je->version = v->version;
        je->crc32 = je_crc32((journal_entry*)je);
        journal.crc32_last = je->crc32;
+        if (cur_sector != journal.cur_sector)
+        {
+            // Write previous sector. We should write the sector only after filling it,
+            // because otherwise we'll write a lot more sectors in the "no_same_sector_overwrite" mode
+            if (cur_sector != -1)
+                prepare_journal_sector_write(journal, cur_sector, sqe[s++], cb);
+            else
+                PRIV(op)->min_flushed_journal_sector = 1 + journal.cur_sector;
+            cur_sector = journal.cur_sector;
+        }
    }
-    prepare_journal_sector_write(journal, journal.cur_sector, sqe[s++], cb);
-    assert(s == space_check.sectors_to_write);
-    if (cur_sector == -1)
-        PRIV(op)->min_flushed_journal_sector = 1 + journal.cur_sector;
+    if (cur_sector != -1)
+        prepare_journal_sector_write(journal, cur_sector, sqe[s++], cb);
    PRIV(op)->max_flushed_journal_sector = 1 + journal.cur_sector;
    PRIV(op)->pending_ops = s;
    PRIV(op)->op_state = 1;
@ -150,8 +159,11 @@ resume_2:
 resume_3:
    if (!disable_journal_fsync)
    {
-        io_uring_sqe *sqe;
-        BS_SUBMIT_GET_SQE_DECL(sqe);
+        io_uring_sqe *sqe = get_sqe();
+        if (!sqe)
+        {
+            return 0;
+        }
        ring_data_t *data = ((ring_data_t*)sqe->user_data);
        my_uring_prep_fsync(sqe, journal.fd, IORING_FSYNC_DATASYNC);
        data->iov = { 0 };
@ -173,7 +185,7 @@ resume_5:
    // Acknowledge op
    op->retval = 0;
    FINISH_OP(op);
-    return 2;
+    return 1;
 }

 void blockstore_impl_t::mark_stable(const obj_ver_id & v)
@ -186,15 +198,6 @@ void blockstore_impl_t::mark_stable(const obj_ver_id & v)
            if ((dirty_it->second.state & BS_ST_WORKFLOW_MASK) == BS_ST_SYNCED)
            {
                dirty_it->second.state = (dirty_it->second.state & ~BS_ST_WORKFLOW_MASK) | BS_ST_STABLE;
-                // Allocations and deletions are counted when they're stabilized
-                if (IS_BIG_WRITE(dirty_it->second.state))
-                {
-                    inode_space_stats[dirty_it->first.oid.inode] += block_size;
-                }
-                else if (IS_DELETE(dirty_it->second.state))
-                {
-                    inode_space_stats[dirty_it->first.oid.inode] -= block_size;
-                }
            }
            else if (IS_STABLE(dirty_it->second.state))
            {
@ -210,6 +213,9 @@ void blockstore_impl_t::mark_stable(const obj_ver_id & v)
                break;
            }
        }
+#ifdef BLOCKSTORE_DEBUG
+        printf("enqueue_flush %lx:%lx v%lu\n", v.oid.inode, v.oid.stripe, v.version);
+#endif
        flusher->enqueue_flush(v);
    }
    auto unstab_it = unstable_writes.find(v.oid);
@ -234,6 +240,9 @@ void blockstore_impl_t::handle_stable_event(ring_data_t *data, blockstore_op_t *
    if (PRIV(op)->pending_ops == 0)
    {
        PRIV(op)->op_state++;
-        ringloop->wakeup();
+        if (!continue_stable(op))
+        {
+            submit_queue.push_front(op);
+        }
    }
 }
--- a/src/blockstore_sync.cpp
+++ b/src/blockstore_sync.cpp
@ -1,5 +1,5 @@
 // Copyright (c) Vitaliy Filippov, 2019+
-// License: VNPL-1.1 (see README.md for details)
+// License: VNPL-1.0 (see README.md for details)

 #include "blockstore_impl.h"

@ -12,15 +12,8 @@
 #define SYNC_JOURNAL_SYNC_SENT 7
 #define SYNC_DONE 8

-int blockstore_impl_t::continue_sync(blockstore_op_t *op, bool queue_has_in_progress_sync)
+int blockstore_impl_t::dequeue_sync(blockstore_op_t *op)
 {
-    if (immediate_commit == IMMEDIATE_ALL)
-    {
-        // We can return immediately because sync is only dequeued after all previous writes
-        op->retval = 0;
-        FINISH_OP(op);
-        return 2;
-    }
    if (PRIV(op)->op_state == 0)
    {
        stop_sync_submitted = false;
@ -36,15 +29,34 @@ int blockstore_impl_t::continue_sync(blockstore_op_t *op, bool queue_has_in_prog
            PRIV(op)->op_state = SYNC_HAS_SMALL;
        else
            PRIV(op)->op_state = SYNC_DONE;
+        // Always add sync to in_progress_syncs because we clear unsynced_big_writes and unsynced_small_writes
+        PRIV(op)->prev_sync_count = in_progress_syncs.size();
+        PRIV(op)->in_progress_ptr = in_progress_syncs.insert(in_progress_syncs.end(), op);
    }
+    continue_sync(op);
+    // Always dequeue because we always add syncs to in_progress_syncs
+    return 1;
+}
+
+int blockstore_impl_t::continue_sync(blockstore_op_t *op)
+{
+    auto cb = [this, op](ring_data_t *data) { handle_sync_event(data, op); };
    if (PRIV(op)->op_state == SYNC_HAS_SMALL)
    {
        // No big writes, just fsync the journal
+        for (; PRIV(op)->sync_small_checked < PRIV(op)->sync_small_writes.size(); PRIV(op)->sync_small_checked++)
+        {
+            if (IS_IN_FLIGHT(dirty_db[PRIV(op)->sync_small_writes[PRIV(op)->sync_small_checked]].state))
+            {
+                // Wait for small inflight writes to complete
+                return 0;
+            }
+        }
        if (journal.sector_info[journal.cur_sector].dirty)
        {
            // Write out the last journal sector if it happens to be dirty
            BS_SUBMIT_GET_ONLY_SQE(sqe);
-            prepare_journal_sector_write(journal, journal.cur_sector, sqe, [this, op](ring_data_t *data) { handle_sync_event(data, op); });
+            prepare_journal_sector_write(journal, journal.cur_sector, sqe, cb);
            PRIV(op)->min_flushed_journal_sector = PRIV(op)->max_flushed_journal_sector = 1 + journal.cur_sector;
            PRIV(op)->pending_ops = 1;
            PRIV(op)->op_state = SYNC_JOURNAL_WRITE_SENT;
@ -57,13 +69,21 @@ int blockstore_impl_t::continue_sync(blockstore_op_t *op, bool queue_has_in_prog
    }
    if (PRIV(op)->op_state == SYNC_HAS_BIG)
    {
+        for (; PRIV(op)->sync_big_checked < PRIV(op)->sync_big_writes.size(); PRIV(op)->sync_big_checked++)
+        {
+            if (IS_IN_FLIGHT(dirty_db[PRIV(op)->sync_big_writes[PRIV(op)->sync_big_checked]].state))
+            {
+                // Wait for big inflight writes to complete
+                return 0;
+            }
+        }
        // 1st step: fsync data
        if (!disable_data_fsync)
        {
            BS_SUBMIT_GET_SQE(sqe, data);
            my_uring_prep_fsync(sqe, data_fd, IORING_FSYNC_DATASYNC);
            data->iov = { 0 };
-            data->callback = [this, op](ring_data_t *data) { handle_sync_event(data, op); };
+            data->callback = cb;
            PRIV(op)->min_flushed_journal_sector = PRIV(op)->max_flushed_journal_sector = 0;
            PRIV(op)->pending_ops = 1;
            PRIV(op)->op_state = SYNC_DATA_SYNC_SENT;
@ -76,6 +96,14 @@ int blockstore_impl_t::continue_sync(blockstore_op_t *op, bool queue_has_in_prog
    }
    if (PRIV(op)->op_state == SYNC_DATA_SYNC_DONE)
    {
+        for (; PRIV(op)->sync_small_checked < PRIV(op)->sync_small_writes.size(); PRIV(op)->sync_small_checked++)
+        {
+            if (IS_IN_FLIGHT(dirty_db[PRIV(op)->sync_small_writes[PRIV(op)->sync_small_checked]].state))
+            {
+                // Wait for small inflight writes to complete
+                return 0;
+            }
+        }
        // 2nd step: Data device is synced, prepare & write journal entries
        // Check space in the journal and journal memory buffers
        blockstore_journal_check_t space_check(this);
@ -84,29 +112,30 @@ int blockstore_impl_t::continue_sync(blockstore_op_t *op, bool queue_has_in_prog
            return 0;
        }
        // Get SQEs. Don't bother about merging, submit each journal sector as a separate request
-        struct io_uring_sqe *sqe[space_check.sectors_to_write];
-        for (int i = 0; i < space_check.sectors_to_write; i++)
+        struct io_uring_sqe *sqe[space_check.sectors_required];
+        for (int i = 0; i < space_check.sectors_required; i++)
        {
            BS_SUBMIT_GET_SQE_DECL(sqe[i]);
        }
        // Prepare and submit journal entries
        auto it = PRIV(op)->sync_big_writes.begin();
        int s = 0, cur_sector = -1;
+        if ((journal_block_size - journal.in_sector_pos) < sizeof(journal_entry_big_write) &&
+            journal.sector_info[journal.cur_sector].dirty)
+        {
+            if (cur_sector == -1)
+                PRIV(op)->min_flushed_journal_sector = 1 + journal.cur_sector;
+            cur_sector = journal.cur_sector;
+            prepare_journal_sector_write(journal, cur_sector, sqe[s++], cb);
+        }
        while (it != PRIV(op)->sync_big_writes.end())
        {
-            if (!journal.entry_fits(sizeof(journal_entry_big_write)) &&
-                journal.sector_info[journal.cur_sector].dirty)
-            {
-                if (cur_sector == -1)
-                    PRIV(op)->min_flushed_journal_sector = 1 + journal.cur_sector;
-                prepare_journal_sector_write(journal, journal.cur_sector, sqe[s++], [this, op](ring_data_t *data) { handle_sync_event(data, op); });
-                cur_sector = journal.cur_sector;
-            }
            journal_entry_big_write *je = (journal_entry_big_write*)prefill_single_journal_entry(
                journal, (dirty_db[*it].state & BS_ST_INSTANT) ? JE_BIG_WRITE_INSTANT : JE_BIG_WRITE,
                sizeof(journal_entry_big_write)
            );
            dirty_db[*it].journal_sector = journal.sector_info[journal.cur_sector].offset;
+            journal.sector_info[journal.cur_sector].dirty = false;
            journal.used_sectors[journal.sector_info[journal.cur_sector].offset]++;
 #ifdef BLOCKSTORE_DEBUG
            printf(
@ -123,11 +152,19 @@ int blockstore_impl_t::continue_sync(blockstore_op_t *op, bool queue_has_in_prog
            je->crc32 = je_crc32((journal_entry*)je);
            journal.crc32_last = je->crc32;
            it++;
+            if (cur_sector != journal.cur_sector)
+            {
+                // Write previous sector. We should write the sector only after filling it,
+                // because otherwise we'll write a lot more sectors in the "no_same_sector_overwrite" mode
+                if (cur_sector != -1)
+                    prepare_journal_sector_write(journal, cur_sector, sqe[s++], cb);
+                else
+                    PRIV(op)->min_flushed_journal_sector = 1 + journal.cur_sector;
+                cur_sector = journal.cur_sector;
+            }
        }
-        prepare_journal_sector_write(journal, journal.cur_sector, sqe[s++], [this, op](ring_data_t *data) { handle_sync_event(data, op); });
-        assert(s == space_check.sectors_to_write);
-        if (cur_sector == -1)
-            PRIV(op)->min_flushed_journal_sector = 1 + journal.cur_sector;
+        if (cur_sector != -1)
+            prepare_journal_sector_write(journal, cur_sector, sqe[s++], cb);
        PRIV(op)->max_flushed_journal_sector = 1 + journal.cur_sector;
        PRIV(op)->pending_ops = s;
        PRIV(op)->op_state = SYNC_JOURNAL_WRITE_SENT;
@ -140,7 +177,7 @@ int blockstore_impl_t::continue_sync(blockstore_op_t *op, bool queue_has_in_prog
            BS_SUBMIT_GET_SQE(sqe, data);
            my_uring_prep_fsync(sqe, journal.fd, IORING_FSYNC_DATASYNC);
            data->iov = { 0 };
-            data->callback = [this, op](ring_data_t *data) { handle_sync_event(data, op); };
+            data->callback = cb;
            PRIV(op)->pending_ops = 1;
            PRIV(op)->op_state = SYNC_JOURNAL_SYNC_SENT;
            return 1;
@ -150,10 +187,9 @@ int blockstore_impl_t::continue_sync(blockstore_op_t *op, bool queue_has_in_prog
            PRIV(op)->op_state = SYNC_DONE;
        }
    }
-    if (PRIV(op)->op_state == SYNC_DONE && !queue_has_in_progress_sync)
+    if (PRIV(op)->op_state == SYNC_DONE)
    {
-        ack_sync(op);
-        return 2;
+        return ack_sync(op);
    }
    return 1;
 }
@ -185,16 +221,42 @@ void blockstore_impl_t::handle_sync_event(ring_data_t *data, blockstore_op_t *op
        else if (PRIV(op)->op_state == SYNC_JOURNAL_SYNC_SENT)
        {
            PRIV(op)->op_state = SYNC_DONE;
+            ack_sync(op);
        }
        else
        {
            throw std::runtime_error("BUG: unexpected sync op state");
        }
-        ringloop->wakeup();
    }
 }

-void blockstore_impl_t::ack_sync(blockstore_op_t *op)
+int blockstore_impl_t::ack_sync(blockstore_op_t *op)
+{
+    if (PRIV(op)->op_state == SYNC_DONE && PRIV(op)->prev_sync_count == 0)
+    {
+        // Remove dependency of subsequent syncs
+        auto it = PRIV(op)->in_progress_ptr;
+        int done_syncs = 1;
+        ++it;
+        // Acknowledge sync
+        ack_one_sync(op);
+        while (it != in_progress_syncs.end())
+        {
+            auto & next_sync = *it++;
+            PRIV(next_sync)->prev_sync_count -= done_syncs;
+            if (PRIV(next_sync)->prev_sync_count == 0 && PRIV(next_sync)->op_state == SYNC_DONE)
+            {
+                done_syncs++;
+                // Acknowledge next_sync
+                ack_one_sync(next_sync);
+            }
+        }
+        return 2;
+    }
+    return 0;
+}
+
+void blockstore_impl_t::ack_one_sync(blockstore_op_t *op)
 {
    // Handle states
    for (auto it = PRIV(op)->sync_big_writes.begin(); it != PRIV(op)->sync_big_writes.end(); it++)
@ -242,6 +304,7 @@ void blockstore_impl_t::ack_sync(blockstore_op_t *op)
            }
        }
    }
+    in_progress_syncs.erase(PRIV(op)->in_progress_ptr);
    op->retval = 0;
    FINISH_OP(op);
 }
--- a/src/blockstore_write.cpp
+++ b/src/blockstore_write.cpp
@ -1,5 +1,5 @@
 // Copyright (c) Vitaliy Filippov, 2019+
-// License: VNPL-1.1 (see README.md for details)
+// License: VNPL-1.0 (see README.md for details)

 #include "blockstore_impl.h"

@ -7,13 +7,8 @@ bool blockstore_impl_t::enqueue_write(blockstore_op_t *op)
 {
    // Check or assign version number
    bool found = false, deleted = false, is_del = (op->opcode == BS_OP_DELETE);
-    bool wait_big = false, wait_del = false;
-    void *bmp = NULL;
+    bool is_inflight_big = false;
    uint64_t version = 1;
-    if (!is_del && clean_entry_bitmap_size > sizeof(void*))
-    {
-        bmp = calloc_or_die(1, clean_entry_bitmap_size);
-    }
    if (dirty_db.size() > 0)
    {
        auto dirty_it = dirty_db.upper_bound((obj_ver_id){
@ -26,14 +21,9 @@ bool blockstore_impl_t::enqueue_write(blockstore_op_t *op)
            found = true;
            version = dirty_it->first.version + 1;
            deleted = IS_DELETE(dirty_it->second.state);
-            wait_del = ((dirty_it->second.state & BS_ST_WORKFLOW_MASK) == BS_ST_WAIT_DEL);
-            wait_big = (dirty_it->second.state & BS_ST_TYPE_MASK) == BS_ST_BIG_WRITE
+            is_inflight_big = (dirty_it->second.state & BS_ST_TYPE_MASK) == BS_ST_BIG_WRITE
                ? !IS_SYNCED(dirty_it->second.state)
                : ((dirty_it->second.state & BS_ST_WORKFLOW_MASK) == BS_ST_WAIT_BIG);
-            if (clean_entry_bitmap_size > sizeof(void*))
-                memcpy(bmp, dirty_it->second.bitmap, clean_entry_bitmap_size);
-            else
-                bmp = dirty_it->second.bitmap;
        }
    }
    if (!found)
@ -42,55 +32,29 @@ bool blockstore_impl_t::enqueue_write(blockstore_op_t *op)
        if (clean_it != clean_db.end())
        {
            version = clean_it->second.version + 1;
-            void *bmp_ptr = get_clean_entry_bitmap(clean_it->second.location, clean_entry_bitmap_size);
-            memcpy((clean_entry_bitmap_size > sizeof(void*) ? bmp : &bmp), bmp_ptr, clean_entry_bitmap_size);
        }
        else
        {
            deleted = true;
        }
    }
+    if (op->version == 0)
+    {
+        op->version = version;
+    }
+    else if (op->version < version)
+    {
+        // Invalid version requested
+        op->retval = -EEXIST;
+        return false;
+    }
    if (deleted && is_del)
    {
        // Already deleted
        op->retval = 0;
        return false;
    }
-    PRIV(op)->real_version = 0;
-    if (op->version == 0)
-    {
-        op->version = version;
-    }
-    else if (op->version < version)
-    {
-        // Implicit operations must be added like that: DEL [FLUSH] BIG [SYNC] SMALL SMALL
-        if (deleted || wait_del)
-        {
-            // It's allowed to write versions with low numbers over deletes
-            // However, we have to flush those deletes first as we use version number for ordering
-#ifdef BLOCKSTORE_DEBUG
-            printf("Write %lx:%lx v%lu over delete (real v%lu) offset=%u len=%u\n", op->oid.inode, op->oid.stripe, version, op->version, op->offset, op->len);
-#endif
-            wait_del = true;
-            PRIV(op)->real_version = op->version;
-            op->version = version;
-            flusher->unshift_flush((obj_ver_id){
-                .oid = op->oid,
-                .version = version-1,
-            }, true);
-        }
-        else
-        {
-            // Invalid version requested
-            op->retval = -EEXIST;
-            if (!is_del && clean_entry_bitmap_size > sizeof(void*))
-            {
-                free(bmp);
-            }
-            return false;
-        }
-    }
-    if (wait_big && !is_del && !deleted && op->len < block_size &&
+    if (is_inflight_big && !is_del && !deleted && op->len < block_size &&
        immediate_commit != IMMEDIATE_ALL)
    {
        // Issue an additional sync so that the previous big write can reach the journal
@ -105,88 +69,31 @@ bool blockstore_impl_t::enqueue_write(blockstore_op_t *op)
 #ifdef BLOCKSTORE_DEBUG
    if (is_del)
        printf("Delete %lx:%lx v%lu\n", op->oid.inode, op->oid.stripe, op->version);
-    else if (!wait_del)
+    else
        printf("Write %lx:%lx v%lu offset=%u len=%u\n", op->oid.inode, op->oid.stripe, op->version, op->offset, op->len);
 #endif
-    // FIXME No strict need to add it into dirty_db here, it's just left
+    // No strict need to add it into dirty_db here, it's just left
    // from the previous implementation where reads waited for writes
-    uint32_t state;
-    if (is_del)
-        state = BS_ST_DELETE | BS_ST_IN_FLIGHT;
-    else
-    {
-        state = (op->len == block_size || deleted ? BS_ST_BIG_WRITE : BS_ST_SMALL_WRITE);
-        if (wait_del)
-            state |= BS_ST_WAIT_DEL;
-        else if (state == BS_ST_SMALL_WRITE && wait_big)
-            state |= BS_ST_WAIT_BIG;
-        else
-            state |= BS_ST_IN_FLIGHT;
-        if (op->opcode == BS_OP_WRITE_STABLE)
-            state |= BS_ST_INSTANT;
-        if (op->bitmap)
-        {
-            // Only allow to overwrite part of the object bitmap respective to the write's offset/len
-            uint8_t *bmp_ptr = (uint8_t*)(clean_entry_bitmap_size > sizeof(void*) ? bmp : &bmp);
-            uint32_t bit = op->offset/bitmap_granularity;
-            uint32_t bits_left = op->len/bitmap_granularity;
-            while (!(bit % 8) && bits_left > 8)
-            {
-                // Copy bytes
-                bmp_ptr[bit/8] = ((uint8_t*)op->bitmap)[bit/8];
-                bit += 8;
-                bits_left -= 8;
-            }
-            while (bits_left > 0)
-            {
-                // Copy bits
-                bmp_ptr[bit/8] = (bmp_ptr[bit/8] & ~(1 << (bit%8)))
-                    | (((uint8_t*)op->bitmap)[bit/8] & (1 << bit%8));
-                bit++;
-                bits_left--;
-            }
-        }
-    }
    dirty_db.emplace((obj_ver_id){
        .oid = op->oid,
        .version = op->version,
    }, (dirty_entry){
-        .state = state,
+        .state = (uint32_t)(
+            is_del
+                ? (BS_ST_DELETE | BS_ST_IN_FLIGHT)
+                : (op->opcode == BS_OP_WRITE_STABLE ? BS_ST_INSTANT : 0) | (op->len == block_size || deleted
+                    ? (BS_ST_BIG_WRITE | BS_ST_IN_FLIGHT)
+                    : (is_inflight_big ? (BS_ST_SMALL_WRITE | BS_ST_WAIT_BIG) : (BS_ST_SMALL_WRITE | BS_ST_IN_FLIGHT)))
+        ),
        .flags = 0,
        .location = 0,
        .offset = is_del ? 0 : op->offset,
        .len = is_del ? 0 : op->len,
        .journal_sector = 0,
-        .bitmap = bmp,
    });
    return true;
 }

-void blockstore_impl_t::cancel_all_writes(blockstore_op_t *op, blockstore_dirty_db_t::iterator dirty_it, int retval)
-{
-    while (dirty_it != dirty_db.end() && dirty_it->first.oid == op->oid)
-    {
-        if (clean_entry_bitmap_size > sizeof(void*))
-            free(dirty_it->second.bitmap);
-        dirty_db.erase(dirty_it++);
-    }
-    bool found = false;
-    for (auto other_op: submit_queue)
-    {
-        if (!found && other_op == op)
-            found = true;
-        else if (found && other_op->oid == op->oid &&
-            (other_op->opcode == BS_OP_WRITE || other_op->opcode == BS_OP_WRITE_STABLE))
-        {
-            // Mark operations to cancel them
-            PRIV(other_op)->real_version = UINT64_MAX;
-            other_op->retval = retval;
-        }
-    }
-    op->retval = retval;
-    FINISH_OP(op);
-}
-
 // First step of the write algorithm: dequeue operation and submit initial write(s)
 int blockstore_impl_t::dequeue_write(blockstore_op_t *op)
 {
@ -199,46 +106,12 @@ int blockstore_impl_t::dequeue_write(blockstore_op_t *op)
        .version = op->version,
    });
    assert(dirty_it != dirty_db.end());
-    if ((dirty_it->second.state & BS_ST_WORKFLOW_MASK) < BS_ST_IN_FLIGHT)
+    if ((dirty_it->second.state & BS_ST_WORKFLOW_MASK) == BS_ST_WAIT_BIG)
    {
        // Don't dequeue
        return 0;
    }
-    if (PRIV(op)->real_version != 0)
-    {
-        if (PRIV(op)->real_version == UINT64_MAX)
-        {
-            // This is the flag value used to cancel operations
-            FINISH_OP(op);
-            return 2;
-        }
-        // Restore original low version number for unblocked operations
-#ifdef BLOCKSTORE_DEBUG
-        printf("Restoring %lx:%lx version: v%lu -> v%lu\n", op->oid.inode, op->oid.stripe, op->version, PRIV(op)->real_version);
-#endif
-        auto prev_it = dirty_it;
-        prev_it--;
-        if (prev_it->first.oid == op->oid && prev_it->first.version >= PRIV(op)->real_version)
-        {
-            // Original version is still invalid
-            // All subsequent writes to the same object must be canceled too
-            cancel_all_writes(op, dirty_it, -EEXIST);
-            return 2;
-        }
-        op->version = PRIV(op)->real_version;
-        PRIV(op)->real_version = 0;
-        dirty_entry e = dirty_it->second;
-        dirty_db.erase(dirty_it);
-        dirty_it = dirty_db.emplace((obj_ver_id){
-            .oid = op->oid,
-            .version = op->version,
-        }, e).first;
-    }
-    if (write_iodepth >= max_write_iodepth)
-    {
-        return 0;
-    }
-    if ((dirty_it->second.state & BS_ST_TYPE_MASK) == BS_ST_BIG_WRITE)
+    else if ((dirty_it->second.state & BS_ST_TYPE_MASK) == BS_ST_BIG_WRITE)
    {
        blockstore_journal_check_t space_check(this);
        if (!space_check.check_available(op, unsynced_big_writes.size() + 1, sizeof(journal_entry_big_write), JOURNAL_STABILIZE_RESERVATION))
@ -256,10 +129,10 @@ int blockstore_impl_t::dequeue_write(blockstore_op_t *op)
                PRIV(op)->wait_for = WAIT_FREE;
                return 0;
            }
-            cancel_all_writes(op, dirty_it, -ENOSPC);
-            return 2;
+            op->retval = -ENOSPC;
+            FINISH_OP(op);
+            return 1;
        }
-        write_iodepth++;
        BS_SUBMIT_GET_SQE(sqe, data);
        dirty_it->second.location = loc << block_order;
        dirty_it->second.state = (dirty_it->second.state & ~BS_ST_WORKFLOW_MASK) | BS_ST_SUBMITTED;
@ -312,7 +185,6 @@ int blockstore_impl_t::dequeue_write(blockstore_op_t *op)
        {
            return 0;
        }
-        write_iodepth++;
        // There is sufficient space. Get SQE(s)
        struct io_uring_sqe *sqe1 = NULL;
        if (immediate_commit != IMMEDIATE_NONE ||
@ -345,7 +217,7 @@ int blockstore_impl_t::dequeue_write(blockstore_op_t *op)
        // Then pre-fill journal entry
        journal_entry_small_write *je = (journal_entry_small_write*)prefill_single_journal_entry(
            journal, op->opcode == BS_OP_WRITE_STABLE ? JE_SMALL_WRITE_INSTANT : JE_SMALL_WRITE,
-            sizeof(journal_entry_small_write) + clean_entry_bitmap_size
+            sizeof(journal_entry_small_write)
        );
        dirty_it->second.journal_sector = journal.sector_info[journal.cur_sector].offset;
        journal.used_sectors[journal.sector_info[journal.cur_sector].offset]++;
@ -364,7 +236,6 @@ int blockstore_impl_t::dequeue_write(blockstore_op_t *op)
        je->len = op->len;
        je->data_offset = journal.next_free;
        je->crc32_data = crc32c(0, op->buf, op->len);
-        memcpy((void*)(je+1), (clean_entry_bitmap_size > sizeof(void*) ? dirty_it->second.bitmap : &dirty_it->second.bitmap), clean_entry_bitmap_size);
        je->crc32 = je_crc32((journal_entry*)je);
        journal.crc32_last = je->crc32;
        if (immediate_commit != IMMEDIATE_NONE)
@ -411,7 +282,7 @@ int blockstore_impl_t::dequeue_write(blockstore_op_t *op)
        if (!PRIV(op)->pending_ops)
        {
            PRIV(op)->op_state = 4;
-            return continue_write(op);
+            continue_write(op);
        }
        else
        {
@ -425,29 +296,30 @@ int blockstore_impl_t::continue_write(blockstore_op_t *op)
 {
    io_uring_sqe *sqe = NULL;
    journal_entry_big_write *je;
-    int op_state = PRIV(op)->op_state;
-    if (op_state != 2 && op_state != 4)
-    {
-        // In progress
-        return 1;
-    }
    auto dirty_it = dirty_db.find((obj_ver_id){
        .oid = op->oid,
        .version = op->version,
    });
    assert(dirty_it != dirty_db.end());
-    if (op_state == 2)
+    if (PRIV(op)->op_state == 2)
        goto resume_2;
-    else if (op_state == 4)
+    else if (PRIV(op)->op_state == 4)
        goto resume_4;
+    else
+        return 1;
 resume_2:
    // Only for the immediate_commit mode: prepare and submit big_write journal entry
-    BS_SUBMIT_GET_SQE_DECL(sqe);
+    sqe = get_sqe();
+    if (!sqe)
+    {
+        return 0;
+    }
    je = (journal_entry_big_write*)prefill_single_journal_entry(
        journal, op->opcode == BS_OP_WRITE_STABLE ? JE_BIG_WRITE_INSTANT : JE_BIG_WRITE,
-        sizeof(journal_entry_big_write) + clean_entry_bitmap_size
+        sizeof(journal_entry_big_write)
    );
    dirty_it->second.journal_sector = journal.sector_info[journal.cur_sector].offset;
+    journal.sector_info[journal.cur_sector].dirty = false;
    journal.used_sectors[journal.sector_info[journal.cur_sector].offset]++;
 #ifdef BLOCKSTORE_DEBUG
    printf(
@ -461,7 +333,6 @@ resume_2:
    je->offset = op->offset;
    je->len = op->len;
    je->location = dirty_it->second.location;
-    memcpy((void*)(je+1), (clean_entry_bitmap_size > sizeof(void*) ? dirty_it->second.bitmap : &dirty_it->second.bitmap), clean_entry_bitmap_size);
    je->crc32 = je_crc32((journal_entry*)je);
    journal.crc32_last = je->crc32;
    prepare_journal_sector_write(journal, journal.cur_sector, sqe,
@ -473,7 +344,7 @@ resume_2:
 resume_4:
    // Switch object state
 #ifdef BLOCKSTORE_DEBUG
-    printf("Ack write %lx:%lx v%lu = state %x\n", op->oid.inode, op->oid.stripe, op->version, dirty_it->second.state);
+    printf("Ack write %lx:%lx v%lu = %d\n", op->oid.inode, op->oid.stripe, op->version, dirty_it->second.state);
 #endif
    bool imm = (dirty_it->second.state & BS_ST_TYPE_MASK) == BS_ST_BIG_WRITE
        ? (immediate_commit == IMMEDIATE_ALL)
@ -504,9 +375,8 @@ resume_4:
    }
    // Acknowledge write
    op->retval = op->len;
-    write_iodepth--;
    FINISH_OP(op);
-    return 2;
+    return 1;
 }

 void blockstore_impl_t::handle_write_event(ring_data_t *data, blockstore_op_t *op)
@ -525,7 +395,10 @@ void blockstore_impl_t::handle_write_event(ring_data_t *data, blockstore_op_t *o
    {
        release_journal_sectors(op);
        PRIV(op)->op_state++;
-        ringloop->wakeup();
+        if (!continue_write(op))
+        {
+            submit_queue.push_front(op);
+        }
    }
 }

@ -538,8 +411,8 @@ void blockstore_impl_t::release_journal_sectors(blockstore_op_t *op)
        uint64_t s = PRIV(op)->min_flushed_journal_sector;
        while (1)
        {
-            journal.sector_info[s-1].flush_count--;
-            if (s != (1+journal.cur_sector) && journal.sector_info[s-1].flush_count == 0)
+            journal.sector_info[s-1].usage_count--;
+            if (s != (1+journal.cur_sector) && journal.sector_info[s-1].usage_count == 0)
            {
                // We know for sure that we won't write into this sector anymore
                uint64_t new_ds = journal.sector_info[s-1].offset + journal.block_size;
@ -563,10 +436,6 @@ void blockstore_impl_t::release_journal_sectors(blockstore_op_t *op)

 int blockstore_impl_t::dequeue_del(blockstore_op_t *op)
 {
-    if (PRIV(op)->op_state)
-    {
-        return continue_write(op);
-    }
    auto dirty_it = dirty_db.find((obj_ver_id){
        .oid = op->oid,
        .version = op->version,
@ -577,7 +446,6 @@ int blockstore_impl_t::dequeue_del(blockstore_op_t *op)
    {
        return 0;
    }
-    write_iodepth++;
    io_uring_sqe *sqe = NULL;
    if (immediate_commit != IMMEDIATE_NONE ||
        (journal_block_size - journal.in_sector_pos) < sizeof(journal_entry_del) &&
@ -624,10 +492,7 @@ int blockstore_impl_t::dequeue_del(blockstore_op_t *op)
        prepare_journal_sector_write(journal, journal.cur_sector, sqe, cb);
        PRIV(op)->min_flushed_journal_sector = PRIV(op)->max_flushed_journal_sector = 1 + journal.cur_sector;
        PRIV(op)->pending_ops++;
-    }
-    else
-    {
-        // Remember delete as unsynced
+        // Remember small write as unsynced
        unsynced_small_writes.push_back((obj_ver_id){
            .oid = op->oid,
            .version = op->version,
@ -636,7 +501,7 @@ int blockstore_impl_t::dequeue_del(blockstore_op_t *op)
    if (!PRIV(op)->pending_ops)
    {
        PRIV(op)->op_state = 4;
-        return continue_write(op);
+        continue_write(op);
    }
    else
    {
--- a/src/cluster_client.cpp
+++ b/src/cluster_client.cpp
@ -1,20 +1,20 @@
 // Copyright (c) Vitaliy Filippov, 2019+
-// License: VNPL-1.1 or GNU GPL-2.0+ (see README.md for details)
+// License: VNPL-1.0 or GNU GPL-2.0+ (see README.md for details)

 #include <stdexcept>
 #include "cluster_client.h"

-#define SCRAP_BUFFER_SIZE 4*1024*1024
-
 cluster_client_t::cluster_client_t(ring_loop_t *ringloop, timerfd_manager_t *tfd, json11::Json & config)
 {
    this->ringloop = ringloop;
    this->tfd = tfd;
-    this->config = config;
+
+    log_level = config["log_level"].int64_value();

    msgr.osd_num = 0;
    msgr.tfd = tfd;
    msgr.ringloop = ringloop;
+    msgr.log_level = log_level;
    msgr.repeer_pgs = [this](osd_num_t peer_osd)
    {
        if (msgr.osd_peer_fds.find(peer_osd) != msgr.osd_peer_fds.end())
@ -67,7 +67,8 @@ cluster_client_t::cluster_client_t(ring_loop_t *ringloop, timerfd_manager_t *tfd
        msgr.stop_client(op->peer_fd);
        delete op;
    };
-    msgr.init();
+    msgr.use_sync_send_recv = config["use_sync_send_recv"].bool_value() ||
+        config["use_sync_send_recv"].uint64_value();

    st_cli.tfd = tfd;
    st_cli.on_load_config_hook = [this](json11::Json::object & cfg) { on_load_config_hook(cfg); };
@ -78,9 +79,6 @@ cluster_client_t::cluster_client_t(ring_loop_t *ringloop, timerfd_manager_t *tfd
    st_cli.parse_config(config);
    st_cli.load_global_config();

-    scrap_buffer_size = SCRAP_BUFFER_SIZE;
-    scrap_buffer = malloc_or_die(scrap_buffer_size);
-
    if (ringloop)
    {
        consumer.loop = [this]()
@ -99,21 +97,13 @@ cluster_client_t::~cluster_client_t()
    {
        ringloop->unregister_consumer(&consumer);
    }
-    free(scrap_buffer);
 }

-cluster_op_t::~cluster_op_t()
+void cluster_client_t::stop()
 {
-    if (buf)
+    while (msgr.clients.size() > 0)
    {
-        free(buf);
-        buf = NULL;
-    }
-    if (bitmap_buf)
-    {
-        free(bitmap_buf);
-        part_bitmaps = NULL;
-        bitmap_buf = NULL;
+        msgr.stop_client(msgr.clients.begin()->first);
    }
 }

@ -154,16 +144,20 @@ static uint32_t is_power_of_two(uint64_t value)
 void cluster_client_t::on_load_config_hook(json11::Json::object & config)
 {
    bs_block_size = config["block_size"].uint64_value();
+    bs_disk_alignment = config["disk_alignment"].uint64_value();
    bs_bitmap_granularity = config["bitmap_granularity"].uint64_value();
    if (!bs_block_size)
    {
        bs_block_size = DEFAULT_BLOCK_SIZE;
    }
+    if (!bs_disk_alignment)
+    {
+        bs_disk_alignment = DEFAULT_DISK_ALIGNMENT;
+    }
    if (!bs_bitmap_granularity)
    {
        bs_bitmap_granularity = DEFAULT_BITMAP_GRANULARITY;
    }
-    bs_bitmap_size = bs_block_size / bs_bitmap_granularity / 8;
    uint32_t block_order;
    if ((block_order = is_power_of_two(bs_block_size)) >= 64 || bs_block_size < MIN_BLOCK_SIZE || bs_block_size >= MAX_BLOCK_SIZE)
    {
@ -191,8 +185,16 @@ void cluster_client_t::on_load_config_hook(json11::Json::object & config)
    {
        up_wait_retry_interval = 50;
    }
-    msgr.parse_config(config);
-    msgr.parse_config(this->config);
+    msgr.peer_connect_interval = config["peer_connect_interval"].uint64_value();
+    if (!msgr.peer_connect_interval)
+    {
+        msgr.peer_connect_interval = DEFAULT_PEER_CONNECT_INTERVAL;
+    }
+    msgr.peer_connect_timeout = config["peer_connect_timeout"].uint64_value();
+    if (!msgr.peer_connect_timeout)
+    {
+        msgr.peer_connect_timeout = DEFAULT_PEER_CONNECT_TIMEOUT;
+    }
    st_cli.load_pgs();
 }

@ -226,21 +228,21 @@ void cluster_client_t::on_change_hook(json11::Json::object & changes)
            // And now they have to be resliced!
            for (auto op: cur_ops)
            {
-                if (INODE_POOL(op->cur_inode) == pool_item.first)
+                if (INODE_POOL(op->inode) == pool_item.first)
                {
                    op->needs_reslice = true;
                }
            }
            for (auto op: unsynced_writes)
            {
-                if (INODE_POOL(op->cur_inode) == pool_item.first)
+                if (INODE_POOL(op->inode) == pool_item.first)
                {
                    op->needs_reslice = true;
                }
            }
            for (auto op: syncing_writes)
            {
-                if (INODE_POOL(op->cur_inode) == pool_item.first)
+                if (INODE_POOL(op->inode) == pool_item.first)
                {
                    op->needs_reslice = true;
                }
@ -259,11 +261,6 @@ void cluster_client_t::on_change_osd_state_hook(uint64_t peer_osd)
    }
 }

-bool cluster_client_t::is_ready()
-{
-    return pgs_loaded;
-}
-
 void cluster_client_t::on_ready(std::function<void(void)> fn)
 {
    if (pgs_loaded)
@ -315,7 +312,7 @@ void cluster_client_t::execute(cluster_op_t *op)
    op->retval = 0;
    if (op->opcode != OSD_OP_SYNC && op->opcode != OSD_OP_READ && op->opcode != OSD_OP_WRITE ||
        (op->opcode == OSD_OP_READ || op->opcode == OSD_OP_WRITE) && (!op->inode || !op->len ||
-        op->offset % bs_bitmap_granularity || op->len % bs_bitmap_granularity))
+        op->offset % bs_disk_alignment || op->len % bs_disk_alignment))
    {
        op->retval = -EINVAL;
        std::function<void(cluster_op_t*)>(op->callback)(op);
@ -326,81 +323,34 @@ void cluster_client_t::execute(cluster_op_t *op)
        execute_sync(op);
        return;
    }
-    op->cur_inode = op->inode;
-    if (op->opcode == OSD_OP_WRITE)
+    if (op->opcode == OSD_OP_WRITE && !immediate_commit)
    {
-        auto ino_it = st_cli.inode_config.find(op->inode);
-        if (ino_it != st_cli.inode_config.end() && ino_it->second.readonly)
+        if (next_writes.size() > 0)
        {
-            op->retval = -EINVAL;
-            std::function<void(cluster_op_t*)>(op->callback)(op);
+            assert(cur_sync);
+            next_writes.push_back(op);
            return;
        }
-        if (!immediate_commit)
+        if (queued_bytes >= client_dirty_limit)
        {
-            if (next_writes.size() > 0)
-            {
-                assert(cur_sync);
-                next_writes.push_back(op);
-                return;
-            }
-            if (queued_bytes >= client_dirty_limit)
-            {
-                // Push an extra SYNC operation to flush previous writes
-                next_writes.push_back(op);
-                cluster_op_t *sync_op = new cluster_op_t;
-                sync_op->is_internal = true;
-                sync_op->opcode = OSD_OP_SYNC;
-                sync_op->callback = [](cluster_op_t* sync_op) {};
-                execute_sync(sync_op);
-                return;
-            }
-            queued_bytes += op->len;
-            op = copy_write(op);
-            unsynced_writes.push_back(op);
+            // Push an extra SYNC operation to flush previous writes
+            next_writes.push_back(op);
+            cluster_op_t *sync_op = new cluster_op_t;
+            sync_op->is_internal = true;
+            sync_op->opcode = OSD_OP_SYNC;
+            sync_op->callback = [](cluster_op_t* sync_op) {};
+            execute_sync(sync_op);
+            return;
        }
+        queued_bytes += op->len;
    }
    cur_ops.insert(op);
    continue_rw(op);
 }

-cluster_op_t *cluster_client_t::copy_write(cluster_op_t *op)
-{
-    // Save operation for replay when one of PGs goes out of sync
-    // (primary OSD drops our connection in this case)
-    cluster_op_t *op_copy = new cluster_op_t();
-    op_copy->is_internal = true;
-    op_copy->orig_op = op;
-    op_copy->opcode = op->opcode;
-    op_copy->inode = op->inode;
-    op_copy->cur_inode = op->inode;
-    op_copy->offset = op->offset;
-    op_copy->len = op->len;
-    op_copy->buf = malloc_or_die(op->len);
-    op_copy->iov.push_back(op_copy->buf, op->len);
-    op_copy->callback = [](cluster_op_t* op_copy)
-    {
-        if (op_copy->orig_op)
-        {
-            // Acknowledge write and forget the original pointer
-            op_copy->orig_op->retval = op_copy->retval;
-            std::function<void(cluster_op_t*)>(op_copy->orig_op->callback)(op_copy->orig_op);
-            op_copy->orig_op = NULL;
-        }
-    };
-    void *cur_buf = op_copy->buf;
-    for (int i = 0; i < op->iov.count; i++)
-    {
-        memcpy(cur_buf, op->iov.buf[i].iov_base, op->iov.buf[i].iov_len);
-        cur_buf += op->iov.buf[i].iov_len;
-    }
-    return op_copy;
-}
-
-// FIXME Reimplement it using "coroutine emulation"
 void cluster_client_t::continue_rw(cluster_op_t *op)
 {
-    pool_id_t pool_id = INODE_POOL(op->cur_inode);
+    pool_id_t pool_id = INODE_POOL(op->inode);
    if (!pool_id)
    {
        op->retval = -EINVAL;
@ -413,6 +363,40 @@ void cluster_client_t::continue_rw(cluster_op_t *op)
        // Postpone operations to unknown pools
        return;
    }
+    if (op->opcode == OSD_OP_WRITE && !immediate_commit && !op->is_internal)
+    {
+        // Save operation for replay when PG goes out of sync
+        // (primary OSD drops our connection in this case)
+        cluster_op_t *op_copy = new cluster_op_t();
+        op_copy->is_internal = true;
+        op_copy->orig_op = op;
+        op_copy->opcode = op->opcode;
+        op_copy->inode = op->inode;
+        op_copy->offset = op->offset;
+        op_copy->len = op->len;
+        op_copy->buf = malloc_or_die(op->len);
+        op_copy->iov.push_back(op_copy->buf, op->len);
+        op_copy->callback = [](cluster_op_t* op_copy)
+        {
+            if (op_copy->orig_op)
+            {
+                // Acknowledge write and forget the original pointer
+                op_copy->orig_op->retval = op_copy->retval;
+                std::function<void(cluster_op_t*)>(op_copy->orig_op->callback)(op_copy->orig_op);
+                op_copy->orig_op = NULL;
+            }
+        };
+        void *cur_buf = op_copy->buf;
+        for (int i = 0; i < op->iov.count; i++)
+        {
+            memcpy(cur_buf, op->iov.buf[i].iov_base, op->iov.buf[i].iov_len);
+            cur_buf += op->iov.buf[i].iov_len;
+        }
+        unsynced_writes.push_back(op_copy);
+        cur_ops.erase(op);
+        cur_ops.insert(op_copy);
+        op = op_copy;
+    }
    if (!op->parts.size())
    {
        // Slice the operation into parts
@ -421,11 +405,11 @@ void cluster_client_t::continue_rw(cluster_op_t *op)
    if (!op->needs_reslice)
    {
        // Send unsent parts, if they're not subject to change
-        for (int i = 0; i < op->parts.size(); i++)
+        for (auto & op_part: op->parts)
        {
-            if (!op->parts[i].sent && !op->parts[i].done)
+            if (!op_part.sent && !op_part.done)
            {
-                try_send(op, i);
+                try_send(op, &op_part);
            }
        }
    }
@ -436,37 +420,15 @@ void cluster_client_t::continue_rw(cluster_op_t *op)
            // Finished successfully
            // Even if the PG count has changed in meanwhile we treat it as success
            // because if some operations were invalid for the new PG count we'd get errors
-            bool is_read = op->opcode == OSD_OP_READ;
-            if (is_read)
-            {
-                // Check parent inode
-                auto ino_it = st_cli.inode_config.find(op->cur_inode);
-                if (ino_it != st_cli.inode_config.end() &&
-                    ino_it->second.parent_id)
-                {
-                    // Continue reading from the parent inode
-                    // FIXME: This obviously requires optimizations for long snapshot chains
-                    op->cur_inode = ino_it->second.parent_id;
-                    op->parts.clear();
-                    op->done_count = 0;
-                    op->needs_reslice = true;
-                    continue_rw(op);
-                    return;
-                }
-            }
            cur_ops.erase(op);
            op->retval = op->len;
            std::function<void(cluster_op_t*)>(op->callback)(op);
-            if (!is_read)
-            {
-                continue_sync();
-            }
+            continue_sync();
            return;
        }
        else if (op->retval != 0 && op->retval != -EPIPE)
        {
            // Fatal error (not -EPIPE)
-            bool is_read = op->opcode == OSD_OP_READ;
            cur_ops.erase(op);
            if (!immediate_commit && op->opcode == OSD_OP_WRITE)
            {
@ -483,12 +445,11 @@ void cluster_client_t::continue_rw(cluster_op_t *op)
            std::function<void(cluster_op_t*)>(op->callback)(op);
            if (del)
            {
+                if (op->buf)
+                    free(op->buf);
                delete op;
            }
-            if (!is_read)
-            {
-                continue_sync();
-            }
+            continue_sync();
            return;
        }
        else
@ -506,145 +467,60 @@ void cluster_client_t::continue_rw(cluster_op_t *op)
    }
 }

-static void add_iov(int size, bool skip, cluster_op_t *op, int &iov_idx, size_t &iov_pos, osd_op_buf_list_t &iov, void *scrap, int scrap_len)
-{
-    int left = size;
-    while (left > 0 && iov_idx < op->iov.count)
-    {
-        int cur_left = op->iov.buf[iov_idx].iov_len - iov_pos;
-        if (cur_left < left)
-        {
-            if (!skip)
-            {
-                iov.push_back(op->iov.buf[iov_idx].iov_base + iov_pos, cur_left);
-            }
-            left -= cur_left;
-            iov_pos = 0;
-            iov_idx++;
-        }
-        else
-        {
-            if (!skip)
-            {
-                iov.push_back(op->iov.buf[iov_idx].iov_base + iov_pos, left);
-            }
-            iov_pos += left;
-            left = 0;
-        }
-    }
-    assert(left == 0);
-    if (skip && scrap_len > 0)
-    {
-        // All skipped ranges are read into the same useless buffer
-        left = size;
-        while (left > 0)
-        {
-            int cur_left = scrap_len < left ? scrap_len : left;
-            iov.push_back(scrap, cur_left);
-            left -= cur_left;
-        }
-    }
-}
-
 void cluster_client_t::slice_rw(cluster_op_t *op)
 {
    // Slice the request into individual object stripe requests
    // Primary OSDs still operate individual stripes, but their size is multiplied by PG minsize in case of EC
-    auto & pool_cfg = st_cli.pool_config[INODE_POOL(op->cur_inode)];
-    uint32_t pg_data_size = (
-        pool_cfg.scheme == POOL_SCHEME_REPLICATED ? 1 : pool_cfg.pg_size-pool_cfg.parity_chunks
+    auto & pool_cfg = st_cli.pool_config[INODE_POOL(op->inode)];
+    uint64_t pg_block_size = bs_block_size * (
+        pool_cfg.scheme == POOL_SCHEME_REPLICATED ? 1 : pool_cfg.pg_minsize
    );
-    uint64_t pg_block_size = bs_block_size * pg_data_size;
    uint64_t first_stripe = (op->offset / pg_block_size) * pg_block_size;
    uint64_t last_stripe = ((op->offset + op->len + pg_block_size - 1) / pg_block_size - 1) * pg_block_size;
    op->retval = 0;
    op->parts.resize((last_stripe - first_stripe) / pg_block_size + 1);
-    if (op->opcode == OSD_OP_READ)
-    {
-        // Allocate memory for the bitmap
-        unsigned object_bitmap_size = ((op->len / bs_bitmap_granularity + 7) / 8);
-        object_bitmap_size = (object_bitmap_size < 8 ? 8 : object_bitmap_size);
-        unsigned bitmap_mem = object_bitmap_size + (bs_bitmap_size * pg_data_size) * op->parts.size();
-        if (op->bitmap_buf_size < bitmap_mem)
-        {
-            op->bitmap_buf = realloc_or_die(op->bitmap_buf, bitmap_mem);
-            if (!op->bitmap_buf_size)
-            {
-                // First allocation
-                memset(op->bitmap_buf, 0, object_bitmap_size);
-            }
-            op->part_bitmaps = op->bitmap_buf + object_bitmap_size;
-            op->bitmap_buf_size = bitmap_mem;
-        }
-    }
    int iov_idx = 0;
    size_t iov_pos = 0;
    int i = 0;
    for (uint64_t stripe = first_stripe; stripe <= last_stripe; stripe += pg_block_size)
    {
-        pg_num_t pg_num = (op->cur_inode + stripe/pool_cfg.pg_stripe_size) % pool_cfg.real_pg_count + 1; // like map_to_pg()
+        pg_num_t pg_num = (op->inode + stripe/pool_cfg.pg_stripe_size) % pool_cfg.real_pg_count + 1;
        uint64_t begin = (op->offset < stripe ? stripe : op->offset);
        uint64_t end = (op->offset + op->len) > (stripe + pg_block_size)
            ? (stripe + pg_block_size) : (op->offset + op->len);
-        op->parts[i].iov.reset();
-        if (op->cur_inode != op->inode)
+        op->parts[i] = {
+            .parent = op,
+            .offset = begin,
+            .len = (uint32_t)(end - begin),
+            .pg_num = pg_num,
+            .sent = false,
+            .done = false,
+        };
+        int left = end-begin;
+        while (left > 0 && iov_idx < op->iov.count)
        {
-            // Read remaining parts from upper layers
-            uint64_t prev = begin, cur = begin;
-            bool skip_prev = true;
-            while (cur < end)
+            if (op->iov.buf[iov_idx].iov_len - iov_pos < left)
            {
-                unsigned bmp_loc = (cur - op->offset)/bs_bitmap_granularity;
-                bool skip = (((*(uint8_t*)(op->bitmap_buf + bmp_loc/8)) >> (bmp_loc%8)) & 0x1);
-                if (skip_prev != skip)
-                {
-                    if (cur > prev)
-                    {
-                        if (prev == begin && skip_prev)
-                        {
-                            begin = cur;
-                            // Just advance iov_idx & iov_pos
-                            add_iov(cur-prev, true, op, iov_idx, iov_pos, op->parts[i].iov, NULL, 0);
-                        }
-                        else
-                            add_iov(cur-prev, skip_prev, op, iov_idx, iov_pos, op->parts[i].iov, scrap_buffer, scrap_buffer_size);
-                    }
-                    skip_prev = skip;
-                    prev = cur;
-                }
-                cur += bs_bitmap_granularity;
-            }
-            assert(cur > prev);
-            if (skip_prev)
-            {
-                // Just advance iov_idx & iov_pos
-                add_iov(end-prev, true, op, iov_idx, iov_pos, op->parts[i].iov, NULL, 0);
-                end = prev;
+                op->parts[i].iov.push_back(op->iov.buf[iov_idx].iov_base + iov_pos, op->iov.buf[iov_idx].iov_len - iov_pos);
+                left -= (op->iov.buf[iov_idx].iov_len - iov_pos);
+                iov_pos = 0;
+                iov_idx++;
            }
            else
-                add_iov(cur-prev, skip_prev, op, iov_idx, iov_pos, op->parts[i].iov, scrap_buffer, scrap_buffer_size);
-            if (end == begin)
-                op->done_count++;
+            {
+                op->parts[i].iov.push_back(op->iov.buf[iov_idx].iov_base + iov_pos, left);
+                iov_pos += left;
+                left = 0;
+            }
        }
-        else
-        {
-            add_iov(end-begin, false, op, iov_idx, iov_pos, op->parts[i].iov, NULL, 0);
-        }
-        op->parts[i].parent = op;
-        op->parts[i].offset = begin;
-        op->parts[i].len = (uint32_t)(end - begin);
-        op->parts[i].pg_num = pg_num;
-        op->parts[i].osd_num = 0;
-        op->parts[i].sent = end <= begin;
-        op->parts[i].done = end <= begin;
+        assert(left == 0);
        i++;
    }
 }

-bool cluster_client_t::try_send(cluster_op_t *op, int i)
+bool cluster_client_t::try_send(cluster_op_t *op, cluster_op_part_t *part)
 {
-    cluster_op_part_t *part = &op->parts[i];
-    auto & pool_cfg = st_cli.pool_config[INODE_POOL(op->cur_inode)];
+    auto & pool_cfg = st_cli.pool_config[INODE_POOL(op->inode)];
    auto pg_it = pool_cfg.pg_config.find(part->pg_num);
    if (pg_it != pool_cfg.pg_config.end() &&
        !pg_it->second.pause && pg_it->second.cur_primary)
@ -657,10 +533,7 @@ bool cluster_client_t::try_send(cluster_op_t *op, int i)
            part->osd_num = primary_osd;
            part->sent = true;
            op->sent_count++;
-            uint64_t pg_bitmap_size = bs_bitmap_size * (
-                pool_cfg.scheme == POOL_SCHEME_REPLICATED ? 1 : pool_cfg.pg_size-pool_cfg.parity_chunks
-            );
-            part->op = (osd_op_t){
+            part->op = {
                .op_type = OSD_OP_OUT,
                .peer_fd = peer_fd,
                .req = { .rw = {
@ -669,12 +542,10 @@ bool cluster_client_t::try_send(cluster_op_t *op, int i)
                        .id = op_id++,
                        .opcode = op->opcode,
                    },
-                    .inode = op->cur_inode,
+                    .inode = op->inode,
                    .offset = part->offset,
                    .len = part->len,
                } },
-                .bitmap = op->opcode == OSD_OP_WRITE ? NULL : op->part_bitmaps + pg_bitmap_size*i,
-                .bitmap_len = (unsigned)(op->opcode == OSD_OP_WRITE ? 0 : pg_bitmap_size),
                .callback = [this, part](osd_op_t *op_part)
                {
                    handle_op_part(part);
@ -800,6 +671,8 @@ void cluster_client_t::finish_sync()
            assert(op->sent_count == 0);
            if (op->is_internal)
            {
+                if (op->buf)
+                    free(op->buf);
                delete op;
            }
        }
@ -821,7 +694,7 @@ void cluster_client_t::send_sync(cluster_op_t *op, cluster_op_part_t *part)
    assert(peer_it != msgr.osd_peer_fds.end());
    part->sent = true;
    op->sent_count++;
-    part->op = (osd_op_t){
+    part->op = {
        .op_type = OSD_OP_OUT,
        .peer_fd = peer_it->second,
        .req = {
@ -839,16 +712,6 @@ void cluster_client_t::send_sync(cluster_op_t *op, cluster_op_part_t *part)
    msgr.outbox_push(&part->op);
 }

-static inline void mem_or(void *res, const void *r2, unsigned int len)
-{
-    unsigned int i;
-    for (i = 0; i < len; ++i)
-    {
-        // Hope the compiler vectorizes this
-        ((uint8_t*)res)[i] = ((uint8_t*)res)[i] | ((uint8_t*)r2)[i];
-    }
-}
-
 void cluster_client_t::handle_op_part(cluster_op_part_t *part)
 {
    cluster_op_t *op = part->parent;
@ -886,35 +749,6 @@ void cluster_client_t::handle_op_part(cluster_op_part_t *part)
        // OK
        part->done = true;
        op->done_count++;
-        if (op->opcode == OSD_OP_READ)
-        {
-            // Copy (OR) bitmap
-            auto & pool_cfg = st_cli.pool_config[INODE_POOL(op->cur_inode)];
-            uint32_t pg_block_size = bs_block_size * (
-                pool_cfg.scheme == POOL_SCHEME_REPLICATED ? 1 : pool_cfg.pg_size-pool_cfg.parity_chunks
-            );
-            uint32_t object_offset = (part->op.req.rw.offset - op->offset) / bs_bitmap_granularity;
-            uint32_t part_offset = (part->op.req.rw.offset % pg_block_size) / bs_bitmap_granularity;
-            uint32_t part_len = part->op.req.rw.len / bs_bitmap_granularity;
-            if (!(object_offset & 0x7) && !(part_offset & 0x7) && (part_len >= 8))
-            {
-                // Copy bytes
-                mem_or(op->bitmap_buf + object_offset/8, part->op.bitmap + part_offset/8, part_len/8);
-                object_offset += (part_len & ~0x7);
-                part_offset += (part_len & ~0x7);
-                part_len = (part_len & 0x7);
-            }
-            while (part_len > 0)
-            {
-                // Copy bits
-                (*(uint8_t*)(op->bitmap_buf + (object_offset >> 3))) |= (
-                    (((*(uint8_t*)(part->op.bitmap + (part_offset >> 3))) >> (part_offset & 0x7)) & 0x1) << (object_offset & 0x7)
-                );
-                part_offset++;
-                object_offset++;
-                part_len--;
-            }
-        }
    }
    if (op->sent_count == 0)
    {
--- a/src/cluster_client.h
+++ b/src/cluster_client.h
@ -1,5 +1,5 @@
 // Copyright (c) Vitaliy Filippov, 2019+
-// License: VNPL-1.1 or GNU GPL-2.0+ (see README.md for details)
+// License: VNPL-1.0 or GNU GPL-2.0+ (see README.md for details)

 #pragma once

@ -8,6 +8,8 @@

 #define MIN_BLOCK_SIZE 4*1024
 #define MAX_BLOCK_SIZE 128*1024*1024
+#define DEFAULT_DISK_ALIGNMENT 4096
+#define DEFAULT_BITMAP_GRANULARITY 4096
 #define DEFAULT_CLIENT_DIRTY_LIMIT 32*1024*1024

 struct cluster_op_t;
@ -34,9 +36,7 @@ struct cluster_op_t
    int retval;
    osd_op_buf_list_t iov;
    std::function<void(cluster_op_t*)> callback;
-    ~cluster_op_t();
 protected:
-    uint64_t cur_inode; // for snapshot reads
    void *buf = NULL;
    cluster_op_t *orig_op = NULL;
    bool is_internal = false;
@ -44,8 +44,6 @@ protected:
    bool up_wait = false;
    int sent_count = 0, done_count = 0;
    std::vector<cluster_op_part_t> parts;
-    void *bitmap_buf = NULL, *part_bitmaps = NULL;
-    unsigned bitmap_buf_size = 0;
    friend class cluster_client_t;
 };

@ -55,7 +53,8 @@ class cluster_client_t
    ring_loop_t *ringloop;

    uint64_t bs_block_size = 0;
-    uint32_t bs_bitmap_granularity = 0, bs_bitmap_size = 0;
+    uint64_t bs_disk_alignment = 0;
+    uint64_t bs_bitmap_granularity = 0;
    std::map<pool_id_t, uint64_t> pg_counts;
    bool immediate_commit = false;
    // FIXME: Implement inmemory_commit mode. Note that it requires to return overlapping reads from memory.
@ -76,8 +75,6 @@ class cluster_client_t
    std::vector<cluster_op_t*> next_writes;
    std::vector<cluster_op_t*> offline_ops;
    uint64_t queued_bytes = 0;
-    void *scrap_buffer = NULL;
-    unsigned scrap_buffer_size = 0;

    bool pgs_loaded = false;
    std::vector<std::function<void(void)>> on_ready_hooks;
@ -85,13 +82,12 @@ class cluster_client_t
 public:
    etcd_state_client_t st_cli;
    osd_messenger_t msgr;
-    json11::Json config;

    cluster_client_t(ring_loop_t *ringloop, timerfd_manager_t *tfd, json11::Json & config);
    ~cluster_client_t();
    void execute(cluster_op_t *op);
-    bool is_ready();
    void on_ready(std::function<void(void)> fn);
+    void stop();

 protected:
    void continue_ops(bool up_retry = false);
@ -99,10 +95,9 @@ protected:
    void on_load_pgs_hook(bool success);
    void on_change_hook(json11::Json::object & changes);
    void on_change_osd_state_hook(uint64_t peer_osd);
-    cluster_op_t *copy_write(cluster_op_t *op);
    void continue_rw(cluster_op_t *op);
    void slice_rw(cluster_op_t *op);
-    bool try_send(cluster_op_t *op, int i);
+    bool try_send(cluster_op_t *op, cluster_op_part_t *part);
    void execute_sync(cluster_op_t *op);
    void continue_sync();
    void finish_sync();
--- a/copy-fio-includes.sh
+++ b/copy-fio-includes.sh
@ -1,13 +0,0 @@
-#!/bin/bash
-
-gcc -I. -E -o fio_headers.i src/fio_headers.h
-
-rm -rf fio-copy
-for i in `grep -Po 'fio/[^"]+' fio_headers.i | sort | uniq`; do
-    j=${i##fio/}
-    p=$(dirname $j)
-    mkdir -p fio-copy/$p
-    cp $i fio-copy/$j
-done
-
-rm fio_headers.i
--- a/copy-qemu-includes.sh
+++ b/copy-qemu-includes.sh
@ -1,18 +0,0 @@
-#!/bin/bash
-
-#cd qemu
-#debian/rules b/configure-stamp
-#cd b/qemu; make qapi
-
-gcc -I qemu/b/qemu `pkg-config glib-2.0 --cflags` \
-    -I qemu/include -E -o qemu_driver.i src/qemu_driver.c
-
-rm -rf qemu-copy
-for i in `grep -Po 'qemu/[^"]+' qemu_driver.i | sort | uniq`; do
-    j=${i##qemu/}
-    p=$(dirname $j)
-    mkdir -p qemu-copy/$p
-    cp $i qemu-copy/$j
-done
-
-rm qemu_driver.i
--- a/src/crc32c.c
+++ b/src/crc32c.c
--- a/src/crc32c.h
+++ b/src/crc32c.h
@ -8,10 +8,4 @@
 // unsigned __int64 _mm_crc32_u64 (unsigned __int64 crc, unsigned __int64 v)
 // unsigned int _mm_crc32_u8 (unsigned int crc, unsigned char v)

-#ifdef __cplusplus
-extern "C" {
-#endif
 uint32_t crc32c(uint32_t crc, const void *buf, size_t len);
-#ifdef __cplusplus
-};
-#endif
--- a/debian/build-vitastor-bullseye.sh
+++ b/debian/build-vitastor-bullseye.sh
@ -1,7 +0,0 @@
-#!/bin/bash
-
-sed 's/$REL/bullseye/g' < vitastor.Dockerfile > ../Dockerfile
-cd ..
-mkdir -p packages
-sudo podman build -v `pwd`/packages:/root/packages -f Dockerfile .
-rm Dockerfile
--- a/debian/build-vitastor-buster.sh
+++ b/debian/build-vitastor-buster.sh
@ -1,7 +0,0 @@
-#!/bin/bash
-
-sed 's/$REL/buster/g' < vitastor.Dockerfile > ../Dockerfile
-cd ..
-mkdir -p packages
-sudo podman build -v `pwd`/packages:/root/packages -f Dockerfile .
-rm Dockerfile
--- a/debian/changelog
+++ b/debian/changelog
@ -1,17 +0,0 @@
-vitastor (0.5.10-1) unstable; urgency=medium
-
-  * Bugfixes
-
- -- Vitaliy Filippov <vitalif@yourcmc.ru>  Tue, 02 Feb 2021 23:01:24 +0300
-
-vitastor (0.5.1-1) unstable; urgency=medium
-
-  * Add jerasure support
-
- -- Vitaliy Filippov <vitalif@yourcmc.ru>  Sat, 05 Dec 2020 17:02:26 +0300
-
-vitastor (0.5-1) unstable; urgency=medium
-
-  * First packaging for Debian
-
- -- Vitaliy Filippov <vitalif@yourcmc.ru>  Thu, 05 Nov 2020 02:20:59 +0300
--- a/debian/compat
+++ b/debian/compat
@ -1 +0,0 @@
-13
--- a/debian/control
+++ b/debian/control
@ -1,17 +0,0 @@
-Source: vitastor
-Section: admin
-Priority: optional
-Maintainer: Vitaliy Filippov <vitalif@yourcmc.ru>
-Build-Depends: debhelper, liburing-dev (>= 0.6), g++ (>= 8), libstdc++6 (>= 8), linux-libc-dev, libgoogle-perftools-dev, libjerasure-dev, libgf-complete-dev
-Standards-Version: 4.5.0
-Homepage: https://vitastor.io/
-Rules-Requires-Root: no
-
-Package: vitastor
-Architecture: amd64
-Depends: ${shlibs:Depends}, ${misc:Depends}, fio (= ${dep:fio}), qemu (= ${dep:qemu}), nodejs (>= 10), node-sprintf-js, node-ws (>= 7), libjerasure2, lp-solve
-Description: Vitastor, a fast software-defined clustered block storage
- Vitastor is a small, simple and fast clustered block storage (storage for VM drives),
- architecturally similar to Ceph which means strong consistency, primary-replication,
- symmetric clustering and automatic data distribution over any number of drives of any
- size with configurable redundancy (replication or erasure codes/XOR).
--- a/debian/copyright
+++ b/debian/copyright
@ -1,21 +0,0 @@
-Format: https://www.debian.org/doc/packaging-manuals/copyright-format/1.0/
-Upstream-Name: vitastor
-Upstream-Contact: Vitaliy Filippov <vitalif@yourcmc.ru>
-Source: https://vitastor.io
-
-Files: *
-Copyright: 2019+ Vitaliy Filippov <vitalif@yourcmc.ru>
-License: Multiple licenses VNPL-1.1 and/or GPL-2.0+
- All server-side code (OSD, Monitor and so on) is licensed under the terms of
- Vitastor Network Public License 1.1 (VNPL 1.1), a copyleft license based on
- GNU GPLv3.0 with the additional "Network Interaction" clause which requires
- opensourcing all programs directly or indirectly interacting with Vitastor
- through a computer network and expressly designed to be used in conjunction
- with it ("Proxy Programs"). Proxy Programs may be made public not only under
- the terms of the same license, but also under the terms of any GPL-Compatible
- Free Software License, as listed by the Free Software Foundation.
- This is a stricter copyleft license than the Affero GPL.
- .
- Client libraries (cluster_client and so on) are dual-licensed under the same
- VNPL 1.1 and also GNU GPL 2.0 or later to allow for compatibility with GPLed
- software like QEMU and fio.
--- a/debian/install
+++ b/debian/install
@ -1,3 +0,0 @@
-VNPL-1.1.txt usr/share/doc/vitastor
-GPL-2.0.txt usr/share/doc/vitastor
-mon usr/lib/vitastor
--- a/debian/patched-qemu.Dockerfile
+++ b/debian/patched-qemu.Dockerfile
@ -1,44 +0,0 @@
-# Build patched QEMU for Debian Buster or Bullseye/Sid inside a container
-# cd ..; podman build --build-arg REL=bullseye -v `pwd`/packages:/root/packages -f debian/patched-qemu.Dockerfile .
-
-FROM debian:$REL
-
-WORKDIR /root
-
-RUN if [ "$REL" = "buster" ]; then \
-        echo 'deb http://deb.debian.org/debian buster-backports main' >> /etc/apt/sources.list; \
-        echo >> /etc/apt/preferences; \
-        echo 'Package: *' >> /etc/apt/preferences; \
-        echo 'Pin: release a=buster-backports' >> /etc/apt/preferences; \
-        echo 'Pin-Priority: 500' >> /etc/apt/preferences; \
-    fi; \
-    grep '^deb ' /etc/apt/sources.list | perl -pe 's/^deb/deb-src/' >> /etc/apt/sources.list; \
-    echo 'APT::Install-Recommends false;' >> /etc/apt/apt.conf; \
-    echo 'APT::Install-Suggests false;' >> /etc/apt/apt.conf
-
-RUN apt-get update
-RUN apt-get -y install qemu fio liburing1 liburing-dev libgoogle-perftools-dev devscripts
-RUN apt-get -y build-dep qemu
-RUN apt-get -y build-dep fio
-RUN apt-get --download-only source qemu
-RUN apt-get --download-only source fio
-
-ADD qemu-5.0-vitastor.patch qemu-5.1-vitastor.patch /root/vitastor/
-RUN set -e; \
-    mkdir -p /root/packages/qemu-$REL; \
-    rm -rf /root/packages/qemu-$REL/*; \
-    cd /root/packages/qemu-$REL; \
-    dpkg-source -x /root/qemu*.dsc; \
-    if [ -d /root/packages/qemu-$REL/qemu-5.0 ]; then \
-        cp /root/vitastor/qemu-5.0-vitastor.patch /root/packages/qemu-$REL/qemu-5.0/debian/patches; \
-        echo qemu-5.0-vitastor.patch >> /root/packages/qemu-$REL/qemu-5.0/debian/patches/series; \
-    else \
-        cp /root/vitastor/qemu-5.1-vitastor.patch /root/packages/qemu-$REL/qemu-*/debian/patches; \
-        P=`ls -d /root/packages/qemu-$REL/qemu-*/debian/patches`; \
-        echo qemu-5.1-vitastor.patch >> $P/series; \
-    fi; \
-    cd /root/packages/qemu-$REL/qemu-*/; \
-    V=$(head -n1 debian/changelog | perl -pe 's/^.*\((.*?)(~bpo[\d\+]*)?\).*$/$1/')+vitastor1; \
-    DEBFULLNAME="Vitaliy Filippov <vitalif@yourcmc.ru>" dch -D $REL -v $V 'Plug Vitastor block driver'; \
-    DEB_BUILD_OPTIONS=nocheck dpkg-buildpackage --jobs=auto -sa; \
-    rm -rf /root/packages/qemu-$REL/qemu-*/
--- a/debian/rules
+++ b/debian/rules
@ -1,9 +0,0 @@
-#!/usr/bin/make -f
-export DH_VERBOSE = 1
-
-%:
-	dh $@
-
-override_dh_installdeb:
-	cat debian/substvars >> debian/vitastor.substvars
-	dh_installdeb
--- a/debian/source/format
+++ b/debian/source/format
@ -1 +0,0 @@
-3.0 (quilt)
--- a/debian/substvars
+++ b/debian/substvars
@ -1,2 +0,0 @@
-dep:fio=3.16-1
-dep:qemu=1:5.1+dfsg-4+vitastor1
--- a/debian/vitastor.Dockerfile
+++ b/debian/vitastor.Dockerfile
@ -1,67 +0,0 @@
-# Build Vitastor packages for Debian Buster or Bullseye/Sid inside a container
-# cd ..; podman build --build-arg REL=bullseye -v `pwd`/packages:/root/packages -f debian/vitastor.Dockerfile .
-
-FROM debian:$REL
-
-WORKDIR /root
-
-RUN if [ "$REL" = "buster" ]; then \
-        echo 'deb http://deb.debian.org/debian buster-backports main' >> /etc/apt/sources.list; \
-        echo >> /etc/apt/preferences; \
-        echo 'Package: *' >> /etc/apt/preferences; \
-        echo 'Pin: release a=buster-backports' >> /etc/apt/preferences; \
-        echo 'Pin-Priority: 500' >> /etc/apt/preferences; \
-    fi; \
-    grep '^deb ' /etc/apt/sources.list | perl -pe 's/^deb/deb-src/' >> /etc/apt/sources.list; \
-    echo 'APT::Install-Recommends false;' >> /etc/apt/apt.conf; \
-    echo 'APT::Install-Suggests false;' >> /etc/apt/apt.conf
-
-RUN apt-get update
-RUN apt-get -y install qemu fio liburing1 liburing-dev libgoogle-perftools-dev devscripts
-RUN apt-get -y build-dep qemu
-RUN apt-get -y build-dep fio
-RUN apt-get --download-only source qemu
-RUN apt-get --download-only source fio
-RUN apt-get -y install libjerasure-dev cmake
-
-ADD . /root/vitastor
-RUN set -e -x; \
-    mkdir -p /root/fio-build/; \
-    cd /root/fio-build/; \
-    rm -rf /root/fio-build/*; \
-    dpkg-source -x /root/fio*.dsc; \
-    cd /root/packages/qemu-$REL/; \
-    rm -rf qemu*/; \
-    dpkg-source -x qemu*.dsc; \
-    cd /root/packages/qemu-$REL/qemu*/; \
-    debian/rules b/configure-stamp; \
-    cd b/qemu; \
-    make -j8 qapi/qapi-builtin-types.h; \
-    mkdir -p /root/packages/vitastor-$REL; \
-    rm -rf /root/packages/vitastor-$REL/*; \
-    cd /root/packages/vitastor-$REL; \
-    cp -r /root/vitastor vitastor-0.5.10; \
-    ln -s /root/packages/qemu-$REL/qemu-*/ vitastor-0.5.10/qemu; \
-    ln -s /root/fio-build/fio-*/ vitastor-0.5.10/fio; \
-    cd vitastor-0.5.10; \
-    FIO=$(head -n1 fio/debian/changelog | perl -pe 's/^.*\((.*?)\).*$/$1/'); \
-    QEMU=$(head -n1 qemu/debian/changelog | perl -pe 's/^.*\((.*?)\).*$/$1/'); \
-    sh copy-qemu-includes.sh; \
-    sh copy-fio-includes.sh; \
-    rm qemu fio; \
-    mkdir -p a b debian/patches; \
-    mv qemu-copy b/qemu; \
-    mv fio-copy b/fio; \
-    diff -NaurpbB a b > debian/patches/qemu-fio-headers.patch || true; \
-    echo qemu-fio-headers.patch >> debian/patches/series; \
-    rm -rf a b; \
-    rm -rf /root/packages/qemu-$REL/qemu*/; \
-    echo "dep:fio=$FIO" > debian/substvars; \
-    echo "dep:qemu=$QEMU" >> debian/substvars; \
-    cd /root/packages/vitastor-$REL; \
-    tar --sort=name --mtime='2020-01-01' --owner=0 --group=0 --exclude=debian -cJf vitastor_0.5.10.orig.tar.xz vitastor-0.5.10; \
-    cd vitastor-0.5.10; \
-    V=$(head -n1 debian/changelog | perl -pe 's/^.*\((.*?)\).*$/$1/'); \
-    DEBFULLNAME="Vitaliy Filippov <vitalif@yourcmc.ru>" dch -D $REL -v "$V""$REL" "Rebuild for $REL"; \
-    DEB_BUILD_OPTIONS=nocheck dpkg-buildpackage --jobs=auto -sa; \
-    rm -rf /root/packages/vitastor-$REL/vitastor-*/
--- a/src/dump_journal.cpp
+++ b/src/dump_journal.cpp
@ -1,5 +1,5 @@
 // Copyright (c) Vitaliy Filippov, 2019+
-// License: VNPL-1.1 (see README.md for details)
+// License: VNPL-1.0 (see README.md for details)

 #define _LARGEFILE64_SOURCE
 #include <sys/types.h>
@ -26,32 +26,23 @@ struct journal_dump_t
    uint64_t journal_offset;
    uint64_t journal_len;
    uint64_t journal_pos;
-    bool all;
-    bool started;
    int fd;
-    uint32_t crc32_last;

-    int dump_block(void *buf);
+    void dump_block(void *buf);
 };

 int main(int argc, char *argv[])
 {
-    journal_dump_t self = { 0 };
-    int b = 1;
-    if (argc >= 2 && !strcmp(argv[1], "--all"))
+    if (argc < 5)
    {
-        self.all = true;
-        b = 2;
-    }
-    if (argc < b+4)
-    {
-        printf("USAGE: %s [--all] <journal_file> <journal_block_size> <offset> <size>\n", argv[0]);
+        printf("USAGE: %s <journal_file> <journal_block_size> <offset> <size>\n", argv[0]);
        return 1;
    }
-    self.journal_device = argv[b];
-    self.journal_block = strtoul(argv[b+1], NULL, 10);
-    self.journal_offset = strtoull(argv[b+2], NULL, 10);
-    self.journal_len = strtoull(argv[b+3], NULL, 10);
+    journal_dump_t self;
+    self.journal_device = argv[1];
+    self.journal_block = strtoul(argv[2], NULL, 10);
+    self.journal_offset = strtoull(argv[3], NULL, 10);
+    self.journal_len = strtoull(argv[4], NULL, 10);
    if (self.journal_block < MEM_ALIGNMENT || (self.journal_block % MEM_ALIGNMENT) ||
        self.journal_block > 128*1024)
    {
@ -66,64 +57,30 @@ int main(int argc, char *argv[])
    }
    void *data = memalign(MEM_ALIGNMENT, self.journal_block);
    self.journal_pos = 0;
-    if (self.all)
-    {
-        while (self.journal_pos < self.journal_len)
-        {
-            int r = pread(self.fd, data, self.journal_block, self.journal_offset+self.journal_pos);
-            assert(r == self.journal_block);
-            uint64_t s;
-            for (s = 0; s < self.journal_block; s += 8)
-            {
-                if (*((uint64_t*)(data+s)) != 0)
-                    break;
-            }
-            if (s == self.journal_block)
-            {
-                printf("offset %08lx: zeroes\n", self.journal_pos);
-                self.journal_pos += self.journal_block;
-            }
-            else if (((journal_entry*)data)->magic == JOURNAL_MAGIC)
-            {
-                printf("offset %08lx:\n", self.journal_pos);
-                self.dump_block(data);
-            }
-            else
-            {
-                printf("offset %08lx: no magic in the beginning, looks like random data (pattern=%lx)\n", self.journal_pos, *((uint64_t*)data));
-                self.journal_pos += self.journal_block;
-            }
-        }
-    }
-    else
+    while (self.journal_pos < self.journal_len)
    {
        int r = pread(self.fd, data, self.journal_block, self.journal_offset+self.journal_pos);
        assert(r == self.journal_block);
-        journal_entry *je = (journal_entry*)(data);
-        if (je->magic != JOURNAL_MAGIC || je->type != JE_START || je_crc32(je) != je->crc32)
+        uint64_t s;
+        for (s = 0; s < self.journal_block; s += 8)
        {
-            printf("offset %08lx: journal superblock is invalid\n", self.journal_pos);
+            if (*((uint64_t*)(data+s)) != 0)
+                break;
        }
-        else
+        if (s == self.journal_block)
+        {
+            printf("offset %08lx: zeroes\n", self.journal_pos);
+            self.journal_pos += self.journal_block;
+        }
+        else if (((journal_entry*)data)->magic == JOURNAL_MAGIC)
        {
            printf("offset %08lx:\n", self.journal_pos);
            self.dump_block(data);
-            self.started = false;
-            self.journal_pos = je->start.journal_start;
-            while (1)
-            {
-                if (self.journal_pos >= self.journal_len)
-                    self.journal_pos = self.journal_block;
-                r = pread(self.fd, data, self.journal_block, self.journal_offset+self.journal_pos);
-                assert(r == self.journal_block);
-                printf("offset %08lx:\n", self.journal_pos);
-                r = self.dump_block(data);
-                if (r <= 0)
-                {
-                    printf("end of the journal\n");
-                    break;
-                }
-            }
+        }
+        else
+        {
+            printf("offset %08lx: no magic in the beginning, looks like random data (pattern=%lx)\n", self.journal_pos, *((uint64_t*)data));
+            self.journal_pos += self.journal_block;
        }
    }
    free(data);
@ -131,7 +88,7 @@ int main(int argc, char *argv[])
    return 0;
 }

-int journal_dump_t::dump_block(void *buf)
+void journal_dump_t::dump_block(void *buf)
 {
    uint32_t pos = 0;
    journal_pos += journal_block;
@ -140,19 +97,12 @@ int journal_dump_t::dump_block(void *buf)
    while (pos < journal_block)
    {
        journal_entry *je = (journal_entry*)(buf + pos);
-        if (je->magic != JOURNAL_MAGIC || je->type < JE_MIN || je->type > JE_MAX ||
-            !all && started && je->crc32_prev != crc32_last)
+        if (je->magic != JOURNAL_MAGIC || je->type < JE_MIN || je->type > JE_MAX)
        {
            break;
        }
-        bool crc32_valid = je_crc32(je) == je->crc32;
-        if (!all && !crc32_valid)
-        {
-            break;
-        }
-        started = true;
-        crc32_last = je->crc32;
-        printf("entry % 3d: crc32=%08x %s prev=%08x ", entry, je->crc32, (crc32_valid ? "(valid)" : "(invalid)"), je->crc32_prev);
+        const char *crc32_valid = je_crc32(je) == je->crc32 ? "(valid)" : "(invalid)";
+        printf("entry % 3d: crc32=%08x %s prev=%08x ", entry, je->crc32, crc32_valid, je->crc32_prev);
        if (je->type == JE_START)
        {
            printf("je_start start=%08lx\n", je->start.journal_start);
@ -220,5 +170,4 @@ int journal_dump_t::dump_block(void *buf)
    {
        journal_pos = journal_len;
    }
-    return entry;
 }
--- a/src/epoll_manager.cpp
+++ b/src/epoll_manager.cpp
@ -1,5 +1,5 @@
 // Copyright (c) Vitaliy Filippov, 2019+
-// License: VNPL-1.1 or GNU GPL-2.0+ (see README.md for details)
+// License: VNPL-1.0 or GNU GPL-2.0+ (see README.md for details)

 #include <sys/epoll.h>
 #include <sys/poll.h>
@ -84,12 +84,8 @@ void epoll_manager_t::handle_epoll_events()
        nfds = epoll_wait(epoll_fd, events, MAX_EPOLL_EVENTS, 0);
        for (int i = 0; i < nfds; i++)
        {
-            auto cb_it = epoll_handlers.find(events[i].data.fd);
-            if (cb_it != epoll_handlers.end())
-            {
-                auto & cb = cb_it->second;
-                cb(events[i].data.fd, events[i].events);
-            }
+            auto & cb = epoll_handlers[events[i].data.fd];
+            cb(events[i].data.fd, events[i].events);
        }
    } while (nfds == MAX_EPOLL_EVENTS);
 }
--- a/src/epoll_manager.h
+++ b/src/epoll_manager.h
@ -1,5 +1,5 @@
 // Copyright (c) Vitaliy Filippov, 2019+
-// License: VNPL-1.1 or GNU GPL-2.0+ (see README.md for details)
+// License: VNPL-1.0 or GNU GPL-2.0+ (see README.md for details)

 #pragma once

--- a/src/etcd_state_client.cpp
+++ b/src/etcd_state_client.cpp
@ -1,5 +1,5 @@
 // Copyright (c) Vitaliy Filippov, 2019+
-// License: VNPL-1.1 or GNU GPL-2.0+ (see README.md for details)
+// License: VNPL-1.0 or GNU GPL-2.0+ (see README.md for details)

 #include "osd_ops.h"
 #include "pg_states.h"
@ -7,21 +7,6 @@
 #include "http_client.h"
 #include "base64.h"

-etcd_state_client_t::~etcd_state_client_t()
-{
-    for (auto watch: watches)
-    {
-        delete watch;
-    }
-    watches.clear();
-    etcd_watches_initialised = -1;
-    if (etcd_watch_ws)
-    {
-        etcd_watch_ws->close();
-        etcd_watch_ws = NULL;
-    }
-}
-
 json_kv_t etcd_state_client_t::parse_etcd_kv(const json11::Json & kv_json)
 {
    json_kv_t kv;
@ -61,23 +46,6 @@ void etcd_state_client_t::etcd_call(std::string api, json11::Json payload, int t
    http_request_json(tfd, etcd_address, req, timeout, callback);
 }

-void etcd_state_client_t::add_etcd_url(std::string addr)
-{
-    if (addr.length() > 0)
-    {
-        if (strtolower(addr.substr(0, 7)) == "http://")
-            addr = addr.substr(7);
-        else if (strtolower(addr.substr(0, 8)) == "https://")
-        {
-            printf("HTTPS is unsupported for etcd. Either use plain HTTP or setup a local proxy for etcd interaction\n");
-            exit(1);
-        }
-        if (addr.find('/') < 0)
-            addr += "/v3";
-        this->etcd_addresses.push_back(addr);
-    }
-}
-
 void etcd_state_client_t::parse_config(json11::Json & config)
 {
    this->etcd_addresses.clear();
@ -87,7 +55,13 @@ void etcd_state_client_t::parse_config(json11::Json & config)
        while (1)
        {
            int pos = ea.find(',');
-            add_etcd_url(pos >= 0 ? ea.substr(0, pos) : ea);
+            std::string addr = pos >= 0 ? ea.substr(0, pos) : ea;
+            if (addr.length() > 0)
+            {
+                if (addr.find('/') < 0)
+                    addr += "/v3";
+                this->etcd_addresses.push_back(addr);
+            }
            if (pos >= 0)
                ea = ea.substr(pos+1);
            else
@ -98,7 +72,13 @@ void etcd_state_client_t::parse_config(json11::Json & config)
    {
        for (auto & ea: config["etcd_address"].array_items())
        {
-            add_etcd_url(ea.string_value());
+            std::string addr = ea.string_value();
+            if (addr != "")
+            {
+                if (addr.find('/') < 0)
+                    addr += "/v3";
+                this->etcd_addresses.push_back(addr);
+            }
        }
    }
    this->etcd_prefix = config["etcd_prefix"].string_value();
@ -180,7 +160,7 @@ void etcd_state_client_t::start_etcd_watcher()
                    start_etcd_watcher();
                });
            }
-            else if (etcd_watches_initialised > 0)
+            else
            {
                // Connection was live, retry immediately
                start_etcd_watcher();
@ -193,7 +173,6 @@ void etcd_state_client_t::start_etcd_watcher()
            { "range_end", base64_encode(etcd_prefix+"/config0") },
            { "start_revision", etcd_watch_revision+1 },
            { "watch_id", ETCD_CONFIG_WATCH_ID },
-            { "progress_notify", true },
        } }
    }).dump());
    etcd_watch_ws->post_message(WS_TEXT, json11::Json(json11::Json::object {
@ -202,7 +181,6 @@ void etcd_state_client_t::start_etcd_watcher()
            { "range_end", base64_encode(etcd_prefix+"/osd/state0") },
            { "start_revision", etcd_watch_revision+1 },
            { "watch_id", ETCD_OSD_STATE_WATCH_ID },
-            { "progress_notify", true },
        } }
    }).dump());
    etcd_watch_ws->post_message(WS_TEXT, json11::Json(json11::Json::object {
@ -211,7 +189,6 @@ void etcd_state_client_t::start_etcd_watcher()
            { "range_end", base64_encode(etcd_prefix+"/pg/state0") },
            { "start_revision", etcd_watch_revision+1 },
            { "watch_id", ETCD_PG_STATE_WATCH_ID },
-            { "progress_notify", true },
        } }
    }).dump());
    etcd_watch_ws->post_message(WS_TEXT, json11::Json(json11::Json::object {
@ -220,7 +197,6 @@ void etcd_state_client_t::start_etcd_watcher()
            { "range_end", base64_encode(etcd_prefix+"/pg/history0") },
            { "start_revision", etcd_watch_revision+1 },
            { "watch_id", ETCD_PG_HISTORY_WATCH_ID },
-            { "progress_notify", true },
        } }
    }).dump());
 }
@ -271,12 +247,6 @@ void etcd_state_client_t::load_pgs()
                { "key", base64_encode(etcd_prefix+"/config/pgs") },
            } }
        },
-        json11::Json::object {
-            { "request_range", json11::Json::object {
-                { "key", base64_encode(etcd_prefix+"/config/inode/") },
-                { "range_end", base64_encode(etcd_prefix+"/config/inode0") },
-            } }
-        },
        json11::Json::object {
            { "request_range", json11::Json::object {
                { "key", base64_encode(etcd_prefix+"/pg/history/") },
@ -345,99 +315,67 @@ void etcd_state_client_t::parse_state(const std::string & key, const json11::Jso
        }
        for (auto & pool_item: value.object_items())
        {
-            pool_config_t pc;
-            // ID
            pool_id_t pool_id = stoull_full(pool_item.first);
            if (!pool_id || pool_id >= POOL_ID_MAX)
            {
                printf("Pool ID %s is invalid (must be a number less than 0x%x), skipping pool\n", pool_item.first.c_str(), POOL_ID_MAX);
                continue;
            }
-            pc.id = pool_id;
-            // Pool Name
-            pc.name = pool_item.second["name"].string_value();
-            if (pc.name == "")
-            {
-                printf("Pool %u has empty name, skipping pool\n", pool_id);
-                continue;
-            }
-            // Failure Domain
-            pc.failure_domain = pool_item.second["failure_domain"].string_value();
-            // Coding Scheme
-            if (pool_item.second["scheme"] == "replicated")
-                pc.scheme = POOL_SCHEME_REPLICATED;
-            else if (pool_item.second["scheme"] == "xor")
-                pc.scheme = POOL_SCHEME_XOR;
-            else if (pool_item.second["scheme"] == "jerasure")
-                pc.scheme = POOL_SCHEME_JERASURE;
-            else
-            {
-                printf("Pool %u has invalid coding scheme (one of \"xor\", \"replicated\" or \"jerasure\" required), skipping pool\n", pool_id);
-                continue;
-            }
-            // PG Size
-            pc.pg_size = pool_item.second["pg_size"].uint64_value();
-            if (pc.pg_size < 1 ||
-                pool_item.second["pg_size"].uint64_value() < 3 &&
-                (pc.scheme == POOL_SCHEME_XOR || pc.scheme == POOL_SCHEME_JERASURE) ||
-                pool_item.second["pg_size"].uint64_value() > 256)
+            if (pool_item.second["pg_size"].uint64_value() < 1 ||
+                pool_item.second["scheme"] == "xor" && pool_item.second["pg_size"].uint64_value() < 3)
            {
                printf("Pool %u has invalid pg_size, skipping pool\n", pool_id);
                continue;
            }
-            // Parity Chunks
-            pc.parity_chunks = pool_item.second["parity_chunks"].uint64_value();
-            if (pc.scheme == POOL_SCHEME_XOR)
-            {
-                if (pc.parity_chunks > 1)
-                {
-                    printf("Pool %u has invalid parity_chunks (must be 1), skipping pool\n", pool_id);
-                    continue;
-                }
-                pc.parity_chunks = 1;
-            }
-            if (pc.scheme == POOL_SCHEME_JERASURE &&
-                (pc.parity_chunks < 1 || pc.parity_chunks > pc.pg_size-2))
-            {
-                printf("Pool %u has invalid parity_chunks (must be between 1 and pg_size-2), skipping pool\n", pool_id);
-                continue;
-            }
-            // PG MinSize
-            pc.pg_minsize = pool_item.second["pg_minsize"].uint64_value();
-            if (pc.pg_minsize < 1 || pc.pg_minsize > pc.pg_size ||
-                (pc.scheme == POOL_SCHEME_XOR || pc.scheme == POOL_SCHEME_JERASURE) &&
-                pc.pg_minsize < (pc.pg_size-pc.parity_chunks))
+            if (pool_item.second["pg_minsize"].uint64_value() < 1 ||
+                pool_item.second["pg_minsize"].uint64_value() > pool_item.second["pg_size"].uint64_value() ||
+                pool_item.second["pg_minsize"].uint64_value() < (pool_item.second["pg_size"].uint64_value() - 1))
            {
                printf("Pool %u has invalid pg_minsize, skipping pool\n", pool_id);
                continue;
            }
-            // PG Count
-            pc.pg_count = pool_item.second["pg_count"].uint64_value();
-            if (pc.pg_count < 1)
+            if (pool_item.second["pg_count"].uint64_value() < 1)
            {
                printf("Pool %u has invalid pg_count, skipping pool\n", pool_id);
                continue;
            }
-            // Max OSD Combinations
-            pc.max_osd_combinations = pool_item.second["max_osd_combinations"].uint64_value();
-            if (!pc.max_osd_combinations)
-                pc.max_osd_combinations = 10000;
-            if (pc.max_osd_combinations > 0 && pc.max_osd_combinations < 100)
+            if (pool_item.second["name"].string_value() == "")
+            {
+                printf("Pool %u has empty name, skipping pool\n", pool_id);
+                continue;
+            }
+            if (pool_item.second["scheme"] != "replicated" && pool_item.second["scheme"] != "xor")
+            {
+                printf("Pool %u has invalid coding scheme (only \"xor\" and \"replicated\" are allowed), skipping pool\n", pool_id);
+                continue;
+            }
+            if (pool_item.second["max_osd_combinations"].uint64_value() > 0 &&
+                pool_item.second["max_osd_combinations"].uint64_value() < 100)
            {
                printf("Pool %u has invalid max_osd_combinations (must be at least 100), skipping pool\n", pool_id);
                continue;
            }
-            // PG Stripe Size
-            pc.pg_stripe_size = pool_item.second["pg_stripe_size"].uint64_value();
-            uint64_t min_stripe_size = bs_block_size * (pc.scheme == POOL_SCHEME_REPLICATED ? 1 : (pc.pg_size-pc.parity_chunks));
-            if (pc.pg_stripe_size < min_stripe_size)
-                pc.pg_stripe_size = min_stripe_size;
-            // Save
-            pc.real_pg_count = this->pool_config[pool_id].real_pg_count;
-            std::swap(pc.pg_config, this->pool_config[pool_id].pg_config);
-            std::swap(this->pool_config[pool_id], pc);
            auto & parsed_cfg = this->pool_config[pool_id];
            parsed_cfg.exists = true;
+            parsed_cfg.id = pool_id;
+            parsed_cfg.name = pool_item.second["name"].string_value();
+            parsed_cfg.scheme = pool_item.second["scheme"] == "replicated" ? POOL_SCHEME_REPLICATED : POOL_SCHEME_XOR;
+            parsed_cfg.pg_size = pool_item.second["pg_size"].uint64_value();
+            parsed_cfg.pg_minsize = pool_item.second["pg_minsize"].uint64_value();
+            parsed_cfg.pg_count = pool_item.second["pg_count"].uint64_value();
+            parsed_cfg.failure_domain = pool_item.second["failure_domain"].string_value();
+            parsed_cfg.pg_stripe_size = pool_item.second["pg_stripe_size"].uint64_value();
+            uint64_t min_stripe_size = bs_block_size *
+                (parsed_cfg.scheme == POOL_SCHEME_REPLICATED ? 1 : parsed_cfg.pg_minsize);
+            if (parsed_cfg.pg_stripe_size < min_stripe_size)
+            {
+                parsed_cfg.pg_stripe_size = min_stripe_size;
+            }
+            parsed_cfg.max_osd_combinations = pool_item.second["max_osd_combinations"].uint64_value();
+            if (!parsed_cfg.max_osd_combinations)
+            {
+                parsed_cfg.max_osd_combinations = 10000;
+            }
            for (auto & pg_item: parsed_cfg.pg_config)
            {
                if (pg_item.second.target_set.size() != parsed_cfg.pg_size)
@ -623,105 +561,4 @@ void etcd_state_client_t::parse_state(const std::string & key, const json11::Jso
            }
        }
    }
-    else if (key.substr(0, etcd_prefix.length()+14) == etcd_prefix+"/config/inode/")
-    {
-        // <etcd_prefix>/config/inode/%d/%d
-        uint64_t pool_id = 0;
-        uint64_t inode_num = 0;
-        char null_byte = 0;
-        sscanf(key.c_str() + etcd_prefix.length()+14, "%lu/%lu%c", &pool_id, &inode_num, &null_byte);
-        if (!pool_id || pool_id >= POOL_ID_MAX || !inode_num || (inode_num >> (64-POOL_ID_BITS)) || null_byte != 0)
-        {
-            printf("Bad etcd key %s, ignoring\n", key.c_str());
-        }
-        else
-        {
-            inode_num |= (pool_id << (64-POOL_ID_BITS));
-            auto it = this->inode_config.find(inode_num);
-            if (it != this->inode_config.end() && it->second.name != "")
-            {
-                auto n_it = this->inode_by_name.find(it->second.name);
-                if (n_it->second == inode_num)
-                {
-                    this->inode_by_name.erase(n_it);
-                    for (auto w: watches)
-                    {
-                        if (w->name == it->second.name)
-                        {
-                            w->cfg = { 0 };
-                        }
-                    }
-                }
-            }
-            if (!value.is_object())
-            {
-                this->inode_config.erase(inode_num);
-            }
-            else
-            {
-                inode_t parent_inode_num = value["parent_id"].uint64_value();
-                if (parent_inode_num && !(parent_inode_num >> (64-POOL_ID_BITS)))
-                {
-                    uint64_t parent_pool_id = value["parent_pool"].uint64_value();
-                    if (!parent_pool_id)
-                        parent_inode_num |= pool_id << (64-POOL_ID_BITS);
-                    else if (parent_pool_id >= POOL_ID_MAX)
-                    {
-                        printf(
-                            "Inode %lu/%lu parent_pool value is invalid, ignoring parent setting\n",
-                            inode_num >> (64-POOL_ID_BITS), inode_num & ((1l << (64-POOL_ID_BITS)) - 1)
-                        );
-                        parent_inode_num = 0;
-                    }
-                    else
-                        parent_inode_num |= parent_pool_id << (64-POOL_ID_BITS);
-                }
-                inode_config_t cfg = (inode_config_t){
-                    .num = inode_num,
-                    .name = value["name"].string_value(),
-                    .size = value["size"].uint64_value(),
-                    .parent_id = parent_inode_num,
-                    .readonly = value["readonly"].bool_value(),
-                };
-                this->inode_config[inode_num] = cfg;
-                if (cfg.name != "")
-                {
-                    this->inode_by_name[cfg.name] = inode_num;
-                    for (auto w: watches)
-                    {
-                        if (w->name == value["name"].string_value())
-                        {
-                            w->cfg = cfg;
-                        }
-                    }
-                }
-            }
-        }
-    }
-}
-
-inode_watch_t* etcd_state_client_t::watch_inode(std::string name)
-{
-    inode_watch_t *watch = new inode_watch_t;
-    watch->name = name;
-    watches.push_back(watch);
-    auto it = inode_by_name.find(name);
-    if (it != inode_by_name.end())
-    {
-        watch->cfg = inode_config[it->second];
-    }
-    return watch;
-}
-
-void etcd_state_client_t::close_watch(inode_watch_t* watch)
-{
-    for (int i = 0; i < watches.size(); i++)
-    {
-        if (watches[i] == watch)
-        {
-            watches.erase(watches.begin()+i, watches.begin()+i+1);
-            break;
-        }
-    }
-    delete watch;
 }
--- a/src/etcd_state_client.h
+++ b/src/etcd_state_client.h
@ -1,5 +1,5 @@
 // Copyright (c) Vitaliy Filippov, 2019+
-// License: VNPL-1.1 or GNU GPL-2.0+ (see README.md for details)
+// License: VNPL-1.0 or GNU GPL-2.0+ (see README.md for details)

 #pragma once

@ -43,7 +43,7 @@ struct pool_config_t
    pool_id_t id;
    std::string name;
    uint64_t scheme;
-    uint64_t pg_size, pg_minsize, parity_chunks;
+    uint64_t pg_size, pg_minsize;
    uint64_t pg_count;
    uint64_t real_pg_count;
    std::string failure_domain;
@ -52,29 +52,8 @@ struct pool_config_t
    std::map<pg_num_t, pg_config_t> pg_config;
 };

-struct inode_config_t
-{
-    uint64_t num;
-    std::string name;
-    uint64_t size;
-    inode_t parent_id;
-    bool readonly;
-};
-
-struct inode_watch_t
-{
-    std::string name;
-    inode_config_t cfg;
-};
-
 struct etcd_state_client_t
 {
-protected:
-    std::vector<inode_watch_t*> watches;
-    websocket_t *etcd_watch_ws = NULL;
-    uint64_t bs_block_size = 0;
-    void add_etcd_url(std::string);
-public:
    std::vector<std::string> etcd_addresses;
    std::string etcd_prefix;
    int log_level = 0;
@ -82,10 +61,10 @@ public:

    int etcd_watches_initialised = 0;
    uint64_t etcd_watch_revision = 0;
+    websocket_t *etcd_watch_ws = NULL;
+    uint64_t bs_block_size = 0;
    std::map<pool_id_t, pool_config_t> pool_config;
    std::map<osd_num_t, json11::Json> peer_states;
-    std::map<inode_t, inode_config_t> inode_config;
-    std::map<std::string, inode_t> inode_by_name;

    std::function<void(json11::Json::object &)> on_change_hook;
    std::function<void(json11::Json::object &)> on_load_config_hook;
@ -102,7 +81,4 @@ public:
    void load_pgs();
    void parse_state(const std::string & key, const json11::Json & value);
    void parse_config(json11::Json & config);
-    inode_watch_t* watch_inode(std::string name);
-    void close_watch(inode_watch_t* watch);
-    ~etcd_state_client_t();
 };
--- a/src/fio_cluster.cpp
+++ b/src/fio_cluster.cpp
@ -1,22 +1,22 @@
 // Copyright (c) Vitaliy Filippov, 2019+
-// License: VNPL-1.1 or GNU GPL-2.0+ (see README.md for details)
+// License: VNPL-1.0 or GNU GPL-2.0+ (see README.md for details)

 // FIO engine to test cluster I/O
 //
 // Random write:
 //
 // fio -thread -ioengine=./libfio_cluster.so -name=test -bs=4k -direct=1 -fsync=16 -iodepth=16 -rw=randwrite \
-//     -etcd=127.0.0.1:2379 [-etcd_prefix=/vitastor] (-image=testimg | -pool=1 -inode=1 -size=1000M)
+//     -etcd=127.0.0.1:2379 [-etcd_prefix=/vitastor] -pool=1 -inode=1 -size=1000M
 //
 // Linear write:
 //
 // fio -thread -ioengine=./libfio_cluster.so -name=test -bs=128k -direct=1 -fsync=32 -iodepth=32 -rw=write \
-//     -etcd=127.0.0.1:2379 [-etcd_prefix=/vitastor] -image=testimg
+//     -etcd=127.0.0.1:2379 [-etcd_prefix=/vitastor] -pool=1 -inode=1 -size=1000M
 //
 // Random read (run with -iodepth=32 or -iodepth=1):
 //
 // fio -thread -ioengine=./libfio_cluster.so -name=test -bs=4k -direct=1 -iodepth=32 -rw=randread \
-//     -etcd=127.0.0.1:2379 [-etcd_prefix=/vitastor] -image=testimg
+//     -etcd=127.0.0.1:2379 [-etcd_prefix=/vitastor] -pool=1 -inode=1 -size=1000M

 #include <sys/types.h>
 #include <sys/socket.h>
@ -35,7 +35,6 @@ struct sec_data
    ring_loop_t *ringloop = NULL;
    epoll_manager_t *epmgr = NULL;
    cluster_client_t *cli = NULL;
-    inode_watch_t *watch = NULL;
    bool last_sync = false;
    /* The list of completed io_u structs. */
    std::vector<io_u*> completed;
@ -48,7 +47,6 @@ struct sec_options
    int __pad;
    char *etcd_host = NULL;
    char *etcd_prefix = NULL;
-    char *image = NULL;
    uint64_t pool = 0;
    uint64_t inode = 0;
    int cluster_log = 0;
@ -66,7 +64,7 @@ static struct fio_option options[] = {
        .group  = FIO_OPT_G_FILENAME,
    },
    {
-        .name   = "etcd_prefix",
+        .name   = "etcd",
        .lname  = "etcd key prefix",
        .type   = FIO_OPT_STR_STORE,
        .off1   = offsetof(struct sec_options, etcd_prefix),
@ -74,15 +72,6 @@ static struct fio_option options[] = {
        .category = FIO_OPT_C_ENGINE,
        .group  = FIO_OPT_G_FILENAME,
    },
-    {
-        .name   = "image",
-        .lname  = "Vitastor image name",
-        .type   = FIO_OPT_STR_STORE,
-        .off1   = offsetof(struct sec_options, image),
-        .help   = "Vitastor image name to run tests on",
-        .category = FIO_OPT_C_ENGINE,
-        .group  = FIO_OPT_G_FILENAME,
-    },
    {
        .name   = "pool",
        .lname  = "pool number for the inode",
@ -97,14 +86,14 @@ static struct fio_option options[] = {
        .lname  = "inode to run tests on",
        .type   = FIO_OPT_INT,
        .off1   = offsetof(struct sec_options, inode),
-        .help   = "inode number to run tests on",
+        .help   = "inode to run tests on (1 by default)",
        .category = FIO_OPT_C_ENGINE,
        .group  = FIO_OPT_G_FILENAME,
    },
    {
        .name   = "cluster_log_level",
        .lname  = "cluster log level",
-        .type   = FIO_OPT_INT,
+        .type   = FIO_OPT_BOOL,
        .off1   = offsetof(struct sec_options, cluster_log),
        .help   = "Set log level for the Vitastor client",
        .def    = "0",
@ -128,15 +117,8 @@ static struct fio_option options[] = {

 static int sec_setup(struct thread_data *td)
 {
-    sec_options *o = (sec_options*)td->eo;
    sec_data *bsd;

-    if (!o->etcd_host)
-    {
-        td_verror(td, EINVAL, "etcd address is missing");
-        return 1;
-    }
-
    bsd = new sec_data;
    if (!bsd)
    {
@ -152,51 +134,6 @@ static int sec_setup(struct thread_data *td)
        td->o.open_files++;
    }

-    json11::Json cfg = json11::Json::object {
-        { "etcd_address", std::string(o->etcd_host) },
-        { "etcd_prefix", std::string(o->etcd_prefix ? o->etcd_prefix : "/vitastor") },
-        { "log_level", o->cluster_log },
-    };
-
-    if (!o->image)
-    {
-        if (!(o->inode & ((1l << (64-POOL_ID_BITS)) - 1)))
-        {
-            td_verror(td, EINVAL, "inode number is missing");
-            return 1;
-        }
-        if (o->pool)
-        {
-            o->inode = (o->inode & ((1l << (64-POOL_ID_BITS)) - 1)) | (o->pool << (64-POOL_ID_BITS));
-        }
-        if (!(o->inode >> (64-POOL_ID_BITS)))
-        {
-            td_verror(td, EINVAL, "pool is missing");
-            return 1;
-        }
-    }
-    else
-    {
-        o->inode = 0;
-    }
-    bsd->ringloop = new ring_loop_t(512);
-    bsd->epmgr = new epoll_manager_t(bsd->ringloop);
-    bsd->cli = new cluster_client_t(bsd->ringloop, bsd->epmgr->tfd, cfg);
-    if (o->image)
-    {
-        while (!bsd->cli->is_ready())
-        {
-            bsd->ringloop->loop();
-            if (bsd->cli->is_ready())
-                break;
-            bsd->ringloop->wait();
-        }
-        bsd->watch = bsd->cli->st_cli.watch_inode(std::string(o->image));
-        td->files[0]->real_file_size = bsd->watch->cfg.size;
-    }
-
-    bsd->trace = o->trace ? true : false;
-
    return 0;
 }

@ -205,20 +142,40 @@ static void sec_cleanup(struct thread_data *td)
    sec_data *bsd = (sec_data*)td->io_ops_data;
    if (bsd)
    {
-        if (bsd->watch)
-        {
-            bsd->cli->st_cli.close_watch(bsd->watch);
-        }
        delete bsd->cli;
        delete bsd->epmgr;
        delete bsd->ringloop;
-        delete bsd;
+        bsd->cli = NULL;
+        bsd->epmgr = NULL;
+        bsd->ringloop = NULL;
    }
 }

 /* Connect to the server from each thread. */
 static int sec_init(struct thread_data *td)
 {
+    sec_options *o = (sec_options*)td->eo;
+    sec_data *bsd = (sec_data*)td->io_ops_data;
+
+    json11::Json cfg = json11::Json::object {
+        { "etcd_address", std::string(o->etcd_host) },
+        { "etcd_prefix", std::string(o->etcd_prefix ? o->etcd_prefix : "/vitastor") },
+        { "log_level", o->cluster_log },
+    };
+
+    if (o->pool)
+        o->inode = (o->inode & ((1l << (64-POOL_ID_BITS)) - 1)) | (o->pool << (64-POOL_ID_BITS));
+    if (!(o->inode >> (64-POOL_ID_BITS)))
+    {
+        td_verror(td, EINVAL, "pool is missing");
+        return 1;
+    }
+    bsd->ringloop = new ring_loop_t(512);
+    bsd->epmgr = new epoll_manager_t(bsd->ringloop);
+    bsd->cli = new cluster_client_t(bsd->ringloop, bsd->epmgr->tfd, cfg);
+
+    bsd->trace = o->trace ? true : false;
+
    return 0;
 }

@ -238,23 +195,19 @@ static enum fio_q_status sec_queue(struct thread_data *td, struct io_u *io)
    io->engine_data = bsd;
    cluster_op_t *op = new cluster_op_t;

-    op->inode = opt->image ? bsd->watch->cfg.num : opt->inode;
    switch (io->ddir)
    {
    case DDIR_READ:
        op->opcode = OSD_OP_READ;
+        op->inode = opt->inode;
        op->offset = io->offset;
        op->len = io->xfer_buflen;
        op->iov.push_back(io->xfer_buf, io->xfer_buflen);
        bsd->last_sync = false;
        break;
    case DDIR_WRITE:
-        if (opt->image && bsd->watch->cfg.readonly)
-        {
-            io->error = EROFS;
-            return FIO_Q_COMPLETED;
-        }
        op->opcode = OSD_OP_WRITE;
+        op->inode = opt->inode;
        op->offset = io->offset;
        op->len = io->xfer_buflen;
        op->iov.push_back(io->xfer_buf, io->xfer_buflen);
--- a/src/fio_engine.cpp
+++ b/src/fio_engine.cpp
@ -1,5 +1,5 @@
 // Copyright (c) Vitaliy Filippov, 2019+
-// License: VNPL-1.1 (see README.md for details)
+// License: VNPL-1.0 (see README.md for details)

 // FIO engine to test Blockstore
 //
--- a/src/fio_headers.h
+++ b/src/fio_headers.h
--- a/src/fio_sec_osd.cpp
+++ b/src/fio_sec_osd.cpp
@ -1,5 +1,5 @@
 // Copyright (c) Vitaliy Filippov, 2019+
-// License: VNPL-1.1 or GNU GPL-2.0+ (see README.md for details)
+// License: VNPL-1.0 or GNU GPL-2.0+ (see README.md for details)

 // FIO engine to test Blockstore through Secondary OSD interface
 //
@ -140,7 +140,6 @@ static void sec_cleanup(struct thread_data *td)
    if (bsd)
    {
        close(bsd->connect_fd);
-        delete bsd;
    }
 }

@ -313,7 +312,6 @@ static int sec_getevents(struct thread_data *td, unsigned int min, unsigned int
            exit(1);
        }
        io_u* io = it->second;
-        bsd->queue.erase(it);
        if (io->ddir == DDIR_READ)
        {
            if (reply.hdr.retval != io->xfer_buflen)
--- a/src/http_client.cpp
+++ b/src/http_client.cpp
@ -1,5 +1,5 @@
 // Copyright (c) Vitaliy Filippov, 2019+
-// License: VNPL-1.1 or GNU GPL-2.0+ (see README.md for details)
+// License: VNPL-1.0 or GNU GPL-2.0+ (see README.md for details)

 #include <netinet/tcp.h>
 #include <sys/epoll.h>
@ -22,6 +22,7 @@
 #define READ_BUFFER_SIZE 9000

 static int extract_port(std::string & host);
+static std::string strtolower(const std::string & in);
 static std::string trim(const std::string & in);
 static std::string ws_format_frame(int type, uint64_t size);
 static bool ws_parse_frame(std::string & buf, int & type, std::string & res);
@ -672,7 +673,7 @@ static int extract_port(std::string & host)
    return port;
 }

-std::string strtolower(const std::string & in)
+static std::string strtolower(const std::string & in)
 {
    std::string s = in;
    for (int i = 0; i < s.length(); i++)
--- a/src/http_client.h
+++ b/src/http_client.h
@ -1,5 +1,5 @@
 // Copyright (c) Vitaliy Filippov, 2019+
-// License: VNPL-1.1 or GNU GPL-2.0+ (see README.md for details)
+// License: VNPL-1.0 or GNU GPL-2.0+ (see README.md for details)

 #pragma once
 #include <string>
@ -49,8 +49,6 @@ std::vector<std::string> getifaddr_list(bool include_v6 = false);

 uint64_t stoull_full(const std::string & str, int base = 10);

-std::string strtolower(const std::string & in);
-
 void http_request(timerfd_manager_t *tfd, const std::string & host, const std::string & request,
    const http_options_t & options, std::function<void(const http_response_t *response)> callback);

--- a/lambda_size.cpp
+++ b/lambda_size.cpp
@ -0,0 +1,51 @@
+// Copyright (c) Vitaliy Filippov, 2019+
+// License: VNPL-1.0 (see README.md for details)
+
+#include <iostream>
+#include <functional>
+#include <array>
+#include <cstdlib> // for malloc() and free()
+using namespace std;
+
+// replace operator new and delete to log allocations
+void* operator new(std::size_t n)
+{
+    cout << "Allocating " << n << " bytes" << endl;
+    return malloc(n);
+}
+
+void operator delete(void* p) throw()
+{
+    free(p);
+}
+
+class test
+{
+public:
+    std::string s;
+    void a(std::function<void()> & f, const char *str)
+    {
+        auto l = [this, str]() { cout << str << " ? " << s << " from this\n"; };
+        cout << "Assigning lambda3 of size " << sizeof(l) << endl;
+        f = l;
+    }
+};
+
+int main()
+{
+    std::array<char, 16> arr1;
+    auto lambda1 = [arr1](){};
+    cout << "Assigning lambda1 of size " << sizeof(lambda1) << endl;
+    std::function<void()> f1 = lambda1;
+
+    std::array<char, 17> arr2;
+    auto lambda2 = [arr2](){};
+    cout << "Assigning lambda2 of size " << sizeof(lambda2) << endl;
+    std::function<void()> f2 = lambda2;
+
+    test t;
+    std::function<void()> f3;
+    t.s = "str";
+    t.a(f3, "huyambda");
+    f3();
+}
--- a/src/malloc_or_die.h
+++ b/src/malloc_or_die.h
@ -1,5 +1,5 @@
 // Copyright (c) Vitaliy Filippov, 2019+
-// License: VNPL-1.1 or GNU GPL-2.0+ (see README.md for details)
+// License: VNPL-1.0 or GNU GPL-2.0+ (see README.md for details)

 #pragma once

--- a/src/messenger.cpp
+++ b/src/messenger.cpp
@ -1,5 +1,5 @@
 // Copyright (c) Vitaliy Filippov, 2019+
-// License: VNPL-1.1 or GNU GPL-2.0+ (see README.md for details)
+// License: VNPL-1.0 or GNU GPL-2.0+ (see README.md for details)

 #include <unistd.h>
 #include <fcntl.h>
@ -26,106 +26,14 @@ osd_op_t::~osd_op_t()
    }
 }

-void osd_messenger_t::init()
-{
-    keepalive_timer_id = tfd->set_timer(1000, true, [this](int)
-    {
-        for (auto cl_it = clients.begin(); cl_it != clients.end();)
-        {
-            auto cl = (cl_it++)->second;
-            if (!cl->osd_num)
-            {
-                // Do not run keepalive on regular clients
-                continue;
-            }
-            if (cl->ping_time_remaining > 0)
-            {
-                cl->ping_time_remaining--;
-                if (!cl->ping_time_remaining)
-                {
-                    // Ping timed out, stop the client
-                    stop_client(cl->peer_fd, true);
-                }
-            }
-            else if (cl->idle_time_remaining > 0)
-            {
-                cl->idle_time_remaining--;
-                if (!cl->idle_time_remaining)
-                {
-                    // Connection is idle for <osd_idle_time>, send ping
-                    osd_op_t *op = new osd_op_t();
-                    op->op_type = OSD_OP_OUT;
-                    op->peer_fd = cl->peer_fd;
-                    op->req = (osd_any_op_t){
-                        .hdr = {
-                            .magic = SECONDARY_OSD_OP_MAGIC,
-                            .id = this->next_subop_id++,
-                            .opcode = OSD_OP_PING,
-                        },
-                    };
-                    op->callback = [this, cl](osd_op_t *op)
-                    {
-                        int fail_fd = (op->reply.hdr.retval != 0 ? op->peer_fd : -1);
-                        cl->ping_time_remaining = 0;
-                        delete op;
-                        if (fail_fd >= 0)
-                        {
-                            stop_client(fail_fd, true);
-                        }
-                    };
-                    outbox_push(op);
-                    cl->ping_time_remaining = osd_ping_timeout;
-                    cl->idle_time_remaining = osd_idle_timeout;
-                }
-            }
-            else
-            {
-                cl->idle_time_remaining = osd_idle_timeout;
-            }
-        }
-    });
-}
-
 osd_messenger_t::~osd_messenger_t()
 {
-    if (keepalive_timer_id >= 0)
-    {
-        tfd->clear_timer(keepalive_timer_id);
-        keepalive_timer_id = -1;
-    }
    while (clients.size() > 0)
    {
-        stop_client(clients.begin()->first, true);
+        stop_client(clients.begin()->first);
    }
 }

-void osd_messenger_t::parse_config(const json11::Json & config)
-{
-    this->use_sync_send_recv = config["use_sync_send_recv"].bool_value() ||
-        config["use_sync_send_recv"].uint64_value();
-    this->peer_connect_interval = config["peer_connect_interval"].uint64_value();
-    if (!this->peer_connect_interval)
-    {
-        this->peer_connect_interval = DEFAULT_PEER_CONNECT_INTERVAL;
-    }
-    this->peer_connect_timeout = config["peer_connect_timeout"].uint64_value();
-    if (!this->peer_connect_timeout)
-    {
-        this->peer_connect_timeout = DEFAULT_PEER_CONNECT_TIMEOUT;
-    }
-    this->osd_idle_timeout = config["osd_idle_timeout"].uint64_value();
-    if (!this->osd_idle_timeout)
-    {
-        this->osd_idle_timeout = DEFAULT_OSD_PING_TIMEOUT;
-    }
-    this->osd_ping_timeout = config["osd_ping_timeout"].uint64_value();
-    if (!this->osd_ping_timeout)
-    {
-        this->osd_ping_timeout = DEFAULT_OSD_PING_TIMEOUT;
-    }
-    this->log_level = config["log_level"].uint64_value();
-}
-
 void osd_messenger_t::connect_peer(uint64_t peer_osd, json11::Json peer_state)
 {
    if (wanted_peers.find(peer_osd) == wanted_peers.end())
@ -203,12 +111,12 @@ void osd_messenger_t::try_connect_peer_addr(osd_num_t peer_osd, const char *peer
        timeout_id = tfd->set_timer(1000*peer_connect_timeout, false, [this, peer_fd](int timer_id)
        {
            osd_num_t peer_osd = clients.at(peer_fd)->osd_num;
-            stop_client(peer_fd, true);
+            stop_client(peer_fd);
            on_connect_peer(peer_osd, -EIO);
            return;
        });
    }
-    clients[peer_fd] = new osd_client_t((osd_client_t){
+    clients[peer_fd] = new osd_client_t({
        .peer_addr = addr,
        .peer_port = peer_port,
        .peer_fd = peer_fd,
@ -241,7 +149,7 @@ void osd_messenger_t::handle_connect_epoll(int peer_fd)
    }
    if (result != 0)
    {
-        stop_client(peer_fd, true);
+        stop_client(peer_fd);
        on_connect_peer(peer_osd, -result);
        return;
    }
@ -263,7 +171,7 @@ void osd_messenger_t::handle_peer_epoll(int peer_fd, int epoll_events)
    {
        // Stop client
        printf("[OSD %lu] client %d disconnected\n", this->osd_num, peer_fd);
-        stop_client(peer_fd, true);
+        stop_client(peer_fd);
    }
    else if (epoll_events & EPOLLIN)
    {
@ -325,7 +233,7 @@ void osd_messenger_t::check_peer_config(osd_client_t *cl)
    osd_op_t *op = new osd_op_t();
    op->op_type = OSD_OP_OUT;
    op->peer_fd = cl->peer_fd;
-    op->req = (osd_any_op_t){
+    op->req = {
        .show_conf = {
            .header = {
                .magic = SECONDARY_OSD_OP_MAGIC,
@ -401,7 +309,7 @@ void osd_messenger_t::cancel_op(osd_op_t *op)
    }
 }

-void osd_messenger_t::stop_client(int peer_fd, bool force)
+void osd_messenger_t::stop_client(int peer_fd)
 {
    assert(peer_fd != 0);
    auto it = clients.find(peer_fd);
@ -426,10 +334,6 @@ void osd_messenger_t::stop_client(int peer_fd, bool force)
                printf("[OSD %lu] Stopping client %d (regular client)\n", osd_num, peer_fd);
        }
    }
-    else if (!force)
-    {
-        return;
-    }
    cl->peer_state = PEER_STOPPED;
    clients.erase(it);
    tfd->set_fd_handler(peer_fd, false, NULL);
@ -444,14 +348,7 @@ void osd_messenger_t::stop_client(int peer_fd, bool force)
    }
    if (cl->read_op)
    {
-        if (cl->read_op->callback)
-        {
-            cancel_op(cl->read_op);
-        }
-        else
-        {
-            delete cl->read_op;
-        }
+        delete cl->read_op;
        cl->read_op = NULL;
    }
    for (auto rit = read_ready_clients.begin(); rit != read_ready_clients.end(); rit++)
@ -505,7 +402,7 @@ void osd_messenger_t::accept_connections(int listen_fd)
        fcntl(peer_fd, F_SETFL, fcntl(peer_fd, F_GETFL, 0) | O_NONBLOCK);
        int one = 1;
        setsockopt(peer_fd, SOL_TCP, TCP_NODELAY, &one, sizeof(one));
-        clients[peer_fd] = new osd_client_t((osd_client_t){
+        clients[peer_fd] = new osd_client_t({
            .peer_addr = addr,
            .peer_port = ntohs(addr.sin_port),
            .peer_fd = peer_fd,
--- a/src/messenger.h
+++ b/src/messenger.h
@ -1,5 +1,5 @@
 // Copyright (c) Vitaliy Filippov, 2019+
-// License: VNPL-1.1 or GNU GPL-2.0+ (see README.md for details)
+// License: VNPL-1.0 or GNU GPL-2.0+ (see README.md for details)

 #pragma once

@ -34,8 +34,6 @@

 #define DEFAULT_PEER_CONNECT_INTERVAL 5
 #define DEFAULT_PEER_CONNECT_TIMEOUT 5
-#define DEFAULT_OSD_PING_TIMEOUT 5
-#define DEFAULT_BITMAP_GRANULARITY 4096

 // Kind of a vector with small-list-optimisation
 struct osd_op_buf_list_t
@ -175,17 +173,13 @@ struct osd_primary_op_data_t;

 struct osd_op_t
 {
-    timespec tv_begin = { 0 }, tv_end = { 0 };
+    timespec tv_begin;
    uint64_t op_type = OSD_OP_IN;
    int peer_fd;
    osd_any_op_t req;
    osd_any_reply_t reply;
    blockstore_op_t *bs_op = NULL;
    void *buf = NULL;
-    // bitmap, bitmap_len, bmp_data are only meaningful for reads
-    void *bitmap = NULL;
-    unsigned bitmap_len = 0;
-    unsigned bmp_data = 0;
    void *rmw_buf = NULL;
    osd_primary_op_data_t* op_data = NULL;
    std::function<void(osd_op_t*)> callback;
@ -204,8 +198,6 @@ struct osd_client_t
    int peer_fd;
    int peer_state;
    int connect_timeout_id = -1;
-    int ping_time_remaining = 0;
-    int idle_time_remaining = 0;
    osd_num_t osd_num = 0;

    void *in_buf = NULL;
@ -259,7 +251,6 @@ struct osd_messenger_t
 {
    timerfd_manager_t *tfd;
    ring_loop_t *ringloop;
-    int keepalive_timer_id = -1;

    // osd_num_t is only for logging and asserts
    osd_num_t osd_num;
@ -267,8 +258,6 @@ struct osd_messenger_t
    int receive_buffer_size = 64*1024;
    int peer_connect_interval = DEFAULT_PEER_CONNECT_INTERVAL;
    int peer_connect_timeout = DEFAULT_PEER_CONNECT_TIMEOUT;
-    int osd_idle_timeout = DEFAULT_OSD_PING_TIMEOUT;
-    int osd_ping_timeout = DEFAULT_OSD_PING_TIMEOUT;
    int log_level = 0;
    bool use_sync_send_recv = false;

@ -285,10 +274,8 @@ struct osd_messenger_t
    osd_op_stats_t stats;

 public:
-    void init();
-    void parse_config(const json11::Json & config);
    void connect_peer(uint64_t osd_num, json11::Json peer_state);
-    void stop_client(int peer_fd, bool force = false);
+    void stop_client(int peer_fd);
    void outbox_push(osd_op_t *cur_op);
    std::function<void(osd_op_t*)> exec_op;
    std::function<void(osd_num_t)> repeer_pgs;
--- a/mon/PGUtil.js
+++ b/mon/PGUtil.js
@ -1,59 +1,22 @@
 // Copyright (c) Vitaliy Filippov, 2019+
-// License: VNPL-1.1 (see README.md for details)
+// License: VNPL-1.0 (see README.md for details)

 module.exports = {
    scale_pg_count,
 };

-function add_pg_history(new_pg_history, new_pg, prev_pgs, prev_pg_history, old_pg)
-{
-    if (!new_pg_history[new_pg])
-    {
-        new_pg_history[new_pg] = {
-            osd_sets: {},
-            all_peers: {},
-            epoch: 0,
-        };
-    }
-    const nh = new_pg_history[new_pg], oh = prev_pg_history[old_pg];
-    nh.osd_sets[prev_pgs[old_pg].join(' ')] = prev_pgs[old_pg];
-    if (oh && oh.osd_sets && oh.osd_sets.length)
-    {
-        for (const pg of oh.osd_sets)
-        {
-            nh.osd_sets[pg.join(' ')] = pg;
-        }
-    }
-    if (oh && oh.all_peers && oh.all_peers.length)
-    {
-        for (const osd_num of oh.all_peers)
-        {
-            nh.all_peers[osd_num] = Number(osd_num);
-        }
-    }
-    if (oh && oh.epoch)
-    {
-        nh.epoch = nh.epoch < oh.epoch ? oh.epoch : nh.epoch;
-    }
-}
-
-function finish_pg_history(merged_history)
-{
-    merged_history.osd_sets = Object.values(merged_history.osd_sets);
-    merged_history.all_peers = Object.values(merged_history.all_peers);
-}
-
 function scale_pg_count(prev_pgs, prev_pg_history, new_pg_history, new_pg_count)
 {
    const old_pg_count = prev_pgs.length;
-    // Add all possibly intersecting PGs to the history of new PGs
+    // Add all possibly intersecting PGs into the history of new PGs
    if (!(new_pg_count % old_pg_count))
    {
-        // New PG count is a multiple of old PG count
+        // New PG count is a multiple of the old PG count
+        const mul = (new_pg_count / old_pg_count);
        for (let i = 0; i < new_pg_count; i++)
        {
-            add_pg_history(new_pg_history, i, prev_pgs, prev_pg_history, i % old_pg_count);
-            finish_pg_history(new_pg_history[i]);
+            const old_i = Math.floor(new_pg_count / mul);
+            new_pg_history[i] = JSON.parse(JSON.stringify(prev_pg_history[1+old_i]));
        }
    }
    else if (!(old_pg_count % new_pg_count))
@ -62,26 +25,68 @@ function scale_pg_count(prev_pgs, prev_pg_history, new_pg_history, new_pg_count)
        const mul = (old_pg_count / new_pg_count);
        for (let i = 0; i < new_pg_count; i++)
        {
+            new_pg_history[i] = {
+                osd_sets: [],
+                all_peers: [],
+                epoch: 0,
+            };
            for (let j = 0; j < mul; j++)
            {
-                add_pg_history(new_pg_history, i, prev_pgs, prev_pg_history, i+j*new_pg_count);
+                new_pg_history[i].osd_sets.push(prev_pgs[i*mul]);
+                const hist = prev_pg_history[1+i*mul+j];
+                if (hist && hist.osd_sets && hist.osd_sets.length)
+                {
+                    Array.prototype.push.apply(new_pg_history[i].osd_sets, hist.osd_sets);
+                }
+                if (hist && hist.all_peers && hist.all_peers.length)
+                {
+                    Array.prototype.push.apply(new_pg_history[i].all_peers, hist.all_peers);
+                }
+                if (hist && hist.epoch)
+                {
+                    new_pg_history[i].epoch = new_pg_history[i].epoch < hist.epoch ? hist.epoch : new_pg_history[i].epoch;
+                }
            }
-            finish_pg_history(new_pg_history[i]);
        }
    }
    else
    {
        // Any PG may intersect with any PG after non-multiple PG count change
        // So, merge ALL PGs history
-        let merged_history = {};
-        for (let i = 0; i < old_pg_count; i++)
+        let all_sets = {};
+        let all_peers = {};
+        let max_epoch = 0;
+        for (const pg of prev_pgs)
        {
-            add_pg_history(merged_history, 1, prev_pgs, prev_pg_history, i);
+            all_sets[pg.join(' ')] = pg;
        }
-        finish_pg_history(merged_history[1]);
+        for (const pg in prev_pg_history)
+        {
+            const hist = prev_pg_history[pg];
+            if (hist && hist.osd_sets)
+            {
+                for (const pg of hist.osd_sets)
+                {
+                    all_sets[pg.join(' ')] = pg;
+                }
+            }
+            if (hist && hist.all_peers)
+            {
+                for (const osd_num of hist.all_peers)
+                {
+                    all_peers[osd_num] = Number(osd_num);
+                }
+            }
+            if (hist && hist.epoch)
+            {
+                max_epoch = max_epoch < hist.epoch ? hist.epoch : max_epoch;
+            }
+        }
+        all_sets = Object.values(all_sets);
+        all_peers = Object.values(all_peers);
        for (let i = 0; i < new_pg_count; i++)
        {
-            new_pg_history[i] = { ...merged_history[1] };
+            new_pg_history[i] = { osd_sets: all_sets, all_peers, epoch: max_epoch };
        }
    }
    // Mark history keys for removed PGs as removed
@ -89,16 +94,19 @@ function scale_pg_count(prev_pgs, prev_pg_history, new_pg_history, new_pg_count)
    {
        new_pg_history[i] = null;
    }
-    // Just for the lp_solve optimizer - pick a "previous" PG for each "new" one
    if (old_pg_count < new_pg_count)
    {
-        for (let i = old_pg_count; i < new_pg_count; i++)
+        for (let i = new_pg_count-1; i >= 0; i--)
        {
-            prev_pgs[i] = prev_pgs[i % old_pg_count];
+            prev_pgs[i] = prev_pgs[Math.floor(i/new_pg_count*old_pg_count)];
        }
    }
    else if (old_pg_count > new_pg_count)
    {
+        for (let i = 0; i < new_pg_count; i++)
+        {
+            prev_pgs[i] = prev_pgs[Math.round(i/new_pg_count*old_pg_count)];
+        }
        prev_pgs.splice(new_pg_count, old_pg_count-new_pg_count);
    }
 }
--- a/mon/afr.js
+++ b/mon/afr.js
@ -1,16 +1,31 @@
 // Functions to calculate Annualized Failure Rate of your cluster
 // if you know AFR of your drives, number of drives, expected rebalance time
 // and replication factor
-// License: VNPL-1.1 (see https://yourcmc.ru/git/vitalif/vitastor/src/branch/master/README.md for details) or AGPL-3.0
-// Author: Vitaliy Filippov, 2020+
+// License: VNPL-1.0 (see README.md for details)
+
+const { sprintf } = require('sprintf-js');

 module.exports = {
    cluster_afr_fullmesh,
    failure_rate_fullmesh,
    cluster_afr,
+    print_cluster_afr,
    c_n_k,
 };

+print_cluster_afr({ n_hosts: 4, n_drives: 6, afr_drive: 0.03, afr_host: 0.05, capacity: 4000, speed: 0.1, replicas: 2 });
+print_cluster_afr({ n_hosts: 4, n_drives: 3, afr_drive: 0.03, capacity: 4000, speed: 0.1, replicas: 2 });
+print_cluster_afr({ n_hosts: 4, n_drives: 3, afr_drive: 0.03, afr_host: 0.05, capacity: 4000, speed: 0.1, replicas: 2 });
+print_cluster_afr({ n_hosts: 4, n_drives: 3, afr_drive: 0.03, capacity: 4000, speed: 0.1, ec: [ 2, 1 ] });
+print_cluster_afr({ n_hosts: 4, n_drives: 3, afr_drive: 0.03, afr_host: 0.05, capacity: 4000, speed: 0.1, ec: [ 2, 1 ] });
+print_cluster_afr({ n_hosts: 10, n_drives: 10, afr_drive: 0.1, capacity: 8000, speed: 0.02, replicas: 2 });
+print_cluster_afr({ n_hosts: 10, n_drives: 10, afr_drive: 0.1, afr_host: 0.05, capacity: 8000, speed: 0.02, replicas: 2 });
+print_cluster_afr({ n_hosts: 10, n_drives: 10, afr_drive: 0.1, capacity: 8000, speed: 0.02, replicas: 3 });
+print_cluster_afr({ n_hosts: 10, n_drives: 10, afr_drive: 0.1, afr_host: 0.05, capacity: 8000, speed: 0.02, replicas: 3 });
+print_cluster_afr({ n_hosts: 10, n_drives: 10, afr_drive: 0.1, capacity: 8000, speed: 0.02, replicas: 3, pgs: 100 });
+print_cluster_afr({ n_hosts: 10, n_drives: 10, afr_drive: 0.1, afr_host: 0.05, capacity: 8000, speed: 0.02, replicas: 3, pgs: 100 });
+print_cluster_afr({ n_hosts: 10, n_drives: 10, afr_drive: 0.1, afr_host: 0.05, capacity: 8000, speed: 0.02, replicas: 3, pgs: 100, degraded_replacement: 1 });
+
 /******** "FULL MESH": ASSUME EACH OSD COMMUNICATES WITH ALL OTHER OSDS ********/

 // Estimate AFR of the cluster
@ -41,38 +56,93 @@ function failure_rate_fullmesh(n, a, f)
 /******** PGS: EACH OSD ONLY COMMUNICATES WITH <pgs> OTHER OSDs ********/

 // <n> hosts of <m> drives of <capacity> GB, each able to backfill at <speed> GB/s,
-// <k> replicas, <pgs> unique peer PGs per OSD (~50 for 100 PG-per-OSD in a big cluster)
+// <k> replicas, <pgs> unique peer PGs per OSD
 //
 // For each of n*m drives: P(drive fails in a year) * P(any of its peers fail in <l*365> next days).
 // More peers per OSD increase rebalance speed (more drives work together to resilver) if you
-// let them finish rebalance BEFORE replacing the failed drive (degraded_replacement=false).
+// let them finish rebalance BEFORE replacing the failed drive.
 // At the same time, more peers per OSD increase probability of any of them to fail!
-// osd_rm=true means that failed OSDs' data is rebalanced over all other hosts,
-// not over the same host as it's in Ceph by default (dead OSDs are marked 'out').
 //
 // Probability of all except one drives in a replica group to fail is (AFR^(k-1)).
 // So with <x> PGs it becomes ~ (x * (AFR*L/365)^(k-1)). Interesting but reasonable consequence
 // is that, with k=2, total failure rate doesn't depend on number of peers per OSD,
 // because it gets increased linearly by increased number of peers to fail
 // and decreased linearly by reduced rebalance time.
-function cluster_afr({ n_hosts, n_drives, afr_drive, afr_host, capacity, speed, ec, ec_data, ec_parity, replicas, pgs = 1, osd_rm, degraded_replacement, down_out_interval = 600 })
+function cluster_afr_pgs({ n_hosts, n_drives, afr_drive, capacity, speed, replicas, pgs = 1, degraded_replacement })
 {
-    const pg_size = (ec ? ec_data+ec_parity : replicas);
-    pgs = Math.min(pgs, (n_hosts-1)*n_drives/(pg_size-1));
-    const host_pgs = Math.min(pgs*n_drives, (n_hosts-1)*n_drives/(pg_size-1));
-    const resilver_disk = n_drives == 1 || osd_rm ? pgs : (n_drives-1);
-    const disk_heal_time = (down_out_interval + capacity/(degraded_replacement ? 1 : resilver_disk)/speed)/86400/365;
-    const host_heal_time = (down_out_interval + n_drives*capacity/pgs/speed)/86400/365;
-    const disk_heal_fail = ((afr_drive+afr_host/n_drives)*disk_heal_time);
-    const host_heal_fail = ((afr_drive+afr_host/n_drives)*host_heal_time);
-    const disk_pg_fail = ec
-        ? failure_rate_fullmesh(ec_data+ec_parity-1, disk_heal_fail, ec_parity)
-        : disk_heal_fail**(replicas-1);
-    const host_pg_fail = ec
-        ? failure_rate_fullmesh(ec_data+ec_parity-1, host_heal_fail, ec_parity)
-        : host_heal_fail**(replicas-1);
-    return 1 - ((1 - afr_drive * (1-(1-disk_pg_fail)**pgs)) ** (n_hosts*n_drives))
-        * ((1 - afr_host * (1-(1-host_pg_fail)**host_pgs)) ** n_hosts);
+    pgs = Math.min(pgs, (n_hosts-1)*n_drives/(replicas-1));
+    const l = capacity/(degraded_replacement ? 1 : pgs)/speed/86400/365;
+    return 1 - (1 - afr_drive * (1-(1-(afr_drive*l)**(replicas-1))**pgs)) ** (n_hosts*n_drives);
+}
+
+function cluster_afr_pgs_ec({ n_hosts, n_drives, afr_drive, capacity, speed, ec: [ ec_data, ec_parity ], pgs = 1, degraded_replacement })
+{
+    const ec_total = ec_data+ec_parity;
+    pgs = Math.min(pgs, (n_hosts-1)*n_drives/(ec_total-1));
+    const l = capacity/(degraded_replacement ? 1 : pgs)/speed/86400/365;
+    return 1 - (1 - afr_drive * (1-(1-failure_rate_fullmesh(ec_total-1, afr_drive*l, ec_parity))**pgs)) ** (n_hosts*n_drives);
+}
+
+// Same as above, but also take server failures into account
+function cluster_afr_pgs_hosts({ n_hosts, n_drives, afr_drive, afr_host, capacity, speed, replicas, pgs = 1, degraded_replacement })
+{
+    let otherhosts = Math.min(pgs, (n_hosts-1)/(replicas-1));
+    pgs = Math.min(pgs, (n_hosts-1)*n_drives/(replicas-1));
+    let pgh = Math.min(pgs*n_drives, (n_hosts-1)*n_drives/(replicas-1));
+    const ld = capacity/(degraded_replacement ? 1 : pgs)/speed/86400/365;
+    const lh = n_drives*capacity/pgs/speed/86400/365;
+    const p1 = ((afr_drive+afr_host*pgs/otherhosts)*lh);
+    const p2 = ((afr_drive+afr_host*pgs/otherhosts)*ld);
+    return 1 - ((1 - afr_host * (1-(1-p1**(replicas-1))**pgh)) ** n_hosts) *
+        ((1 - afr_drive * (1-(1-p2**(replicas-1))**pgs)) ** (n_hosts*n_drives));
+}
+
+function cluster_afr_pgs_ec_hosts({ n_hosts, n_drives, afr_drive, afr_host, capacity, speed, ec: [ ec_data, ec_parity ], pgs = 1, degraded_replacement })
+{
+    const ec_total = ec_data+ec_parity;
+    const otherhosts = Math.min(pgs, (n_hosts-1)/(ec_total-1));
+    pgs = Math.min(pgs, (n_hosts-1)*n_drives/(ec_total-1));
+    const pgh = Math.min(pgs*n_drives, (n_hosts-1)*n_drives/(ec_total-1));
+    const ld = capacity/(degraded_replacement ? 1 : pgs)/speed/86400/365;
+    const lh = n_drives*capacity/pgs/speed/86400/365;
+    const p1 = ((afr_drive+afr_host*pgs/otherhosts)*lh);
+    const p2 = ((afr_drive+afr_host*pgs/otherhosts)*ld);
+    return 1 - ((1 - afr_host * (1-(1-failure_rate_fullmesh(ec_total-1, p1, ec_parity))**pgh)) ** n_hosts) *
+        ((1 - afr_drive * (1-(1-failure_rate_fullmesh(ec_total-1, p2, ec_parity))**pgs)) ** (n_hosts*n_drives));
+}
+
+// Wrapper for 4 above functions
+function cluster_afr(config)
+{
+    if (config.ec && config.afr_host)
+    {
+        return cluster_afr_pgs_ec_hosts(config);
+    }
+    else if (config.ec)
+    {
+        return cluster_afr_pgs_ec(config);
+    }
+    else if (config.afr_host)
+    {
+        return cluster_afr_pgs_hosts(config);
+    }
+    else
+    {
+        return cluster_afr_pgs(config);
+    }
+}
+
+function print_cluster_afr(config)
+{
+    console.log(
+        `${config.n_hosts} nodes with ${config.n_drives} ${sprintf("%.1f", config.capacity/1000)}TB drives`+
+        `, capable to backfill at ${sprintf("%.1f", config.speed*1000)} MB/s, drive AFR ${sprintf("%.1f", config.afr_drive*100)}%`+
+        (config.afr_host ? `, host AFR ${sprintf("%.1f", config.afr_host*100)}%` : '')+
+        (config.ec ? `, EC ${config.ec[0]}+${config.ec[1]}` : `, ${config.replicas} replicas`)+
+        `, ${config.pgs||1} PG per OSD`+
+        (config.degraded_replacement ? `\n...and you don't let the rebalance finish before replacing drives` : '')
+    );
+    console.log('-> '+sprintf("%.7f%%", 100*cluster_afr(config))+'\n');
 }

 /******** UTILITY ********/
--- a/mon/afr_test.js
+++ b/mon/afr_test.js
@ -1,28 +0,0 @@
-const { sprintf } = require('sprintf-js');
-const { cluster_afr } = require('./afr.js');
-
-print_cluster_afr({ n_hosts: 4, n_drives: 6, afr_drive: 0.03, afr_host: 0.05, capacity: 4000, speed: 0.1, replicas: 2 });
-print_cluster_afr({ n_hosts: 4, n_drives: 3, afr_drive: 0.03, afr_host: 0, capacity: 4000, speed: 0.1, replicas: 2 });
-print_cluster_afr({ n_hosts: 4, n_drives: 3, afr_drive: 0.03, afr_host: 0.05, capacity: 4000, speed: 0.1, replicas: 2 });
-print_cluster_afr({ n_hosts: 4, n_drives: 3, afr_drive: 0.03, afr_host: 0, capacity: 4000, speed: 0.1, ec: true, ec_data: 2, ec_parity: 1 });
-print_cluster_afr({ n_hosts: 4, n_drives: 3, afr_drive: 0.03, afr_host: 0.05, capacity: 4000, speed: 0.1, ec: true, ec_data: 2, ec_parity: 1 });
-print_cluster_afr({ n_hosts: 10, n_drives: 10, afr_drive: 0.1, afr_host: 0, capacity: 8000, speed: 0.02, replicas: 2 });
-print_cluster_afr({ n_hosts: 10, n_drives: 10, afr_drive: 0.1, afr_host: 0.05, capacity: 8000, speed: 0.02, replicas: 2 });
-print_cluster_afr({ n_hosts: 10, n_drives: 10, afr_drive: 0.1, afr_host: 0, capacity: 8000, speed: 0.02, replicas: 3 });
-print_cluster_afr({ n_hosts: 10, n_drives: 10, afr_drive: 0.1, afr_host: 0.05, capacity: 8000, speed: 0.02, replicas: 3 });
-print_cluster_afr({ n_hosts: 10, n_drives: 10, afr_drive: 0.1, afr_host: 0, capacity: 8000, speed: 0.02, replicas: 3, pgs: 100 });
-print_cluster_afr({ n_hosts: 10, n_drives: 10, afr_drive: 0.1, afr_host: 0.05, capacity: 8000, speed: 0.02, replicas: 3, pgs: 100 });
-print_cluster_afr({ n_hosts: 10, n_drives: 10, afr_drive: 0.1, afr_host: 0.05, capacity: 8000, speed: 0.02, replicas: 3, pgs: 100, degraded_replacement: 1 });
-
-function print_cluster_afr(config)
-{
-    console.log(
-        `${config.n_hosts} nodes with ${config.n_drives} ${sprintf("%.1f", config.capacity/1000)}TB drives`+
-        `, capable to backfill at ${sprintf("%.1f", config.speed*1000)} MB/s, drive AFR ${sprintf("%.1f", config.afr_drive*100)}%`+
-        (config.afr_host ? `, host AFR ${sprintf("%.1f", config.afr_host*100)}%` : '')+
-        (config.ec ? `, EC ${config.ec_data}+${config.ec_parity}` : `, ${config.replicas} replicas`)+
-        `, ${config.pgs||1} PG per OSD`+
-        (config.degraded_replacement ? `\n...and you don't let the rebalance finish before replacing drives` : '')
-    );
-    console.log('-> '+sprintf("%.7f%%", 100*cluster_afr(config))+'\n');
-}
--- a/mon/lp-optimizer.js
+++ b/mon/lp-optimizer.js
@ -1,5 +1,5 @@
 // Copyright (c) Vitaliy Filippov, 2019+
-// License: VNPL-1.1 (see README.md for details)
+// License: VNPL-1.0 (see README.md for details)

 // Data distribution optimizer using linear programming (lp_solve)

@ -58,7 +58,7 @@ async function optimize_initial({ osd_tree, pg_count, pg_size = 3, pg_minsize =
    }
    const all_weights = Object.assign({}, ...Object.values(osd_tree));
    const total_weight = Object.values(all_weights).reduce((a, c) => Number(a) + Number(c), 0);
-    const all_pgs = Object.values(random_combinations(osd_tree, pg_size, max_combinations, parity_space > 1));
+    const all_pgs = Object.values(random_combinations(osd_tree, pg_size, max_combinations));
    const pg_per_osd = {};
    for (const pg of all_pgs)
    {
@ -249,7 +249,7 @@ async function optimize_change({ prev_pgs: prev_int_pgs, osd_tree, pg_size = 3,
        }
    }
    // Get all combinations
-    let all_pgs = random_combinations(osd_tree, pg_size, max_combinations, parity_space > 1);
+    let all_pgs = random_combinations(osd_tree, pg_size, max_combinations);
    add_valid_previous(osd_tree, prev_weights, all_pgs);
    all_pgs = Object.values(all_pgs);
    const pg_per_osd = {};
@ -275,11 +275,6 @@ async function optimize_change({ prev_pgs: prev_int_pgs, osd_tree, pg_size = 3,
    lp += 'max: '+all_pg_names.map(pg_name => (
        prev_weights[pg_name] ? `${pg_size+1}*add_${pg_name} - ${pg_size+1}*del_${pg_name}` : `${pg_size+1-move_weights[pg_name]}*${pg_name}`
    )).join(' + ')+';\n';
-    lp += all_pg_names
-        .map(pg_name => (prev_weights[pg_name] ? `add_${pg_name} - del_${pg_name}` : `${pg_name}`))
-        .join(' + ')+' = '+(pg_count
-            - Object.keys(prev_weights).reduce((a, old_pg_name) => (a + (all_pgs_hash[old_pg_name] ? prev_weights[old_pg_name] : 0)), 0)
-        )+';\n';
    for (const osd in pg_per_osd)
    {
        if (osd !== NO_OSD)
@ -493,8 +488,7 @@ function extract_osds(osd_tree, levels, osd_level, osds = {})
    return osds;
 }

-// ordered = don't treat (x,y) and (y,x) as equal
-function random_combinations(osd_tree, pg_size, count, ordered)
+function random_combinations(osd_tree, pg_size, count)
 {
    let seed = 0x5f020e43;
    let rng = () =>
@ -522,47 +516,25 @@ function random_combinations(osd_tree, pg_size, count, ordered)
                pg.push(osds[cur_hosts[next_host]][next_osd]);
                cur_hosts.splice(next_host, 1);
            }
-            const cyclic_pgs = [ pg ];
-            if (ordered)
+            while (pg.length < pg_size)
            {
-                for (let i = 1; i < pg.size; i++)
-                {
-                    cyclic_pgs.push([ ...pg.slice(i), ...pg.slice(0, i) ]);
-                }
-            }
-            for (const pg of cyclic_pgs)
-            {
-                while (pg.length < pg_size)
-                {
-                    pg.push(NO_OSD);
-                }
-                r['pg_'+pg.join('_')] = pg;
+                pg.push(NO_OSD);
            }
+            r['pg_'+pg.join('_')] = pg;
        }
    }
    // Generate purely random combinations
-    while (count > 0)
+    restart: while (count > 0)
    {
        let host_idx = [];
-        const cur_hosts = [ ...hosts.map((h, i) => i) ];
-        const max_hosts = pg_size < hosts.length ? pg_size : hosts.length;
-        if (ordered)
+        for (let i = 0; i < pg_size && i < hosts.length; i++)
        {
-            for (let i = 0; i < max_hosts; i++)
+            let start = i > 0 ? host_idx[i-1]+1 : 0;
+            if (start >= hosts.length)
            {
-                const r = rng() % cur_hosts.length;
-                host_idx[i] = cur_hosts[r];
-                cur_hosts.splice(r, 1);
-            }
-        }
-        else
-        {
-            for (let i = 0; i < max_hosts; i++)
-            {
-                const r = rng() % (cur_hosts.length - (max_hosts - i - 1));
-                host_idx[i] = cur_hosts[r];
-                cur_hosts.splice(0, r+1);
+                continue restart;
            }
+            host_idx[i] = start + rng() % (hosts.length-start);
        }
        let pg = host_idx.map(h => osds[hosts[h]][rng() % osds[hosts[h]].length]);
        while (pg.length < pg_size)
--- a/mon/make-osd.sh
+++ b/mon/make-osd.sh
@ -1,76 +0,0 @@
-#!/bin/bash
-# Very simple systemd unit generator for vitastor-osd services
-# Not the final solution yet, mostly for tests
-# Copyright (c) Vitaliy Filippov, 2019+
-# License: MIT
-
-# USAGE: ./make-osd.sh /dev/disk/by-partuuid/xxx [ /dev/disk/by-partuuid/yyy]...
-
-IP_SUBSTR="10.200.1."
-ETCD_HOSTS="etcd0=http://10.200.1.10:2380,etcd1=http://10.200.1.11:2380,etcd2=http://10.200.1.12:2380"
-
-set -e -x
-
-IP=`ip -json a s | jq -r '.[].addr_info[] | select(.local | startswith("'$IP_SUBSTR'")) | .local'`
-[ "$IP" != "" ] || exit 1
-ETCD_MON=$(echo $ETCD_HOSTS | perl -pe 's/:2380/:2379/g; s/etcd\d*=//g;')
-D=`dirname $0`
-
-# Create OSDs on all passed devices
-OSD_NUM=1
-for DEV in $*; do
-
-# Ugly :) -> node.js rework pending
-while true; do
-    ST=$(etcdctl --endpoints="$ETCD_MON" get --print-value-only /vitastor/osd/stats/$OSD_NUM)
-    if [ "$ST" = "" ]; then
-        break
-    fi
-    OSD_NUM=$((OSD_NUM+1))
-done
-etcdctl --endpoints="$ETCD_MON" put /vitastor/osd/stats/$OSD_NUM '{}'
-
-echo Creating OSD $OSD_NUM on $DEV
-
-OPT=`node $D/simple-offsets.js --device $DEV --format options | tr '\n' ' '`
-META=`echo $OPT | grep -Po '(?<=data_offset )\d+'`
-dd if=/dev/zero of=$DEV bs=1048576 count=$(((META+1048575)/1048576)) oflag=direct
-
-cat >/etc/systemd/system/vitastor-osd$OSD_NUM.service <<EOF
-[Unit]
-Description=Vitastor object storage daemon osd.$OSD_NUM
-After=network-online.target local-fs.target time-sync.target
-Wants=network-online.target local-fs.target time-sync.target
-PartOf=vitastor.target
-
-[Service]
-LimitNOFILE=1048576
-LimitNPROC=1048576
-LimitMEMLOCK=infinity
-ExecStart=/usr/bin/vitastor-osd \\
-    --etcd_address $IP:2379/v3 \\
-    --bind_address $IP \\
-    --osd_num $OSD_NUM \\
-    --disable_data_fsync 1 \\
-    --immediate_commit all \\
-    --flusher_count 256 \\
-    --disk_alignment 4096 --journal_block_size 4096 --meta_block_size 4096 \\
-    --journal_no_same_sector_overwrites true \\
-    --journal_sector_buffer_count 1024 \\
-    $OPT
-WorkingDirectory=/
-ExecStartPre=+chown vitastor:vitastor $DEV
-User=vitastor
-PrivateTmp=false
-TasksMax=infinity
-Restart=always
-StartLimitInterval=0
-RestartSec=10
-
-[Install]
-WantedBy=vitastor.target
-EOF
-
-systemctl enable vitastor-osd$OSD_NUM
-
-done
--- a/mon/make-units.sh
+++ b/mon/make-units.sh
@ -1,25 +1,19 @@
 #!/bin/bash
-# Very simple systemd unit generator for etcd & vitastor-mon services
-# Not the final solution yet, mostly for tests
+# Example startup script generator
+# Of course this isn't a production solution yet, this is just for tests
 # Copyright (c) Vitaliy Filippov, 2019+
 # License: MIT

-# USAGE: ./make-units.sh
+IP=`ip -json a s | jq -r '.[].addr_info[] | select(.broadcast == "10.115.0.255") | .local'`

-IP_SUBSTR="10.200.1."
-ETCD_HOSTS="etcd0=http://10.200.1.10:2380,etcd1=http://10.200.1.11:2380,etcd2=http://10.200.1.12:2380"
-
-# determine IP
-IP=`ip -json a s | jq -r '.[].addr_info[] | select(.local | startswith("'$IP_SUBSTR'")) | .local'`
 [ "$IP" != "" ] || exit 1
-ETCD_NUM=${ETCD_HOSTS/$IP*/}
-[ "$ETCD_NUM" != "$ETCD_HOSTS" ] || exit 1
-ETCD_NUM=$(echo $ETCD_NUM | tr -d -c , | wc -c)

-# etcd
+BASE=${IP/*./}
+BASE=$((BASE-10))
+
 useradd etcd

-mkdir -p /var/lib/etcd$ETCD_NUM.etcd
+mkdir -p /var/lib/etcd$BASE.etcd
 cat >/etc/systemd/system/etcd.service <<EOF
 [Unit]
 Description=etcd for vitastor
@ -28,18 +22,19 @@ Wants=network-online.target local-fs.target time-sync.target

 [Service]
 Restart=always
-ExecStart=/usr/local/bin/etcd -name etcd$ETCD_NUM --data-dir /var/lib/etcd$ETCD_NUM.etcd \\
+ExecStart=/usr/local/bin/etcd -name etcd$BASE --data-dir /var/lib/etcd$BASE.etcd \\
    --advertise-client-urls http://$IP:2379 --listen-client-urls http://$IP:2379 \\
    --initial-advertise-peer-urls http://$IP:2380 --listen-peer-urls http://$IP:2380 \\
-    --initial-cluster-token vitastor-etcd-1 --initial-cluster $ETCD_HOSTS \\
+    --initial-cluster-token vitastor-etcd-1 --initial-cluster etcd0=http://10.115.0.10:2380,etcd1=http://10.115.0.11:2380,etcd2=http://10.115.0.12:2380,etcd3=http://10.115.0.13:2380 \\
    --initial-cluster-state new --max-txn-ops=100000 --auto-compaction-retention=10 --auto-compaction-mode=revision
-WorkingDirectory=/var/lib/etcd$ETCD_NUM.etcd
-ExecStartPre=+chown -R etcd /var/lib/etcd$ETCD_NUM.etcd
+WorkingDirectory=/var/lib/etcd$BASE.etcd
+ExecStartPre=+chown -R etcd /var/lib/etcd$BASE.etcd
 User=etcd
 PrivateTmp=false
 TasksMax=infinity
 Restart=always
 StartLimitInterval=0
+StartLimitIntervalSec=0
 RestartSec=10

 [Install]
@ -53,7 +48,9 @@ systemctl start etcd
 useradd vitastor
 chmod 755 /root

-# Vitastor target
+BASE=${IP/*./}
+BASE=$(((BASE-10)*12))
+
 cat >/etc/systemd/system/vitastor.target <<EOF
 [Unit]
 Description=vitastor target
@ -61,25 +58,116 @@ Description=vitastor target
 WantedBy=multi-user.target
 EOF

-# Monitor unit
-ETCD_MON=$(echo $ETCD_HOSTS | perl -pe 's/:2380/:2379/g; s/etcd\d*=//g;')
-cat >/etc/systemd/system/vitastor-mon.service <<EOF
+i=1
+for DEV in `ls /dev/disk/by-id/ | grep ata-INTEL_SSDSC2KB`; do
+    dd if=/dev/zero of=/dev/disk/by-id/$DEV bs=1048576 count=$(((427814912+1048575)/1048576+2))
+    dd if=/dev/zero of=/dev/disk/by-id/$DEV bs=1048576 count=$(((427814912+1048575)/1048576+2)) seek=$((1920377991168/1048576))
+cat >/etc/systemd/system/vitastor-osd$((BASE+i)).service <<EOF
 [Unit]
-Description=Vitastor monitor
+Description=Vitastor object storage daemon osd.$((BASE+i))
 After=network-online.target local-fs.target time-sync.target
 Wants=network-online.target local-fs.target time-sync.target
+PartOf=vitastor.target

 [Service]
-Restart=always
-ExecStart=node /usr/lib/vitastor/mon/mon-main.js --etcd_url '$ETCD_MON' --etcd_prefix '/vitastor' --etcd_start_timeout 5
-WorkingDirectory=/
+LimitNOFILE=1048576
+LimitNPROC=1048576
+LimitMEMLOCK=infinity
+ExecStart=/root/vitastor/osd \\
+    --etcd_address $IP:2379/v3 \\
+    --bind_address $IP \\
+    --osd_num $((BASE+i)) \\
+    --disable_data_fsync 1 \\
+    --disable_device_lock 1 \\
+    --immediate_commit all \\
+    --flusher_count 8 \\
+    --disk_alignment 4096 --journal_block_size 4096 --meta_block_size 4096 \\
+    --journal_no_same_sector_overwrites true \\
+    --journal_sector_buffer_count 1024 \\
+    --journal_offset 0 \\
+    --meta_offset 16777216 \\
+    --data_offset 427814912 \\
+    --data_size $((1920377991168-427814912)) \\
+    --data_device /dev/disk/by-id/$DEV
+WorkingDirectory=/root/vitastor
+ExecStartPre=+chown vitastor:vitastor /dev/disk/by-id/$DEV
 User=vitastor
 PrivateTmp=false
 TasksMax=infinity
 Restart=always
 StartLimitInterval=0
+StartLimitIntervalSec=0
 RestartSec=10

 [Install]
 WantedBy=vitastor.target
 EOF
+    systemctl enable vitastor-osd$((BASE+i))
+    i=$((i+1))
+cat >/etc/systemd/system/vitastor-osd$((BASE+i)).service <<EOF
+[Unit]
+Description=Vitastor object storage daemon osd.$((BASE+i))
+After=network-online.target local-fs.target time-sync.target
+Wants=network-online.target local-fs.target time-sync.target
+PartOf=vitastor.target
+
+[Service]
+LimitNOFILE=1048576
+LimitNPROC=1048576
+LimitMEMLOCK=infinity
+ExecStart=/root/vitastor/osd \\
+    --etcd_address $IP:2379/v3 \\
+    --bind_address $IP \\
+    --osd_num $((BASE+i)) \\
+    --disable_data_fsync 1 \\
+    --immediate_commit all \\
+    --flusher_count 8 \\
+    --disk_alignment 4096 --journal_block_size 4096 --meta_block_size 4096 \\
+    --journal_no_same_sector_overwrites true \\
+    --journal_sector_buffer_count 1024 \\
+    --journal_offset 1920377991168 \\
+    --meta_offset $((1920377991168+16777216)) \\
+    --data_offset $((1920377991168+427814912)) \\
+    --data_size $((1920377991168-427814912)) \\
+    --data_device /dev/disk/by-id/$DEV
+WorkingDirectory=/root/vitastor
+ExecStartPre=+chown vitastor:vitastor /dev/disk/by-id/$DEV
+User=vitastor
+PrivateTmp=false
+TasksMax=infinity
+Restart=always
+StartLimitInterval=0
+StartLimitIntervalSec=0
+RestartSec=10
+
+[Install]
+WantedBy=vitastor.target
+EOF
+    systemctl enable vitastor-osd$((BASE+i))
+    i=$((i+1))
+done
+
+exit
+
+node mon-main.js --etcd_url 'http://10.115.0.10:2379,http://10.115.0.11:2379,http://10.115.0.12:2379,http://10.115.0.13:2379' --etcd_prefix '/vitastor' --etcd_start_timeout 5
+
+podman run -d --network host --restart always -v /var/lib/etcd0.etcd:/etcd0.etcd --name etcd quay.io/coreos/etcd:v3.4.13 etcd -name etcd0 \
+    -advertise-client-urls http://10.115.0.10:2379 -listen-client-urls http://10.115.0.10:2379 \
+    -initial-advertise-peer-urls http://10.115.0.10:2380 -listen-peer-urls http://10.115.0.10:2380 \
+    -initial-cluster-token vitastor-etcd-1 -initial-cluster etcd0=http://10.115.0.10:2380,etcd1=http://10.115.0.11:2380,etcd2=http://10.115.0.12:2380,etcd3=http://10.115.0.13:2380 \
+    -initial-cluster-state new --max-txn-ops=100000 --auto-compaction-retention=10 --auto-compaction-mode=revision
+
+etcdctl --endpoints http://10.115.0.10:2379 put /vitastor/config/global '{"immediate_commit":"all"}'
+
+etcdctl --endpoints http://10.115.0.10:2379 put /vitastor/config/pools '{"1":{"name":"testpool","scheme":"replicated","pg_size":2,"pg_minsize":1,"pg_count":48,"failure_domain":"host"}}'
+
+#let pgs = {};
+#for (let n = 0; n < 48; n++) { let i = n/2 | 0; pgs[1+n] = { osd_set: [ (1+i%12+(i/12 | 0)*24), (1+12+i%12+(i/12 | 0)*24) ], primary: (1+(n%2)*12+i%12+(i/12 | 0)*24) }; };
+#console.log(JSON.stringify({ items: { 1: pgs } }));
+#etcdctl --endpoints http://10.115.0.10:2379 put /vitastor/config/pgs ...
+
+#    --disk_alignment 4096 --journal_block_size 4096 --meta_block_size 4096 \\
+#    --data_offset 427814912 \\
+
+#    --disk_alignment 4096 --journal_block_size 512 --meta_block_size 512 \\
+#    --data_offset 433434624 \\
--- a/mon/merge.js
+++ b/mon/merge.js
@ -1,23 +0,0 @@
-const fsp = require('fs').promises;
-
-async function merge(file1, file2, out)
-{
-    if (!out)
-    {
-        console.error('USAGE: nodejs merge.js layer1 layer2 output');
-        process.exit();
-    }
-    const layer1 = await fsp.readFile(file1);
-    const layer2 = await fsp.readFile(file2);
-    const zero = Buffer.alloc(4096);
-    for (let i = 0; i < layer2.length; i += 4096)
-    {
-        if (zero.compare(layer2, i, i+4096) != 0)
-        {
-            layer2.copy(layer1, i, i, i+4096);
-        }
-    }
-    await fsp.writeFile(out, layer1);
-}
-
-merge(process.argv[2], process.argv[3], process.argv[4]);
--- a/mon/mon-main.js
+++ b/mon/mon-main.js
@ -1,7 +1,7 @@
 #!/usr/bin/node

 // Copyright (c) Vitaliy Filippov, 2019+
-// License: VNPL-1.1 (see README.md for details)
+// License: VNPL-1.0 (see README.md for details)

 const Mon = require('./mon.js');

--- a/mon/mon.js
+++ b/mon/mon.js
--- a/mon/simple-offsets.js
+++ b/mon/simple-offsets.js
@ -4,7 +4,6 @@
 // Simple tool to calculate journal and metadata offsets for a single device
 // Will be replaced by smarter tools in the future

-const fs = require('fs').promises;
 const child_process = require('child_process');

 async function run()
@ -16,7 +15,6 @@ async function run()
        device_block_size: 4096,
        journal_offset: 0,
        device_size: 0,
-        format: 'text',
    };
    for (let i = 2; i < process.argv.length; i++)
    {
@ -26,22 +24,7 @@ async function run()
            i++;
        }
    }
-    if (!options.device)
-    {
-        process.stderr.write('USAGE: nodejs '+process.argv[1]+' --device /dev/sdXXX\n');
-        process.exit(1);
-    }
-    options.device_size = Number(options.device_size);
-    let device_size = options.device_size;
-    if (!device_size)
-    {
-        const st = await fs.stat(options.device);
-        options.device_block_size = st.blksize;
-        if (st.isBlockDevice())
-            device_size = Number(await system("/sbin/blockdev --getsize64 "+options.device))
-        else
-            device_size = st.size;
-    }
+    const device_size = Number(options.device_size || await system("blockdev --getsize64 "+options.device));
    if (!device_size)
    {
        process.stderr.write('Failed to get device size\n');
@ -49,45 +32,25 @@ async function run()
    }
    options.journal_offset = Math.ceil(options.journal_offset/options.device_block_size)*options.device_block_size;
    const meta_offset = options.journal_offset + Math.ceil(options.journal_size/options.device_block_size)*options.device_block_size;
-    const entries_per_block = Math.floor(options.device_block_size / (24 + 2*options.object_size/options.bitmap_granularity/8));
+    const entries_per_block = Math.floor(options.device_block_size / (24 + options.object_size/options.bitmap_granularity/8));
    const object_count = Math.floor((device_size-meta_offset)/options.object_size);
    const meta_size = Math.ceil(object_count / entries_per_block) * options.device_block_size;
    const data_offset = meta_offset + meta_size;
    const meta_size_fmt = (meta_size > 1024*1024*1024 ? Math.round(meta_size/1024/1024/1024*100)/100+" GB"
        : Math.round(meta_size/1024/1024*100)/100+" MB");
-    if (options.format == 'text' || options.format == 'options')
-    {
-        if (options.format == 'text')
-        {
-            process.stderr.write(
-                `Metadata size: ${meta_size_fmt}\n`+
-                `Options for the OSD:\n`
-            );
-        }
-        process.stdout.write(
-            `    --data_device ${options.device}\n`+
-            `    --journal_offset ${options.journal_offset}\n`+
-            `    --meta_offset ${meta_offset}\n`+
-            `    --data_offset ${data_offset}\n`+
-            (options.device_size ? `    --data_size ${device_size-data_offset}\n` : '')
-        );
-    }
-    else if (options.format == 'env')
-    {
-        process.stdout.write(
-            `journal_offset=${options.journal_offset}\n`+
-            `meta_offset=${meta_offset}\n`+
-            `data_offset=${data_offset}\n`+
-            `data_size=${device_size-data_offset}\n`
-        );
-    }
-    else
-        process.stdout.write('Unknown format: '+options.format);
+    process.stdout.write(
+        `Metadata size: ${meta_size_fmt}\n`+
+        `Options for the OSD:\n`+
+        `    --journal_offset ${options.journal_offset}\n`+
+        `    --meta_offset ${meta_offset}\n`+
+        `    --data_offset ${data_offset}\n`+
+        (options.device_size ? `    --data_size ${device_size-data_offset}\n` : '')
+    );
 }

 function system(cmd)
 {
-    return new Promise((ok, no) => child_process.exec(cmd, { maxBuffer: 64*1024*1024 }, (err, stdout, stderr) => (err ? no(err.message) : ok(stdout))));
+    return new Promise((ok, no) => child_process.exec(cmd, { maxBuffer: 64*1024*1024 }, (err, stdout, stderr) => (err ? no(err) : ok(stdout))));
 }

-run().catch(err => { console.error(err); process.exit(1); });
+run().catch(console.error);
--- a/mon/test-nonuniform.js
+++ b/mon/test-nonuniform.js
@ -1,5 +1,5 @@
 // Copyright (c) Vitaliy Filippov, 2019+
-// License: VNPL-1.1 (see README.md for details)
+// License: VNPL-1.0 (see README.md for details)

 // Interesting real-world example coming from Ceph with EC and compression enabled.
 // EC parity chunks can't be compressed as efficiently as data chunks,
--- a/mon/test-optimize-simple.js
+++ b/mon/test-optimize-simple.js
@ -1,25 +0,0 @@
-// Copyright (c) Vitaliy Filippov, 2019+
-// License: VNPL-1.1 (see README.md for details)
-
-const LPOptimizer = require('./lp-optimizer.js');
-
-async function run()
-{
-    const osd_tree = { a: { 1: 1 }, b: { 2: 1 }, c: { 3: 1 } };
-    let res;
-
-    console.log('16 PGs, size=3');
-    res = await LPOptimizer.optimize_initial({ osd_tree, pg_size: 3, pg_count: 16 });
-    LPOptimizer.print_change_stats(res, false);
-
-    console.log('\nReduce PG size to 2');
-    res = await LPOptimizer.optimize_change({ prev_pgs: res.int_pgs.map(pg => pg.slice(0, 2)), osd_tree, pg_size: 2 });
-    LPOptimizer.print_change_stats(res, false);
-
-    console.log('\nRemove OSD 3');
-    delete osd_tree['c'];
-    res = await LPOptimizer.optimize_change({ prev_pgs: res.int_pgs, osd_tree, pg_size: 2 });
-    LPOptimizer.print_change_stats(res, false);
-}
-
-run().catch(console.error);
--- a/mon/test-optimize-undersized.js
+++ b/mon/test-optimize-undersized.js
@ -1,5 +1,5 @@
 // Copyright (c) Vitaliy Filippov, 2019+
-// License: VNPL-1.1 (see README.md for details)
+// License: VNPL-1.0 (see README.md for details)

 const LPOptimizer = require('./lp-optimizer.js');

--- a/mon/test-optimize.js
+++ b/mon/test-optimize.js
@ -1,5 +1,5 @@
 // Copyright (c) Vitaliy Filippov, 2019+
-// License: VNPL-1.1 (see README.md for details)
+// License: VNPL-1.0 (see README.md for details)

 const LPOptimizer = require('./lp-optimizer.js');

--- a/src/msgr_receive.cpp
+++ b/src/msgr_receive.cpp
@ -1,5 +1,5 @@
 // Copyright (c) Vitaliy Filippov, 2019+
-// License: VNPL-1.1 or GNU GPL-2.0+ (see README.md for details)
+// License: VNPL-1.0 or GNU GPL-2.0+ (see README.md for details)

 #include "messenger.h"

@ -9,10 +9,6 @@ void osd_messenger_t::read_requests()
    {
        int peer_fd = read_ready_clients[i];
        osd_client_t *cl = clients[peer_fd];
-        if (cl->read_msg.msg_iovlen)
-        {
-            continue;
-        }
        if (cl->read_remaining < receive_buffer_size)
        {
            cl->read_iov.iov_base = cl->in_buf;
@ -33,7 +29,6 @@ void osd_messenger_t::read_requests()
            io_uring_sqe* sqe = ringloop->get_sqe();
            if (!sqe)
            {
-                cl->read_msg.msg_iovlen = 0;
                read_ready_clients.erase(read_ready_clients.begin(), read_ready_clients.begin() + i);
                return;
            }
@ -57,7 +52,6 @@ void osd_messenger_t::read_requests()
 bool osd_messenger_t::handle_read(int result, osd_client_t *cl)
 {
    bool ret = false;
-    cl->read_msg.msg_iovlen = 0;
    cl->refs--;
    if (cl->peer_state == PEER_STOPPED)
    {
@ -166,14 +160,8 @@ bool osd_messenger_t::handle_finished_read(osd_client_t *cl)
    {
        if (cl->read_op->req.hdr.magic == SECONDARY_OSD_REPLY_MAGIC)
            return handle_reply_hdr(cl);
-        else if (cl->read_op->req.hdr.magic == SECONDARY_OSD_OP_MAGIC)
-            handle_op_hdr(cl);
        else
-        {
-            printf("Received garbage: magic=%lx id=%lu opcode=%lx from %d\n", cl->read_op->req.hdr.magic, cl->read_op->req.hdr.id, cl->read_op->req.hdr.opcode, cl->peer_fd);
-            stop_client(cl->peer_fd);
-            return false;
-        }
+            handle_op_hdr(cl);
    }
    else if (cl->read_state == CL_READ_DATA)
    {
@ -202,34 +190,22 @@ void osd_messenger_t::handle_op_hdr(osd_client_t *cl)
    osd_op_t *cur_op = cl->read_op;
    if (cur_op->req.hdr.opcode == OSD_OP_SEC_READ)
    {
+        if (cur_op->req.sec_rw.len > 0)
+            cur_op->buf = memalign_or_die(MEM_ALIGNMENT, cur_op->req.sec_rw.len);
        cl->read_remaining = 0;
    }
    else if (cur_op->req.hdr.opcode == OSD_OP_SEC_WRITE ||
        cur_op->req.hdr.opcode == OSD_OP_SEC_WRITE_STABLE)
    {
-        if (cur_op->req.sec_rw.attr_len > 0)
-        {
-            if (cur_op->req.sec_rw.attr_len > sizeof(unsigned))
-                cur_op->bitmap = cur_op->rmw_buf = malloc_or_die(cur_op->req.sec_rw.attr_len);
-            else
-                cur_op->bitmap = &cur_op->bmp_data;
-            cl->recv_list.push_back(cur_op->bitmap, cur_op->req.sec_rw.attr_len);
-        }
        if (cur_op->req.sec_rw.len > 0)
-        {
            cur_op->buf = memalign_or_die(MEM_ALIGNMENT, cur_op->req.sec_rw.len);
-            cl->recv_list.push_back(cur_op->buf, cur_op->req.sec_rw.len);
-        }
-        cl->read_remaining = cur_op->req.sec_rw.len + cur_op->req.sec_rw.attr_len;
+        cl->read_remaining = cur_op->req.sec_rw.len;
    }
    else if (cur_op->req.hdr.opcode == OSD_OP_SEC_STABILIZE ||
        cur_op->req.hdr.opcode == OSD_OP_SEC_ROLLBACK)
    {
        if (cur_op->req.sec_stab.len > 0)
-        {
            cur_op->buf = memalign_or_die(MEM_ALIGNMENT, cur_op->req.sec_stab.len);
-            cl->recv_list.push_back(cur_op->buf, cur_op->req.sec_stab.len);
-        }
        cl->read_remaining = cur_op->req.sec_stab.len;
    }
    else if (cur_op->req.hdr.opcode == OSD_OP_READ)
@ -239,15 +215,13 @@ void osd_messenger_t::handle_op_hdr(osd_client_t *cl)
    else if (cur_op->req.hdr.opcode == OSD_OP_WRITE)
    {
        if (cur_op->req.rw.len > 0)
-        {
            cur_op->buf = memalign_or_die(MEM_ALIGNMENT, cur_op->req.rw.len);
-            cl->recv_list.push_back(cur_op->buf, cur_op->req.rw.len);
-        }
        cl->read_remaining = cur_op->req.rw.len;
    }
    if (cl->read_remaining > 0)
    {
        // Read data
+        cl->recv_list.push_back(cur_op->buf, cl->read_remaining);
        cl->read_state = CL_READ_DATA;
    }
    else
@ -273,12 +247,12 @@ bool osd_messenger_t::handle_reply_hdr(osd_client_t *cl)
    osd_op_t *op = req_it->second;
    memcpy(op->reply.buf, cl->read_op->req.buf, OSD_PACKET_SIZE);
    cl->sent_ops.erase(req_it);
-    if (op->reply.hdr.opcode == OSD_OP_SEC_READ || op->reply.hdr.opcode == OSD_OP_READ)
+    if ((op->reply.hdr.opcode == OSD_OP_SEC_READ || op->reply.hdr.opcode == OSD_OP_READ) &&
+        op->reply.hdr.retval > 0)
    {
        // Read data. In this case we assume that the buffer is preallocated by the caller (!)
-        unsigned bmp_len = (op->reply.hdr.opcode == OSD_OP_SEC_READ ? op->reply.sec_rw.attr_len : op->reply.rw.bitmap_len);
-        if (op->reply.hdr.retval != (op->reply.hdr.opcode == OSD_OP_SEC_READ ? op->req.sec_rw.len : op->req.rw.len) ||
-            bmp_len > op->bitmap_len)
+        assert(op->iov.count > 0);
+        if (op->reply.hdr.retval != (op->reply.hdr.opcode == OSD_OP_SEC_READ ? op->req.sec_rw.len : op->req.rw.len))
        {
            // Check reply length to not overflow the buffer
            printf("Client %d read reply of different length\n", cl->peer_fd);
@ -286,23 +260,11 @@ bool osd_messenger_t::handle_reply_hdr(osd_client_t *cl)
            stop_client(cl->peer_fd);
            return false;
        }
-        if (bmp_len > 0)
-        {
-            cl->recv_list.push_back(op->bitmap, bmp_len);
-        }
-        if (op->reply.hdr.retval > 0)
-        {
-            assert(op->iov.count > 0);
-            cl->recv_list.append(op->iov);
-        }
-        cl->read_remaining = op->reply.hdr.retval + bmp_len;
-        if (cl->read_remaining == 0)
-        {
-            goto reuse;
-        }
+        cl->recv_list.append(op->iov);
        delete cl->read_op;
        cl->read_op = op;
        cl->read_state = CL_READ_REPLY_DATA;
+        cl->read_remaining = op->reply.hdr.retval;
    }
    else if (op->reply.hdr.opcode == OSD_OP_SEC_LIST && op->reply.hdr.retval > 0)
    {
@ -326,7 +288,6 @@ bool osd_messenger_t::handle_reply_hdr(osd_client_t *cl)
    }
    else
    {
-reuse:
        // It's fine to reuse cl->read_op for the next reply
        handle_reply_ready(op);
        cl->recv_list.push_back(cl->read_op->req.buf, OSD_PACKET_SIZE);
--- a/src/msgr_send.cpp
+++ b/src/msgr_send.cpp
@ -1,5 +1,5 @@
 // Copyright (c) Vitaliy Filippov, 2019+
-// License: VNPL-1.1 or GNU GPL-2.0+ (see README.md for details)
+// License: VNPL-1.0 or GNU GPL-2.0+ (see README.md for details)

 #define _XOPEN_SOURCE
 #include <limits.h>
@ -46,29 +46,7 @@ void osd_messenger_t::outbox_push(osd_op_t *cur_op)
        to_send_list.push_back((iovec){ .iov_base = cur_op->req.buf, .iov_len = OSD_PACKET_SIZE });
        cl->sent_ops[cur_op->req.hdr.id] = cur_op;
    }
-    to_outbox.push_back(NULL);
-    // Bitmap
-    if (cur_op->op_type == OSD_OP_IN &&
-        cur_op->req.hdr.opcode == OSD_OP_SEC_READ &&
-        cur_op->reply.sec_rw.attr_len > 0)
-    {
-        to_send_list.push_back((iovec){
-            .iov_base = cur_op->bitmap,
-            .iov_len = cur_op->reply.sec_rw.attr_len,
-        });
-        to_outbox.push_back(NULL);
-    }
-    else if (cur_op->op_type == OSD_OP_OUT &&
-        (cur_op->req.hdr.opcode == OSD_OP_SEC_WRITE || cur_op->req.hdr.opcode == OSD_OP_SEC_WRITE_STABLE) &&
-        cur_op->req.sec_rw.attr_len > 0)
-    {
-        to_send_list.push_back((iovec){
-            .iov_base = cur_op->bitmap,
-            .iov_len = cur_op->req.sec_rw.attr_len,
-        });
-        to_outbox.push_back(NULL);
-    }
-    // Operation data
+    // Pre-defined send_lists
    if ((cur_op->op_type == OSD_OP_IN
        ? (cur_op->req.hdr.opcode == OSD_OP_READ ||
        cur_op->req.hdr.opcode == OSD_OP_SEC_READ ||
@ -80,17 +58,17 @@ void osd_messenger_t::outbox_push(osd_op_t *cur_op)
        cur_op->req.hdr.opcode == OSD_OP_SEC_STABILIZE ||
        cur_op->req.hdr.opcode == OSD_OP_SEC_ROLLBACK)) && cur_op->iov.count > 0)
    {
+        to_outbox.push_back(NULL);
        for (int i = 0; i < cur_op->iov.count; i++)
        {
            assert(cur_op->iov.buf[i].iov_base);
            to_send_list.push_back(cur_op->iov.buf[i]);
-            to_outbox.push_back(NULL);
+            to_outbox.push_back(i == cur_op->iov.count-1 ? cur_op : NULL);
        }
    }
-    if (cur_op->op_type == OSD_OP_IN)
+    else
    {
-        // To free it later
-        to_outbox[to_outbox.size()-1] = cur_op;
+        to_outbox.push_back(cur_op);
    }
    if (!ringloop)
    {
@ -114,14 +92,8 @@ void osd_messenger_t::outbox_push(osd_op_t *cur_op)
 void osd_messenger_t::measure_exec(osd_op_t *cur_op)
 {
    // Measure execution latency
-    if (cur_op->req.hdr.opcode > OSD_OP_MAX)
-    {
-        return;
-    }
-    if (!cur_op->tv_end.tv_sec)
-    {
-        clock_gettime(CLOCK_REALTIME, &cur_op->tv_end);
-    }
+    timespec tv_end;
+    clock_gettime(CLOCK_REALTIME, &tv_end);
    stats.op_stat_count[cur_op->req.hdr.opcode]++;
    if (!stats.op_stat_count[cur_op->req.hdr.opcode])
    {
@ -130,8 +102,8 @@ void osd_messenger_t::measure_exec(osd_op_t *cur_op)
        stats.op_stat_bytes[cur_op->req.hdr.opcode] = 0;
    }
    stats.op_stat_sum[cur_op->req.hdr.opcode] += (
-        (cur_op->tv_end.tv_sec - cur_op->tv_begin.tv_sec)*1000000 +
-        (cur_op->tv_end.tv_nsec - cur_op->tv_begin.tv_nsec)/1000
+        (tv_end.tv_sec - cur_op->tv_begin.tv_sec)*1000000 +
+        (tv_end.tv_nsec - cur_op->tv_begin.tv_nsec)/1000
    );
    if (cur_op->req.hdr.opcode == OSD_OP_READ ||
        cur_op->req.hdr.opcode == OSD_OP_WRITE)
@ -226,8 +198,11 @@ void osd_messenger_t::handle_send(int result, osd_client_t *cl)
            {
                if (cl->outbox[done])
                {
-                    // Reply fully sent
-                    delete cl->outbox[done];
+                    // Operation fully sent
+                    if (cl->outbox[done]->op_type == OSD_OP_IN)
+                    {
+                        delete cl->outbox[done];
+                    }
                }
                result -= iov.iov_len;
                done++;
--- a/src/nbd_proxy.cpp
+++ b/src/nbd_proxy.cpp
@ -1,5 +1,5 @@
 // Copyright (c) Vitaliy Filippov, 2019+
-// License: VNPL-1.1 (see README.md for details)
+// License: VNPL-1.0 (see README.md for details)
 // Similar to qemu-nbd, but sets timeout and uses io_uring

 #include <linux/nbd.h>
@ -17,10 +17,6 @@
 #include "epoll_manager.h"
 #include "cluster_client.h"

-#ifndef MSG_ZEROCOPY
-#define MSG_ZEROCOPY 0
-#endif
-
 const char *exe_name = NULL;

 class nbd_proxy
@ -111,7 +107,7 @@ public:
    {
        printf(
            "Vitastor NBD proxy\n"
-            "(c) Vitaliy Filippov, 2020 (VNPL-1.1)\n\n"
+            "(c) Vitaliy Filippov, 2020 (VNPL-1.0)\n\n"
            "USAGE:\n"
            "  %s map --etcd_address <etcd_address> --pool <pool> --inode <inode> --size <size in bytes>\n"
            "  %s unmap /dev/nbd0\n"
--- a/src/object_id.h
+++ b/src/object_id.h
@ -1,19 +1,17 @@
 // Copyright (c) Vitaliy Filippov, 2019+
-// License: VNPL-1.1 or GNU GPL-2.0+ (see README.md for details)
+// License: VNPL-1.0 or GNU GPL-2.0+ (see README.md for details)

 #pragma once

 #include <stdint.h>
 #include <functional>

-typedef uint64_t inode_t;
-
 // 16 bytes per object/stripe id
 // stripe = (start of the parity stripe + peer role)
 // i.e. for example (256KB + one of 0,1,2)
 struct __attribute__((__packed__)) object_id
 {
-    inode_t inode;
+    uint64_t inode;
    uint64_t stripe;
 };

--- a/src/osd.cpp
+++ b/src/osd.cpp
@ -1,5 +1,5 @@
 // Copyright (c) Vitaliy Filippov, 2019+
-// License: VNPL-1.1 (see README.md for details)
+// License: VNPL-1.0 (see README.md for details)

 #include <sys/socket.h>
 #include <sys/poll.h>
@ -9,21 +9,15 @@

 #include "osd.h"

-osd_t::osd_t(blockstore_config_t & config, ring_loop_t *ringloop)
+osd_t::osd_t(blockstore_config_t & config, blockstore_t *bs, ring_loop_t *ringloop)
 {
-    bs_block_size = strtoull(config["block_size"].c_str(), NULL, 10);
-    bs_bitmap_granularity = strtoull(config["bitmap_granularity"].c_str(), NULL, 10);
-    if (!bs_block_size)
-        bs_block_size = DEFAULT_BLOCK_SIZE;
-    if (!bs_bitmap_granularity)
-        bs_bitmap_granularity = DEFAULT_BITMAP_GRANULARITY;
-    clean_entry_bitmap_size = bs_block_size / bs_bitmap_granularity / 8;
-
    this->config = config;
+    this->bs = bs;
    this->ringloop = ringloop;

-    // FIXME: Create Blockstore from on-disk superblock config and check it against the OSD cluster config
-    this->bs = new blockstore_t(config, ringloop);
+    this->bs_block_size = bs->get_block_size();
+    // FIXME: use bitmap granularity instead
+    this->bs_disk_alignment = bs->get_disk_alignment();

    parse_config(config);

@ -43,7 +37,6 @@ osd_t::osd_t(blockstore_config_t & config, ring_loop_t *ringloop)
    c_cli.ringloop = this->ringloop;
    c_cli.exec_op = [this](osd_op_t *op) { exec_op(op); };
    c_cli.repeer_pgs = [this](osd_num_t peer_osd) { repeer_pgs(peer_osd); };
-    c_cli.init();

    init_cluster();

@ -55,7 +48,6 @@ osd_t::~osd_t()
 {
    ringloop->unregister_consumer(&consumer);
    delete epmgr;
-    delete bs;
    close(listen_fd);
 }

@ -63,7 +55,6 @@ void osd_t::parse_config(blockstore_config_t & config)
 {
    if (config.find("log_level") == config.end())
        config["log_level"] = "1";
-    log_level = strtoull(config["log_level"].c_str(), NULL, 10);
    // Initial startup configuration
    json11::Json json_config = json11::Json(config);
    st_cli.parse_config(json_config);
@ -75,8 +66,6 @@ void osd_t::parse_config(blockstore_config_t & config)
        throw std::runtime_error("osd_num is required in the configuration");
    c_cli.osd_num = osd_num;
    run_primary = config["run_primary"] != "false" && config["run_primary"] != "0" && config["run_primary"] != "no";
-    no_rebalance = config["no_rebalance"] == "true" || config["no_rebalance"] == "1" || config["no_rebalance"] == "yes";
-    no_recovery = config["no_recovery"] == "true" || config["no_recovery"] == "1" || config["no_recovery"] == "yes";
    // Cluster configuration
    bind_address = config["bind_address"];
    if (bind_address == "")
@ -103,9 +92,6 @@ void osd_t::parse_config(blockstore_config_t & config)
    recovery_queue_depth = strtoull(config["recovery_queue_depth"].c_str(), NULL, 10);
    if (recovery_queue_depth < 1 || recovery_queue_depth > MAX_RECOVERY_QUEUE)
        recovery_queue_depth = DEFAULT_RECOVERY_QUEUE;
-    recovery_sync_batch = strtoull(config["recovery_sync_batch"].c_str(), NULL, 10);
-    if (recovery_sync_batch < 1 || recovery_sync_batch > MAX_RECOVERY_QUEUE)
-        recovery_sync_batch = DEFAULT_RECOVERY_BATCH;
    if (config["readonly"] == "true" || config["readonly"] == "1" || config["readonly"] == "yes")
        readonly = true;
    print_stats_interval = strtoull(config["print_stats_interval"].c_str(), NULL, 10);
@ -114,7 +100,14 @@ void osd_t::parse_config(blockstore_config_t & config)
    slow_log_interval = strtoull(config["slow_log_interval"].c_str(), NULL, 10);
    if (!slow_log_interval)
        slow_log_interval = 10;
-    c_cli.parse_config(json_config);
+    c_cli.peer_connect_interval = strtoull(config["peer_connect_interval"].c_str(), NULL, 10);
+    if (!c_cli.peer_connect_interval)
+        c_cli.peer_connect_interval = DEFAULT_PEER_CONNECT_INTERVAL;
+    c_cli.peer_connect_timeout = strtoull(config["peer_connect_timeout"].c_str(), NULL, 10);
+    if (!c_cli.peer_connect_timeout)
+        c_cli.peer_connect_timeout = DEFAULT_PEER_CONNECT_TIMEOUT;
+    log_level = strtoull(config["log_level"].c_str(), NULL, 10);
+    c_cli.log_level = log_level;
 }

 void osd_t::bind_socket()
@ -178,7 +171,7 @@ bool osd_t::shutdown()
    {
        return false;
    }
-    return !bs || bs->is_safe_to_stop();
+    return bs->is_safe_to_stop();
 }

 void osd_t::loop()
@ -205,25 +198,19 @@ void osd_t::exec_op(osd_op_t *cur_op)
            cur_op->req.hdr.opcode == OSD_OP_SEC_WRITE ||
            cur_op->req.hdr.opcode == OSD_OP_SEC_WRITE_STABLE) &&
            (cur_op->req.sec_rw.len > OSD_RW_MAX ||
-            cur_op->req.sec_rw.len % bs_bitmap_granularity ||
-            cur_op->req.sec_rw.offset % bs_bitmap_granularity)) ||
+            cur_op->req.sec_rw.len % bs_disk_alignment ||
+            cur_op->req.sec_rw.offset % bs_disk_alignment)) ||
        ((cur_op->req.hdr.opcode == OSD_OP_READ ||
            cur_op->req.hdr.opcode == OSD_OP_WRITE ||
            cur_op->req.hdr.opcode == OSD_OP_DELETE) &&
            (cur_op->req.rw.len > OSD_RW_MAX ||
-            cur_op->req.rw.len % bs_bitmap_granularity ||
-            cur_op->req.rw.offset % bs_bitmap_granularity)))
+            cur_op->req.rw.len % bs_disk_alignment ||
+            cur_op->req.rw.offset % bs_disk_alignment)))
    {
        // Bad command
        finish_op(cur_op, -EINVAL);
        return;
    }
-    if (cur_op->req.hdr.opcode == OSD_OP_PING)
-    {
-        // Pong
-        finish_op(cur_op, 0);
-        return;
-    }
    if (readonly &&
        cur_op->req.hdr.opcode != OSD_OP_SEC_READ &&
        cur_op->req.hdr.opcode != OSD_OP_SEC_LIST &&
@ -274,9 +261,9 @@ void osd_t::reset_stats()

 void osd_t::print_stats()
 {
-    for (int i = OSD_OP_MIN; i <= OSD_OP_MAX; i++)
+    for (int i = 0; i <= OSD_OP_MAX; i++)
    {
-        if (c_cli.stats.op_stat_count[i] != prev_stats.op_stat_count[i] && i != OSD_OP_PING)
+        if (c_cli.stats.op_stat_count[i] != prev_stats.op_stat_count[i])
        {
            uint64_t avg = (c_cli.stats.op_stat_sum[i] - prev_stats.op_stat_sum[i])/(c_cli.stats.op_stat_count[i] - prev_stats.op_stat_count[i]);
            uint64_t bw = (c_cli.stats.op_stat_bytes[i] - prev_stats.op_stat_bytes[i]) / print_stats_interval;
@ -297,7 +284,7 @@ void osd_t::print_stats()
            prev_stats.op_stat_bytes[i] = c_cli.stats.op_stat_bytes[i];
        }
    }
-    for (int i = OSD_OP_MIN; i <= OSD_OP_MAX; i++)
+    for (int i = 0; i <= OSD_OP_MAX; i++)
    {
        if (c_cli.stats.subop_stat_count[i] != prev_stats.subop_stat_count[i])
        {
--- a/src/osd.h
+++ b/src/osd.h
@ -1,5 +1,5 @@
 // Copyright (c) Vitaliy Filippov, 2019+
-// License: VNPL-1.1 (see README.md for details)
+// License: VNPL-1.0 (see README.md for details)

 #pragma once

@ -37,7 +37,6 @@
 #define DEFAULT_AUTOSYNC_INTERVAL 5
 #define MAX_RECOVERY_QUEUE 2048
 #define DEFAULT_RECOVERY_QUEUE 4
-#define DEFAULT_RECOVERY_BATCH 16

 //#define OSD_STUB

@ -55,17 +54,6 @@ struct osd_recovery_op_t
    osd_op_t *osd_op = NULL;
 };

-// Posted as /osd/inodestats/$osd, then accumulated by the monitor
-#define INODE_STATS_READ 0
-#define INODE_STATS_WRITE 1
-#define INODE_STATS_DELETE 2
-struct inode_stats_t
-{
-    uint64_t op_sum[3] = { 0 };
-    uint64_t op_count[3] = { 0 };
-    uint64_t op_bytes[3] = { 0 };
-};
-
 class osd_t
 {
    // config
@ -76,8 +64,6 @@ class osd_t
    bool readonly = false;
    osd_num_t osd_num = 1; // OSD numbers start with 1
    bool run_primary = false;
-    bool no_rebalance = false;
-    bool no_recovery = false;
    std::string bind_address;
    int bind_port, listen_backlog;
    // FIXME: Implement client queue depth limit
@ -88,7 +74,6 @@ class osd_t
    int immediate_commit = IMMEDIATE_NONE;
    int autosync_interval = DEFAULT_AUTOSYNC_INTERVAL; // sync every 5 seconds
    int recovery_queue_depth = DEFAULT_RECOVERY_QUEUE;
-    int recovery_sync_batch = DEFAULT_RECOVERY_BATCH;
    int log_level = 0;

    // cluster state
@ -110,11 +95,9 @@ class osd_t
    std::map<pool_pg_num_t, pg_t> pgs;
    std::set<pool_pg_num_t> dirty_pgs;
    std::set<osd_num_t> dirty_osds;
-    int copies_to_delete_after_sync_count = 0;
    uint64_t misplaced_objects = 0, degraded_objects = 0, incomplete_objects = 0;
    int peering_state = 0;
    std::map<object_id, osd_recovery_op_t> recovery_ops;
-    int recovery_done = 0;
    osd_op_t *autosync_op = NULL;

    // Unstable writes
@ -126,7 +109,7 @@ class osd_t
    bool stopping = false;
    int inflight_ops = 0;
    blockstore_t *bs;
-    uint32_t bs_block_size, bs_bitmap_granularity, clean_entry_bitmap_size;
+    uint32_t bs_block_size, bs_disk_alignment;
    ring_loop_t *ringloop;
    timerfd_manager_t *tfd = NULL;
    epoll_manager_t *epmgr = NULL;
@ -137,7 +120,6 @@ class osd_t

    // op statistics
    osd_op_stats_t prev_stats;
-    std::map<uint64_t, inode_stats_t> inode_stats;
    const char* recovery_stat_names[2] = { "degraded", "misplaced" };
    uint64_t recovery_stat_count[2][2] = { 0 };
    uint64_t recovery_stat_bytes[2][2] = { 0 };
@ -178,7 +160,6 @@ class osd_t
    void submit_list_subop(osd_num_t role_osd, pg_peering_state_t *ps);
    void discard_list_subop(osd_op_t *list_op);
    bool stop_pg(pg_t & pg);
-    void reset_pg(pg_t & pg);
    void finish_stop_pg(pg_t & pg);

    // flushing, recovery and backfill
@ -217,7 +198,6 @@ class osd_t
    void pg_cancel_write_queue(pg_t & pg, osd_op_t *first_op, object_id oid, int retval);
    void submit_primary_subops(int submit_type, uint64_t op_version, int pg_size, const uint64_t* osd_set, osd_op_t *cur_op);
    void submit_primary_del_subops(osd_op_t *cur_op, uint64_t *cur_set, uint64_t set_size, pg_osd_set_t & loc_set);
-    void submit_primary_del_batch(osd_op_t *cur_op, obj_ver_osd_t *chunks_to_delete, int chunks_to_delete_count);
    void submit_primary_sync_subops(osd_op_t *cur_op);
    void submit_primary_stab_subops(osd_op_t *cur_op);

@ -230,7 +210,7 @@ class osd_t
    }

 public:
-    osd_t(blockstore_config_t & config, ring_loop_t *ringloop);
+    osd_t(blockstore_config_t & config, blockstore_t *bs, ring_loop_t *ringloop);
    ~osd_t();
    void force_stop(int exitcode);
    bool shutdown();
--- a/src/osd_cluster.cpp
+++ b/src/osd_cluster.cpp
@ -1,10 +1,9 @@
 // Copyright (c) Vitaliy Filippov, 2019+
-// License: VNPL-1.1 (see README.md for details)
+// License: VNPL-1.0 (see README.md for details)

 #include "osd.h"
 #include "base64.h"
 #include "etcd_state_client.h"
-#include "osd_rmw.h"

 // Startup sequence:
 //   Start etcd watcher -> Load global OSD configuration -> Bind socket -> Acquire lease -> Report&lock OSD state
@ -33,26 +32,12 @@ void osd_t::init_cluster()
            }
            pgs[{ 1, 1 }] = (pg_t){
                .state = PG_PEERING,
-                .scheme = POOL_SCHEME_XOR,
                .pg_cursize = 0,
-                .pg_size = 3,
-                .pg_minsize = 2,
-                .pg_data_size = 2,
                .pool_id = 1,
                .pg_num = 1,
                .target_set = { 1, 2, 3 },
                .cur_set = { 0, 0, 0 },
            };
-            st_cli.pool_config[1] = (pool_config_t){
-                .exists = true,
-                .id = 1,
-                .name = "testpool",
-                .scheme = POOL_SCHEME_XOR,
-                .pg_size = 3,
-                .pg_minsize = 2,
-                .pg_count = 1,
-                .real_pg_count = 1,
-            };
            report_pg_state(pgs[{ 1, 1 }]);
            pg_counts[1] = 1;
        }
@ -142,7 +127,7 @@ json11::Json osd_t::get_statistics()
    }
    st["host"] = self_state["host"];
    json11::Json::object op_stats, subop_stats;
-    for (int i = OSD_OP_MIN; i <= OSD_OP_MAX; i++)
+    for (int i = 0; i <= OSD_OP_MAX; i++)
    {
        op_stats[osd_op_names[i]] = json11::Json::object {
            { "count", c_cli.stats.op_stat_count[i] },
@ -150,7 +135,7 @@ json11::Json osd_t::get_statistics()
            { "bytes", c_cli.stats.op_stat_bytes[i] },
        };
    }
-    for (int i = OSD_OP_MIN; i <= OSD_OP_MAX; i++)
+    for (int i = 0; i <= OSD_OP_MAX; i++)
    {
        subop_stats[osd_op_names[i]] = json11::Json::object {
            { "count", c_cli.stats.subop_stat_count[i] },
@ -179,47 +164,11 @@ void osd_t::report_statistics()
        return;
    }
    etcd_reporting_stats = true;
-    // Report space usage statistics as a whole
-    // Maybe we'll report it using deltas if we tune for a lot of inodes at some point
-    json11::Json::object inode_space;
-    for (auto kv: bs->get_inode_space_stats())
-    {
-        inode_space[std::to_string(kv.first)] = kv.second;
-    }
-    json11::Json::object inode_ops;
-    for (auto kv: inode_stats)
-    {
-        inode_ops[std::to_string(kv.first)] = json11::Json::object {
-            { "read", json11::Json::object {
-                { "count", kv.second.op_count[INODE_STATS_READ] },
-                { "usec", kv.second.op_sum[INODE_STATS_READ] },
-                { "bytes", kv.second.op_bytes[INODE_STATS_READ] },
-            } },
-            { "write", json11::Json::object {
-                { "count", kv.second.op_count[INODE_STATS_WRITE] },
-                { "usec", kv.second.op_sum[INODE_STATS_WRITE] },
-                { "bytes", kv.second.op_bytes[INODE_STATS_WRITE] },
-            } },
-            { "delete", json11::Json::object {
-                { "count", kv.second.op_count[INODE_STATS_DELETE] },
-                { "usec", kv.second.op_sum[INODE_STATS_DELETE] },
-                { "bytes", kv.second.op_bytes[INODE_STATS_DELETE] },
-            } },
-        };
-    }
    json11::Json::array txn = { json11::Json::object {
        { "request_put", json11::Json::object {
            { "key", base64_encode(st_cli.etcd_prefix+"/osd/stats/"+std::to_string(osd_num)) },
            { "value", base64_encode(get_statistics().dump()) },
-        } },
-        { "request_put", json11::Json::object {
-            { "key", base64_encode(st_cli.etcd_prefix+"/osd/space/"+std::to_string(osd_num)) },
-            { "value", base64_encode(json11::Json(inode_space).dump()) },
-        } },
-        { "request_put", json11::Json::object {
-            { "key", base64_encode(st_cli.etcd_prefix+"/osd/inodestats/"+std::to_string(osd_num)) },
-            { "value", base64_encode(json11::Json(inode_ops).dump()) },
-        } },
+        } }
    } };
    for (auto & p: pgs)
    {
@ -420,7 +369,6 @@ void osd_t::create_osd_state()
        {
            st_cli.load_pgs();
        }
-        report_statistics();
    });
 }

@ -531,11 +479,7 @@ void osd_t::apply_pg_count()
            }
            if (still_active > 0)
            {
-                printf(
-                    "[OSD %lu] PG count change detected for pool %u (new is %lu, old is %u),"
-                    " but %u PG(s) are still active. This is not allowed. Exiting\n",
-                    this->osd_num, pool_item.first, pool_item.second.real_pg_count, pg_counts[pool_item.first], still_active
-                );
+                printf("[OSD %lu] PG count change detected, but %d PG(s) are still active. This is not allowed. Exiting\n", this->osd_num, still_active);
                force_stop(1);
                return;
            }
@ -629,10 +573,7 @@ void osd_t::apply_pg_config()
                    }
                    else
                    {
-                        throw std::runtime_error(
-                            "Unexpected PG "+std::to_string(pool_id)+"/"+std::to_string(pg_num)+
-                            " state: "+std::to_string(pg_it->second.state)
-                        );
+                        throw std::runtime_error("Unexpected PG "+std::to_string(pg_num)+" state: "+std::to_string(pg_it->second.state));
                    }
                }
                auto & pg = this->pgs[{ .pool_id = pool_id, .pg_num = pg_num }];
@ -642,8 +583,6 @@ void osd_t::apply_pg_config()
                    .pg_cursize = 0,
                    .pg_size = pool_item.second.pg_size,
                    .pg_minsize = pool_item.second.pg_minsize,
-                    .pg_data_size = pg.scheme == POOL_SCHEME_REPLICATED
-                         ? 1 : pool_item.second.pg_size - pool_item.second.parity_chunks,
                    .pool_id = pool_id,
                    .pg_num = pg_num,
                    .reported_epoch = pg_cfg.epoch,
@ -651,10 +590,6 @@ void osd_t::apply_pg_config()
                    .all_peers = std::vector<osd_num_t>(all_peers.begin(), all_peers.end()),
                    .target_set = pg_cfg.target_set,
                };
-                if (pg.scheme == POOL_SCHEME_JERASURE)
-                {
-                    use_jerasure(pg.pg_size, pg.pg_data_size, true);
-                }
                this->pg_state_dirty.insert({ .pool_id = pool_id, .pg_num = pg_num });
                pg.print_state();
                if (pg_cfg.cur_primary == this->osd_num)
@ -701,21 +636,7 @@ void osd_t::report_pg_states()
        auto & pg = pg_it->second;
        reporting_pgs.push_back({ *it, pg.history_changed });
        std::string state_key_base64 = base64_encode(st_cli.etcd_prefix+"/pg/state/"+std::to_string(pg.pool_id)+"/"+std::to_string(pg.pg_num));
-        bool pg_state_exists = false;
-        if (pg.state != PG_STARTING)
-        {
-            auto pool_it = st_cli.pool_config.find(pg.pool_id);
-            if (pool_it != st_cli.pool_config.end())
-            {
-                auto pg_it = pool_it->second.pg_config.find(pg.pg_num);
-                if (pg_it != pool_it->second.pg_config.end() &&
-                    pg_it->second.cur_state != 0)
-                {
-                    pg_state_exists = true;
-                }
-            }
-        }
-        if (!pg_state_exists)
+        if (pg.state == PG_STARTING)
        {
            // Check that the PG key does not exist
            // Failed check indicates an unsuccessful PG lock attempt in this case
@ -727,7 +648,9 @@ void osd_t::report_pg_states()
        }
        else
        {
-            // Check that the key is ours if it already exists
+            // Check that the key is ours
+            // Failed check indicates success for OFFLINE pgs (PG lock is already deleted)
+            // and an unexpected race condition for started pgs (PG lock is held by someone else)
            checks.push_back(json11::Json::object {
                { "target", "LEASE" },
                { "lease", etcd_lease_id },
@ -849,16 +772,13 @@ void osd_t::report_pg_states()
            for (auto pp: reporting_pgs)
            {
                auto pg_it = this->pgs.find(pp.first);
-                if (pg_it != this->pgs.end() &&
-                    pg_it->second.state == PG_OFFLINE &&
-                    pg_state_dirty.find(pp.first) == pg_state_dirty.end())
+                if (pg_it != this->pgs.end())
                {
-                    // Forget offline PGs after reporting their state
-                    if (pg_it->second.scheme == POOL_SCHEME_JERASURE)
+                    if (pg_it->second.state == PG_OFFLINE)
                    {
-                        use_jerasure(pg_it->second.pg_size, pg_it->second.pg_data_size, false);
+                        // Remove offline PGs after reporting their state
+                        this->pgs.erase(pg_it);
                    }
-                    this->pgs.erase(pg_it);
                }
            }
            // Push other PG state updates, if any
--- a/src/osd_flush.cpp
+++ b/src/osd_flush.cpp
@ -1,5 +1,5 @@
 // Copyright (c) Vitaliy Filippov, 2019+
-// License: VNPL-1.1 (see README.md for details)
+// License: VNPL-1.0 (see README.md for details)

 #include "osd.h"

@ -95,7 +95,7 @@ void osd_t::handle_flush_op(bool rollback, pool_id_t pool_id, pg_num_t pg_num, p
    {
        // This flush batch is done
        std::vector<osd_op_t*> continue_ops;
-        auto & pg = pgs.at(pg_id);
+        auto & pg = pgs[pg_id];
        auto it = pg.flush_actions.begin(), prev_it = it;
        auto erase_start = it;
        while (1)
@ -166,7 +166,7 @@ void osd_t::submit_flush_op(pool_id_t pool_id, pg_num_t pg_num, pg_flush_batch_t
    {
        // local
        clock_gettime(CLOCK_REALTIME, &op->tv_begin);
-        op->bs_op = new blockstore_op_t((blockstore_op_t){
+        op->bs_op = new blockstore_op_t({
            .opcode = (uint64_t)(rollback ? BS_OP_ROLLBACK : BS_OP_STABLE),
            .callback = [this, op, pool_id, pg_num, fb](blockstore_op_t *bs_op)
            {
@ -188,7 +188,7 @@ void osd_t::submit_flush_op(pool_id_t pool_id, pg_num_t pg_num, pg_flush_batch_t
        op->op_type = OSD_OP_OUT;
        op->iov.push_back(op->buf, count * sizeof(obj_ver_id));
        op->peer_fd = peer_fd;
-        op->req = (osd_any_op_t){
+        op->req = {
            .sec_stab = {
                .header = {
                    .magic = SECONDARY_OSD_OP_MAGIC,
@ -209,38 +209,32 @@ void osd_t::submit_flush_op(pool_id_t pool_id, pg_num_t pg_num, pg_flush_batch_t

 bool osd_t::pick_next_recovery(osd_recovery_op_t &op)
 {
-    if (!no_recovery)
+    for (auto pg_it = pgs.begin(); pg_it != pgs.end(); pg_it++)
    {
-        for (auto pg_it = pgs.begin(); pg_it != pgs.end(); pg_it++)
+        if ((pg_it->second.state & (PG_ACTIVE | PG_HAS_DEGRADED)) == (PG_ACTIVE | PG_HAS_DEGRADED))
        {
-            if ((pg_it->second.state & (PG_ACTIVE | PG_HAS_DEGRADED)) == (PG_ACTIVE | PG_HAS_DEGRADED))
+            for (auto obj_it = pg_it->second.degraded_objects.begin(); obj_it != pg_it->second.degraded_objects.end(); obj_it++)
            {
-                for (auto obj_it = pg_it->second.degraded_objects.begin(); obj_it != pg_it->second.degraded_objects.end(); obj_it++)
+                if (recovery_ops.find(obj_it->first) == recovery_ops.end())
                {
-                    if (recovery_ops.find(obj_it->first) == recovery_ops.end())
-                    {
-                        op.degraded = true;
-                        op.oid = obj_it->first;
-                        return true;
-                    }
+                    op.degraded = true;
+                    op.oid = obj_it->first;
+                    return true;
                }
            }
        }
    }
-    if (!no_rebalance)
+    for (auto pg_it = pgs.begin(); pg_it != pgs.end(); pg_it++)
    {
-        for (auto pg_it = pgs.begin(); pg_it != pgs.end(); pg_it++)
+        if ((pg_it->second.state & (PG_ACTIVE | PG_HAS_MISPLACED)) == (PG_ACTIVE | PG_HAS_MISPLACED))
        {
-            if ((pg_it->second.state & (PG_ACTIVE | PG_HAS_MISPLACED)) == (PG_ACTIVE | PG_HAS_MISPLACED))
+            for (auto obj_it = pg_it->second.misplaced_objects.begin(); obj_it != pg_it->second.misplaced_objects.end(); obj_it++)
            {
-                for (auto obj_it = pg_it->second.misplaced_objects.begin(); obj_it != pg_it->second.misplaced_objects.end(); obj_it++)
+                if (recovery_ops.find(obj_it->first) == recovery_ops.end())
                {
-                    if (recovery_ops.find(obj_it->first) == recovery_ops.end())
-                    {
-                        op.degraded = false;
-                        op.oid = obj_it->first;
-                        return true;
-                    }
+                    op.degraded = false;
+                    op.oid = obj_it->first;
+                    return true;
                }
            }
        }
@ -252,7 +246,7 @@ void osd_t::submit_recovery_op(osd_recovery_op_t *op)
 {
    op->osd_op = new osd_op_t();
    op->osd_op->op_type = OSD_OP_OUT;
-    op->osd_op->req = (osd_any_op_t){
+    op->osd_op->req = {
        .rw = {
            .header = {
                .magic = SECONDARY_OSD_OP_MAGIC,
@ -270,6 +264,7 @@ void osd_t::submit_recovery_op(osd_recovery_op_t *op)
    }
    op->osd_op->callback = [this, op](osd_op_t *osd_op)
    {
+        // Don't sync the write, it will be synced by our regular sync coroutine
        if (osd_op->reply.hdr.retval < 0)
        {
            // Error recovering object
@ -291,17 +286,6 @@ void osd_t::submit_recovery_op(osd_recovery_op_t *op)
        op->osd_op = NULL;
        recovery_ops.erase(op->oid);
        delete osd_op;
-        if (immediate_commit != IMMEDIATE_ALL)
-        {
-            recovery_done++;
-            if (recovery_done >= recovery_sync_batch)
-            {
-                // Force sync every <recovery_sync_batch> operations
-                // This is required not to pile up an excessive amount of delete operations
-                autosync();
-                recovery_done = 0;
-            }
-        }
        continue_recovery();
    };
    exec_op(op->osd_op);
--- a/src/osd_id.h
+++ b/src/osd_id.h
@ -1,11 +1,10 @@
 // Copyright (c) Vitaliy Filippov, 2019+
-// License: VNPL-1.1 or GNU GPL-2.0+ (see README.md for details)
+// License: VNPL-1.0 or GNU GPL-2.0+ (see README.md for details)

 #pragma once

 #define POOL_SCHEME_REPLICATED 1
 #define POOL_SCHEME_XOR 2
-#define POOL_SCHEME_JERASURE 3
 #define POOL_ID_MAX 0x10000
 #define POOL_ID_BITS 16
 #define INODE_POOL(inode) (pool_id_t)((inode) >> (64 - POOL_ID_BITS))
--- a/src/osd_main.cpp
+++ b/src/osd_main.cpp
@ -1,5 +1,5 @@
 // Copyright (c) Vitaliy Filippov, 2019+
-// License: VNPL-1.1 (see README.md for details)
+// License: VNPL-1.0 (see README.md for details)

 #include "osd.h"

@ -41,13 +41,16 @@ int main(int narg, char *args[])
    signal(SIGINT, handle_sigint);
    signal(SIGTERM, handle_sigint);
    ring_loop_t *ringloop = new ring_loop_t(512);
-    osd = new osd_t(config, ringloop);
+    // FIXME: Create Blockstore from on-disk superblock config and check it against the OSD cluster config
+    blockstore_t *bs = new blockstore_t(config, ringloop);
+    osd = new osd_t(config, bs, ringloop);
    while (1)
    {
        ringloop->loop();
        ringloop->wait();
    }
    delete osd;
+    delete bs;
    delete ringloop;
    return 0;
 }
--- a/src/osd_ops.cpp
+++ b/src/osd_ops.cpp
@ -1,5 +1,5 @@
 // Copyright (c) Vitaliy Filippov, 2019+
-// License: VNPL-1.1 or GNU GPL-2.0+ (see README.md for details)
+// License: VNPL-1.0 or GNU GPL-2.0+ (see README.md for details)

 #include "osd_ops.h"

@ -19,5 +19,4 @@ const char* osd_op_names[] = {
    "primary_write",
    "primary_sync",
    "primary_delete",
-    "ping",
 };
--- a/src/osd_ops.h
+++ b/src/osd_ops.h
@ -1,5 +1,5 @@
 // Copyright (c) Vitaliy Filippov, 2019+
-// License: VNPL-1.1 or GNU GPL-2.0+ (see README.md for details)
+// License: VNPL-1.0 or GNU GPL-2.0+ (see README.md for details)

 #pragma once

@ -27,8 +27,7 @@
 #define OSD_OP_WRITE                12
 #define OSD_OP_SYNC                 13
 #define OSD_OP_DELETE               14
-#define OSD_OP_PING                 15
-#define OSD_OP_MAX                  15
+#define OSD_OP_MAX                  14
 // Alignment & limit for read/write operations
 #ifndef MEM_ALIGNMENT
 #define MEM_ALIGNMENT               512
@ -71,9 +70,6 @@ struct __attribute__((__packed__)) osd_op_secondary_rw_t
    uint32_t offset;
    // length
    uint32_t len;
-    // bitmap/attribute length - bitmap comes after header, but before data
-    uint32_t attr_len;
-    uint32_t pad0;
 };

 struct __attribute__((__packed__)) osd_reply_secondary_rw_t
@ -81,9 +77,6 @@ struct __attribute__((__packed__)) osd_reply_secondary_rw_t
    osd_reply_header_t header;
    // for reads and writes: assigned or read version number
    uint64_t version;
-    // for reads: bitmap/attribute length (just to double-check)
-    uint32_t attr_len;
-    uint32_t pad0;
 };

 // delete object on the secondary OSD
@ -160,6 +153,7 @@ struct __attribute__((__packed__)) osd_reply_secondary_list_t
 };

 // read or write to the primary OSD (must be within individual stripe)
+// FIXME: allow to return used block bitmap (required for snapshots)
 struct __attribute__((__packed__)) osd_op_rw_t
 {
    osd_op_header_t header;
@ -174,9 +168,6 @@ struct __attribute__((__packed__)) osd_op_rw_t
 struct __attribute__((__packed__)) osd_reply_rw_t
 {
    osd_reply_header_t header;
-    // for reads: bitmap length
-    uint32_t bitmap_len;
-    uint32_t pad0;
 };

 // sync to the primary OSD
--- a/src/osd_peering.cpp
+++ b/src/osd_peering.cpp
@ -1,5 +1,5 @@
 // Copyright (c) Vitaliy Filippov, 2019+
-// License: VNPL-1.1 (see README.md for details)
+// License: VNPL-1.0 (see README.md for details)

 #include <netinet/tcp.h>
 #include <sys/epoll.h>
@ -98,13 +98,15 @@ void osd_t::repeer_pgs(osd_num_t peer_osd)
    }
 }

-// Reset PG state (when peering or stopping)
-void osd_t::reset_pg(pg_t & pg)
+// Repeer on each connect/disconnect peer event
+void osd_t::start_pg_peering(pg_t & pg)
 {
+    pg.state = PG_PEERING;
+    this->peering_state |= OSD_PEERING_PGS;
+    report_pg_state(pg);
+    // Reset PG state
    pg.cur_peers.clear();
    pg.state_dict.clear();
-    copies_to_delete_after_sync_count -= pg.copies_to_delete_after_sync.size();
-    pg.copies_to_delete_after_sync.clear();
    incomplete_objects -= pg.incomplete_objects.size();
    misplaced_objects -= pg.misplaced_objects.size();
    degraded_objects -= pg.degraded_objects.size();
@ -133,15 +135,6 @@ void osd_t::reset_pg(pg_t & pg)
            it++;
    }
    dirty_pgs.erase({ .pool_id = pg.pool_id, .pg_num = pg.pg_num });
-}
-
-// Repeer on each connect/disconnect peer event
-void osd_t::start_pg_peering(pg_t & pg)
-{
-    pg.state = PG_PEERING;
-    this->peering_state |= OSD_PEERING_PGS;
-    reset_pg(pg);
-    report_pg_state(pg);
    // Drop connections of clients who have this PG in dirty_pgs
    if (immediate_commit != IMMEDIATE_ALL)
    {
@ -182,18 +175,13 @@ void osd_t::start_pg_peering(pg_t & pg)
        // (PG history is kept up to the latest active+clean state)
        for (auto & history_set: pg.target_history)
        {
-            bool found = true;
+            bool found = false;
            for (auto history_osd: history_set)
            {
-                if (history_osd != 0)
+                if (history_osd != 0 && c_cli.osd_peer_fds.find(history_osd) != c_cli.osd_peer_fds.end())
                {
-                    found = false;
-                    if (history_osd == this->osd_num ||
-                        c_cli.osd_peer_fds.find(history_osd) != c_cli.osd_peer_fds.end())
-                    {
-                        found = true;
-                        break;
-                    }
+                    found = true;
+                    break;
                }
            }
            if (!found)
@ -319,7 +307,7 @@ void osd_t::submit_sync_and_list_subop(osd_num_t role_osd, pg_peering_state_t *p
        osd_op_t *op = new osd_op_t();
        op->op_type = OSD_OP_OUT;
        op->peer_fd = cl->peer_fd;
-        op->req = (osd_any_op_t){
+        op->req = {
            .sec_sync = {
                .header = {
                    .magic = SECONDARY_OSD_OP_MAGIC,
@ -394,7 +382,7 @@ void osd_t::submit_list_subop(osd_num_t role_osd, pg_peering_state_t *ps)
        osd_op_t *op = new osd_op_t();
        op->op_type = OSD_OP_OUT;
        op->peer_fd = c_cli.osd_peer_fds[role_osd];
-        op->req = (osd_any_op_t){
+        op->req = {
            .sec_list = {
                .header = {
                    .magic = SECONDARY_OSD_OP_MAGIC,
@ -466,11 +454,11 @@ bool osd_t::stop_pg(pg_t & pg)
    if (pg.peering_state)
    {
        // Stop peering
-        for (auto it = pg.peering_state->list_ops.begin(); it != pg.peering_state->list_ops.end(); it++)
+        for (auto it = pg.peering_state->list_ops.begin(); it != pg.peering_state->list_ops.end();)
        {
            discard_list_subop(it->second);
        }
-        for (auto it = pg.peering_state->list_results.begin(); it != pg.peering_state->list_results.end(); it++)
+        for (auto it = pg.peering_state->list_results.begin(); it != pg.peering_state->list_results.end();)
        {
            if (it->second.buf)
            {
@ -480,19 +468,12 @@ bool osd_t::stop_pg(pg_t & pg)
        delete pg.peering_state;
        pg.peering_state = NULL;
    }
-    if (pg.state & (PG_STOPPING | PG_OFFLINE))
+    if (!(pg.state & PG_ACTIVE))
    {
        return false;
    }
-    if (!(pg.state & PG_ACTIVE))
-    {
-        finish_stop_pg(pg);
-        return true;
-    }
    pg.state = pg.state & ~PG_ACTIVE | PG_STOPPING;
-    if (pg.inflight == 0 && !pg.flush_batch &&
-        // We must either forget all PG's unstable writes or wait for it to become clean
-        dirty_pgs.find({ .pool_id = pg.pool_id, .pg_num = pg.pg_num }) == dirty_pgs.end())
+    if (pg.inflight == 0 && !pg.flush_batch)
    {
        finish_stop_pg(pg);
    }
@ -506,7 +487,6 @@ bool osd_t::stop_pg(pg_t & pg)
 void osd_t::finish_stop_pg(pg_t & pg)
 {
    pg.state = PG_OFFLINE;
-    reset_pg(pg);
    report_pg_state(pg);
 }

--- a/src/osd_peering_pg.cpp
+++ b/src/osd_peering_pg.cpp
@ -1,5 +1,5 @@
 // Copyright (c) Vitaliy Filippov, 2019+
-// License: VNPL-1.1 (see README.md for details)
+// License: VNPL-1.0 (see README.md for details)

 #include <unordered_map>
 #include "osd_peering_pg.h"
@ -108,7 +108,7 @@ void pg_obj_state_check_t::start_object()

 void pg_obj_state_check_t::handle_version()
 {
-    if (!target_ver && last_ver != list[list_pos].version && (n_stable > 0 || n_roles >= pg->pg_data_size))
+    if (!target_ver && last_ver != list[list_pos].version && (n_stable > 0 || n_roles >= pg->pg_minsize))
    {
        // Version is either stable or recoverable
        target_ver = last_ver;
@ -171,7 +171,7 @@ void pg_obj_state_check_t::handle_version()

 void pg_obj_state_check_t::finish_object()
 {
-    if (!target_ver && (n_stable > 0 || n_roles >= pg->pg_data_size))
+    if (!target_ver && (n_stable > 0 || n_roles >= pg->pg_minsize))
    {
        // Version is either stable or recoverable
        target_ver = last_ver;
@ -233,7 +233,7 @@ void pg_obj_state_check_t::finish_object()
    {
        return;
    }
-    if (!replicated && n_roles < pg->pg_data_size)
+    if (!replicated && n_roles < pg->pg_minsize)
    {
        if (log_level > 1)
        {
--- a/src/osd_peering_pg.h
+++ b/src/osd_peering_pg.h
@ -1,5 +1,5 @@
 // Copyright (c) Vitaliy Filippov, 2019+
-// License: VNPL-1.1 (see README.md for details)
+// License: VNPL-1.0 (see README.md for details)

 #include <map>
 #include <vector>
@ -56,13 +56,6 @@ struct obj_piece_id_t
    uint64_t osd_num;
 };

-struct obj_ver_osd_t
-{
-    uint64_t osd_num;
-    object_id oid;
-    uint64_t version;
-};
-
 struct flush_action_t
 {
    bool rollback = false, make_stable = false;
@ -82,7 +75,7 @@ struct pg_t
 {
    int state = 0;
    uint64_t scheme = 0;
-    uint64_t pg_cursize = 0, pg_size = 0, pg_minsize = 0, pg_data_size = 0;
+    uint64_t pg_cursize = 0, pg_size = 0, pg_minsize = 0;
    pool_id_t pool_id = 0;
    pg_num_t pg_num = 0;
    uint64_t clean_count = 0, total_count = 0;
@ -101,14 +94,13 @@ struct pg_t
    std::vector<osd_num_t> cur_set;
    // same thing in state_dict-like format
    pg_osd_set_t cur_loc_set;
-    // moved object map. by default, each object is considered to reside on cur_set.
+    // moved object map. by default, each object is considered to reside on the cur_set.
    // this map stores all objects that differ.
    // it may consume up to ~ (raw storage / object size) * 24 bytes in the worst case scenario
    // which is up to ~192 MB per 1 TB in the worst case scenario
    std::map<pg_osd_set_t, pg_osd_set_state_t> state_dict;
    btree::btree_map<object_id, pg_osd_set_state_t*> incomplete_objects, misplaced_objects, degraded_objects;
    std::map<obj_piece_id_t, flush_action_t> flush_actions;
-    std::vector<obj_ver_osd_t> copies_to_delete_after_sync;
    btree::btree_map<object_id, uint64_t> ver_override;
    pg_peering_state_t *peering_state = NULL;
    pg_flush_batch_t *flush_batch = NULL;
--- a/src/osd_peering_pg_test.cpp
+++ b/src/osd_peering_pg_test.cpp
@ -1,9 +1,8 @@
 // Copyright (c) Vitaliy Filippov, 2019+
-// License: VNPL-1.1 (see README.md for details)
+// License: VNPL-1.0 (see README.md for details)

 #define _LARGEFILE64_SOURCE

-#include "malloc_or_die.h"
 #include "osd_peering_pg.h"
 #define STRIPE_SHIFT 12

--- a/src/osd_primary.cpp
+++ b/src/osd_primary.cpp
@ -1,8 +1,7 @@
 // Copyright (c) Vitaliy Filippov, 2019+
-// License: VNPL-1.1 (see README.md for details)
+// License: VNPL-1.0 (see README.md for details)

 #include "osd_primary.h"
-#include "allocator.h"

 // read: read directly or read paired stripe(s), reconstruct, return
 // write: read paired stripe(s), reconstruct, modify, calculate parity, write
@ -17,9 +16,8 @@ bool osd_t::prepare_primary_rw(osd_op_t *cur_op)
 {
    // PG number is calculated from the offset
    // Our EC scheme stores data in fixed chunks equal to (K*block size)
-    // K = (pg_size-parity_chunks) in case of EC/XOR, or 1 for replicated pools
+    // K = pg_minsize in case of EC/XOR, or 1 for replicated pools
    pool_id_t pool_id = INODE_POOL(cur_op->req.rw.inode);
-    // FIXME: We have to access pool config here, so make sure that it doesn't change while its PGs are active...
    auto pool_cfg_it = st_cli.pool_config.find(pool_id);
    if (pool_cfg_it == st_cli.pool_config.end())
    {
@ -28,8 +26,7 @@ bool osd_t::prepare_primary_rw(osd_op_t *cur_op)
        return false;
    }
    auto & pool_cfg = pool_cfg_it->second;
-    uint64_t pg_data_size = (pool_cfg.scheme == POOL_SCHEME_REPLICATED ? 1 : pool_cfg.pg_size-pool_cfg.parity_chunks);
-    uint64_t pg_block_size = bs_block_size * pg_data_size;
+    uint64_t pg_block_size = bs_block_size * (pool_cfg.scheme == POOL_SCHEME_REPLICATED ? 1 : pool_cfg.pg_minsize);
    object_id oid = {
        .inode = cur_op->req.rw.inode,
        // oid.stripe = starting offset of the parity stripe
@ -40,33 +37,26 @@ bool osd_t::prepare_primary_rw(osd_op_t *cur_op)
    if (pg_it == pgs.end() || !(pg_it->second.state & PG_ACTIVE))
    {
        // This OSD is not primary for this PG or the PG is inactive
-        // FIXME: Allow reads from PGs degraded under pg_minsize, but don't allow writes
        finish_op(cur_op, -EPIPE);
        return false;
    }
    if ((cur_op->req.rw.offset + cur_op->req.rw.len) > (oid.stripe + pg_block_size) ||
-        (cur_op->req.rw.offset % bs_bitmap_granularity) != 0 ||
-        (cur_op->req.rw.len % bs_bitmap_granularity) != 0)
+        (cur_op->req.rw.offset % bs_disk_alignment) != 0 ||
+        (cur_op->req.rw.len % bs_disk_alignment) != 0)
    {
        finish_op(cur_op, -EINVAL);
        return false;
    }
-    int stripe_count = (pool_cfg.scheme == POOL_SCHEME_REPLICATED ? 1 : pg_it->second.pg_size);
    osd_primary_op_data_t *op_data = (osd_primary_op_data_t*)calloc_or_die(
-        1, sizeof(osd_primary_op_data_t) + (clean_entry_bitmap_size + sizeof(osd_rmw_stripe_t)) * stripe_count
+        1, sizeof(osd_primary_op_data_t) + sizeof(osd_rmw_stripe_t) * (pool_cfg.scheme == POOL_SCHEME_REPLICATED ? 1 : pg_it->second.pg_size)
    );
    op_data->pg_num = pg_num;
    op_data->oid = oid;
    op_data->stripes = ((osd_rmw_stripe_t*)(op_data+1));
    op_data->scheme = pool_cfg.scheme;
-    op_data->pg_data_size = pg_data_size;
    cur_op->op_data = op_data;
-    split_stripes(pg_data_size, bs_block_size, (uint32_t)(cur_op->req.rw.offset - oid.stripe), cur_op->req.rw.len, op_data->stripes);
-    // Allocate bitmaps along with stripes to avoid extra allocations and fragmentation
-    for (int i = 0; i < stripe_count; i++)
-    {
-        op_data->stripes[i].bmp_buf = (void*)(op_data->stripes+stripe_count) + clean_entry_bitmap_size*i;
-    }
+    split_stripes((pool_cfg.scheme == POOL_SCHEME_REPLICATED ? 1 : pg_it->second.pg_minsize),
+        bs_block_size, (uint32_t)(cur_op->req.rw.offset - oid.stripe), cur_op->req.rw.len, op_data->stripes);
    pg_it->second.inflight++;
    return true;
 }
@ -106,13 +96,12 @@ void osd_t::continue_primary_read(osd_op_t *cur_op)
    {
        return;
    }
-    cur_op->reply.rw.bitmap_len = 0;
    osd_primary_op_data_t *op_data = cur_op->op_data;
    if (op_data->st == 1)      goto resume_1;
    else if (op_data->st == 2) goto resume_2;
    {
-        auto & pg = pgs.at({ .pool_id = INODE_POOL(op_data->oid.inode), .pg_num = op_data->pg_num });
-        for (int role = 0; role < op_data->pg_data_size; role++)
+        auto & pg = pgs[{ .pool_id = INODE_POOL(op_data->oid.inode), .pg_num = op_data->pg_num }];
+        for (int role = 0; role < (op_data->scheme == POOL_SCHEME_REPLICATED ? 1 : pg.pg_minsize); role++)
        {
            op_data->stripes[role].read_start = op_data->stripes[role].req_start;
            op_data->stripes[role].read_end = op_data->stripes[role].req_end;
@ -123,23 +112,24 @@ void osd_t::continue_primary_read(osd_op_t *cur_op)
        if (pg.state == PG_ACTIVE || op_data->scheme == POOL_SCHEME_REPLICATED)
        {
            // Fast happy-path
-            cur_op->buf = alloc_read_buffer(op_data->stripes, op_data->pg_data_size, 0);
+            cur_op->buf = alloc_read_buffer(op_data->stripes,
+                (op_data->scheme == POOL_SCHEME_REPLICATED ? 1 : pg.pg_minsize), 0);
            submit_primary_subops(SUBMIT_READ, op_data->target_ver,
-                (op_data->scheme == POOL_SCHEME_REPLICATED ? pg.pg_size : op_data->pg_data_size), pg.cur_set.data(), cur_op);
+                (op_data->scheme == POOL_SCHEME_REPLICATED ? pg.pg_size : pg.pg_minsize), pg.cur_set.data(), cur_op);
            op_data->st = 1;
        }
        else
        {
            // PG may be degraded or have misplaced objects
            uint64_t* cur_set = get_object_osd_set(pg, op_data->oid, pg.cur_set.data(), &op_data->object_state);
-            if (extend_missing_stripes(op_data->stripes, cur_set, op_data->pg_data_size, pg.pg_size) < 0)
+            if (extend_missing_stripes(op_data->stripes, cur_set, pg.pg_minsize, pg.pg_size) < 0)
            {
                finish_op(cur_op, -EIO);
                return;
            }
            // Submit reads
+            op_data->pg_minsize = pg.pg_minsize;
            op_data->pg_size = pg.pg_size;
-            op_data->scheme = pg.scheme;
            op_data->degraded = 1;
            cur_op->buf = alloc_read_buffer(op_data->stripes, pg.pg_size, 0);
            submit_primary_subops(SUBMIT_READ, op_data->target_ver, pg.pg_size, cur_set, cur_op);
@ -154,22 +144,17 @@ resume_2:
        finish_op(cur_op, op_data->epipe > 0 ? -EPIPE : -EIO);
        return;
    }
-    cur_op->reply.rw.bitmap_len = op_data->pg_data_size * clean_entry_bitmap_size;
    if (op_data->degraded)
    {
        // Reconstruct missing stripes
+        // FIXME: Always EC(k+1) by now. Add different coding schemes
        osd_rmw_stripe_t *stripes = op_data->stripes;
-        if (op_data->scheme == POOL_SCHEME_XOR)
-        {
-            reconstruct_stripes_xor(stripes, op_data->pg_size, clean_entry_bitmap_size);
-        }
-        else if (op_data->scheme == POOL_SCHEME_JERASURE)
-        {
-            reconstruct_stripes_jerasure(stripes, op_data->pg_size, op_data->pg_data_size, clean_entry_bitmap_size);
-        }
-        cur_op->iov.push_back(op_data->stripes[0].bmp_buf, cur_op->reply.rw.bitmap_len);
-        for (int role = 0; role < op_data->pg_size; role++)
+        for (int role = 0; role < op_data->pg_minsize; role++)
        {
+            if (stripes[role].read_end != 0 && stripes[role].missing)
+            {
+                reconstruct_stripe_xor(stripes, op_data->pg_size, role);
+            }
            if (stripes[role].req_end != 0)
            {
                // Send buffer in parts to avoid copying
@ -182,7 +167,6 @@ resume_2:
    }
    else
    {
-        cur_op->iov.push_back(op_data->stripes[0].bmp_buf, cur_op->reply.rw.bitmap_len);
        cur_op->iov.push_back(cur_op->buf, cur_op->req.rw.len);
    }
    finish_op(cur_op, cur_op->req.rw.len);
@ -222,7 +206,7 @@ void osd_t::continue_primary_write(osd_op_t *cur_op)
        return;
    }
    osd_primary_op_data_t *op_data = cur_op->op_data;
-    auto & pg = pgs.at({ .pool_id = INODE_POOL(op_data->oid.inode), .pg_num = op_data->pg_num });
+    auto & pg = pgs[{ .pool_id = INODE_POOL(op_data->oid.inode), .pg_num = op_data->pg_num }];
    if (op_data->st == 1)      goto resume_1;
    else if (op_data->st == 2) goto resume_2;
    else if (op_data->st == 3) goto resume_3;
@ -249,7 +233,6 @@ resume_1:
        op_data->stripes[0].write_start = op_data->stripes[0].req_start;
        op_data->stripes[0].write_end = op_data->stripes[0].req_end;
        op_data->stripes[0].write_buf = cur_op->buf;
-        op_data->stripes[0].bmp_buf = (void*)(op_data->stripes+1);
        if (pg.cur_set.data() != op_data->prev_set && (op_data->stripes[0].write_start != 0 ||
            op_data->stripes[0].write_end != bs_block_size))
        {
@ -262,7 +245,7 @@ resume_1:
    else
    {
        cur_op->rmw_buf = calc_rmw(cur_op->buf, op_data->stripes, op_data->prev_set,
-            pg.pg_size, op_data->pg_data_size, pg.pg_cursize, pg.cur_set.data(), bs_block_size, clean_entry_bitmap_size);
+            pg.pg_size, pg.pg_minsize, pg.pg_cursize, pg.cur_set.data(), bs_block_size);
        if (!cur_op->rmw_buf)
        {
            // Refuse partial overwrite of an incomplete object
@ -285,9 +268,7 @@ resume_3:
    pg.ver_override[op_data->oid] = op_data->fact_ver;
    if (op_data->scheme == POOL_SCHEME_REPLICATED)
    {
-        // Set bitmap bits
-        bitmap_set(op_data->stripes[0].bmp_buf, op_data->stripes[0].write_start, op_data->stripes[0].write_end, bs_bitmap_granularity);
-        // Possibly copy new data from the request into the recovery buffer
+        // Only (possibly) copy new data from the request into the recovery buffer
        if (pg.cur_set.data() != op_data->prev_set && (op_data->stripes[0].write_start != 0 ||
            op_data->stripes[0].write_end != bs_block_size))
        {
@ -304,14 +285,7 @@ resume_3:
    else
    {
        // Recover missing stripes, calculate parity
-        if (pg.scheme == POOL_SCHEME_XOR)
-        {
-            calc_rmw_parity_xor(op_data->stripes, pg.pg_size, op_data->prev_set, pg.cur_set.data(), bs_block_size, clean_entry_bitmap_size);
-        }
-        else if (pg.scheme == POOL_SCHEME_JERASURE)
-        {
-            calc_rmw_parity_jerasure(op_data->stripes, pg.pg_size, op_data->pg_data_size, op_data->prev_set, pg.cur_set.data(), bs_block_size, clean_entry_bitmap_size);
-        }
+        calc_rmw_parity_xor(op_data->stripes, pg.pg_size, op_data->prev_set, pg.cur_set.data(), bs_block_size);
    }
    // Send writes
    if ((op_data->fact_ver >> (64-PG_EPOCH_BITS)) < pg.epoch)
@ -379,34 +353,9 @@ resume_7:
                recovery_stat_bytes[0][recovery_type] += op_data->stripes[role].write_end - op_data->stripes[role].write_start;
            }
        }
-        // Any kind of a non-clean object can have extra chunks, because we don't record objects
-        // as degraded & misplaced or incomplete & misplaced at the same time. So try to remove extra chunks
-        if (immediate_commit != IMMEDIATE_ALL)
-        {
-            // We can't remove extra chunks yet if fsyncs are explicit, because
-            // new copies may not be committed to stable storage yet
-            // We can only remove extra chunks after a successful SYNC for this PG
-            for (auto & chunk: op_data->object_state->osd_set)
-            {
-                // Check is the same as in submit_primary_del_subops()
-                if (op_data->scheme == POOL_SCHEME_REPLICATED
-                    ? !contains_osd(pg.cur_set.data(), pg.pg_size, chunk.osd_num)
-                    : (chunk.osd_num != pg.cur_set[chunk.role]))
-                {
-                    pg.copies_to_delete_after_sync.push_back((obj_ver_osd_t){
-                        .osd_num = chunk.osd_num,
-                        .oid = {
-                            .inode = op_data->oid.inode,
-                            .stripe = op_data->oid.stripe | (op_data->scheme == POOL_SCHEME_REPLICATED ? 0 : chunk.role),
-                        },
-                        .version = op_data->fact_ver,
-                    });
-                    copies_to_delete_after_sync_count++;
-                }
-            }
-        }
-        else
+        if (op_data->object_state->state & OBJ_MISPLACED)
        {
+            // Remove extra chunks
            submit_primary_del_subops(cur_op, pg.cur_set.data(), pg.pg_size, op_data->object_state->osd_set);
            if (op_data->n_subops > 0)
            {
@ -430,19 +379,19 @@ continue_others:
    // Remove version override
    pg.ver_override.erase(op_data->oid);
    object_id oid = op_data->oid;
-    // Remove the operation from queue before calling finish_op so it doesn't see the completed operation in queue
-    auto next_it = pg.write_queue.find(oid);
-    if (next_it != pg.write_queue.end() && next_it->second == cur_op)
-    {
-        pg.write_queue.erase(next_it++);
-    }
-    // finish_op would invalidate next_it if it cleared pg.write_queue, but it doesn't do that :)
    finish_op(cur_op, cur_op->reply.hdr.retval);
    // Continue other write operations to the same object
-    if (next_it != pg.write_queue.end() && next_it->first == oid)
+    auto next_it = pg.write_queue.find(oid);
+    auto this_it = next_it;
+    if (this_it != pg.write_queue.end() && this_it->second == cur_op)
    {
-        osd_op_t *next_op = next_it->second;
-        continue_primary_write(next_op);
+        next_it++;
+        pg.write_queue.erase(this_it);
+        if (next_it != pg.write_queue.end() && next_it->first == oid)
+        {
+            osd_op_t *next_op = next_it->second;
+            continue_primary_write(next_op);
+        }
    }
 }

@ -528,11 +477,7 @@ resume_7:
        }
        // Remember PG as dirty to drop the connection when PG goes offline
        // (this is required because of the "lazy sync")
-        auto cl_it = c_cli.clients.find(cur_op->peer_fd);
-        if (cl_it != c_cli.clients.end())
-        {
-            cl_it->second->dirty_pgs.insert({ .pool_id = pg.pool_id, .pg_num = pg.pg_num });
-        }
+        c_cli.clients[cur_op->peer_fd]->dirty_pgs.insert({ .pool_id = pg.pool_id, .pg_num = pg.pg_num });
        dirty_pgs.insert({ .pool_id = pg.pool_id, .pg_num = pg.pg_num });
    }
    return true;
@ -552,8 +497,6 @@ void osd_t::continue_primary_sync(osd_op_t *cur_op)
    else if (op_data->st == 4) goto resume_4;
    else if (op_data->st == 5) goto resume_5;
    else if (op_data->st == 6) goto resume_6;
-    else if (op_data->st == 7) goto resume_7;
-    else if (op_data->st == 8) goto resume_8;
    assert(op_data->st == 0);
    if (syncs_in_progress.size() > 0)
    {
@ -615,38 +558,15 @@ resume_2:
        this->unstable_writes.clear();
    }
    {
-        void *dirty_buf = malloc_or_die(
-            sizeof(pool_pg_num_t)*dirty_pgs.size() +
-            sizeof(osd_num_t)*dirty_osds.size() +
-            sizeof(obj_ver_osd_t)*this->copies_to_delete_after_sync_count
-        );
+        void *dirty_buf = malloc_or_die(sizeof(pool_pg_num_t)*dirty_pgs.size() + sizeof(osd_num_t)*dirty_osds.size());
        op_data->dirty_pgs = (pool_pg_num_t*)dirty_buf;
        op_data->dirty_osds = (osd_num_t*)(dirty_buf + sizeof(pool_pg_num_t)*dirty_pgs.size());
        op_data->dirty_pg_count = dirty_pgs.size();
        op_data->dirty_osd_count = dirty_osds.size();
-        if (this->copies_to_delete_after_sync_count)
-        {
-            op_data->copies_to_delete_count = 0;
-            op_data->copies_to_delete = (obj_ver_osd_t*)(op_data->dirty_osds + op_data->dirty_osd_count);
-            for (auto dirty_pg_num: dirty_pgs)
-            {
-                auto & pg = pgs.at(dirty_pg_num);
-                assert(pg.copies_to_delete_after_sync.size() <= this->copies_to_delete_after_sync_count);
-                memcpy(
-                    op_data->copies_to_delete + op_data->copies_to_delete_count,
-                    pg.copies_to_delete_after_sync.data(),
-                    sizeof(obj_ver_osd_t)*pg.copies_to_delete_after_sync.size()
-                );
-                op_data->copies_to_delete_count += pg.copies_to_delete_after_sync.size();
-                this->copies_to_delete_after_sync_count -= pg.copies_to_delete_after_sync.size();
-                pg.copies_to_delete_after_sync.clear();
-            }
-            assert(this->copies_to_delete_after_sync_count == 0);
-        }
        int dpg = 0;
        for (auto dirty_pg_num: dirty_pgs)
        {
-            pgs.at(dirty_pg_num).inflight++;
+            pgs[dirty_pg_num].inflight++;
            op_data->dirty_pgs[dpg++] = dirty_pg_num;
        }
        dirty_pgs.clear();
@ -703,7 +623,7 @@ resume_6:
                        .pool_id = INODE_POOL(w.oid.inode),
                        .pg_num = map_to_pg(w.oid, st_cli.pool_config.at(INODE_POOL(w.oid.inode)).pg_stripe_size),
                    };
-                    if (pgs.at(wpg).state & PG_ACTIVE)
+                    if (pgs[wpg].state & PG_ACTIVE)
                    {
                        uint64_t & dest = this->unstable_writes[(osd_object_id_t){
                            .osd_num = unstable_osd.osd_num,
@ -715,44 +635,12 @@ resume_6:
                }
            }
        }
-        if (op_data->copies_to_delete)
-        {
-            // Return 'copies to delete' back into respective PGs
-            for (int i = 0; i < op_data->copies_to_delete_count; i++)
-            {
-                auto & w = op_data->copies_to_delete[i];
-                auto & pg = pgs.at((pool_pg_num_t){
-                    .pool_id = INODE_POOL(w.oid.inode),
-                    .pg_num = map_to_pg(w.oid, st_cli.pool_config.at(INODE_POOL(w.oid.inode)).pg_stripe_size),
-                });
-                if (pg.state & PG_ACTIVE)
-                {
-                    pg.copies_to_delete_after_sync.push_back(w);
-                    copies_to_delete_after_sync_count++;
-                }
-            }
-        }
-    }
-    else if (op_data->copies_to_delete)
-    {
-        // Actually delete copies which we wanted to delete
-        submit_primary_del_batch(cur_op, op_data->copies_to_delete, op_data->copies_to_delete_count);
-resume_7:
-        op_data->st = 7;
-        return;
-resume_8:
-        if (op_data->errors > 0)
-        {
-            goto resume_6;
-        }
    }
    for (int i = 0; i < op_data->dirty_pg_count; i++)
    {
        auto & pg = pgs.at(op_data->dirty_pgs[i]);
        pg.inflight--;
-        if ((pg.state & PG_STOPPING) && pg.inflight == 0 && !pg.flush_batch &&
-            // We must either forget all PG's unstable writes or wait for it to become clean
-            dirty_pgs.find({ .pool_id = pg.pool_id, .pg_num = pg.pg_num }) == dirty_pgs.end())
+        if ((pg.state & PG_STOPPING) && pg.inflight == 0 && !pg.flush_batch)
        {
            finish_stop_pg(pg);
        }
@ -846,7 +734,7 @@ void osd_t::continue_primary_del(osd_op_t *cur_op)
        return;
    }
    osd_primary_op_data_t *op_data = cur_op->op_data;
-    auto & pg = pgs.at({ .pool_id = INODE_POOL(op_data->oid.inode), .pg_num = op_data->pg_num });
+    auto & pg = pgs[{ .pool_id = INODE_POOL(op_data->oid.inode), .pg_num = op_data->pg_num }];
    if (op_data->st == 1)      goto resume_1;
    else if (op_data->st == 2) goto resume_2;
    else if (op_data->st == 3) goto resume_3;
--- a/src/osd_primary.h
+++ b/src/osd_primary.h
@ -1,5 +1,5 @@
 // Copyright (c) Vitaliy Filippov, 2019+
-// License: VNPL-1.1 (see README.md for details)
+// License: VNPL-1.0 (see README.md for details)

 #pragma once

@ -25,7 +25,7 @@ struct osd_primary_op_data_t
    uint64_t fact_ver = 0;
    uint64_t scheme = 0;
    int n_subops = 0, done = 0, errors = 0, epipe = 0;
-    int degraded = 0, pg_size, pg_data_size;
+    int degraded = 0, pg_size, pg_minsize;
    osd_rmw_stripe_t *stripes;
    osd_op_t *subops = NULL;
    uint64_t *prev_set = NULL;
@ -38,8 +38,4 @@ struct osd_primary_op_data_t
    osd_num_t *dirty_osds = NULL;
    int dirty_osd_count = 0;
    obj_ver_id *unstable_writes = NULL;
-    obj_ver_osd_t *copies_to_delete = NULL;
-    int copies_to_delete_count = 0;
 };
-
-bool contains_osd(osd_num_t *osd_set, uint64_t size, osd_num_t osd_num);
--- a/src/osd_primary_subops.cpp
+++ b/src/osd_primary_subops.cpp
@ -1,5 +1,5 @@
 // Copyright (c) Vitaliy Filippov, 2019+
-// License: VNPL-1.1 (see README.md for details)
+// License: VNPL-1.0 (see README.md for details)

 #include "osd_primary.h"

@ -11,7 +11,7 @@ void osd_t::autosync()
    {
        autosync_op = new osd_op_t();
        autosync_op->op_type = OSD_OP_IN;
-        autosync_op->req = (osd_any_op_t){
+        autosync_op->req = {
            .sync = {
                .header = {
                    .magic = SECONDARY_OSD_OP_MAGIC,
@ -36,39 +36,14 @@ void osd_t::autosync()
 void osd_t::finish_op(osd_op_t *cur_op, int retval)
 {
    inflight_ops--;
-    if (cur_op->req.hdr.opcode == OSD_OP_READ ||
-        cur_op->req.hdr.opcode == OSD_OP_WRITE ||
-        cur_op->req.hdr.opcode == OSD_OP_DELETE)
-    {
-        // Track inode statistics
-        if (!cur_op->tv_end.tv_sec)
-        {
-            clock_gettime(CLOCK_REALTIME, &cur_op->tv_end);
-        }
-        uint64_t usec = (
-            (cur_op->tv_end.tv_sec - cur_op->tv_begin.tv_sec)*1000000 +
-            (cur_op->tv_end.tv_nsec - cur_op->tv_begin.tv_nsec)/1000
-        );
-        int inode_st_op = cur_op->req.hdr.opcode == OSD_OP_DELETE
-            ? INODE_STATS_DELETE
-            : (cur_op->req.hdr.opcode == OSD_OP_READ ? INODE_STATS_READ : INODE_STATS_WRITE);
-        inode_stats[cur_op->req.rw.inode].op_count[inode_st_op]++;
-        inode_stats[cur_op->req.rw.inode].op_sum[inode_st_op] += usec;
-        if (cur_op->req.hdr.opcode == OSD_OP_DELETE)
-            inode_stats[cur_op->req.rw.inode].op_bytes[inode_st_op] += cur_op->op_data->pg_data_size * bs_block_size;
-        else
-            inode_stats[cur_op->req.rw.inode].op_bytes[inode_st_op] += cur_op->req.rw.len;
-    }
    if (cur_op->op_data)
    {
        if (cur_op->op_data->pg_num > 0)
        {
-            auto & pg = pgs.at({ .pool_id = INODE_POOL(cur_op->op_data->oid.inode), .pg_num = cur_op->op_data->pg_num });
+            auto & pg = pgs[{ .pool_id = INODE_POOL(cur_op->op_data->oid.inode), .pg_num = cur_op->op_data->pg_num }];
            pg.inflight--;
            assert(pg.inflight >= 0);
-            if ((pg.state & PG_STOPPING) && pg.inflight == 0 && !pg.flush_batch &&
-                // We must either forget all PG's unstable writes or wait for it to become clean
-                dirty_pgs.find({ .pool_id = pg.pool_id, .pg_num = pg.pg_num }) == dirty_pgs.end())
+            if ((pg.state & PG_STOPPING) && pg.inflight == 0 && !pg.flush_batch)
            {
                finish_stop_pg(pg);
            }
@ -87,7 +62,7 @@ void osd_t::finish_op(osd_op_t *cur_op, int retval)
    }
    else
    {
-        // FIXME add separate magic number for primary ops
+        // FIXME add separate magic number
        auto cl_it = c_cli.clients.find(cur_op->peer_fd);
        if (cl_it != c_cli.clients.end())
        {
@ -152,8 +127,6 @@ void osd_t::submit_primary_subops(int submit_type, uint64_t op_version, int pg_s
            {
                clock_gettime(CLOCK_REALTIME, &subops[i].tv_begin);
                subops[i].op_type = (uint64_t)cur_op;
-                subops[i].bitmap = stripes[stripe_num].bmp_buf;
-                subops[i].bitmap_len = clean_entry_bitmap_size;
                subops[i].bs_op = new blockstore_op_t({
                    .opcode = (uint64_t)(wr ? (rep ? BS_OP_WRITE_STABLE : BS_OP_WRITE) : BS_OP_READ),
                    .callback = [subop = &subops[i], this](blockstore_op_t *bs_subop)
@ -168,7 +141,6 @@ void osd_t::submit_primary_subops(int submit_type, uint64_t op_version, int pg_s
                    .offset = wr ? stripes[stripe_num].write_start : stripes[stripe_num].read_start,
                    .len = wr ? stripes[stripe_num].write_end - stripes[stripe_num].write_start : stripes[stripe_num].read_end - stripes[stripe_num].read_start,
                    .buf = wr ? stripes[stripe_num].write_buf : stripes[stripe_num].read_buf,
-                    .bitmap = stripes[stripe_num].bmp_buf,
                });
 #ifdef OSD_DEBUG
                printf(
@ -183,8 +155,6 @@ void osd_t::submit_primary_subops(int submit_type, uint64_t op_version, int pg_s
            {
                subops[i].op_type = OSD_OP_OUT;
                subops[i].peer_fd = c_cli.osd_peer_fds.at(role_osd_num);
-                subops[i].bitmap = stripes[stripe_num].bmp_buf;
-                subops[i].bitmap_len = clean_entry_bitmap_size;
                subops[i].req.sec_rw = {
                    .header = {
                        .magic = SECONDARY_OSD_OP_MAGIC,
@ -198,7 +168,6 @@ void osd_t::submit_primary_subops(int submit_type, uint64_t op_version, int pg_s
                    .version = op_version,
                    .offset = wr ? stripes[stripe_num].write_start : stripes[stripe_num].read_start,
                    .len = wr ? stripes[stripe_num].write_end - stripes[stripe_num].write_start : stripes[stripe_num].read_end - stripes[stripe_num].read_start,
-                    .attr_len = wr ? clean_entry_bitmap_size : 0,
                };
 #ifdef OSD_DEBUG
                printf(
@ -326,7 +295,7 @@ void osd_t::handle_primary_subop(osd_op_t *subop, osd_op_t *cur_op)
            uint64_t version = subop->reply.sec_rw.version;
 #ifdef OSD_DEBUG
            uint64_t peer_osd = c_cli.clients.find(subop->peer_fd) != c_cli.clients.end()
-                ? c_cli.clients[subop->peer_fd]->osd_num : osd_num;
+                ? c_cli.clients[subop->peer_fd].osd_num : osd_num;
            printf("subop %lu from osd %lu: version = %lu\n", opcode, peer_osd, version);
 #endif
            if (op_data->fact_ver != 0 && op_data->fact_ver != version)
@ -384,7 +353,7 @@ void osd_t::cancel_primary_write(osd_op_t *cur_op)
    }
 }

-bool contains_osd(osd_num_t *osd_set, uint64_t size, osd_num_t osd_num)
+static bool contains_osd(osd_num_t *osd_set, uint64_t size, osd_num_t osd_num)
 {
    for (uint64_t i = 0; i < size; i++)
    {
@ -400,82 +369,78 @@ void osd_t::submit_primary_del_subops(osd_op_t *cur_op, osd_num_t *cur_set, uint
 {
    osd_primary_op_data_t *op_data = cur_op->op_data;
    bool rep = op_data->scheme == POOL_SCHEME_REPLICATED;
-    obj_ver_osd_t extra_chunks[loc_set.size()];
-    int chunks_to_del = 0;
+    int extra_chunks = 0;
+    // ordered comparison for EC/XOR, unordered for replicated pools
    for (auto & chunk: loc_set)
    {
-        // ordered comparison for EC/XOR, unordered for replicated pools
-        if (!cur_set || (rep
-            ? !contains_osd(cur_set, set_size, chunk.osd_num)
-            : (chunk.osd_num != cur_set[chunk.role])))
+        if (!cur_set || (rep ? !contains_osd(cur_set, set_size, chunk.osd_num) : chunk.osd_num != cur_set[chunk.role]))
        {
-            extra_chunks[chunks_to_del++] = (obj_ver_osd_t){
-                .osd_num = chunk.osd_num,
-                .oid = {
-                    .inode = op_data->oid.inode,
-                    .stripe = op_data->oid.stripe | (rep ? 0 : chunk.role),
-                },
-                // Same version as write
-                .version = op_data->fact_ver,
-            };
+            extra_chunks++;
        }
    }
-    submit_primary_del_batch(cur_op, extra_chunks, chunks_to_del);
-}
-
-void osd_t::submit_primary_del_batch(osd_op_t *cur_op, obj_ver_osd_t *chunks_to_delete, int chunks_to_delete_count)
-{
-    osd_primary_op_data_t *op_data = cur_op->op_data;
-    op_data->n_subops = chunks_to_delete_count;
+    op_data->n_subops = extra_chunks;
    op_data->done = op_data->errors = 0;
-    if (!op_data->n_subops)
+    if (!extra_chunks)
    {
        return;
    }
-    osd_op_t *subops = new osd_op_t[chunks_to_delete_count];
+    osd_op_t *subops = new osd_op_t[extra_chunks];
    op_data->subops = subops;
-    for (int i = 0; i < chunks_to_delete_count; i++)
+    int i = 0;
+    for (auto & chunk: loc_set)
    {
-        auto & chunk = chunks_to_delete[i];
-        if (chunk.osd_num == this->osd_num)
+        if (!cur_set || (rep ? !contains_osd(cur_set, set_size, chunk.osd_num) : chunk.osd_num != cur_set[chunk.role]))
        {
-            clock_gettime(CLOCK_REALTIME, &subops[i].tv_begin);
-            subops[i].op_type = (uint64_t)cur_op;
-            subops[i].bs_op = new blockstore_op_t({
-                .opcode = BS_OP_DELETE,
-                .callback = [subop = &subops[i], this](blockstore_op_t *bs_subop)
-                {
-                    handle_primary_bs_subop(subop);
-                },
-                .oid = chunk.oid,
-                .version = chunk.version,
-            });
-            bs->enqueue_op(subops[i].bs_op);
-        }
-        else
-        {
-            subops[i].op_type = OSD_OP_OUT;
-            subops[i].peer_fd = c_cli.osd_peer_fds.at(chunk.osd_num);
-            subops[i].req.sec_del = {
-                .header = {
-                    .magic = SECONDARY_OSD_OP_MAGIC,
-                    .id = c_cli.next_subop_id++,
-                    .opcode = OSD_OP_SEC_DELETE,
-                },
-                .oid = chunk.oid,
-                .version = chunk.version,
-            };
-            subops[i].callback = [cur_op, this](osd_op_t *subop)
+            int stripe_num = op_data->scheme == POOL_SCHEME_REPLICATED ? 0 : chunk.role;
+            if (chunk.osd_num == this->osd_num)
            {
-                int fail_fd = subop->reply.hdr.retval != 0 ? subop->peer_fd : -1;
-                handle_primary_subop(subop, cur_op);
-                if (fail_fd >= 0)
+                clock_gettime(CLOCK_REALTIME, &subops[i].tv_begin);
+                subops[i].op_type = (uint64_t)cur_op;
+                subops[i].bs_op = new blockstore_op_t({
+                    .opcode = BS_OP_DELETE,
+                    .callback = [subop = &subops[i], this](blockstore_op_t *bs_subop)
+                    {
+                        handle_primary_bs_subop(subop);
+                    },
+                    .oid = {
+                        .inode = op_data->oid.inode,
+                        .stripe = op_data->oid.stripe | stripe_num,
+                    },
+                    // Same version as write
+                    .version = op_data->fact_ver,
+                });
+                bs->enqueue_op(subops[i].bs_op);
+            }
+            else
+            {
+                subops[i].op_type = OSD_OP_OUT;
+                subops[i].peer_fd = c_cli.osd_peer_fds.at(chunk.osd_num);
+                subops[i].req.sec_del = {
+                    .header = {
+                        .magic = SECONDARY_OSD_OP_MAGIC,
+                        .id = c_cli.next_subop_id++,
+                        .opcode = OSD_OP_SEC_DELETE,
+                    },
+                    .oid = {
+                        .inode = op_data->oid.inode,
+                        .stripe = op_data->oid.stripe | stripe_num,
+                    },
+                    // Same version as write
+                    .version = op_data->fact_ver,
+                };
+                subops[i].callback = [cur_op, this](osd_op_t *subop)
                {
-                    // delete operation failed, drop the connection
-                    c_cli.stop_client(fail_fd);
-                }
-            };
-            c_cli.outbox_push(&subops[i]);
+                    int fail_fd = subop->reply.hdr.retval != 0 ? subop->peer_fd : -1;
+                    handle_primary_subop(subop, cur_op);
+                    if (fail_fd >= 0)
+                    {
+                        // delete operation failed, drop the connection
+                        c_cli.stop_client(fail_fd);
+                    }
+                };
+                c_cli.outbox_push(&subops[i]);
+            }
+            i++;
        }
    }
 }
@ -545,7 +510,7 @@ void osd_t::submit_primary_stab_subops(osd_op_t *cur_op)
        {
            clock_gettime(CLOCK_REALTIME, &subops[i].tv_begin);
            subops[i].op_type = (uint64_t)cur_op;
-            subops[i].bs_op = new blockstore_op_t((blockstore_op_t){
+            subops[i].bs_op = new blockstore_op_t({
                .opcode = BS_OP_STABLE,
                .callback = [subop = &subops[i], this](blockstore_op_t *bs_subop)
                {
--- a/src/osd_rmw.cpp
+++ b/src/osd_rmw.cpp
@ -1,19 +1,12 @@
 // Copyright (c) Vitaliy Filippov, 2019+
-// License: VNPL-1.1 (see README.md for details)
+// License: VNPL-1.0 (see README.md for details)

-#include <stdexcept>
 #include <string.h>
 #include <assert.h>
-#include <jerasure/reed_sol.h>
-#include <jerasure.h>
-#include <map>
-#include "allocator.h"
 #include "xor.h"
 #include "osd_rmw.h"
 #include "malloc_or_die.h"

-#define OSD_JERASURE_W 8
-
 static inline void extend_read(uint32_t start, uint32_t end, osd_rmw_stripe_t & stripe)
 {
    if (stripe.read_end == 0)
@ -82,203 +75,44 @@ void split_stripes(uint64_t pg_minsize, uint32_t bs_block_size, uint32_t start,
    }
 }

-void reconstruct_stripes_xor(osd_rmw_stripe_t *stripes, int pg_size, uint32_t bitmap_size)
+void reconstruct_stripe_xor(osd_rmw_stripe_t *stripes, int pg_size, int role)
 {
-    for (int role = 0; role < pg_size; role++)
+    int prev = -2;
+    for (int other = 0; other < pg_size; other++)
    {
-        if (stripes[role].read_end != 0 && stripes[role].missing)
+        if (other != role)
        {
-            // Reconstruct missing stripe (XOR k+1)
-            int prev = -2;
-            for (int other = 0; other < pg_size; other++)
+            if (prev == -2)
            {
-                if (other != role)
-                {
-                    if (prev == -2)
-                    {
-                        prev = other;
-                    }
-                    else if (prev >= 0)
-                    {
-                        assert(stripes[role].read_start >= stripes[prev].read_start &&
-                            stripes[role].read_start >= stripes[other].read_start);
-                        memxor(
-                            stripes[prev].read_buf + (stripes[role].read_start - stripes[prev].read_start),
-                            stripes[other].read_buf + (stripes[role].read_start - stripes[other].read_start),
-                            stripes[role].read_buf, stripes[role].read_end - stripes[role].read_start
-                        );
-                        memxor(stripes[prev].bmp_buf, stripes[other].bmp_buf, stripes[role].bmp_buf, bitmap_size);
-                        prev = -1;
-                    }
-                    else
-                    {
-                        assert(stripes[role].read_start >= stripes[other].read_start);
-                        memxor(
-                            stripes[role].read_buf,
-                            stripes[other].read_buf + (stripes[role].read_start - stripes[other].read_start),
-                            stripes[role].read_buf, stripes[role].read_end - stripes[role].read_start
-                        );
-                        memxor(stripes[role].bmp_buf, stripes[other].bmp_buf, stripes[role].bmp_buf, bitmap_size);
-                    }
-                }
+                prev = other;
+            }
+            else if (prev >= 0)
+            {
+                assert(stripes[role].read_start >= stripes[prev].read_start &&
+                    stripes[role].read_start >= stripes[other].read_start);
+                memxor(
+                    stripes[prev].read_buf + (stripes[role].read_start - stripes[prev].read_start),
+                    stripes[other].read_buf + (stripes[role].read_start - stripes[other].read_start),
+                    stripes[role].read_buf, stripes[role].read_end - stripes[role].read_start
+                );
+                prev = -1;
+            }
+            else
+            {
+                assert(stripes[role].read_start >= stripes[other].read_start);
+                memxor(
+                    stripes[role].read_buf,
+                    stripes[other].read_buf + (stripes[role].read_start - stripes[other].read_start),
+                    stripes[role].read_buf, stripes[role].read_end - stripes[role].read_start
+                );
            }
        }
    }
 }

-struct reed_sol_erased_t
+int extend_missing_stripes(osd_rmw_stripe_t *stripes, osd_num_t *osd_set, int minsize, int size)
 {
-    int *data;
-    int size;
-};
-
-inline bool operator < (const reed_sol_erased_t &a, const reed_sol_erased_t &b)
-{
-    for (int i = 0; i < a.size && i < b.size; i++)
-    {
-        if (a.data[i] < b.data[i])
-            return -1;
-        else if (a.data[i] > b.data[i])
-            return 1;
-    }
-    return 0;
-}
-
-struct reed_sol_matrix_t
-{
-    int refs = 0;
-    int *data;
-    std::map<reed_sol_erased_t, int*> decodings;
-};
-
-std::map<uint64_t, reed_sol_matrix_t> matrices;
-
-void use_jerasure(int pg_size, int pg_minsize, bool use)
-{
-    uint64_t key = (uint64_t)pg_size | ((uint64_t)pg_minsize) << 32;
-    auto rs_it = matrices.find(key);
-    if (rs_it == matrices.end())
-    {
-        if (!use)
-        {
-            return;
-        }
-        int *matrix = reed_sol_vandermonde_coding_matrix(pg_minsize, pg_size-pg_minsize, OSD_JERASURE_W);
-        matrices[key] = (reed_sol_matrix_t){
-            .refs = 0,
-            .data = matrix,
-        };
-        rs_it = matrices.find(key);
-    }
-    rs_it->second.refs += (!use ? -1 : 1);
-    if (rs_it->second.refs <= 0)
-    {
-        free(rs_it->second.data);
-        for (auto dec_it = rs_it->second.decodings.begin(); dec_it != rs_it->second.decodings.end();)
-        {
-            int *data = dec_it->second;
-            rs_it->second.decodings.erase(dec_it++);
-            free(data);
-        }
-        matrices.erase(rs_it);
-    }
-}
-
-reed_sol_matrix_t* get_jerasure_matrix(int pg_size, int pg_minsize)
-{
-    uint64_t key = (uint64_t)pg_size | ((uint64_t)pg_minsize) << 32;
-    auto rs_it = matrices.find(key);
-    if (rs_it == matrices.end())
-    {
-        throw std::runtime_error("jerasure matrix not initialized");
-    }
-    return &rs_it->second;
-}
-
-// jerasure_matrix_decode() decodes all chunks at once and tries to reencode all missing coding chunks.
-// we don't need it. also it makes an extra allocation of int *erased on every call and doesn't cache
-// the decoding matrix.
-// all these flaws are fixed in this function:
-int* get_jerasure_decoding_matrix(osd_rmw_stripe_t *stripes, int pg_size, int pg_minsize)
-{
-    int edd = 0;
-    int erased[pg_size] = { 0 };
-    for (int i = 0; i < pg_size; i++)
-        if (stripes[i].read_end == 0 || stripes[i].missing)
-            erased[i] = 1;
-    for (int i = 0; i < pg_minsize; i++)
-        if (stripes[i].read_end != 0 && stripes[i].missing)
-            edd++;
-    if (edd == 0)
-        return NULL;
-    reed_sol_matrix_t *matrix = get_jerasure_matrix(pg_size, pg_minsize);
-    auto dec_it = matrix->decodings.find((reed_sol_erased_t){ .data = erased, .size = pg_size });
-    if (dec_it == matrix->decodings.end())
-    {
-        int *dm_ids = (int*)malloc_or_die(sizeof(int)*(pg_minsize + pg_minsize*pg_minsize + pg_size));
-        int *decoding_matrix = dm_ids + pg_minsize;
-        if (!dm_ids)
-            throw std::bad_alloc();
-        // we always use row_k_ones=1 and w=8 (OSD_JERASURE_W)
-        if (jerasure_make_decoding_matrix(pg_minsize, pg_size-pg_minsize, OSD_JERASURE_W, matrix->data, erased, decoding_matrix, dm_ids) < 0)
-        {
-            free(dm_ids);
-            throw std::runtime_error("jerasure_make_decoding_matrix() failed");
-        }
-        int *erased_copy = dm_ids + pg_minsize + pg_minsize*pg_minsize;
-        memcpy(erased_copy, erased, pg_size*sizeof(int));
-        matrix->decodings.emplace((reed_sol_erased_t){ .data = erased_copy, .size = pg_size }, dm_ids);
-        return dm_ids;
-    }
-    return dec_it->second;
-}
-
-void reconstruct_stripes_jerasure(osd_rmw_stripe_t *stripes, int pg_size, int pg_minsize, uint32_t bitmap_size)
-{
-    int *dm_ids = get_jerasure_decoding_matrix(stripes, pg_size, pg_minsize);
-    if (!dm_ids)
-    {
-        return;
-    }
-    int *decoding_matrix = dm_ids + pg_minsize;
-    char *data_ptrs[pg_size] = { 0 };
-    for (int role = 0; role < pg_minsize; role++)
-    {
-        if (stripes[role].read_end != 0 && stripes[role].missing)
-        {
-            for (int other = 0; other < pg_size; other++)
-            {
-                if (stripes[other].read_end != 0 && !stripes[other].missing)
-                {
-                    assert(stripes[other].read_start <= stripes[role].read_start);
-                    assert(stripes[other].read_end >= stripes[role].read_end);
-                    data_ptrs[other] = (char*)(stripes[other].read_buf + (stripes[role].read_start - stripes[other].read_start));
-                }
-            }
-            data_ptrs[role] = (char*)stripes[role].read_buf;
-            jerasure_matrix_dotprod(
-                pg_minsize, OSD_JERASURE_W, decoding_matrix+(role*pg_minsize), dm_ids, role,
-                data_ptrs, data_ptrs+pg_minsize, stripes[role].read_end - stripes[role].read_start
-            );
-            for (int other = 0; other < pg_size; other++)
-            {
-                if (stripes[other].read_end != 0 && !stripes[other].missing)
-                {
-                    data_ptrs[other] = (char*)(stripes[other].bmp_buf);
-                }
-            }
-            data_ptrs[role] = (char*)stripes[role].bmp_buf;
-            jerasure_matrix_dotprod(
-                pg_minsize, OSD_JERASURE_W, decoding_matrix+(role*pg_minsize), dm_ids, role,
-                data_ptrs, data_ptrs+pg_minsize, bitmap_size
-            );
-        }
-    }
-}
-
-int extend_missing_stripes(osd_rmw_stripe_t *stripes, osd_num_t *osd_set, int pg_minsize, int pg_size)
-{
-    for (int role = 0; role < pg_minsize; role++)
+    for (int role = 0; role < minsize; role++)
    {
        if (stripes[role].read_end != 0 && osd_set[role] == 0)
        {
@ -287,21 +121,21 @@ int extend_missing_stripes(osd_rmw_stripe_t *stripes, osd_num_t *osd_set, int pg
            // We need at least pg_minsize stripes to recover the lost part.
            // FIXME: LRC EC and similar don't require to read all other stripes.
            int exist = 0;
-            for (int j = 0; j < pg_size; j++)
+            for (int j = 0; j < size; j++)
            {
                if (osd_set[j] != 0)
                {
                    extend_read(stripes[role].read_start, stripes[role].read_end, stripes[j]);
                    exist++;
-                    if (exist >= pg_minsize)
+                    if (exist >= minsize)
                    {
                        break;
                    }
                }
            }
-            if (exist < pg_minsize)
+            if (exist < minsize)
            {
-                // Less than pg_minsize stripes are available for this object
+                // Less than minsize stripes are available for this object
                return -1;
            }
        }
@ -335,8 +169,7 @@ void* alloc_read_buffer(osd_rmw_stripe_t *stripes, int read_pg_size, uint64_t ad
 }

 void* calc_rmw(void *request_buf, osd_rmw_stripe_t *stripes, uint64_t *read_osd_set,
-    uint64_t pg_size, uint64_t pg_minsize, uint64_t pg_cursize, uint64_t *write_osd_set,
-    uint64_t chunk_size, uint32_t bitmap_size)
+    uint64_t pg_size, uint64_t pg_minsize, uint64_t pg_cursize, uint64_t *write_osd_set, uint64_t chunk_size)
 {
    // Generic parity modification (read-modify-write) algorithm
    // Read -> Reconstruct missing chunks -> Calc parity chunks -> Write
@ -536,13 +369,22 @@ static void xor_multiple_buffers(buf_len_t *xor1, int n1, buf_len_t *xor2, int n
    }
 }

-static void calc_rmw_parity_copy_mod(osd_rmw_stripe_t *stripes, int pg_size, int pg_minsize,
-    uint64_t *read_osd_set, uint64_t *write_osd_set, uint32_t chunk_size, uint32_t bitmap_granularity,
-    uint32_t &start, uint32_t &end)
+void calc_rmw_parity_xor(osd_rmw_stripe_t *stripes, int pg_size, uint64_t *read_osd_set, uint64_t *write_osd_set, uint32_t chunk_size)
 {
+    int pg_minsize = pg_size-1;
+    for (int role = 0; role < pg_size; role++)
+    {
+        if (stripes[role].read_end != 0 && stripes[role].missing)
+        {
+            // Reconstruct missing stripe (XOR k+1)
+            reconstruct_stripe_xor(stripes, pg_size, role);
+            break;
+        }
+    }
+    uint32_t start = 0, end = 0;
    if (write_osd_set[pg_minsize] != 0 || write_osd_set != read_osd_set)
    {
-        // start & end are required for calc_rmw_parity
+        // Required for the next two if()s
        for (int role = 0; role < pg_minsize; role++)
        {
            if (stripes[role].req_end != 0)
@ -560,20 +402,6 @@ static void calc_rmw_parity_copy_mod(osd_rmw_stripe_t *stripes, int pg_size, int
            }
        }
    }
-    // Set bitmap bits accordingly
-    if (bitmap_granularity > 0)
-    {
-        for (int role = 0; role < pg_minsize; role++)
-        {
-            if (stripes[role].req_end != 0)
-            {
-                bitmap_set(
-                    stripes[role].bmp_buf, stripes[role].req_start,
-                    stripes[role].req_end-stripes[role].req_start, bitmap_granularity
-                );
-            }
-        }
-    }
    if (write_osd_set != read_osd_set)
    {
        for (int role = 0; role < pg_minsize; role++)
@ -593,11 +421,34 @@ static void calc_rmw_parity_copy_mod(osd_rmw_stripe_t *stripes, int pg_size, int
            }
        }
    }
-}
-
-static void calc_rmw_parity_copy_parity(osd_rmw_stripe_t *stripes, int pg_size, int pg_minsize,
-    uint64_t *read_osd_set, uint64_t *write_osd_set, uint32_t chunk_size, uint32_t start, uint32_t end)
-{
+    if (write_osd_set[pg_minsize] != 0 && end != 0)
+    {
+        // Calculate new parity (XOR k+1)
+        int parity = pg_minsize, prev = -2;
+        for (int other = 0; other < pg_minsize; other++)
+        {
+            if (prev == -2)
+            {
+                prev = other;
+            }
+            else
+            {
+                int n1 = 0, n2 = 0;
+                buf_len_t xor1[3], xor2[3];
+                if (prev == -1)
+                {
+                    xor1[n1++] = { .buf = stripes[parity].write_buf, .len = end-start };
+                }
+                else
+                {
+                    get_old_new_buffers(stripes[prev], start, end, xor1, n1);
+                    prev = -1;
+                }
+                get_old_new_buffers(stripes[other], start, end, xor2, n2);
+                xor_multiple_buffers(xor1, n1, xor2, n2, stripes[parity].write_buf, end-start);
+            }
+        }
+    }
    if (write_osd_set != read_osd_set)
    {
        for (int role = pg_minsize; role < pg_size; role++)
@ -617,7 +468,7 @@ static void calc_rmw_parity_copy_parity(osd_rmw_stripe_t *stripes, int pg_size,
        }
    }
 #ifdef RMW_DEBUG
-    printf("calc_rmw_parity:\n");
+    printf("calc_rmw_xor:\n");
    for (int role = 0; role < pg_size; role++)
    {
        auto & s = stripes[role];
@ -633,119 +484,3 @@ static void calc_rmw_parity_copy_parity(osd_rmw_stripe_t *stripes, int pg_size,
    }
 #endif
 }
-
-void calc_rmw_parity_xor(osd_rmw_stripe_t *stripes, int pg_size, uint64_t *read_osd_set, uint64_t *write_osd_set,
-    uint32_t chunk_size, uint32_t bitmap_size)
-{
-    uint32_t bitmap_granularity = bitmap_size > 0 ? chunk_size / bitmap_size / 8 : 0;
-    int pg_minsize = pg_size-1;
-    reconstruct_stripes_xor(stripes, pg_size, bitmap_size);
-    uint32_t start = 0, end = 0;
-    calc_rmw_parity_copy_mod(stripes, pg_size, pg_minsize, read_osd_set, write_osd_set, chunk_size, bitmap_granularity, start, end);
-    if (write_osd_set[pg_minsize] != 0 && end != 0)
-    {
-        // Calculate new parity (XOR k+1)
-        int parity = pg_minsize, prev = -2;
-        for (int other = 0; other < pg_minsize; other++)
-        {
-            if (prev == -2)
-            {
-                prev = other;
-            }
-            else
-            {
-                int n1 = 0, n2 = 0;
-                buf_len_t xor1[3], xor2[3];
-                if (prev == -1)
-                {
-                    xor1[n1++] = { .buf = stripes[parity].write_buf, .len = end-start };
-                    memxor(stripes[parity].bmp_buf, stripes[other].bmp_buf, stripes[parity].bmp_buf, bitmap_size);
-                }
-                else
-                {
-                    memxor(stripes[prev].bmp_buf, stripes[other].bmp_buf, stripes[parity].bmp_buf, bitmap_size);
-                    get_old_new_buffers(stripes[prev], start, end, xor1, n1);
-                    prev = -1;
-                }
-                get_old_new_buffers(stripes[other], start, end, xor2, n2);
-                xor_multiple_buffers(xor1, n1, xor2, n2, stripes[parity].write_buf, end-start);
-            }
-        }
-    }
-    calc_rmw_parity_copy_parity(stripes, pg_size, pg_minsize, read_osd_set, write_osd_set, chunk_size, start, end);
-}
-
-void calc_rmw_parity_jerasure(osd_rmw_stripe_t *stripes, int pg_size, int pg_minsize,
-    uint64_t *read_osd_set, uint64_t *write_osd_set, uint32_t chunk_size, uint32_t bitmap_size)
-{
-    uint32_t bitmap_granularity = bitmap_size > 0 ? chunk_size / bitmap_size / 8 : 0;
-    reed_sol_matrix_t *matrix = get_jerasure_matrix(pg_size, pg_minsize);
-    reconstruct_stripes_jerasure(stripes, pg_size, pg_minsize, bitmap_size);
-    uint32_t start = 0, end = 0;
-    calc_rmw_parity_copy_mod(stripes, pg_size, pg_minsize, read_osd_set, write_osd_set, chunk_size, bitmap_granularity, start, end);
-    if (end != 0)
-    {
-        int i;
-        for (i = pg_minsize; i < pg_size; i++)
-        {
-            if (write_osd_set[i] != 0)
-                break;
-        }
-        if (i < pg_size)
-        {
-            // Calculate new coding chunks
-            buf_len_t bufs[pg_size][3];
-            int nbuf[pg_size] = { 0 }, curbuf[pg_size] = { 0 };
-            uint32_t positions[pg_size];
-            void *data_ptrs[pg_size] = { 0 };
-            for (int i = 0; i < pg_minsize; i++)
-            {
-                get_old_new_buffers(stripes[i], start, end, bufs[i], nbuf[i]);
-                positions[i] = start;
-            }
-            for (int i = pg_minsize; i < pg_size; i++)
-            {
-                bufs[i][nbuf[i]++] = { .buf = stripes[i].write_buf, .len = end-start };
-                positions[i] = start;
-            }
-            uint32_t pos = start;
-            while (pos < end)
-            {
-                uint32_t next_end = end;
-                for (int i = 0; i < pg_size; i++)
-                {
-                    assert(curbuf[i] < nbuf[i]);
-                    assert(bufs[i][curbuf[i]].buf);
-                    data_ptrs[i] = bufs[i][curbuf[i]].buf + pos-positions[i];
-                    uint32_t this_end = bufs[i][curbuf[i]].len + positions[i];
-                    if (next_end > this_end)
-                        next_end = this_end;
-                }
-                assert(next_end > pos);
-                for (int i = 0; i < pg_size; i++)
-                {
-                    uint32_t this_end = bufs[i][curbuf[i]].len + positions[i];
-                    if (next_end >= this_end)
-                    {
-                        positions[i] += bufs[i][curbuf[i]].len;
-                        curbuf[i]++;
-                    }
-                }
-                jerasure_matrix_encode(
-                    pg_minsize, pg_size-pg_minsize, OSD_JERASURE_W, matrix->data,
-                    (char**)data_ptrs, (char**)data_ptrs+pg_minsize, next_end-pos
-                );
-                pos = next_end;
-            }
-            for (int i = 0; i < pg_size; i++)
-            {
-                data_ptrs[i] = stripes[i].bmp_buf;
-            }
-            jerasure_matrix_encode(
-                pg_minsize, pg_size-pg_minsize, OSD_JERASURE_W, matrix->data,
-                (char**)data_ptrs, (char**)data_ptrs+pg_minsize, bitmap_size
-            );
-        }
-    }
-    calc_rmw_parity_copy_parity(stripes, pg_size, pg_minsize, read_osd_set, write_osd_set, chunk_size, start, end);
-}
--- a/src/osd_rmw.h
+++ b/src/osd_rmw.h
@ -1,5 +1,5 @@
 // Copyright (c) Vitaliy Filippov, 2019+
-// License: VNPL-1.1 (see README.md for details)
+// License: VNPL-1.0 (see README.md for details)

 #pragma once

@ -20,33 +20,21 @@ struct buf_len_t
 struct osd_rmw_stripe_t
 {
    void *read_buf, *write_buf;
-    void *bmp_buf;
    uint32_t req_start, req_end;
    uint32_t read_start, read_end;
    uint32_t write_start, write_end;
    bool missing;
 };

-// Here pg_minsize is the number of data chunks, not the minimum number of alive OSDs for the PG to operate
-
 void split_stripes(uint64_t pg_minsize, uint32_t bs_block_size, uint32_t start, uint32_t len, osd_rmw_stripe_t *stripes);

-void reconstruct_stripes_xor(osd_rmw_stripe_t *stripes, int pg_size, uint32_t bitmap_size);
+void reconstruct_stripe_xor(osd_rmw_stripe_t *stripes, int pg_size, int role);

-int extend_missing_stripes(osd_rmw_stripe_t *stripes, osd_num_t *osd_set, int pg_minsize, int pg_size);
+int extend_missing_stripes(osd_rmw_stripe_t *stripes, osd_num_t *osd_set, int minsize, int size);

 void* alloc_read_buffer(osd_rmw_stripe_t *stripes, int read_pg_size, uint64_t add_size);

 void* calc_rmw(void *request_buf, osd_rmw_stripe_t *stripes, uint64_t *read_osd_set,
-    uint64_t pg_size, uint64_t pg_minsize, uint64_t pg_cursize, uint64_t *write_osd_set,
-    uint64_t chunk_size, uint32_t bitmap_size);
+    uint64_t pg_size, uint64_t pg_minsize, uint64_t pg_cursize, uint64_t *write_osd_set, uint64_t chunk_size);

-void calc_rmw_parity_xor(osd_rmw_stripe_t *stripes, int pg_size, uint64_t *read_osd_set, uint64_t *write_osd_set,
-    uint32_t chunk_size, uint32_t bitmap_size);
-
-void use_jerasure(int pg_size, int pg_minsize, bool use);
-
-void reconstruct_stripes_jerasure(osd_rmw_stripe_t *stripes, int pg_size, int pg_minsize, uint32_t bitmap_size);
-
-void calc_rmw_parity_jerasure(osd_rmw_stripe_t *stripes, int pg_size, int pg_minsize,
-    uint64_t *read_osd_set, uint64_t *write_osd_set, uint32_t chunk_size, uint32_t bitmap_size);
+void calc_rmw_parity_xor(osd_rmw_stripe_t *stripes, int pg_size, uint64_t *read_osd_set, uint64_t *write_osd_set, uint32_t chunk_size);
--- a/src/osd_rmw_test.cpp
+++ b/src/osd_rmw_test.cpp
@ -1,5 +1,5 @@
 // Copyright (c) Vitaliy Filippov, 2019+
-// License: VNPL-1.1 (see README.md for details)
+// License: VNPL-1.0 (see README.md for details)

 #define RMW_DEBUG

@ -18,8 +18,107 @@ void test9();
 void test10();
 void test11();
 void test12();
-void test13();
-void test14();
+
+/***
+
+Cases:
+
+1. split(offset=128K-4K, len=8K)
+   = [ [ 128K-4K, 128K ], [ 0, 4K ], [ 0, 0 ] ]
+
+2. read(offset=128K-4K, len=8K, osd_set=[1,0,3])
+   = { read: [ [ 0, 128K ], [ 0, 4K ], [ 0, 4K ] ] }
+
+3. cover_read(0, 128K, { req: [ 128K-4K, 4K ] })
+   = { read: [ 0, 128K-4K ] }
+
+4. write(offset=128K-4K, len=8K, osd_set=[1,0,3])
+   = {
+     read: [ [ 0, 128K ], [ 4K, 128K ], [ 4K, 128K ] ],
+     write: [ [ 128K-4K, 128K ], [ 0, 4K ], [ 0, 128K ] ],
+     input buffer: [ write0, write1 ],
+     rmw buffer: [ write2, read0, read1, read2 ],
+   }
+   + check write2 buffer
+
+5. write(offset=0, len=128K+64K, osd_set=[1,0,3])
+   = {
+     req: [ [ 0, 128K ], [ 0, 64K ], [ 0, 0 ] ],
+     read: [ [ 64K, 128K ], [ 64K, 128K ], [ 64K, 128K ] ],
+     write: [ [ 0, 128K ], [ 0, 64K ], [ 0, 128K ] ],
+     input buffer: [ write0, write1 ],
+     rmw buffer: [ write2, read0, read1, read2 ],
+   }
+
+6. write(offset=0, len=128K+64K, osd_set=[1,2,3])
+   = {
+     req: [ [ 0, 128K ], [ 0, 64K ], [ 0, 0 ] ],
+     read: [ [ 0, 0 ], [ 64K, 128K ], [ 0, 0 ] ],
+     write: [ [ 0, 128K ], [ 0, 64K ], [ 0, 128K ] ],
+     input buffer: [ write0, write1 ],
+     rmw buffer: [ write2, read1 ],
+   }
+
+7. calc_rmw(offset=128K-4K, len=8K, osd_set=[1,0,3], write_set=[1,2,3])
+   = {
+     read: [ [ 0, 128K ], [ 0, 128K ], [ 0, 128K ] ],
+     write: [ [ 128K-4K, 128K ], [ 0, 4K ], [ 0, 128K ] ],
+     input buffer: [ write0, write1 ],
+     rmw buffer: [ write2, read0, read1, read2 ],
+   }
+   then, after calc_rmw_parity_xor(): {
+     write: [ [ 128K-4K, 128K ], [ 0, 128K ], [ 0, 128K ] ],
+     write1==read1,
+   }
+   + check write1 buffer
+   + check write2 buffer
+
+8. calc_rmw(offset=0, len=128K+4K, osd_set=[0,2,3], write_set=[1,2,3])
+   = {
+     read: [ [ 0, 0 ], [ 4K, 128K ], [ 0, 0 ] ],
+     write: [ [ 0, 128K ], [ 0, 4K ], [ 0, 128K ] ],
+     input buffer: [ write0, write1 ],
+     rmw buffer: [ write2, read1 ],
+   }
+   + check write2 buffer
+
+9. object recovery case:
+   calc_rmw(offset=0, len=0, read_osd_set=[0,2,3], write_osd_set=[1,2,3])
+   = {
+     read: [ [ 0, 128K ], [ 0, 128K ], [ 0, 128K ] ],
+     write: [ [ 0, 0 ], [ 0, 0 ], [ 0, 0 ] ],
+     input buffer: NULL,
+     rmw buffer: [ read0, read1, read2 ],
+   }
+   then, after calc_rmw_parity_xor(): {
+     write: [ [ 0, 128K ], [ 0, 0 ], [ 0, 0 ] ],
+     write0==read0,
+   }
+   + check write0 buffer
+
+10. full overwrite/recovery case:
+   calc_rmw(offset=0, len=256K, read_osd_set=[1,0,0], write_osd_set=[1,2,3])
+   = {
+     read: [ [ 0, 0 ], [ 0, 0 ], [ 0, 0 ] ],
+     write: [ [ 0, 128K ], [ 0, 128K ], [ 0, 128K ] ],
+     input buffer: [ write0, write1 ],
+     rmw buffer: [ write2 ],
+   }
+   then, after calc_rmw_parity_xor(): all the same
+   + check write2 buffer
+
+10. partial recovery case:
+   calc_rmw(offset=128K, len=128K, read_osd_set=[1,0,0], write_osd_set=[1,2,3])
+   = {
+     read: [ [ 0, 128K ], [ 0, 0 ], [ 0, 0 ] ],
+     write: [ [ 0, 0 ], [ 0, 128K ], [ 0, 128K ] ],
+     input buffer: [ write1 ],
+     rmw buffer: [ write2, read0 ],
+   }
+   then, after calc_rmw_parity_xor(): all the same
+   + check write2 buffer
+
+***/

 int main(int narg, char *args[])
 {
@ -43,10 +142,6 @@ int main(int narg, char *args[])
    test11();
    // Test 12
    test12();
-    // Test 13
-    test13();
-    // Test 14
-    test14();
    // End
    printf("all ok\n");
    return 0;
@ -74,19 +169,6 @@ void dump_stripes(osd_rmw_stripe_t *stripes, int pg_size)
    printf("\n");
 }

-/***
-
-1. split(offset=128K-4K, len=8K)
-   = [ [ 128K-4K, 128K ], [ 0, 4K ], [ 0, 0 ] ]
-
-   read(offset=128K-4K, len=8K, osd_set=[1,0,3])
-   = { read: [ [ 0, 128K ], [ 0, 4K ], [ 0, 4K ] ] }
-
-   cover_read(0, 128K, { req: [ 128K-4K, 4K ] })
-   = { read: [ 0, 128K-4K ] }
-
-***/
-
 void test1()
 {
    osd_num_t osd_set[3] = { 1, 0, 3 };
@ -106,36 +188,19 @@ void test1()
    assert(stripes[0].read_start == 0 && stripes[0].read_end == 128*1024);
    assert(stripes[2].read_start == 0 && stripes[2].read_end == 4096);
    // Test 1.3
-    stripes[0] = (osd_rmw_stripe_t){ .req_start = 128*1024-4096, .req_end = 128*1024 };
+    stripes[0] = { .req_start = 128*1024-4096, .req_end = 128*1024 };
    cover_read(0, 128*1024, stripes[0]);
    assert(stripes[0].read_start == 0 && stripes[0].read_end == 128*1024-4096);
 }

-/***
-
-4. write(offset=128K-4K, len=8K, osd_set=[1,0,3])
-   = {
-     read: [ [ 0, 128K ], [ 4K, 128K ], [ 4K, 128K ] ],
-     write: [ [ 128K-4K, 128K ], [ 0, 4K ], [ 0, 128K ] ],
-     input buffer: [ write0, write1 ],
-     rmw buffer: [ write2, read0, read1, read2 ],
-   }
-   + check write2 buffer
-
-***/
-
 void test4()
 {
-    const uint32_t bmp = 4;
-    unsigned bitmaps[3] = { 0 };
    osd_num_t osd_set[3] = { 1, 0, 3 };
    osd_rmw_stripe_t stripes[3] = { 0 };
    // Test 4.1
    split_stripes(2, 128*1024, 128*1024-4096, 8192, stripes);
-    for (int i = 0; i < 3; i++)
-        stripes[i].bmp_buf = bitmaps+i;
    void* write_buf = malloc(8192);
-    void* rmw_buf = calc_rmw(write_buf, stripes, osd_set, 3, 2, 2, osd_set, 128*1024, bmp);
+    void* rmw_buf = calc_rmw(write_buf, stripes, osd_set, 3, 2, 2, osd_set, 128*1024);
    assert(stripes[0].read_start == 0 && stripes[0].read_end == 128*1024);
    assert(stripes[1].read_start == 4096 && stripes[1].read_end == 128*1024);
    assert(stripes[2].read_start == 4096 && stripes[2].read_end == 128*1024);
@ -153,13 +218,7 @@ void test4()
    set_pattern(stripes[0].read_buf, 128*1024, PATTERN1); // old data
    set_pattern(stripes[1].read_buf, 128*1024-4096, UINT64_MAX); // didn't read it, it's missing
    set_pattern(stripes[2].read_buf, 128*1024-4096, 0); // old parity = 0
-    memset(stripes[0].bmp_buf, 0, bmp);
-    memset(stripes[1].bmp_buf, 0, bmp);
-    memset(stripes[2].bmp_buf, 0, bmp);
-    calc_rmw_parity_xor(stripes, 3, osd_set, osd_set, 128*1024, bmp);
-    assert(*(uint32_t*)stripes[0].bmp_buf == 0x80000000);
-    assert(*(uint32_t*)stripes[1].bmp_buf == 0x00000001);
-    assert(*(uint32_t*)stripes[2].bmp_buf == 0x80000001); // XOR
+    calc_rmw_parity_xor(stripes, 3, osd_set, osd_set, 128*1024);
    check_pattern(stripes[2].write_buf, 4096, PATTERN0^PATTERN1); // new parity
    check_pattern(stripes[2].write_buf+4096, 128*1024-4096*2, 0); // new parity
    check_pattern(stripes[2].write_buf+128*1024-4096, 4096, PATTERN0^PATTERN1); // new parity
@ -167,19 +226,6 @@ void test4()
    free(write_buf);
 }

-/***
-
-5. write(offset=0, len=128K+64K, osd_set=[1,0,3])
-   = {
-     req: [ [ 0, 128K ], [ 0, 64K ], [ 0, 0 ] ],
-     read: [ [ 64K, 128K ], [ 64K, 128K ], [ 64K, 128K ] ],
-     write: [ [ 0, 128K ], [ 0, 64K ], [ 0, 128K ] ],
-     input buffer: [ write0, write1 ],
-     rmw buffer: [ write2, read0, read1, read2 ],
-   }
-
-***/
-
 void test5()
 {
    osd_num_t osd_set[3] = { 1, 0, 3 };
@ -191,7 +237,7 @@ void test5()
    assert(stripes[2].req_end == 0);
    // Test 5.2
    void *write_buf = malloc(64*1024*3);
-    void *rmw_buf = calc_rmw(write_buf, stripes, osd_set, 3, 2, 2, osd_set, 128*1024, 0);
+    void *rmw_buf = calc_rmw(write_buf, stripes, osd_set, 3, 2, 2, osd_set, 128*1024);
    assert(stripes[0].read_start == 64*1024 && stripes[0].read_end == 128*1024);
    assert(stripes[1].read_start == 64*1024 && stripes[1].read_end == 128*1024);
    assert(stripes[2].read_start == 64*1024 && stripes[2].read_end == 128*1024);
@ -208,19 +254,6 @@ void test5()
    free(write_buf);
 }

-/***
-
-6. write(offset=0, len=128K+64K, osd_set=[1,2,3])
-   = {
-     req: [ [ 0, 128K ], [ 0, 64K ], [ 0, 0 ] ],
-     read: [ [ 0, 0 ], [ 64K, 128K ], [ 0, 0 ] ],
-     write: [ [ 0, 128K ], [ 0, 64K ], [ 0, 128K ] ],
-     input buffer: [ write0, write1 ],
-     rmw buffer: [ write2, read1 ],
-   }
-
-***/
-
 void test6()
 {
    osd_num_t osd_set[3] = { 1, 2, 3 };
@ -228,7 +261,7 @@ void test6()
    // Test 6.1
    split_stripes(2, 128*1024, 0, 64*1024*3, stripes);
    void *write_buf = malloc(64*1024*3);
-    void *rmw_buf = calc_rmw(write_buf, stripes, osd_set, 3, 2, 3, osd_set, 128*1024, 0);
+    void *rmw_buf = calc_rmw(write_buf, stripes, osd_set, 3, 2, 3, osd_set, 128*1024);
    assert(stripes[0].read_end == 0);
    assert(stripes[1].read_start == 64*1024 && stripes[1].read_end == 128*1024);
    assert(stripes[2].read_end == 0);
@ -245,24 +278,6 @@ void test6()
    free(write_buf);
 }

-/***
-
-7. calc_rmw(offset=128K-4K, len=8K, osd_set=[1,0,3], write_set=[1,2,3])
-   = {
-     read: [ [ 0, 128K ], [ 0, 128K ], [ 0, 128K ] ],
-     write: [ [ 128K-4K, 128K ], [ 0, 4K ], [ 0, 128K ] ],
-     input buffer: [ write0, write1 ],
-     rmw buffer: [ write2, read0, read1, read2 ],
-   }
-   then, after calc_rmw_parity_xor(): {
-     write: [ [ 128K-4K, 128K ], [ 0, 128K ], [ 0, 128K ] ],
-     write1==read1,
-   }
-   + check write1 buffer
-   + check write2 buffer
-
-***/
-
 void test7()
 {
    osd_num_t osd_set[3] = { 1, 0, 3 };
@ -271,7 +286,7 @@ void test7()
    // Test 7.1
    split_stripes(2, 128*1024, 128*1024-4096, 8192, stripes);
    void *write_buf = malloc(8192);
-    void *rmw_buf = calc_rmw(write_buf, stripes, osd_set, 3, 2, 2, write_osd_set, 128*1024, 0);
+    void *rmw_buf = calc_rmw(write_buf, stripes, osd_set, 3, 2, 2, write_osd_set, 128*1024);
    assert(stripes[0].read_start == 0 && stripes[0].read_end == 128*1024);
    assert(stripes[1].read_start == 0 && stripes[1].read_end == 128*1024);
    assert(stripes[2].read_start == 0 && stripes[2].read_end == 128*1024);
@ -289,7 +304,7 @@ void test7()
    set_pattern(stripes[0].read_buf, 128*1024, PATTERN1); // old data
    set_pattern(stripes[1].read_buf, 128*1024, UINT64_MAX); // didn't read it, it's missing
    set_pattern(stripes[2].read_buf, 128*1024, 0); // old parity = 0
-    calc_rmw_parity_xor(stripes, 3, osd_set, write_osd_set, 128*1024, 0);
+    calc_rmw_parity_xor(stripes, 3, osd_set, write_osd_set, 128*1024);
    assert(stripes[0].write_start == 128*1024-4096 && stripes[0].write_end == 128*1024);
    assert(stripes[1].write_start == 0 && stripes[1].write_end == 128*1024);
    assert(stripes[2].write_start == 0 && stripes[2].write_end == 128*1024);
@ -303,19 +318,6 @@ void test7()
    free(write_buf);
 }

-/***
-
-8. calc_rmw(offset=0, len=128K+4K, osd_set=[0,2,3], write_set=[1,2,3])
-   = {
-     read: [ [ 0, 0 ], [ 4K, 128K ], [ 0, 0 ] ],
-     write: [ [ 0, 128K ], [ 0, 4K ], [ 0, 128K ] ],
-     input buffer: [ write0, write1 ],
-     rmw buffer: [ write2, read1 ],
-   }
-   + check write2 buffer
-
-***/
-
 void test8()
 {
    osd_num_t osd_set[3] = { 0, 2, 3 };
@ -324,7 +326,7 @@ void test8()
    // Test 8.1
    split_stripes(2, 128*1024, 0, 128*1024+4096, stripes);
    void *write_buf = malloc(128*1024+4096);
-    void *rmw_buf = calc_rmw(write_buf, stripes, osd_set, 3, 2, 2, write_osd_set, 128*1024, 0);
+    void *rmw_buf = calc_rmw(write_buf, stripes, osd_set, 3, 2, 2, write_osd_set, 128*1024);
    assert(stripes[0].read_start == 0 && stripes[0].read_end == 0);
    assert(stripes[1].read_start == 4096 && stripes[1].read_end == 128*1024);
    assert(stripes[2].read_start == 0 && stripes[2].read_end == 0);
@ -340,7 +342,7 @@ void test8()
    // Test 8.2
    set_pattern(write_buf, 128*1024+4096, PATTERN0);
    set_pattern(stripes[1].read_buf, 128*1024-4096, PATTERN1);
-    calc_rmw_parity_xor(stripes, 3, osd_set, write_osd_set, 128*1024, 0);
+    calc_rmw_parity_xor(stripes, 3, osd_set, write_osd_set, 128*1024);
    assert(stripes[0].write_start == 0 && stripes[0].write_end == 128*1024); // recheck again
    assert(stripes[1].write_start == 0 && stripes[1].write_end == 4096);     // recheck again
    assert(stripes[2].write_start == 0 && stripes[2].write_end == 128*1024); // recheck again
@ -353,24 +355,6 @@ void test8()
    free(write_buf);
 }

-/***
-
-9. object recovery case:
-   calc_rmw(offset=0, len=0, read_osd_set=[0,2,3], write_osd_set=[1,2,3])
-   = {
-     read: [ [ 0, 128K ], [ 0, 128K ], [ 0, 128K ] ],
-     write: [ [ 0, 0 ], [ 0, 0 ], [ 0, 0 ] ],
-     input buffer: NULL,
-     rmw buffer: [ read0, read1, read2 ],
-   }
-   then, after calc_rmw_parity_xor(): {
-     write: [ [ 0, 128K ], [ 0, 0 ], [ 0, 0 ] ],
-     write0==read0,
-   }
-   + check write0 buffer
-
-***/
-
 void test9()
 {
    osd_num_t osd_set[3] = { 0, 2, 3 };
@ -383,7 +367,7 @@ void test9()
    assert(stripes[2].req_start == 0 && stripes[2].req_end == 0);
    // Test 9.1
    void *write_buf = NULL;
-    void *rmw_buf = calc_rmw(write_buf, stripes, osd_set, 3, 2, 3, write_osd_set, 128*1024, 0);
+    void *rmw_buf = calc_rmw(write_buf, stripes, osd_set, 3, 2, 3, write_osd_set, 128*1024);
    assert(stripes[0].read_start == 0 && stripes[0].read_end == 128*1024);
    assert(stripes[1].read_start == 0 && stripes[1].read_end == 128*1024);
    assert(stripes[2].read_start == 0 && stripes[2].read_end == 128*1024);
@ -399,7 +383,7 @@ void test9()
    // Test 9.2
    set_pattern(stripes[1].read_buf, 128*1024, 0);
    set_pattern(stripes[2].read_buf, 128*1024, PATTERN1);
-    calc_rmw_parity_xor(stripes, 3, osd_set, write_osd_set, 128*1024, 0);
+    calc_rmw_parity_xor(stripes, 3, osd_set, write_osd_set, 128*1024);
    assert(stripes[0].write_start == 0 && stripes[0].write_end == 128*1024);
    assert(stripes[1].write_start == 0 && stripes[1].write_end == 0);
    assert(stripes[2].write_start == 0 && stripes[2].write_end == 0);
@ -411,21 +395,6 @@ void test9()
    free(rmw_buf);
 }

-/***
-
-10. full overwrite/recovery case:
-   calc_rmw(offset=0, len=256K, read_osd_set=[1,0,0], write_osd_set=[1,2,3])
-   = {
-     read: [ [ 0, 0 ], [ 0, 0 ], [ 0, 0 ] ],
-     write: [ [ 0, 128K ], [ 0, 128K ], [ 0, 128K ] ],
-     input buffer: [ write0, write1 ],
-     rmw buffer: [ write2 ],
-   }
-   then, after calc_rmw_parity_xor(): all the same
-   + check write2 buffer
-
-***/
-
 void test10()
 {
    osd_num_t osd_set[3] = { 1, 0, 0 };
@ -438,7 +407,7 @@ void test10()
    assert(stripes[2].req_start == 0 && stripes[2].req_end == 0);
    // Test 10.1
    void *write_buf = malloc(256*1024);
-    void *rmw_buf = calc_rmw(write_buf, stripes, osd_set, 3, 2, 3, write_osd_set, 128*1024, 0);
+    void *rmw_buf = calc_rmw(write_buf, stripes, osd_set, 3, 2, 3, write_osd_set, 128*1024);
    assert(rmw_buf);
    assert(stripes[0].read_start == 0 && stripes[0].read_end == 0);
    assert(stripes[1].read_start == 0 && stripes[1].read_end == 0);
@ -455,7 +424,7 @@ void test10()
    // Test 10.2
    set_pattern(stripes[0].write_buf, 128*1024, PATTERN1);
    set_pattern(stripes[1].write_buf, 128*1024, PATTERN2);
-    calc_rmw_parity_xor(stripes, 3, osd_set, write_osd_set, 128*1024, 0);
+    calc_rmw_parity_xor(stripes, 3, osd_set, write_osd_set, 128*1024);
    assert(stripes[0].write_start == 0 && stripes[0].write_end == 128*1024);
    assert(stripes[1].write_start == 0 && stripes[1].write_end == 128*1024);
    assert(stripes[2].write_start == 0 && stripes[2].write_end == 128*1024);
@ -467,21 +436,6 @@ void test10()
    free(write_buf);
 }

-/***
-
-11. partial recovery case:
-   calc_rmw(offset=128K, len=128K, read_osd_set=[1,0,0], write_osd_set=[1,2,3])
-   = {
-     read: [ [ 0, 128K ], [ 0, 0 ], [ 0, 0 ] ],
-     write: [ [ 0, 0 ], [ 0, 128K ], [ 0, 128K ] ],
-     input buffer: [ write1 ],
-     rmw buffer: [ write2, read0 ],
-   }
-   then, after calc_rmw_parity_xor(): all the same
-   + check write2 buffer
-
-***/
-
 void test11()
 {
    osd_num_t osd_set[3] = { 1, 0, 0 };
@ -494,7 +448,7 @@ void test11()
    assert(stripes[2].req_start == 0 && stripes[2].req_end == 0);
    // Test 11.1
    void *write_buf = malloc(256*1024);
-    void *rmw_buf = calc_rmw(write_buf, stripes, osd_set, 3, 2, 3, write_osd_set, 128*1024, 0);
+    void *rmw_buf = calc_rmw(write_buf, stripes, osd_set, 3, 2, 3, write_osd_set, 128*1024);
    assert(rmw_buf);
    assert(stripes[0].read_start == 0 && stripes[0].read_end == 128*1024);
    assert(stripes[1].read_start == 0 && stripes[1].read_end == 0);
@ -511,7 +465,7 @@ void test11()
    // Test 11.2
    set_pattern(stripes[0].read_buf, 128*1024, PATTERN1);
    set_pattern(stripes[1].write_buf, 128*1024, PATTERN2);
-    calc_rmw_parity_xor(stripes, 3, osd_set, write_osd_set, 128*1024, 0);
+    calc_rmw_parity_xor(stripes, 3, osd_set, write_osd_set, 128*1024);
    assert(stripes[0].write_start == 0 && stripes[0].write_end == 0);
    assert(stripes[1].write_start == 0 && stripes[1].write_end == 128*1024);
    assert(stripes[2].write_start == 0 && stripes[2].write_end == 128*1024);
@ -523,33 +477,18 @@ void test11()
    free(write_buf);
 }

-/***
-
-12. parity recovery case:
-   calc_rmw(offset=0, len=0, read_osd_set=[1,2,0], write_osd_set=[1,2,3])
-   = {
-     read: [ [ 0, 128K ], [ 0, 128K ], [ 0, 0 ] ],
-     write: [ [ 0, 0 ], [ 0, 0 ], [ 0, 128K ] ],
-     input buffer: [],
-     rmw buffer: [ write2, read0, read1 ],
-   }
-   then, after calc_rmw_parity_xor(): all the same
-   + check write2 buffer
-
-***/
-
 void test12()
 {
    osd_num_t osd_set[3] = { 1, 2, 0 };
    osd_num_t write_osd_set[3] = { 1, 2, 3 };
    osd_rmw_stripe_t stripes[3] = { 0 };
-    // Test 12.0
+    // Test 11.0
    split_stripes(2, 128*1024, 0, 0, stripes);
    assert(stripes[0].req_start == 0 && stripes[0].req_end == 0);
    assert(stripes[1].req_start == 0 && stripes[1].req_end == 0);
    assert(stripes[2].req_start == 0 && stripes[2].req_end == 0);
-    // Test 12.1
-    void *rmw_buf = calc_rmw(NULL, stripes, osd_set, 3, 2, 3, write_osd_set, 128*1024, 0);
+    // Test 11.1
+    void *rmw_buf = calc_rmw(NULL, stripes, osd_set, 3, 2, 3, write_osd_set, 128*1024);
    assert(rmw_buf);
    assert(stripes[0].read_start == 0 && stripes[0].read_end == 128*1024);
    assert(stripes[1].read_start == 0 && stripes[1].read_end == 128*1024);
@ -563,10 +502,10 @@ void test12()
    assert(stripes[0].write_buf == NULL);
    assert(stripes[1].write_buf == NULL);
    assert(stripes[2].write_buf == rmw_buf);
-    // Test 12.2
+    // Test 11.2
    set_pattern(stripes[0].read_buf, 128*1024, PATTERN1);
    set_pattern(stripes[1].read_buf, 128*1024, PATTERN2);
-    calc_rmw_parity_xor(stripes, 3, osd_set, write_osd_set, 128*1024, 0);
+    calc_rmw_parity_xor(stripes, 3, osd_set, write_osd_set, 128*1024);
    assert(stripes[0].write_start == 0 && stripes[0].write_end == 0);
    assert(stripes[1].write_start == 0 && stripes[1].write_end == 0);
    assert(stripes[2].write_start == 0 && stripes[2].write_end == 128*1024);
@ -576,229 +515,3 @@ void test12()
    check_pattern(stripes[2].write_buf, 128*1024, PATTERN1^PATTERN2);
    free(rmw_buf);
 }
-
-/***
-
-13. basic jerasure 2+2 test
-   calc_rmw(offset=128K-4K, len=8K, osd_set=[1,2,0,0], write_set=[1,2,3,4])
-   = {
-     read: [ [ 0, 128K ], [ 0, 128K ], [ 0, 0 ], [ 0, 0 ] ],
-     write: [ [ 128K-4K, 128K ], [ 0, 4K ], [ 0, 128K ], [ 0, 128K ] ],
-     input buffer: [ write0, write1 ],
-     rmw buffer: [ write2, write3, read0, read1 ],
-   }
-   then, after calc_rmw_parity_jerasure(): all the same
-   then simulate read with read_osd_set=[0,0,3,4] and check read0,read1 buffers
-
-***/
-
-void test13()
-{
-    use_jerasure(4, 2, true);
-    osd_num_t osd_set[4] = { 1, 2, 0, 0 };
-    osd_num_t write_osd_set[4] = { 1, 2, 3, 4 };
-    osd_rmw_stripe_t stripes[4] = { 0 };
-    // Test 13.0
-    void *write_buf = malloc_or_die(8192);
-    split_stripes(2, 128*1024, 128*1024-4096, 8192, stripes);
-    assert(stripes[0].req_start == 128*1024-4096 && stripes[0].req_end == 128*1024);
-    assert(stripes[1].req_start == 0 && stripes[1].req_end == 4096);
-    assert(stripes[2].req_start == 0 && stripes[2].req_end == 0);
-    assert(stripes[3].req_start == 0 && stripes[3].req_end == 0);
-    // Test 13.1
-    void *rmw_buf = calc_rmw(write_buf, stripes, osd_set, 4, 2, 4, write_osd_set, 128*1024, 0);
-    assert(rmw_buf);
-    assert(stripes[0].read_start == 0 && stripes[0].read_end == 128*1024-4096);
-    assert(stripes[1].read_start == 4096 && stripes[1].read_end == 128*1024);
-    assert(stripes[2].read_start == 0 && stripes[2].read_end == 0);
-    assert(stripes[3].read_start == 0 && stripes[3].read_end == 0);
-    assert(stripes[0].write_start == 128*1024-4096 && stripes[0].write_end == 128*1024);
-    assert(stripes[1].write_start == 0 && stripes[1].write_end == 4096);
-    assert(stripes[2].write_start == 0 && stripes[2].write_end == 128*1024);
-    assert(stripes[3].write_start == 0 && stripes[3].write_end == 128*1024);
-    assert(stripes[0].read_buf == rmw_buf+2*128*1024);
-    assert(stripes[1].read_buf == rmw_buf+3*128*1024-4096);
-    assert(stripes[2].read_buf == NULL);
-    assert(stripes[3].read_buf == NULL);
-    assert(stripes[0].write_buf == write_buf);
-    assert(stripes[1].write_buf == write_buf+4096);
-    assert(stripes[2].write_buf == rmw_buf);
-    assert(stripes[3].write_buf == rmw_buf+128*1024);
-    // Test 13.2 - encode
-    set_pattern(write_buf, 8192, PATTERN3);
-    set_pattern(stripes[0].read_buf, 128*1024-4096, PATTERN1);
-    set_pattern(stripes[1].read_buf, 128*1024-4096, PATTERN2);
-    calc_rmw_parity_jerasure(stripes, 4, 2, osd_set, write_osd_set, 128*1024, 0);
-    assert(stripes[0].write_start == 128*1024-4096 && stripes[0].write_end == 128*1024);
-    assert(stripes[1].write_start == 0 && stripes[1].write_end == 4096);
-    assert(stripes[2].write_start == 0 && stripes[2].write_end == 128*1024);
-    assert(stripes[3].write_start == 0 && stripes[3].write_end == 128*1024);
-    assert(stripes[0].write_buf == write_buf);
-    assert(stripes[1].write_buf == write_buf+4096);
-    assert(stripes[2].write_buf == rmw_buf);
-    assert(stripes[3].write_buf == rmw_buf+128*1024);
-    // Test 13.3 - full decode and verify
-    osd_num_t read_osd_set[4] = { 0, 0, 3, 4 };
-    memset(stripes, 0, sizeof(stripes));
-    split_stripes(2, 128*1024, 0, 256*1024, stripes);
-    assert(stripes[0].req_start == 0 && stripes[0].req_end == 128*1024);
-    assert(stripes[1].req_start == 0 && stripes[1].req_end == 128*1024);
-    assert(stripes[2].req_start == 0 && stripes[2].req_end == 0);
-    assert(stripes[3].req_start == 0 && stripes[3].req_end == 0);
-    for (int role = 0; role < 4; role++)
-    {
-        stripes[role].read_start = stripes[role].req_start;
-        stripes[role].read_end = stripes[role].req_end;
-    }
-    assert(extend_missing_stripes(stripes, read_osd_set, 2, 4) == 0);
-    assert(stripes[0].read_start == 0 && stripes[0].read_end == 128*1024);
-    assert(stripes[1].read_start == 0 && stripes[1].read_end == 128*1024);
-    assert(stripes[2].read_start == 0 && stripes[2].read_end == 128*1024);
-    assert(stripes[3].read_start == 0 && stripes[3].read_end == 128*1024);
-    void *read_buf = alloc_read_buffer(stripes, 4, 0);
-    assert(read_buf);
-    assert(stripes[0].read_buf == read_buf);
-    assert(stripes[1].read_buf == read_buf+128*1024);
-    assert(stripes[2].read_buf == read_buf+2*128*1024);
-    assert(stripes[3].read_buf == read_buf+3*128*1024);
-    memcpy(read_buf+2*128*1024, rmw_buf, 128*1024);
-    memcpy(read_buf+3*128*1024, rmw_buf+128*1024, 128*1024);
-    reconstruct_stripes_jerasure(stripes, 4, 2, 0);
-    check_pattern(stripes[0].read_buf, 128*1024-4096, PATTERN1);
-    check_pattern(stripes[0].read_buf+128*1024-4096, 4096, PATTERN3);
-    check_pattern(stripes[1].read_buf, 4096, PATTERN3);
-    check_pattern(stripes[1].read_buf+4096, 128*1024-4096, PATTERN2);
-    free(read_buf);
-    // Test 13.4 - partial decode (only 1st chunk) and verify
-    memset(stripes, 0, sizeof(stripes));
-    split_stripes(2, 128*1024, 0, 128*1024, stripes);
-    assert(stripes[0].req_start == 0 && stripes[0].req_end == 128*1024);
-    assert(stripes[1].req_start == 0 && stripes[1].req_end == 0);
-    assert(stripes[2].req_start == 0 && stripes[2].req_end == 0);
-    assert(stripes[3].req_start == 0 && stripes[3].req_end == 0);
-    for (int role = 0; role < 4; role++)
-    {
-        stripes[role].read_start = stripes[role].req_start;
-        stripes[role].read_end = stripes[role].req_end;
-    }
-    assert(extend_missing_stripes(stripes, read_osd_set, 2, 4) == 0);
-    assert(stripes[0].read_start == 0 && stripes[0].read_end == 128*1024);
-    assert(stripes[1].read_start == 0 && stripes[1].read_end == 0);
-    assert(stripes[2].read_start == 0 && stripes[2].read_end == 128*1024);
-    assert(stripes[3].read_start == 0 && stripes[3].read_end == 128*1024);
-    read_buf = alloc_read_buffer(stripes, 4, 0);
-    assert(read_buf);
-    assert(stripes[0].read_buf == read_buf);
-    assert(stripes[1].read_buf == NULL);
-    assert(stripes[2].read_buf == read_buf+128*1024);
-    assert(stripes[3].read_buf == read_buf+2*128*1024);
-    memcpy(read_buf+128*1024, rmw_buf, 128*1024);
-    memcpy(read_buf+2*128*1024, rmw_buf+128*1024, 128*1024);
-    reconstruct_stripes_jerasure(stripes, 4, 2, 0);
-    check_pattern(stripes[0].read_buf, 128*1024-4096, PATTERN1);
-    check_pattern(stripes[0].read_buf+128*1024-4096, 4096, PATTERN3);
-    free(read_buf);
-    // Huh done
-    free(rmw_buf);
-    free(write_buf);
-    use_jerasure(4, 2, false);
-}
-
-/***
-
-13. basic jerasure 2+1 test
-   calc_rmw(offset=128K-4K, len=8K, osd_set=[1,2,0], write_set=[1,2,3])
-   = {
-     read: [ [ 0, 128K ], [ 0, 128K ], [ 0, 0 ] ],
-     write: [ [ 128K-4K, 128K ], [ 0, 4K ], [ 0, 128K ] ],
-     input buffer: [ write0, write1 ],
-     rmw buffer: [ write2, read0, read1 ],
-   }
-   then, after calc_rmw_parity_jerasure(): all the same
-   then simulate read with read_osd_set=[0,2,3] and check read0 buffer
-
-***/
-
-void test14()
-{
-    const int bmp = 4;
-    use_jerasure(3, 2, true);
-    osd_num_t osd_set[3] = { 1, 2, 0 };
-    osd_num_t write_osd_set[3] = { 1, 2, 3 };
-    osd_rmw_stripe_t stripes[3] = { 0 };
-    unsigned bitmaps[3] = { 0 };
-    // Test 13.0
-    void *write_buf = malloc_or_die(8192);
-    split_stripes(2, 128*1024, 128*1024-4096, 8192, stripes);
-    assert(stripes[0].req_start == 128*1024-4096 && stripes[0].req_end == 128*1024);
-    assert(stripes[1].req_start == 0 && stripes[1].req_end == 4096);
-    assert(stripes[2].req_start == 0 && stripes[2].req_end == 0);
-    // Test 13.1
-    void *rmw_buf = calc_rmw(write_buf, stripes, osd_set, 3, 2, 3, write_osd_set, 128*1024, bmp);
-    for (int i = 0; i < 3; i++)
-        stripes[i].bmp_buf = bitmaps+i;
-    assert(rmw_buf);
-    assert(stripes[0].read_start == 0 && stripes[0].read_end == 128*1024-4096);
-    assert(stripes[1].read_start == 4096 && stripes[1].read_end == 128*1024);
-    assert(stripes[2].read_start == 0 && stripes[2].read_end == 0);
-    assert(stripes[0].write_start == 128*1024-4096 && stripes[0].write_end == 128*1024);
-    assert(stripes[1].write_start == 0 && stripes[1].write_end == 4096);
-    assert(stripes[2].write_start == 0 && stripes[2].write_end == 128*1024);
-    assert(stripes[0].read_buf == rmw_buf+128*1024);
-    assert(stripes[1].read_buf == rmw_buf+2*128*1024-4096);
-    assert(stripes[2].read_buf == NULL);
-    assert(stripes[0].write_buf == write_buf);
-    assert(stripes[1].write_buf == write_buf+4096);
-    assert(stripes[2].write_buf == rmw_buf);
-    // Test 13.2 - encode
-    set_pattern(write_buf, 8192, PATTERN3);
-    set_pattern(stripes[0].read_buf, 128*1024-4096, PATTERN1);
-    set_pattern(stripes[1].read_buf, 128*1024-4096, PATTERN2);
-    memset(stripes[0].bmp_buf, 0, bmp);
-    memset(stripes[1].bmp_buf, 0, bmp);
-    memset(stripes[2].bmp_buf, 0, bmp);
-    calc_rmw_parity_jerasure(stripes, 3, 2, osd_set, write_osd_set, 128*1024, bmp);
-    assert(*(uint32_t*)stripes[0].bmp_buf == 0x80000000);
-    assert(*(uint32_t*)stripes[1].bmp_buf == 0x00000001);
-    assert(*(uint32_t*)stripes[2].bmp_buf == 0x80000001); // jerasure 2+1 is still just XOR
-    assert(stripes[0].write_start == 128*1024-4096 && stripes[0].write_end == 128*1024);
-    assert(stripes[1].write_start == 0 && stripes[1].write_end == 4096);
-    assert(stripes[2].write_start == 0 && stripes[2].write_end == 128*1024);
-    assert(stripes[0].write_buf == write_buf);
-    assert(stripes[1].write_buf == write_buf+4096);
-    assert(stripes[2].write_buf == rmw_buf);
-    // Test 13.3 - decode and verify
-    osd_num_t read_osd_set[4] = { 0, 2, 3 };
-    memset(stripes, 0, sizeof(stripes));
-    split_stripes(2, 128*1024, 0, 128*1024, stripes);
-    assert(stripes[0].req_start == 0 && stripes[0].req_end == 128*1024);
-    assert(stripes[1].req_start == 0 && stripes[1].req_end == 0);
-    assert(stripes[2].req_start == 0 && stripes[2].req_end == 0);
-    for (int role = 0; role < 3; role++)
-    {
-        stripes[role].read_start = stripes[role].req_start;
-        stripes[role].read_end = stripes[role].req_end;
-    }
-    assert(extend_missing_stripes(stripes, read_osd_set, 2, 3) == 0);
-    assert(stripes[0].read_start == 0 && stripes[0].read_end == 128*1024);
-    assert(stripes[1].read_start == 0 && stripes[1].read_end == 128*1024);
-    assert(stripes[2].read_start == 0 && stripes[2].read_end == 128*1024);
-    void *read_buf = alloc_read_buffer(stripes, 3, 0);
-    for (int i = 0; i < 3; i++)
-        stripes[i].bmp_buf = bitmaps+i;
-    assert(read_buf);
-    assert(stripes[0].read_buf == read_buf);
-    assert(stripes[1].read_buf == read_buf+128*1024);
-    assert(stripes[2].read_buf == read_buf+2*128*1024);
-    set_pattern(stripes[1].read_buf, 4096, PATTERN3);
-    set_pattern(stripes[1].read_buf+4096, 128*1024-4096, PATTERN2);
-    memcpy(stripes[2].read_buf, rmw_buf, 128*1024);
-    reconstruct_stripes_jerasure(stripes, 3, 2, bmp);
-    check_pattern(stripes[0].read_buf, 128*1024-4096, PATTERN1);
-    check_pattern(stripes[0].read_buf+128*1024-4096, 4096, PATTERN3);
-    free(read_buf);
-    // Huh done
-    free(rmw_buf);
-    free(write_buf);
-    use_jerasure(3, 2, false);
-}
--- a/src/osd_secondary.cpp
+++ b/src/osd_secondary.cpp
@ -1,5 +1,5 @@
 // Copyright (c) Vitaliy Filippov, 2019+
-// License: VNPL-1.1 (see README.md for details)
+// License: VNPL-1.0 (see README.md for details)

 #include "osd.h"

@ -17,14 +17,10 @@ void osd_t::secondary_op_callback(osd_op_t *op)
    {
        op->reply.sec_del.version = op->bs_op->version;
    }
-    if (op->req.hdr.opcode == OSD_OP_SEC_READ)
+    if (op->req.hdr.opcode == OSD_OP_SEC_READ &&
+        op->bs_op->retval > 0)
    {
-        if (op->bs_op->retval >= 0)
-            op->reply.sec_rw.attr_len = clean_entry_bitmap_size;
-        else
-            op->reply.sec_rw.attr_len = 0;
-        if (op->bs_op->retval > 0)
-            op->iov.push_back(op->buf, op->bs_op->retval);
+        op->iov.push_back(op->buf, op->bs_op->retval);
    }
    else if (op->req.hdr.opcode == OSD_OP_SEC_LIST)
    {
@ -59,22 +55,11 @@ void osd_t::exec_secondary(osd_op_t *cur_op)
        cur_op->req.hdr.opcode == OSD_OP_SEC_WRITE ||
        cur_op->req.hdr.opcode == OSD_OP_SEC_WRITE_STABLE)
    {
-        if (cur_op->req.hdr.opcode == OSD_OP_SEC_READ)
-        {
-            // Allocate memory for the read operation
-            if (clean_entry_bitmap_size > sizeof(unsigned))
-                cur_op->bitmap = cur_op->rmw_buf = malloc_or_die(clean_entry_bitmap_size);
-            else
-                cur_op->bitmap = &cur_op->bmp_data;
-            if (cur_op->req.sec_rw.len > 0)
-                cur_op->buf = memalign_or_die(MEM_ALIGNMENT, cur_op->req.sec_rw.len);
-        }
        cur_op->bs_op->oid = cur_op->req.sec_rw.oid;
        cur_op->bs_op->version = cur_op->req.sec_rw.version;
        cur_op->bs_op->offset = cur_op->req.sec_rw.offset;
        cur_op->bs_op->len = cur_op->req.sec_rw.len;
        cur_op->bs_op->buf = cur_op->buf;
-        cur_op->bs_op->bitmap = cur_op->bitmap;
 #ifdef OSD_STUB
        cur_op->bs_op->retval = cur_op->bs_op->len;
 #endif
--- a/src/osd_test.cpp
+++ b/src/osd_test.cpp
@ -1,5 +1,5 @@
 // Copyright (c) Vitaliy Filippov, 2019+
-// License: VNPL-1.1 (see README.md for details)
+// License: VNPL-1.0 (see README.md for details)

 #include <sys/types.h>
 #include <sys/socket.h>
--- a/Show More
+++ b/Show More