Release v0.5.4

- Fix a rare hang, more or less reproducible with very slow drives - Fix a hang with the no_same_sector_overwrites mode
Attempt forced trim when stopping an overrun flusher
2021-02-24 01:40:30 +03:00 · 2021-02-24 01:33:01 +03:00 · 2021-02-23 18:50:51 +03:00 · 2021-02-03 00:38:57 +03:00 · 2021-02-02 02:05:41 +03:00 · 2021-02-02 01:32:23 +03:00
72 changed files with 3160 additions and 790 deletions
--- a/.dockerignore
+++ b/.dockerignore
@@ -0,0 +1,17 @@
+.git
+build
+mon/node_modules
+*.o
+*.so
+osd
+stub_osd
+stub_uring_osd
+stub_bench
+osd_test
+dump_journal
+nbd_proxy
+rm_inode
+fio
+qemu
+rpm/*.Dockerfile
+debian/*.Dockerfile
--- a/40
+++ b/40
@@ -1,10 +1,28 @@
+BINDIR ?= /usr/bin
+LIBDIR ?= /usr/lib/x86_64-linux-gnu
+QEMU_PLUGINDIR ?= /usr/lib/x86_64-linux-gnu/qemu
+
 BLOCKSTORE_OBJS := allocator.o blockstore.o blockstore_impl.o blockstore_init.o blockstore_open.o blockstore_journal.o blockstore_read.o \
 	blockstore_write.o blockstore_sync.o blockstore_stable.o blockstore_rollback.o blockstore_flush.o crc32c.o ringloop.o
 # -fsanitize=address
-CXXFLAGS := -g -O3 -Wall -Wno-sign-compare -Wno-comment -Wno-parentheses -Wno-pointer-arith -fPIC -fdiagnostics-color=always
+CXXFLAGS := -g -O3 -Wall -Wno-sign-compare -Wno-comment -Wno-parentheses -Wno-pointer-arith -fPIC -fdiagnostics-color=always -I/usr/include/jerasure
 all: libfio_blockstore.so osd libfio_sec_osd.so libfio_cluster.so stub_osd stub_uring_osd stub_bench osd_test dump_journal qemu_driver.so nbd_proxy rm_inode
 clean:
-	rm -f *.o
+	rm -f *.o libblockstore.so libfio_blockstore.so osd libfio_sec_osd.so libfio_cluster.so stub_osd stub_uring_osd stub_bench osd_test dump_journal qemu_driver.so nbd_proxy rm_inode
+
+install: all
+	mkdir -p $(DESTDIR)$(LIBDIR)/vitastor
+	install -m 0755 libfio_sec_osd.so $(DESTDIR)$(LIBDIR)/vitastor/
+	install -m 0755 libfio_cluster.so $(DESTDIR)$(LIBDIR)/vitastor/
+	install -m 0755 libfio_blockstore.so $(DESTDIR)$(LIBDIR)/vitastor/
+	install -m 0755 libblockstore.so $(DESTDIR)$(LIBDIR)/vitastor/
+	mkdir -p $(DESTDIR)$(BINDIR)
+	install -m 0755 osd $(DESTDIR)$(BINDIR)/vitastor-osd
+	install -m 0755 dump_journal $(DESTDIR)$(BINDIR)/vitastor-dump-journal
+	install -m 0755 nbd_proxy $(DESTDIR)$(BINDIR)/vitastor-nbd
+	install -m 0755 rm_inode $(DESTDIR)$(BINDIR)/vitastor-rm
+	mkdir -p $(DESTDIR)$(QEMU_PLUGINDIR)
+	install -m 0755 qemu_driver.so $(DESTDIR)$(QEMU_PLUGINDIR)/block-vitastor.so

 dump_journal: dump_journal.cpp crc32c.o blockstore_journal.h
 	g++ $(CXXFLAGS) -o $@ $< crc32c.o
@@ -12,19 +30,19 @@ dump_journal: dump_journal.cpp crc32c.o blockstore_journal.h
 libblockstore.so: $(BLOCKSTORE_OBJS)
 	g++ $(CXXFLAGS) -o $@ -shared $(BLOCKSTORE_OBJS) -ltcmalloc_minimal -luring
 libfio_blockstore.so: ./libblockstore.so fio_engine.o json11.o
-	g++ $(CXXFLAGS) -shared -o $@ fio_engine.o json11.o ./libblockstore.so -ltcmalloc_minimal -luring
+	g++ $(CXXFLAGS) -Wl,-rpath,'$(LIBDIR)/vitastor',-rpath,'$$ORIGIN' -shared -o $@ fio_engine.o json11.o libblockstore.so -ltcmalloc_minimal -luring

 OSD_OBJS := osd.o osd_secondary.o msgr_receive.o msgr_send.o osd_peering.o osd_flush.o osd_peering_pg.o \
 	osd_primary.o osd_primary_subops.o etcd_state_client.o messenger.o osd_cluster.o http_client.o osd_ops.o pg_states.o \
 	osd_rmw.o json11.o base64.o timerfd_manager.o epoll_manager.o
 osd: ./libblockstore.so osd_main.cpp osd.h osd_ops.h $(OSD_OBJS)
-	g++ $(CXXFLAGS) -o $@ osd_main.cpp $(OSD_OBJS) ./libblockstore.so -ltcmalloc_minimal -luring
+	g++ $(CXXFLAGS) -Wl,-rpath,'$(LIBDIR)/vitastor',-rpath,'$$ORIGIN' -o $@ osd_main.cpp $(OSD_OBJS) libblockstore.so -ltcmalloc_minimal -luring -lJerasure

 stub_osd: stub_osd.o rw_blocking.o
 	g++ $(CXXFLAGS) -o $@ stub_osd.o rw_blocking.o -ltcmalloc_minimal

 osd_rmw_test: osd_rmw_test.o
-	g++ $(CXXFLAGS) -o $@ osd_rmw_test.o
+	g++ $(CXXFLAGS) -o $@ osd_rmw_test.o -lJerasure -fsanitize=address

 STUB_URING_OSD_OBJS := stub_uring_osd.o epoll_manager.o messenger.o msgr_send.o msgr_receive.o ringloop.o timerfd_manager.o json11.o
 stub_uring_osd: $(STUB_URING_OSD_OBJS)
@@ -55,12 +73,12 @@ qemu_driver.o: qemu_driver.c qemu_proxy.h
 		-I qemu/include $(CXXFLAGS) -c -o $@ $<

 qemu_driver.so: qemu_driver.o qemu_proxy.o $(FIO_CLUSTER_OBJS)
-	g++ $(CXXFLAGS) -ltcmalloc_minimal -shared -o $@ $< $(FIO_CLUSTER_OBJS) qemu_driver.o qemu_proxy.o -luring
+	g++ $(CXXFLAGS) -ltcmalloc_minimal -shared -o $@ $(FIO_CLUSTER_OBJS) qemu_driver.o qemu_proxy.o -luring

 test_blockstore: ./libblockstore.so test_blockstore.cpp timerfd_interval.o
-	g++ $(CXXFLAGS) -o test_blockstore test_blockstore.cpp timerfd_interval.o ./libblockstore.so -ltcmalloc_minimal -luring
-test: test.cpp osd_peering_pg.o
-	g++ $(CXXFLAGS) -o test test.cpp osd_peering_pg.o -luring -lm
+	g++ $(CXXFLAGS) -Wl,-rpath,'$(LIBDIR)/vitastor',-rpath,'$$ORIGIN' -o test_blockstore test_blockstore.cpp timerfd_interval.o libblockstore.so -ltcmalloc_minimal -luring
+test_shit: test_shit.cpp osd_peering_pg.o
+	g++ $(CXXFLAGS) -o test_shit test_shit.cpp -luring -lm
 test_allocator: test_allocator.cpp allocator.o
 	g++ $(CXXFLAGS) -o test_allocator test_allocator.cpp allocator.o

@@ -165,12 +183,12 @@ stub_osd.o: stub_osd.cpp object_id.h osd_id.h osd_ops.h rw_blocking.h
 	g++ $(CXXFLAGS) -c -o $@ $<
 stub_uring_osd.o: stub_uring_osd.cpp epoll_manager.h json11/json11.hpp malloc_or_die.h messenger.h object_id.h osd_id.h osd_ops.h ringloop.h timerfd_manager.h
 	g++ $(CXXFLAGS) -c -o $@ $<
-test.o: test.cpp allocator.h blockstore.h blockstore_flush.h blockstore_impl.h blockstore_init.h blockstore_journal.h cpp-btree/btree_map.h crc32c.h malloc_or_die.h object_id.h osd_id.h osd_ops.h osd_peering_pg.h pg_states.h ringloop.h
-	g++ $(CXXFLAGS) -c -o $@ $<
 test_allocator.o: test_allocator.cpp allocator.h
 	g++ $(CXXFLAGS) -c -o $@ $<
 test_blockstore.o: test_blockstore.cpp blockstore.h object_id.h ringloop.h timerfd_interval.h
 	g++ $(CXXFLAGS) -c -o $@ $<
+test_shit.o: test_shit.cpp allocator.h blockstore.h blockstore_flush.h blockstore_impl.h blockstore_init.h blockstore_journal.h cpp-btree/btree_map.h crc32c.h malloc_or_die.h object_id.h osd_id.h osd_ops.h osd_peering_pg.h pg_states.h ringloop.h
+	g++ $(CXXFLAGS) -c -o $@ $<
 timerfd_interval.o: timerfd_interval.cpp ringloop.h timerfd_interval.h
 	g++ $(CXXFLAGS) -c -o $@ $<
 timerfd_manager.o: timerfd_manager.cpp timerfd_manager.h
--- a/README.md
+++ b/README.md
@@ -16,7 +16,8 @@ breaking changes in the future. However, the following is implemented:

 - Basic part: highly-available block storage with symmetric clustering and no SPOF
 - Performance ;-D
- Two redundancy schemes: Replication and XOR n+1 (simplest case of EC)
+- Multiple redundancy schemes: Replication, XOR n+1, Reed-Solomon erasure codes
+  based on jerasure library with any number of data and parity drives in a group
 - Configuration via simple JSON data structures in etcd
 - Automatic data distribution over OSDs, with support for:
  - Mathematical optimization for better uniformity and less data movement
@@ -31,16 +32,14 @@ breaking changes in the future. However, the following is implemented:
 - QEMU driver (built out-of-tree)
 - Loadable fio engine for benchmarks (also built out-of-tree)
 - NBD proxy for kernel mounts
- Inode removal tool (./rm_inode)
+- Inode removal tool (vitastor-rm)
+- Packaging for Debian and CentOS

 ## Roadmap

- Packaging for Debian and, probably, CentOS too
 - OSD creation tool (OSDs currently have to be created by hand)
 - Other administrative tools
 - Per-inode I/O and space usage statistics
- jerasure EC support with any number of data and parity drives in a group
- Parallel usage of multiple network interfaces
 - Proxmox and OpenNebula plugins
 - iSCSI proxy
 - Inode metadata storage in etcd
@@ -50,6 +49,7 @@ breaking changes in the future. However, the following is implemented:
 - Checksums
 - SSD+HDD optimizations, possibly including tiered storage and soft journal flushes
 - RDMA and NVDIMM support
+- Web GUI
 - Compression (possibly)
 - Read caching using system page cache (possibly)

@@ -280,7 +280,34 @@ Vitastor with single-thread NBD on the same hardware:
 - Linear write (4M T1Q128): 1266 MB/s (compared to 2800 MB/s via fio)
 - Linear read (4M T1Q128): 975 MB/s (compared to 1500 MB/s via fio)

-## Building
+## Installation
+
+### Debian
+
+- Trust Vitastor package signing key:
+  `wget -q -O - https://vitastor.io/debian/pubkey | sudo apt-key add -`
+- Add Vitastor package repository to your /etc/apt/sources.list:
+  - Debian 11 (Bullseye/Sid): `deb https://vitastor.io/debian bullseye main`
+  - Debian 10 (Buster): `deb https://vitastor.io/debian buster main`
+- For Debian 10 (Buster) also enable backports repository:
+  `deb http://deb.debian.org/debian buster-backports main`
+- Install packages: `apt update; apt install vitastor lp-solve etcd linux-image-amd64`
+
+### CentOS
+
+- Add Vitastor package repository:
+  - CentOS 7: `yum install https://vitastor.io/rpms/centos/7/vitastor-release-1.0-1.el7.noarch.rpm`
+  - CentOS 8: `dnf install https://vitastor.io/rpms/centos/8/vitastor-release-1.0-1.el8.noarch.rpm`
+- Enable EPEL: `yum/dnf install epel-release`
+- Enable additional CentOS repositories:
+  - CentOS 7: `yum install centos-release-scl`
+  - CentOS 8: `dnf install centos-release-advanced-virtualization`
+- Enable elrepo-kernel:
+  - CentOS 7: `yum install https://www.elrepo.org/elrepo-release-7.el7.elrepo.noarch.rpm`
+  - CentOS 8: `dnf install https://www.elrepo.org/elrepo-release-8.el8.elrepo.noarch.rpm`
+- Install packages: `yum/dnf install vitastor lpsolve etcd kernel-ml qemu-kvm`
+
+### Building from Source

 - Install Linux kernel 5.4 or newer, for io_uring support. 5.8 or later is highly recommended because
  there is at least one known io_uring hang with 5.4 and an HP SmartArray controller.
@@ -290,10 +317,10 @@ Vitastor with single-thread NBD on the same hardware:
  branch release-3.4, because there is a bug in upstream etcd which makes Vitastor OSDs fail to
  move PGs out of "starting" state if you have at least around ~500 PGs or so. The custom build
  will be unnecessary when etcd merges the fix: https://github.com/etcd-io/etcd/pull/12402.
- Install node.js 12 or newer.
- Install gcc and g++ 9.x or later.
+- Install node.js 10 or newer.
+- Install gcc and g++ 8.x or newer.
 - Clone https://yourcmc.ru/git/vitalif/vitastor/ with submodules.
- Install QEMU 4.x or 5.x, get its source, begin to build it, stop the build and copy headers:
+- Install QEMU 3.0+, get its source, begin to build it, stop the build and copy headers:
   - `<qemu>/include` &rarr; `<vitastor>/qemu/include`
   - Debian:
      * Use qemu packages from the main repository
@@ -303,11 +330,15 @@ Vitastor with single-thread NBD on the same hardware:
      * Use qemu packages from the Advanced-Virtualization repository. To enable it, run
        `yum install centos-release-advanced-virtualization.noarch` and then `yum install qemu`
      * `<qemu>/config-host.h` &rarr; `<vitastor>/qemu/b/qemu/config-host.h`
-      * `<qemu>/qapi` &rarr; `<vitastor>/qemu/b/qemu/qapi`
+      * For QEMU 3.0+: `<qemu>/qapi` &rarr; `<vitastor>/qemu/b/qemu/qapi`
+      * For QEMU 2.0+: `<qemu>/qapi-types.h` &rarr; `<vitastor>/qemu/b/qemu/qapi-types.h`
   - `config-host.h` and `qapi` are required because they contain generated headers
- Install fio 3.16 or later, get its source and symlink it into `<vitastor>/fio`.
+- You can also rebuild QEMU with a patch that makes LD_PRELOAD unnecessary to load vitastor driver.
+  See `qemu-*.*-vitastor.patch`.
+- Install fio 3.7 or later, get its source and symlink it into `<vitastor>/fio`.
 - Build Vitastor with `make -j8`.
- Copy binaries somewhere.
+- Run `make install` (optionally with `LIBDIR=/usr/lib64 QEMU_PLUGINDIR=/usr/lib64/qemu-kvm`
+  if you're using an RPM-based distro).

 ## Running

@@ -322,8 +353,9 @@ and calculate disk offsets almost by hand. This will be fixed in near future.
 - Create global configuration in etcd: `etcdctl --endpoints=... put /vitastor/config/global '{"immediate_commit":"all"}'`
  (if all your drives have capacitors).
 - Create pool configuration in etcd: `etcdctl --endpoints=... put /vitastor/config/pools '{"1":{"name":"testpool","scheme":"replicated","pg_size":2,"pg_minsize":1,"pg_count":256,"failure_domain":"host"}}'`.
- Calculate offsets for your drives with `node ./mon/simple-offsets.js --device /dev/sdX`.
- Make systemd units for your OSDs. Look at `./mon/make-units.sh` for example.
+  For jerasure pools the configuration should look like the following: `2:{"name":"ecpool","scheme":"jerasure","pg_size":4,"parity_chunks":2,"pg_minsize":2,"pg_count":256,"failure_domain":"host"}`.
+- Calculate offsets for your drives with `node /usr/lib/vitastor/mon/simple-offsets.js --device /dev/sdX`.
+- Make systemd units for your OSDs. Look at `/usr/lib/vitastor/mon/make-units.sh` for example.
  Notable configuration variables from the example:
  - `disable_data_fsync 1` - only safe with server-grade drives with capacitors.
  - `immediate_commit all` - use this if all your drives are server-grade.
@@ -342,35 +374,32 @@ and calculate disk offsets almost by hand. This will be fixed in near future.
    setting is set, it is also required to raise `journal_sector_buffer_count` setting, which is the
    number of dirty journal sectors that may be written to at the same time.
 - `systemctl start vitastor.target` everywhere.
- Start any number of monitors: `cd mon; node mon-main.js --etcd_url 'http://10.115.0.10:2379,http://10.115.0.11:2379,http://10.115.0.12:2379,http://10.115.0.13:2379' --etcd_prefix '/vitastor' --etcd_start_timeout 5`.
+- Start any number of monitors: `node /usr/lib/vitastor/mon/mon-main.js --etcd_url 'http://10.115.0.10:2379,http://10.115.0.11:2379,http://10.115.0.12:2379,http://10.115.0.13:2379' --etcd_prefix '/vitastor' --etcd_start_timeout 5`.
 - At this point, one of the monitors will configure PGs and OSDs will start them.
 - You can check PG states with `etcdctl --endpoints=... get --prefix /vitastor/pg/state`. All PGs should become 'active'.
- Run tests with (for example): `fio -thread -ioengine=./libfio_cluster.so -name=test -bs=4M -direct=1 -iodepth=16 -rw=write -etcd=10.115.0.10:2379/v3 -pool=1 -inode=1 -size=400G`.
+- Run tests with (for example): `fio -thread -ioengine=/usr/lib/x86_64-linux-gnu/vitastor/libfio_cluster.so -name=test -bs=4M -direct=1 -iodepth=16 -rw=write -etcd=10.115.0.10:2379/v3 -pool=1 -inode=1 -size=400G`.
 - Upload VM disk image with qemu-img (for example):
  ```
-  LD_PRELOAD=./qemu_driver.so qemu-img convert -f qcow2 debian10.qcow2 -p
+  LD_PRELOAD=/usr/lib/x86_64-linux-gnu/qemu/block-vitastor.so qemu-img convert -f qcow2 debian10.qcow2 -p
    -O raw 'vitastor:etcd_host=10.115.0.10\:2379/v3:pool=1:inode=1:size=2147483648'
  ```
 - Run QEMU with (for example):
  ```
-  LD_PRELOAD=./qemu_driver.so qemu-system-x86_64 -enable-kvm -m 1024
+  LD_PRELOAD=/usr/lib/x86_64-linux-gnu/qemu/block-vitastor.so qemu-system-x86_64 -enable-kvm -m 1024
    -drive 'file=vitastor:etcd_host=10.115.0.10\:2379/v3:pool=1:inode=1:size=2147483648',format=raw,if=none,id=drive-virtio-disk0,cache=none
    -device virtio-blk-pci,scsi=off,bus=pci.0,addr=0x5,drive=drive-virtio-disk0,id=virtio-disk0,bootindex=1,write-cache=off,physical_block_size=4096,logical_block_size=512
    -vnc 0.0.0.0:0
  ```
 - Remove inode with (for example):
  ```
-  ./rm_inode --etcd_address 10.115.0.10:2379/v3 --pool 1 --inode 1 --parallel_osds 16 --iodepth 32
+  vitastor-rm --etcd_address 10.115.0.10:2379/v3 --pool 1 --inode 1 --parallel_osds 16 --iodepth 32
  ```

 ## Known Problems

 - Object deletion requests may currently lead to 'incomplete' objects if your OSDs crash during
  deletion because proper handling of object cleanup in a cluster should be "three-phase"
-  and it's currently not implemented. Inode removal tool currently can't handle unclean
-  objects, so incomplete objects become undeletable. This will be fixed in near future
-  by allowing the inode removal tool to delete unclean objects. With this problem fixed
-  you'll be able just to repeat the removal again.
+  and it's currently not implemented. Just to repeat the removal again in this case.

 ## Implementation Principles

--- a/blockstore_flush.cpp
+++ b/blockstore_flush.cpp
@@ -76,6 +76,9 @@ void journal_flusher_t::loop()

 void journal_flusher_t::enqueue_flush(obj_ver_id ov)
 {
+#ifdef BLOCKSTORE_DEBUG
+    printf("enqueue_flush %lx:%lx v%lu\n", ov.oid.inode, ov.oid.stripe, ov.version);
+#endif
    auto it = flush_versions.find(ov.oid);
    if (it != flush_versions.end())
    {
@@ -94,8 +97,11 @@ void journal_flusher_t::enqueue_flush(obj_ver_id ov)
    }
 }

-void journal_flusher_t::unshift_flush(obj_ver_id ov)
+void journal_flusher_t::unshift_flush(obj_ver_id ov, bool force)
 {
+#ifdef BLOCKSTORE_DEBUG
+    printf("unshift_flush %lx:%lx v%lu\n", ov.oid.inode, ov.oid.stripe, ov.version);
+#endif
    auto it = flush_versions.find(ov.oid);
    if (it != flush_versions.end())
    {
@@ -105,15 +111,38 @@ void journal_flusher_t::unshift_flush(obj_ver_id ov)
    else
    {
        flush_versions[ov.oid] = ov.version;
-        flush_queue.push_front(ov.oid);
+        if (!force)
+            flush_queue.push_front(ov.oid);
    }
-    if (!dequeuing && (flush_queue.size() >= flusher_start_threshold || trim_wanted > 0))
+    if (force)
+        flush_queue.push_front(ov.oid);
+    if (force || !dequeuing && (flush_queue.size() >= flusher_start_threshold || trim_wanted > 0))
    {
        dequeuing = true;
        bs->ringloop->wakeup();
    }
 }

+void journal_flusher_t::remove_flush(object_id oid)
+{
+#ifdef BLOCKSTORE_DEBUG
+    printf("undo_flush %lx:%lx\n", oid.inode, oid.stripe);
+#endif
+    auto v_it = flush_versions.find(oid);
+    if (v_it != flush_versions.end())
+    {
+        flush_versions.erase(v_it);
+        for (auto q_it = flush_queue.begin(); q_it != flush_queue.end(); q_it++)
+        {
+            if (*q_it == oid)
+            {
+                flush_queue.erase(q_it);
+                break;
+            }
+        }
+    }
+}
+
 void journal_flusher_t::request_trim()
 {
    dequeuing = true;
@@ -194,6 +223,7 @@ bool journal_flusher_co::loop()
 resume_0:
    if (!flusher->flush_queue.size() || !flusher->dequeuing)
    {
+stop_flusher:
        if (flusher->trim_wanted > 0 && flusher->journal_trim_counter > 0)
        {
            // Attempt forced trim
@@ -298,9 +328,7 @@ resume_0:
 #ifdef BLOCKSTORE_DEBUG
                    printf("No older flushes, stopping\n");
 #endif
-                    flusher->dequeuing = false;
-                    wait_state = 0;
-                    return true;
+                    goto stop_flusher;
                }
            }
        }
@@ -319,8 +347,8 @@ resume_1:
            return false;
        }
        // Writes and deletes shouldn't happen at the same time
-        assert(!(copy_count > 0 || has_writes) || !has_delete);
-        if (copy_count == 0 && !has_writes && !has_delete || has_delete && old_clean_loc == UINT64_MAX)
+        assert(!has_writes || !has_delete);
+        if (!has_writes && !has_delete || has_delete && old_clean_loc == UINT64_MAX)
        {
            // Nothing to flush
            bs->erase_dirty(dirty_start, std::next(dirty_end), clean_loc);
@@ -445,8 +473,8 @@ resume_1:
            clean_disk_entry *new_entry = (clean_disk_entry*)(meta_new.buf + meta_new.pos*bs->clean_entry_size);
            if (new_entry->oid.inode != 0 && new_entry->oid != cur.oid)
            {
-                printf("Fatal error (metadata corruption or bug): tried to overwrite non-zero metadata entry %lx (%lx:%lx) with %lx:%lx\n",
-                    clean_loc, new_entry->oid.inode, new_entry->oid.stripe, cur.oid.inode, cur.oid.stripe);
+                printf("Fatal error (metadata corruption or bug): tried to overwrite non-zero metadata entry %lu (%lx:%lx) with %lx:%lx\n",
+                    clean_loc >> bs->block_order, new_entry->oid.inode, new_entry->oid.stripe, cur.oid.inode, cur.oid.stripe);
                exit(1);
            }
            new_entry->oid = cur.oid;
@@ -513,7 +541,7 @@ resume_1:
        if (repeat_it != flusher->sync_to_repeat.end() && repeat_it->second > cur.version)
        {
            // Requeue version
-            flusher->unshift_flush({ .oid = cur.oid, .version = repeat_it->second });
+            flusher->unshift_flush({ .oid = cur.oid, .version = repeat_it->second }, false);
        }
        flusher->sync_to_repeat.erase(repeat_it);
    trim_journal:
@@ -602,7 +630,7 @@ bool journal_flusher_co::scan_dirty(int wait_base)
        {
            char err[1024];
            snprintf(
-                err, 1024, "BUG: Unexpected dirty_entry %lx:%lx v%lu state during flush: %d",
+                err, 1024, "BUG: Unexpected dirty_entry %lx:%lx v%lu unstable state during flush: %d",
                dirty_it->first.oid.inode, dirty_it->first.oid.stripe, dirty_it->first.version, dirty_it->second.state
            );
            throw std::runtime_error(err);
--- a/blockstore_flush.h
+++ b/blockstore_flush.h
@@ -107,5 +107,6 @@ public:
    void request_trim();
    void release_trim();
    void enqueue_flush(obj_ver_id oid);
-    void unshift_flush(obj_ver_id oid);
+    void unshift_flush(obj_ver_id oid, bool force);
+    void remove_flush(object_id oid);
 };
--- a/blockstore_impl.cpp
+++ b/blockstore_impl.cpp
@@ -287,7 +287,7 @@ void blockstore_impl_t::check_wait(blockstore_op_t *op)
    else if (PRIV(op)->wait_for == WAIT_JOURNAL_BUFFER)
    {
        int next = ((journal.cur_sector + 1) % journal.sector_count);
-        if (journal.sector_info[next].usage_count > 0 ||
+        if (journal.sector_info[next].flush_count > 0 ||
            journal.sector_info[next].dirty)
        {
            // do not submit
--- a/blockstore_impl.h
+++ b/blockstore_impl.h
@@ -30,12 +30,13 @@
 #define BS_ST_BIG_WRITE 0x02
 #define BS_ST_DELETE 0x03

-#define BS_ST_WAIT_BIG 0x10
-#define BS_ST_IN_FLIGHT 0x20
-#define BS_ST_SUBMITTED 0x30
-#define BS_ST_WRITTEN 0x40
-#define BS_ST_SYNCED 0x50
-#define BS_ST_STABLE 0x60
+#define BS_ST_WAIT_DEL 0x10
+#define BS_ST_WAIT_BIG 0x20
+#define BS_ST_IN_FLIGHT 0x30
+#define BS_ST_SUBMITTED 0x40
+#define BS_ST_WRITTEN 0x50
+#define BS_ST_SYNCED 0x60
+#define BS_ST_STABLE 0x70

 #define BS_ST_INSTANT 0x100

@@ -153,6 +154,8 @@ struct blockstore_op_private_t

    // Write
    struct iovec iov_zerofill[3];
+    // Warning: must not have a default value here because it's written to before calling constructor in blockstore_write.cpp O_o
+    uint64_t real_version;

    // Sync
    std::vector<obj_ver_id> sync_big_writes, sync_small_writes;
--- a/blockstore_init.cpp
+++ b/blockstore_init.cpp
@@ -111,7 +111,7 @@ void blockstore_init_meta::handle_entries(void* entries, unsigned count, int blo
                {
                    // free the previous block
 #ifdef BLOCKSTORE_DEBUG
-                    printf("Free block %lu (new location is %lu)\n", clean_it->second.location >> block_order, done_cnt+i >> block_order);
+                    printf("Free block %lu (new location is %lu)\n", clean_it->second.location >> block_order, done_cnt+i);
 #endif
                    bs->data_alloc->set(clean_it->second.location >> block_order, false);
                }
@@ -557,11 +557,56 @@ int blockstore_init_journal::handle_journal_part(void *buf, uint64_t done_pos, u
            {
 #ifdef BLOCKSTORE_DEBUG
                printf(
-                    "je_big_write%s oid=%lx:%lx ver=%lu loc=%08lx\n",
+                    "je_big_write%s oid=%lx:%lx ver=%lu loc=%lu\n",
                    je->type == JE_BIG_WRITE_INSTANT ? "_instant" : "",
-                    je->big_write.oid.inode, je->big_write.oid.stripe, je->big_write.version, je->big_write.location
+                    je->big_write.oid.inode, je->big_write.oid.stripe, je->big_write.version, je->big_write.location >> bs->block_order
                );
 #endif
+                auto dirty_it = bs->dirty_db.upper_bound((obj_ver_id){
+                    .oid = je->big_write.oid,
+                    .version = UINT64_MAX,
+                });
+                if (dirty_it != bs->dirty_db.begin() && bs->dirty_db.size() > 0)
+                {
+                    dirty_it--;
+                    if (dirty_it->first.oid == je->big_write.oid &&
+                        dirty_it->first.version >= je->big_write.version &&
+                        (dirty_it->second.state & BS_ST_TYPE_MASK) == BS_ST_DELETE)
+                    {
+                        // It is allowed to overwrite a deleted object with a
+                        // version number smaller than deletion version number,
+                        // because the presence of a BIG_WRITE entry means that
+                        // its data and metadata are already flushed.
+                        // We don't know if newer versions are flushed, but
+                        // the previous delete definitely is.
+                        // So we flush previous dirty entries, but retain the clean one.
+                        // This feature is required for writes happening shortly
+                        // after deletes.
+                        auto dirty_end = dirty_it;
+                        dirty_end++;
+                        while (1)
+                        {
+                            if (dirty_it == bs->dirty_db.begin())
+                            {
+                                break;
+                            }
+                            dirty_it--;
+                            if (dirty_it->first.oid != je->big_write.oid)
+                            {
+                                dirty_it++;
+                                break;
+                            }
+                        }
+                        auto clean_it = bs->clean_db.find(je->big_write.oid);
+                        bs->erase_dirty(
+                            dirty_it, dirty_end,
+                            clean_it != bs->clean_db.end() ? clean_it->second.location : UINT64_MAX
+                        );
+                        // Remove it from the flusher's queue, too
+                        // Otherwise it may end up referring to a small unstable write after reading the rest of the journal
+                        bs->flusher->remove_flush(je->big_write.oid);
+                    }
+                }
                auto clean_it = bs->clean_db.find(je->big_write.oid);
                if (clean_it == bs->clean_db.end() ||
                    clean_it->second.version < je->big_write.version)
--- a/blockstore_journal.cpp
+++ b/blockstore_journal.cpp
@@ -6,7 +6,7 @@
 blockstore_journal_check_t::blockstore_journal_check_t(blockstore_impl_t *bs)
 {
    this->bs = bs;
-    sectors_required = 0;
+    sectors_to_write = 0;
    next_pos = bs->journal.next_free;
    next_sector = bs->journal.cur_sector;
    first_sector = -1;
@@ -20,23 +20,26 @@ int blockstore_journal_check_t::check_available(blockstore_op_t *op, int entries
    int required = entries_required;
    while (1)
    {
-        int fits = bs->journal.no_same_sector_overwrites && bs->journal.sector_info[next_sector].written
+        int fits = bs->journal.no_same_sector_overwrites && next_pos == bs->journal.next_free && bs->journal.sector_info[next_sector].written
            ? 0
            : (bs->journal.block_size - next_in_pos) / size;
        if (fits > 0)
        {
+            if (fits > required)
+            {
+                fits = required;
+            }
            if (first_sector == -1)
            {
                first_sector = next_sector;
            }
            required -= fits;
            next_in_pos += fits * size;
-            sectors_required++;
+            sectors_to_write++;
        }
        else if (bs->journal.sector_info[next_sector].dirty)
        {
-            // sectors_required is more like "sectors to write"
-            sectors_required++;
+            sectors_to_write++;
        }
        if (required <= 0)
        {
@@ -59,7 +62,7 @@ int blockstore_journal_check_t::check_available(blockstore_op_t *op, int entries
                " is too small for a batch of "+std::to_string(entries_required)+" entries of "+std::to_string(size)+" bytes"
            );
        }
-        if (bs->journal.sector_info[next_sector].usage_count > 0 ||
+        if (bs->journal.sector_info[next_sector].flush_count > 0 ||
            bs->journal.sector_info[next_sector].dirty)
        {
            // No memory buffer available. Wait for it.
@@ -71,17 +74,18 @@ int blockstore_journal_check_t::check_available(blockstore_op_t *op, int entries
                    dirty++;
                    used++;
                }
-                if (bs->journal.sector_info[i].usage_count > 0)
+                if (bs->journal.sector_info[i].flush_count > 0)
                {
                    used++;
                }
            }
            // In fact, it's even more rare than "ran out of journal space", so print a warning
            printf(
-                "Ran out of journal sector buffers: %d/%lu buffers used (%d dirty), next buffer (%ld) is %s and flushed %lu times\n",
+                "Ran out of journal sector buffers: %d/%lu buffers used (%d dirty), next buffer (%ld)"
+                " is %s and flushed %lu times. Consider increasing \'journal_sector_buffer_count\'\n",
                used, bs->journal.sector_count, dirty, next_sector,
                bs->journal.sector_info[next_sector].dirty ? "dirty" : "not dirty",
-                bs->journal.sector_info[next_sector].usage_count
+                bs->journal.sector_info[next_sector].flush_count
            );
            PRIV(op)->wait_for = WAIT_JOURNAL_BUFFER;
            return 0;
@@ -100,11 +104,8 @@ int blockstore_journal_check_t::check_available(blockstore_op_t *op, int entries
    {
        // No space in the journal. Wait until used_start changes.
        printf(
-            "Ran out of journal space (free space: %lu bytes, sectors to write: %d)\n",
-            (bs->journal.next_free >= bs->journal.used_start
-                ? bs->journal.len-bs->journal.block_size - (bs->journal.next_free-bs->journal.used_start)
-                : bs->journal.used_start - bs->journal.next_free),
-            sectors_required
+            "Ran out of journal space (used_start=%08lx, next_free=%08lx, dirty_start=%08lx)\n",
+            bs->journal.used_start, bs->journal.next_free, bs->journal.dirty_start
        );
        PRIV(op)->wait_for = WAIT_JOURNAL;
        bs->flusher->request_trim();
@@ -116,22 +117,21 @@ int blockstore_journal_check_t::check_available(blockstore_op_t *op, int entries

 journal_entry* prefill_single_journal_entry(journal_t & journal, uint16_t type, uint32_t size)
 {
-    if (journal.block_size - journal.in_sector_pos < size ||
-        journal.no_same_sector_overwrites && journal.sector_info[journal.cur_sector].written)
+    if (!journal.entry_fits(size))
    {
        assert(!journal.sector_info[journal.cur_sector].dirty);
        // Move to the next journal sector
-        journal.sector_info[journal.cur_sector].written = false;
-        if (journal.sector_info[journal.cur_sector].usage_count > 0)
+        if (journal.sector_info[journal.cur_sector].flush_count > 0)
        {
            // Also select next sector buffer in memory
            journal.cur_sector = ((journal.cur_sector + 1) % journal.sector_count);
-            assert(!journal.sector_info[journal.cur_sector].usage_count);
+            assert(!journal.sector_info[journal.cur_sector].flush_count);
        }
        else
        {
            journal.dirty_start = journal.next_free;
        }
+        journal.sector_info[journal.cur_sector].written = false;
        journal.sector_info[journal.cur_sector].offset = journal.next_free;
        journal.in_sector_pos = 0;
        journal.next_free = (journal.next_free+journal.block_size) < journal.len ? journal.next_free + journal.block_size : journal.block_size;
@@ -157,7 +157,7 @@ void prepare_journal_sector_write(journal_t & journal, int cur_sector, io_uring_
 {
    journal.sector_info[cur_sector].dirty = false;
    journal.sector_info[cur_sector].written = true;
-    journal.sector_info[cur_sector].usage_count++;
+    journal.sector_info[cur_sector].flush_count++;
    ring_data_t *data = ((ring_data_t*)sqe->user_data);
    data->iov = (struct iovec){
        (journal.inmemory
--- a/blockstore_journal.h
+++ b/blockstore_journal.h
@@ -133,7 +133,7 @@ inline uint32_t je_crc32(journal_entry *je)
 struct journal_sector_info_t
 {
    uint64_t offset;
-    uint64_t usage_count;
+    uint64_t flush_count;
    bool written;
    bool dirty;
 };
@@ -170,13 +170,18 @@ struct journal_t
    ~journal_t();
    bool trim();
    uint64_t get_trim_pos();
+    inline bool entry_fits(int size)
+    {
+        return !(block_size - in_sector_pos < size ||
+            no_same_sector_overwrites && sector_info[cur_sector].written);
+    }
 };

 struct blockstore_journal_check_t
 {
    blockstore_impl_t *bs;
    uint64_t next_pos, next_sector, next_in_pos;
-    int sectors_required, first_sector;
+    int sectors_to_write, first_sector;
    bool right_dir; // writing to the end or the beginning of the ring buffer

    blockstore_journal_check_t(blockstore_impl_t *bs);
--- a/blockstore_rollback.cpp
+++ b/blockstore_rollback.cpp
@@ -75,44 +75,35 @@ skip_ov:
        return 0;
    }
    // There is sufficient space. Get SQEs
-    struct io_uring_sqe *sqe[space_check.sectors_required];
-    for (i = 0; i < space_check.sectors_required; i++)
+    struct io_uring_sqe *sqe[space_check.sectors_to_write];
+    for (i = 0; i < space_check.sectors_to_write; i++)
    {
        BS_SUBMIT_GET_SQE_DECL(sqe[i]);
    }
    // Prepare and submit journal entries
    auto cb = [this, op](ring_data_t *data) { handle_rollback_event(data, op); };
    int s = 0, cur_sector = -1;
-    if ((journal_block_size - journal.in_sector_pos) < sizeof(journal_entry_rollback) &&
-        journal.sector_info[journal.cur_sector].dirty)
-    {
-        if (cur_sector == -1)
-            PRIV(op)->min_flushed_journal_sector = 1 + journal.cur_sector;
-        cur_sector = journal.cur_sector;
-        prepare_journal_sector_write(journal, cur_sector, sqe[s++], cb);
-    }
    for (i = 0, v = (obj_ver_id*)op->buf; i < op->len; i++, v++)
    {
+        if (!journal.entry_fits(sizeof(journal_entry_rollback)) &&
+            journal.sector_info[journal.cur_sector].dirty)
+        {
+            if (cur_sector == -1)
+                PRIV(op)->min_flushed_journal_sector = 1 + journal.cur_sector;
+            prepare_journal_sector_write(journal, journal.cur_sector, sqe[s++], cb);
+            cur_sector = journal.cur_sector;
+        }
        journal_entry_rollback *je = (journal_entry_rollback*)
            prefill_single_journal_entry(journal, JE_ROLLBACK, sizeof(journal_entry_rollback));
-        journal.sector_info[journal.cur_sector].dirty = false;
        je->oid = v->oid;
        je->version = v->version;
        je->crc32 = je_crc32((journal_entry*)je);
        journal.crc32_last = je->crc32;
-        if (cur_sector != journal.cur_sector)
-        {
-            // Write previous sector. We should write the sector only after filling it,
-            // because otherwise we'll write a lot more sectors in the "no_same_sector_overwrite" mode
-            if (cur_sector != -1)
-                prepare_journal_sector_write(journal, cur_sector, sqe[s++], cb);
-            else
-                PRIV(op)->min_flushed_journal_sector = 1 + journal.cur_sector;
-            cur_sector = journal.cur_sector;
-        }
    }
-    if (cur_sector != -1)
-        prepare_journal_sector_write(journal, cur_sector, sqe[s++], cb);
+    prepare_journal_sector_write(journal, journal.cur_sector, sqe[s++], cb);
+    assert(s == space_check.sectors_to_write);
+    if (cur_sector == -1)
+        PRIV(op)->min_flushed_journal_sector = 1 + journal.cur_sector;
    PRIV(op)->max_flushed_journal_sector = 1 + journal.cur_sector;
    PRIV(op)->pending_ops = s;
    PRIV(op)->op_state = 1;
@@ -234,10 +225,38 @@ void blockstore_impl_t::handle_rollback_event(ring_data_t *data, blockstore_op_t

 void blockstore_impl_t::erase_dirty(blockstore_dirty_db_t::iterator dirty_start, blockstore_dirty_db_t::iterator dirty_end, uint64_t clean_loc)
 {
-    auto dirty_it = dirty_end;
-    while (dirty_it != dirty_start)
+    if (dirty_end == dirty_start)
    {
+        return;
+    }
+    auto dirty_it = dirty_end;
+    dirty_it--;
+    if (IS_DELETE(dirty_it->second.state))
+    {
+        object_id oid = dirty_it->first.oid;
+#ifdef BLOCKSTORE_DEBUG
+        printf("Unblock writes-after-delete %lx:%lx v%lx\n", oid.inode, oid.stripe, dirty_it->first.version);
+#endif
+        dirty_it = dirty_end;
+        // Unblock operations blocked by delete flushing
+        uint32_t next_state = BS_ST_IN_FLIGHT;
+        while (dirty_it != dirty_db.end() && dirty_it->first.oid == oid)
+        {
+            if ((dirty_it->second.state & BS_ST_WORKFLOW_MASK) == BS_ST_WAIT_DEL)
+            {
+                dirty_it->second.state = (dirty_it->second.state & ~BS_ST_WORKFLOW_MASK) | next_state;
+                if (IS_BIG_WRITE(dirty_it->second.state))
+                {
+                    next_state = BS_ST_WAIT_BIG;
+                }
+            }
+            dirty_it++;
+        }
+        dirty_it = dirty_end;
        dirty_it--;
+    }
+    while (1)
+    {
        if (IS_BIG_WRITE(dirty_it->second.state) && dirty_it->second.location != clean_loc)
        {
 #ifdef BLOCKSTORE_DEBUG
@@ -256,6 +275,11 @@ void blockstore_impl_t::erase_dirty(blockstore_dirty_db_t::iterator dirty_start,
        {
            journal.used_sectors.erase(dirty_it->second.journal_sector);
        }
+        if (dirty_it == dirty_start)
+        {
+            break;
+        }
+        dirty_it--;
    }
    dirty_db.erase(dirty_start, dirty_end);
 }
--- a/blockstore_stable.cpp
+++ b/blockstore_stable.cpp
@@ -98,45 +98,36 @@ int blockstore_impl_t::dequeue_stable(blockstore_op_t *op)
        return 0;
    }
    // There is sufficient space. Get SQEs
-    struct io_uring_sqe *sqe[space_check.sectors_required];
-    for (i = 0; i < space_check.sectors_required; i++)
+    struct io_uring_sqe *sqe[space_check.sectors_to_write];
+    for (i = 0; i < space_check.sectors_to_write; i++)
    {
        BS_SUBMIT_GET_SQE_DECL(sqe[i]);
    }
    // Prepare and submit journal entries
    auto cb = [this, op](ring_data_t *data) { handle_stable_event(data, op); };
    int s = 0, cur_sector = -1;
-    if ((journal_block_size - journal.in_sector_pos) < sizeof(journal_entry_stable) &&
-        journal.sector_info[journal.cur_sector].dirty)
-    {
-        if (cur_sector == -1)
-            PRIV(op)->min_flushed_journal_sector = 1 + journal.cur_sector;
-        cur_sector = journal.cur_sector;
-        prepare_journal_sector_write(journal, cur_sector, sqe[s++], cb);
-    }
    for (i = 0, v = (obj_ver_id*)op->buf; i < op->len; i++, v++)
    {
        // FIXME: Only stabilize versions that aren't stable yet
+        if (!journal.entry_fits(sizeof(journal_entry_stable)) &&
+            journal.sector_info[journal.cur_sector].dirty)
+        {
+            if (cur_sector == -1)
+                PRIV(op)->min_flushed_journal_sector = 1 + journal.cur_sector;
+            prepare_journal_sector_write(journal, journal.cur_sector, sqe[s++], cb);
+            cur_sector = journal.cur_sector;
+        }
        journal_entry_stable *je = (journal_entry_stable*)
            prefill_single_journal_entry(journal, JE_STABLE, sizeof(journal_entry_stable));
-        journal.sector_info[journal.cur_sector].dirty = false;
        je->oid = v->oid;
        je->version = v->version;
        je->crc32 = je_crc32((journal_entry*)je);
        journal.crc32_last = je->crc32;
-        if (cur_sector != journal.cur_sector)
-        {
-            // Write previous sector. We should write the sector only after filling it,
-            // because otherwise we'll write a lot more sectors in the "no_same_sector_overwrite" mode
-            if (cur_sector != -1)
-                prepare_journal_sector_write(journal, cur_sector, sqe[s++], cb);
-            else
-                PRIV(op)->min_flushed_journal_sector = 1 + journal.cur_sector;
-            cur_sector = journal.cur_sector;
-        }
    }
-    if (cur_sector != -1)
-        prepare_journal_sector_write(journal, cur_sector, sqe[s++], cb);
+    prepare_journal_sector_write(journal, journal.cur_sector, sqe[s++], cb);
+    assert(s == space_check.sectors_to_write);
+    if (cur_sector == -1)
+        PRIV(op)->min_flushed_journal_sector = 1 + journal.cur_sector;
    PRIV(op)->max_flushed_journal_sector = 1 + journal.cur_sector;
    PRIV(op)->pending_ops = s;
    PRIV(op)->op_state = 1;
@@ -213,9 +204,6 @@ void blockstore_impl_t::mark_stable(const obj_ver_id & v)
                break;
            }
        }
-#ifdef BLOCKSTORE_DEBUG
-        printf("enqueue_flush %lx:%lx v%lu\n", v.oid.inode, v.oid.stripe, v.version);
-#endif
        flusher->enqueue_flush(v);
    }
    auto unstab_it = unstable_writes.find(v.oid);
--- a/blockstore_sync.cpp
+++ b/blockstore_sync.cpp
@@ -112,30 +112,29 @@ int blockstore_impl_t::continue_sync(blockstore_op_t *op)
            return 0;
        }
        // Get SQEs. Don't bother about merging, submit each journal sector as a separate request
-        struct io_uring_sqe *sqe[space_check.sectors_required];
-        for (int i = 0; i < space_check.sectors_required; i++)
+        struct io_uring_sqe *sqe[space_check.sectors_to_write];
+        for (int i = 0; i < space_check.sectors_to_write; i++)
        {
            BS_SUBMIT_GET_SQE_DECL(sqe[i]);
        }
        // Prepare and submit journal entries
        auto it = PRIV(op)->sync_big_writes.begin();
        int s = 0, cur_sector = -1;
-        if ((journal_block_size - journal.in_sector_pos) < sizeof(journal_entry_big_write) &&
-            journal.sector_info[journal.cur_sector].dirty)
-        {
-            if (cur_sector == -1)
-                PRIV(op)->min_flushed_journal_sector = 1 + journal.cur_sector;
-            cur_sector = journal.cur_sector;
-            prepare_journal_sector_write(journal, cur_sector, sqe[s++], cb);
-        }
        while (it != PRIV(op)->sync_big_writes.end())
        {
+            if (!journal.entry_fits(sizeof(journal_entry_big_write)) &&
+                journal.sector_info[journal.cur_sector].dirty)
+            {
+                if (cur_sector == -1)
+                    PRIV(op)->min_flushed_journal_sector = 1 + journal.cur_sector;
+                prepare_journal_sector_write(journal, journal.cur_sector, sqe[s++], cb);
+                cur_sector = journal.cur_sector;
+            }
            journal_entry_big_write *je = (journal_entry_big_write*)prefill_single_journal_entry(
                journal, (dirty_db[*it].state & BS_ST_INSTANT) ? JE_BIG_WRITE_INSTANT : JE_BIG_WRITE,
                sizeof(journal_entry_big_write)
            );
            dirty_db[*it].journal_sector = journal.sector_info[journal.cur_sector].offset;
-            journal.sector_info[journal.cur_sector].dirty = false;
            journal.used_sectors[journal.sector_info[journal.cur_sector].offset]++;
 #ifdef BLOCKSTORE_DEBUG
            printf(
@@ -152,19 +151,11 @@ int blockstore_impl_t::continue_sync(blockstore_op_t *op)
            je->crc32 = je_crc32((journal_entry*)je);
            journal.crc32_last = je->crc32;
            it++;
-            if (cur_sector != journal.cur_sector)
-            {
-                // Write previous sector. We should write the sector only after filling it,
-                // because otherwise we'll write a lot more sectors in the "no_same_sector_overwrite" mode
-                if (cur_sector != -1)
-                    prepare_journal_sector_write(journal, cur_sector, sqe[s++], cb);
-                else
-                    PRIV(op)->min_flushed_journal_sector = 1 + journal.cur_sector;
-                cur_sector = journal.cur_sector;
-            }
        }
-        if (cur_sector != -1)
-            prepare_journal_sector_write(journal, cur_sector, sqe[s++], cb);
+        prepare_journal_sector_write(journal, journal.cur_sector, sqe[s++], cb);
+        assert(s == space_check.sectors_to_write);
+        if (cur_sector == -1)
+            PRIV(op)->min_flushed_journal_sector = 1 + journal.cur_sector;
        PRIV(op)->max_flushed_journal_sector = 1 + journal.cur_sector;
        PRIV(op)->pending_ops = s;
        PRIV(op)->op_state = SYNC_JOURNAL_WRITE_SENT;
--- a/blockstore_write.cpp
+++ b/blockstore_write.cpp
@@ -7,7 +7,7 @@ bool blockstore_impl_t::enqueue_write(blockstore_op_t *op)
 {
    // Check or assign version number
    bool found = false, deleted = false, is_del = (op->opcode == BS_OP_DELETE);
-    bool is_inflight_big = false;
+    bool wait_big = false, wait_del = false;
    uint64_t version = 1;
    if (dirty_db.size() > 0)
    {
@@ -21,7 +21,8 @@ bool blockstore_impl_t::enqueue_write(blockstore_op_t *op)
            found = true;
            version = dirty_it->first.version + 1;
            deleted = IS_DELETE(dirty_it->second.state);
-            is_inflight_big = (dirty_it->second.state & BS_ST_TYPE_MASK) == BS_ST_BIG_WRITE
+            wait_del = ((dirty_it->second.state & BS_ST_WORKFLOW_MASK) == BS_ST_WAIT_DEL);
+            wait_big = (dirty_it->second.state & BS_ST_TYPE_MASK) == BS_ST_BIG_WRITE
                ? !IS_SYNCED(dirty_it->second.state)
                : ((dirty_it->second.state & BS_ST_WORKFLOW_MASK) == BS_ST_WAIT_BIG);
        }
@@ -38,23 +39,43 @@ bool blockstore_impl_t::enqueue_write(blockstore_op_t *op)
            deleted = true;
        }
    }
-    if (op->version == 0)
-    {
-        op->version = version;
-    }
-    else if (op->version < version)
-    {
-        // Invalid version requested
-        op->retval = -EEXIST;
-        return false;
-    }
    if (deleted && is_del)
    {
        // Already deleted
        op->retval = 0;
        return false;
    }
-    if (is_inflight_big && !is_del && !deleted && op->len < block_size &&
+    PRIV(op)->real_version = 0;
+    if (op->version == 0)
+    {
+        op->version = version;
+    }
+    else if (op->version < version)
+    {
+        // Implicit operations must be added like that: DEL [FLUSH] BIG [SYNC] SMALL SMALL
+        if (deleted || wait_del)
+        {
+            // It's allowed to write versions with low numbers over deletes
+            // However, we have to flush those deletes first as we use version number for ordering
+#ifdef BLOCKSTORE_DEBUG
+            printf("Write %lx:%lx v%lu over delete (real v%lu) offset=%u len=%u\n", op->oid.inode, op->oid.stripe, version, op->version, op->offset, op->len);
+#endif
+            wait_del = true;
+            PRIV(op)->real_version = op->version;
+            op->version = version;
+            flusher->unshift_flush((obj_ver_id){
+                .oid = op->oid,
+                .version = version-1,
+            }, true);
+        }
+        else
+        {
+            // Invalid version requested
+            op->retval = -EEXIST;
+            return false;
+        }
+    }
+    if (wait_big && !is_del && !deleted && op->len < block_size &&
        immediate_commit != IMMEDIATE_ALL)
    {
        // Issue an additional sync so that the previous big write can reach the journal
@@ -69,22 +90,31 @@ bool blockstore_impl_t::enqueue_write(blockstore_op_t *op)
 #ifdef BLOCKSTORE_DEBUG
    if (is_del)
        printf("Delete %lx:%lx v%lu\n", op->oid.inode, op->oid.stripe, op->version);
-    else
+    else if (!wait_del)
        printf("Write %lx:%lx v%lu offset=%u len=%u\n", op->oid.inode, op->oid.stripe, op->version, op->offset, op->len);
 #endif
-    // No strict need to add it into dirty_db here, it's just left
+    // FIXME No strict need to add it into dirty_db here, it's just left
    // from the previous implementation where reads waited for writes
+    uint32_t state;
+    if (is_del)
+        state = BS_ST_DELETE | BS_ST_IN_FLIGHT;
+    else
+    {
+        state = (op->len == block_size || deleted ? BS_ST_BIG_WRITE : BS_ST_SMALL_WRITE);
+        if (wait_del)
+            state |= BS_ST_WAIT_DEL;
+        else if (state == BS_ST_SMALL_WRITE && wait_big)
+            state |= BS_ST_WAIT_BIG;
+        else
+            state |= BS_ST_IN_FLIGHT;
+        if (op->opcode == BS_OP_WRITE_STABLE)
+            state |= BS_ST_INSTANT;
+    }
    dirty_db.emplace((obj_ver_id){
        .oid = op->oid,
        .version = op->version,
    }, (dirty_entry){
-        .state = (uint32_t)(
-            is_del
-                ? (BS_ST_DELETE | BS_ST_IN_FLIGHT)
-                : (op->opcode == BS_OP_WRITE_STABLE ? BS_ST_INSTANT : 0) | (op->len == block_size || deleted
-                    ? (BS_ST_BIG_WRITE | BS_ST_IN_FLIGHT)
-                    : (is_inflight_big ? (BS_ST_SMALL_WRITE | BS_ST_WAIT_BIG) : (BS_ST_SMALL_WRITE | BS_ST_IN_FLIGHT)))
-        ),
+        .state = state,
        .flags = 0,
        .location = 0,
        .offset = is_del ? 0 : op->offset,
@@ -106,12 +136,38 @@ int blockstore_impl_t::dequeue_write(blockstore_op_t *op)
        .version = op->version,
    });
    assert(dirty_it != dirty_db.end());
-    if ((dirty_it->second.state & BS_ST_WORKFLOW_MASK) == BS_ST_WAIT_BIG)
+    if ((dirty_it->second.state & BS_ST_WORKFLOW_MASK) < BS_ST_IN_FLIGHT)
    {
        // Don't dequeue
        return 0;
    }
-    else if ((dirty_it->second.state & BS_ST_TYPE_MASK) == BS_ST_BIG_WRITE)
+    if (PRIV(op)->real_version != 0)
+    {
+        // Restore original low version number for unblocked operations
+#ifdef BLOCKSTORE_DEBUG
+        printf("Restoring %lx:%lx version: v%lu -> v%lu\n", op->oid.inode, op->oid.stripe, op->version, PRIV(op)->real_version);
+#endif
+        auto prev_it = dirty_it;
+        prev_it--;
+        if (prev_it->first.oid == op->oid && prev_it->first.version >= PRIV(op)->real_version)
+        {
+            // Original version is still invalid
+            // FIXME Oops. Successive small writes will currently break in an unexpected way. Fix it
+            dirty_db.erase(dirty_it);
+            op->retval = -EEXIST;
+            FINISH_OP(op);
+            return 1;
+        }
+        op->version = PRIV(op)->real_version;
+        PRIV(op)->real_version = 0;
+        dirty_entry e = dirty_it->second;
+        dirty_db.erase(dirty_it);
+        dirty_it = dirty_db.emplace((obj_ver_id){
+            .oid = op->oid,
+            .version = op->version,
+        }, e).first;
+    }
+    if ((dirty_it->second.state & BS_ST_TYPE_MASK) == BS_ST_BIG_WRITE)
    {
        blockstore_journal_check_t space_check(this);
        if (!space_check.check_available(op, unsynced_big_writes.size() + 1, sizeof(journal_entry_big_write), JOURNAL_STABILIZE_RESERVATION))
@@ -129,6 +185,8 @@ int blockstore_impl_t::dequeue_write(blockstore_op_t *op)
                PRIV(op)->wait_for = WAIT_FREE;
                return 0;
            }
+            // FIXME Oops. Successive small writes will currently break in an unexpected way. Fix it
+            dirty_db.erase(dirty_it);
            op->retval = -ENOSPC;
            FINISH_OP(op);
            return 1;
@@ -319,7 +377,6 @@ resume_2:
        sizeof(journal_entry_big_write)
    );
    dirty_it->second.journal_sector = journal.sector_info[journal.cur_sector].offset;
-    journal.sector_info[journal.cur_sector].dirty = false;
    journal.used_sectors[journal.sector_info[journal.cur_sector].offset]++;
 #ifdef BLOCKSTORE_DEBUG
    printf(
@@ -344,7 +401,7 @@ resume_2:
 resume_4:
    // Switch object state
 #ifdef BLOCKSTORE_DEBUG
-    printf("Ack write %lx:%lx v%lu = %d\n", op->oid.inode, op->oid.stripe, op->version, dirty_it->second.state);
+    printf("Ack write %lx:%lx v%lu = state %x\n", op->oid.inode, op->oid.stripe, op->version, dirty_it->second.state);
 #endif
    bool imm = (dirty_it->second.state & BS_ST_TYPE_MASK) == BS_ST_BIG_WRITE
        ? (immediate_commit == IMMEDIATE_ALL)
@@ -411,8 +468,8 @@ void blockstore_impl_t::release_journal_sectors(blockstore_op_t *op)
        uint64_t s = PRIV(op)->min_flushed_journal_sector;
        while (1)
        {
-            journal.sector_info[s-1].usage_count--;
-            if (s != (1+journal.cur_sector) && journal.sector_info[s-1].usage_count == 0)
+            journal.sector_info[s-1].flush_count--;
+            if (s != (1+journal.cur_sector) && journal.sector_info[s-1].flush_count == 0)
            {
                // We know for sure that we won't write into this sector anymore
                uint64_t new_ds = journal.sector_info[s-1].offset + journal.block_size;
@@ -492,7 +549,10 @@ int blockstore_impl_t::dequeue_del(blockstore_op_t *op)
        prepare_journal_sector_write(journal, journal.cur_sector, sqe, cb);
        PRIV(op)->min_flushed_journal_sector = PRIV(op)->max_flushed_journal_sector = 1 + journal.cur_sector;
        PRIV(op)->pending_ops++;
-        // Remember small write as unsynced
+    }
+    else
+    {
+        // Remember delete as unsynced
        unsynced_small_writes.push_back((obj_ver_id){
            .oid = op->oid,
            .version = op->version,
--- a/cluster_client.cpp
+++ b/cluster_client.cpp
@@ -473,7 +473,7 @@ void cluster_client_t::slice_rw(cluster_op_t *op)
    // Primary OSDs still operate individual stripes, but their size is multiplied by PG minsize in case of EC
    auto & pool_cfg = st_cli.pool_config[INODE_POOL(op->inode)];
    uint64_t pg_block_size = bs_block_size * (
-        pool_cfg.scheme == POOL_SCHEME_REPLICATED ? 1 : pool_cfg.pg_minsize
+        pool_cfg.scheme == POOL_SCHEME_REPLICATED ? 1 : pool_cfg.pg_size-pool_cfg.parity_chunks
    );
    uint64_t first_stripe = (op->offset / pg_block_size) * pg_block_size;
    uint64_t last_stripe = ((op->offset + op->len + pg_block_size - 1) / pg_block_size - 1) * pg_block_size;
@@ -488,7 +488,7 @@ void cluster_client_t::slice_rw(cluster_op_t *op)
        uint64_t begin = (op->offset < stripe ? stripe : op->offset);
        uint64_t end = (op->offset + op->len) > (stripe + pg_block_size)
            ? (stripe + pg_block_size) : (op->offset + op->len);
-        op->parts[i] = {
+        op->parts[i] = (cluster_op_part_t){
            .parent = op,
            .offset = begin,
            .len = (uint32_t)(end - begin),
@@ -533,7 +533,7 @@ bool cluster_client_t::try_send(cluster_op_t *op, cluster_op_part_t *part)
            part->osd_num = primary_osd;
            part->sent = true;
            op->sent_count++;
-            part->op = {
+            part->op = (osd_op_t){
                .op_type = OSD_OP_OUT,
                .peer_fd = peer_fd,
                .req = { .rw = {
@@ -694,7 +694,7 @@ void cluster_client_t::send_sync(cluster_op_t *op, cluster_op_part_t *part)
    assert(peer_it != msgr.osd_peer_fds.end());
    part->sent = true;
    op->sent_count++;
-    part->op = {
+    part->op = (osd_op_t){
        .op_type = OSD_OP_OUT,
        .peer_fd = peer_it->second,
        .req = {
--- a/copy-fio-includes.sh
+++ b/copy-fio-includes.sh
@@ -0,0 +1,13 @@
+#!/bin/bash
+
+gcc -E -o fio_headers.i fio_headers.h
+
+rm -rf fio-copy
+for i in `grep -Po 'fio/[^"]+' fio_headers.i | sort | uniq`; do
+    j=${i##fio/}
+    p=$(dirname $j)
+    mkdir -p fio-copy/$p
+    cp $i fio-copy/$j
+done
+
+rm fio_headers.i
--- a/copy-qemu-includes.sh
+++ b/copy-qemu-includes.sh
@@ -0,0 +1,18 @@
+#!/bin/bash
+
+#cd qemu
+#debian/rules b/configure-stamp
+#cd b/qemu; make qapi
+
+gcc -I qemu/b/qemu `pkg-config glib-2.0 --cflags` \
+    -I qemu/include -E -o qemu_driver.i qemu_driver.c
+
+rm -rf qemu-copy
+for i in `grep -Po 'qemu/[^"]+' qemu_driver.i | sort | uniq`; do
+    j=${i##qemu/}
+    p=$(dirname $j)
+    mkdir -p qemu-copy/$p
+    cp $i qemu-copy/$j
+done
+
+rm qemu_driver.i
--- a/debian/changelog
+++ b/debian/changelog
@@ -0,0 +1,17 @@
+vitastor (0.5.4-1) unstable; urgency=medium
+
+  * Bugfixes
+
+ -- Vitaliy Filippov <vitalif@yourcmc.ru>  Tue, 02 Feb 2021 23:01:24 +0300
+
+vitastor (0.5.1-1) unstable; urgency=medium
+
+  * Add jerasure support
+
+ -- Vitaliy Filippov <vitalif@yourcmc.ru>  Sat, 05 Dec 2020 17:02:26 +0300
+
+vitastor (0.5-1) unstable; urgency=medium
+
+  * First packaging for Debian
+
+ -- Vitaliy Filippov <vitalif@yourcmc.ru>  Thu, 05 Nov 2020 02:20:59 +0300
--- a/debian/compat
+++ b/debian/compat
@@ -0,0 +1 @@
+13
--- a/debian/control
+++ b/debian/control
@@ -0,0 +1,17 @@
+Source: vitastor
+Section: admin
+Priority: optional
+Maintainer: Vitaliy Filippov <vitalif@yourcmc.ru>
+Build-Depends: debhelper, liburing-dev (>= 0.6), g++ (>= 8), libstdc++6 (>= 8), linux-libc-dev, libgoogle-perftools-dev, libjerasure-dev, libgf-complete-dev
+Standards-Version: 4.5.0
+Homepage: https://vitastor.io/
+Rules-Requires-Root: no
+
+Package: vitastor
+Architecture: amd64
+Depends: ${shlibs:Depends}, ${misc:Depends}, fio (= ${dep:fio}), qemu (= ${dep:qemu}), nodejs (>= 10), node-sprintf-js, node-ws (>= 7), libjerasure2, lp-solve
+Description: Vitastor, a fast software-defined clustered block storage
+ Vitastor is a small, simple and fast clustered block storage (storage for VM drives),
+ architecturally similar to Ceph which means strong consistency, primary-replication,
+ symmetric clustering and automatic data distribution over any number of drives of any
+ size with configurable redundancy (replication or erasure codes/XOR).
--- a/debian/copyright
+++ b/debian/copyright
@@ -0,0 +1,20 @@
+Format: https://www.debian.org/doc/packaging-manuals/copyright-format/1.0/
+Upstream-Name: vitastor
+Upstream-Contact: Vitaliy Filippov <vitalif@yourcmc.ru>
+Source: https://vitastor.io
+
+Files: *
+Copyright: 2019+ Vitaliy Filippov <vitalif@yourcmc.ru>
+License: Multiple licenses VNPL-1.0 and/or GPL-2.0+
+ All server-side code (OSD, Monitor and so on) is licensed under the terms of
+ Vitastor Network Public License 1.0 (VNPL 1.0), a copyleft license based on
+ GNU GPLv3.0 with the additional "Network Interaction" clause which requires
+ opensourcing all programs directly or indirectly interacting with Vitastor
+ through a computer network ("Proxy Programs"). Proxy Programs may be made public
+ not only under the terms of the same license, but also under the terms of any
+ GPL-Compatible Free Software License, as listed by the Free Software Foundation.
+ This is a stricter copyleft license than the Affero GPL.
+ .
+ Client libraries (cluster_client and so on) are dual-licensed under the same
+ VNPL 1.0 and also GNU GPL 2.0 or later to allow for compatibility with GPLed
+ software like QEMU and fio.
--- a/debian/install
+++ b/debian/install
@@ -0,0 +1,3 @@
+VNPL-1.0.txt usr/share/doc/vitastor
+GPL-2.0.txt usr/share/doc/vitastor
+mon usr/lib/vitastor
--- a/debian/patched-qemu.Dockerfile
+++ b/debian/patched-qemu.Dockerfile
@@ -0,0 +1,49 @@
+# Build patched QEMU for Debian Buster or Bullseye/Sid inside a container
+# cd ..; podman build --build-arg REL=bullseye -v `pwd`/build:/root/build -f debian/patched-qemu.Dockerfile .
+
+ARG REL=bullseye
+
+FROM debian:$REL
+
+# again, it doesn't work otherwise
+ARG REL=bullseye
+
+WORKDIR /root
+
+RUN if [ "$REL" = "buster" ]; then \
+        echo 'deb http://deb.debian.org/debian buster-backports main' >> /etc/apt/sources.list; \
+        echo >> /etc/apt/preferences; \
+        echo 'Package: *' >> /etc/apt/preferences; \
+        echo 'Pin: release a=buster-backports' >> /etc/apt/preferences; \
+        echo 'Pin-Priority: 500' >> /etc/apt/preferences; \
+    fi; \
+    grep '^deb ' /etc/apt/sources.list | perl -pe 's/^deb/deb-src/' >> /etc/apt/sources.list; \
+    echo 'APT::Install-Recommends false;' >> /etc/apt/apt.conf; \
+    echo 'APT::Install-Suggests false;' >> /etc/apt/apt.conf
+
+RUN apt-get update
+RUN apt-get -y install qemu fio liburing1 liburing-dev libgoogle-perftools-dev devscripts
+RUN apt-get -y build-dep qemu
+RUN apt-get -y build-dep fio
+RUN apt-get --download-only source qemu
+RUN apt-get --download-only source fio
+
+ADD qemu-5.0-vitastor.patch qemu-5.1-vitastor.patch /root/vitastor/
+RUN set -e; \
+    mkdir -p /root/build/qemu-$REL; \
+    rm -rf /root/build/qemu-$REL/*; \
+    cd /root/build/qemu-$REL; \
+    dpkg-source -x /root/qemu*.dsc; \
+    if [ -d /root/build/qemu-$REL/qemu-5.0 ]; then \
+        cp /root/vitastor/qemu-5.0-vitastor.patch /root/build/qemu-$REL/qemu-5.0/debian/patches; \
+        echo qemu-5.0-vitastor.patch >> /root/build/qemu-$REL/qemu-5.0/debian/patches/series; \
+    else \
+        cp /root/vitastor/qemu-5.1-vitastor.patch /root/build/qemu-$REL/qemu-*/debian/patches; \
+        P=`ls -d /root/build/qemu-$REL/qemu-*/debian/patches`; \
+        echo qemu-5.1-vitastor.patch >> $P/series; \
+    fi; \
+    cd /root/build/qemu-$REL/qemu-*/; \
+    V=$(head -n1 debian/changelog | perl -pe 's/^.*\((.*?)(~bpo[\d\+]*)?\).*$/$1/')+vitastor1; \
+    DEBFULLNAME="Vitaliy Filippov <vitalif@yourcmc.ru>" dch -D $REL -v $V 'Plug Vitastor block driver'; \
+    DEB_BUILD_OPTIONS=nocheck dpkg-buildpackage --jobs=auto -sa; \
+    rm -rf /root/build/qemu-$REL/qemu-*/
--- a/debian/rules
+++ b/debian/rules
@@ -0,0 +1,9 @@
+#!/usr/bin/make -f
+export DH_VERBOSE = 1
+
+%:
+	dh $@
+
+override_dh_installdeb:
+	cat debian/substvars >> debian/vitastor.substvars
+	dh_installdeb
--- a/debian/source/format
+++ b/debian/source/format
@@ -0,0 +1 @@
+3.0 (quilt)
--- a/debian/substvars
+++ b/debian/substvars
@@ -0,0 +1,2 @@
+dep:fio=3.16-1
+dep:qemu=1:5.1+dfsg-4+vitastor1
--- a/debian/vitastor.Dockerfile
+++ b/debian/vitastor.Dockerfile
@@ -0,0 +1,72 @@
+# Build Vitastor packages for Debian Buster or Bullseye/Sid inside a container
+# cd ..; podman build --build-arg REL=bullseye -v `pwd`/build:/root/build -f debian/vitastor.Dockerfile .
+
+ARG REL=bullseye
+
+FROM debian:$REL
+
+# again, it doesn't work otherwise
+ARG REL=bullseye
+
+WORKDIR /root
+
+RUN if [ "$REL" = "buster" ]; then \
+        echo 'deb http://deb.debian.org/debian buster-backports main' >> /etc/apt/sources.list; \
+        echo >> /etc/apt/preferences; \
+        echo 'Package: *' >> /etc/apt/preferences; \
+        echo 'Pin: release a=buster-backports' >> /etc/apt/preferences; \
+        echo 'Pin-Priority: 500' >> /etc/apt/preferences; \
+    fi; \
+    grep '^deb ' /etc/apt/sources.list | perl -pe 's/^deb/deb-src/' >> /etc/apt/sources.list; \
+    echo 'APT::Install-Recommends false;' >> /etc/apt/apt.conf; \
+    echo 'APT::Install-Suggests false;' >> /etc/apt/apt.conf
+
+RUN apt-get update
+RUN apt-get -y install qemu fio liburing1 liburing-dev libgoogle-perftools-dev devscripts
+RUN apt-get -y build-dep qemu
+RUN apt-get -y build-dep fio
+RUN apt-get --download-only source qemu
+RUN apt-get --download-only source fio
+RUN apt-get -y install libjerasure-dev
+
+ADD . /root/vitastor
+RUN set -e -x; \
+    mkdir -p /root/fio-build/; \
+    cd /root/fio-build/; \
+    rm -rf /root/fio-build/*; \
+    dpkg-source -x /root/fio*.dsc; \
+    cd /root/build/qemu-$REL/; \
+    rm -rf qemu*/; \
+    dpkg-source -x qemu*.dsc; \
+    cd /root/build/qemu-$REL/qemu*/; \
+    debian/rules b/configure-stamp; \
+    cd b/qemu; \
+    make -j8 qapi; \
+    mkdir -p /root/build/vitastor-$REL; \
+    rm -rf /root/build/vitastor-$REL/*; \
+    cd /root/build/vitastor-$REL; \
+    cp -r /root/vitastor vitastor-0.5.4; \
+    ln -s /root/build/qemu-$REL/qemu-*/ vitastor-0.5.4/qemu; \
+    ln -s /root/fio-build/fio-*/ vitastor-0.5.4/fio; \
+    cd vitastor-0.5.4; \
+    FIO=$(head -n1 fio/debian/changelog | perl -pe 's/^.*\((.*?)\).*$/$1/'); \
+    QEMU=$(head -n1 qemu/debian/changelog | perl -pe 's/^.*\((.*?)\).*$/$1/'); \
+    sh copy-qemu-includes.sh; \
+    sh copy-fio-includes.sh; \
+    rm qemu fio; \
+    mkdir -p a b debian/patches; \
+    mv qemu-copy b/qemu; \
+    mv fio-copy b/fio; \
+    diff -NaurpbB a b > debian/patches/qemu-fio-headers.patch || true; \
+    echo qemu-fio-headers.patch >> debian/patches/series; \
+    rm -rf a b; \
+    rm -rf /root/build/qemu-$REL/qemu*/; \
+    echo "dep:fio=$FIO" > debian/substvars; \
+    echo "dep:qemu=$QEMU" >> debian/substvars; \
+    cd /root/build/vitastor-$REL; \
+    tar --sort=name --mtime='2020-01-01' --owner=0 --group=0 --exclude=debian -cJf vitastor_0.5.4.orig.tar.xz vitastor-0.5.4; \
+    cd vitastor-0.5.4; \
+    V=$(head -n1 debian/changelog | perl -pe 's/^.*\((.*?)\).*$/$1/'); \
+    DEBFULLNAME="Vitaliy Filippov <vitalif@yourcmc.ru>" dch -D $REL -v "$V""$REL" "Rebuild for $REL"; \
+    DEB_BUILD_OPTIONS=nocheck dpkg-buildpackage --jobs=auto -sa; \
+    rm -rf /root/build/vitastor-$REL/vitastor-*/
--- a/dump_journal.cpp
+++ b/dump_journal.cpp
@@ -26,23 +26,32 @@ struct journal_dump_t
    uint64_t journal_offset;
    uint64_t journal_len;
    uint64_t journal_pos;
+    bool all;
+    bool started;
    int fd;
+    uint32_t crc32_last;

-    void dump_block(void *buf);
+    int dump_block(void *buf);
 };

 int main(int argc, char *argv[])
 {
-    if (argc < 5)
+    journal_dump_t self = { 0 };
+    int b = 1;
+    if (argc >= 2 && !strcmp(argv[1], "--all"))
    {
-        printf("USAGE: %s <journal_file> <journal_block_size> <offset> <size>\n", argv[0]);
+        self.all = true;
+        b = 2;
+    }
+    if (argc < b+4)
+    {
+        printf("USAGE: %s [--all] <journal_file> <journal_block_size> <offset> <size>\n", argv[0]);
        return 1;
    }
-    journal_dump_t self;
-    self.journal_device = argv[1];
-    self.journal_block = strtoul(argv[2], NULL, 10);
-    self.journal_offset = strtoull(argv[3], NULL, 10);
-    self.journal_len = strtoull(argv[4], NULL, 10);
+    self.journal_device = argv[b];
+    self.journal_block = strtoul(argv[b+1], NULL, 10);
+    self.journal_offset = strtoull(argv[b+2], NULL, 10);
+    self.journal_len = strtoull(argv[b+3], NULL, 10);
    if (self.journal_block < MEM_ALIGNMENT || (self.journal_block % MEM_ALIGNMENT) ||
        self.journal_block > 128*1024)
    {
@@ -57,30 +66,64 @@ int main(int argc, char *argv[])
    }
    void *data = memalign(MEM_ALIGNMENT, self.journal_block);
    self.journal_pos = 0;
-    while (self.journal_pos < self.journal_len)
+    if (self.all)
+    {
+        while (self.journal_pos < self.journal_len)
+        {
+            int r = pread(self.fd, data, self.journal_block, self.journal_offset+self.journal_pos);
+            assert(r == self.journal_block);
+            uint64_t s;
+            for (s = 0; s < self.journal_block; s += 8)
+            {
+                if (*((uint64_t*)(data+s)) != 0)
+                    break;
+            }
+            if (s == self.journal_block)
+            {
+                printf("offset %08lx: zeroes\n", self.journal_pos);
+                self.journal_pos += self.journal_block;
+            }
+            else if (((journal_entry*)data)->magic == JOURNAL_MAGIC)
+            {
+                printf("offset %08lx:\n", self.journal_pos);
+                self.dump_block(data);
+            }
+            else
+            {
+                printf("offset %08lx: no magic in the beginning, looks like random data (pattern=%lx)\n", self.journal_pos, *((uint64_t*)data));
+                self.journal_pos += self.journal_block;
+            }
+        }
+    }
+    else
    {
        int r = pread(self.fd, data, self.journal_block, self.journal_offset+self.journal_pos);
        assert(r == self.journal_block);
-        uint64_t s;
-        for (s = 0; s < self.journal_block; s += 8)
+        journal_entry *je = (journal_entry*)(data);
+        if (je->magic != JOURNAL_MAGIC || je->type != JE_START || je_crc32(je) != je->crc32)
        {
-            if (*((uint64_t*)(data+s)) != 0)
-                break;
-        }
-        if (s == self.journal_block)
-        {
-            printf("offset %08lx: zeroes\n", self.journal_pos);
-            self.journal_pos += self.journal_block;
-        }
-        else if (((journal_entry*)data)->magic == JOURNAL_MAGIC)
-        {
-            printf("offset %08lx:\n", self.journal_pos);
-            self.dump_block(data);
+            printf("offset %08lx: journal superblock is invalid\n", self.journal_pos);
        }
        else
        {
-            printf("offset %08lx: no magic in the beginning, looks like random data (pattern=%lx)\n", self.journal_pos, *((uint64_t*)data));
-            self.journal_pos += self.journal_block;
+            printf("offset %08lx:\n", self.journal_pos);
+            self.dump_block(data);
+            self.started = false;
+            self.journal_pos = je->start.journal_start;
+            while (1)
+            {
+                if (self.journal_pos >= self.journal_len)
+                    self.journal_pos = self.journal_block;
+                r = pread(self.fd, data, self.journal_block, self.journal_offset+self.journal_pos);
+                assert(r == self.journal_block);
+                printf("offset %08lx:\n", self.journal_pos);
+                r = self.dump_block(data);
+                if (r <= 0)
+                {
+                    printf("end of the journal\n");
+                    break;
+                }
+            }
        }
    }
    free(data);
@@ -88,7 +131,7 @@ int main(int argc, char *argv[])
    return 0;
 }

-void journal_dump_t::dump_block(void *buf)
+int journal_dump_t::dump_block(void *buf)
 {
    uint32_t pos = 0;
    journal_pos += journal_block;
@@ -97,12 +140,19 @@ void journal_dump_t::dump_block(void *buf)
    while (pos < journal_block)
    {
        journal_entry *je = (journal_entry*)(buf + pos);
-        if (je->magic != JOURNAL_MAGIC || je->type < JE_MIN || je->type > JE_MAX)
+        if (je->magic != JOURNAL_MAGIC || je->type < JE_MIN || je->type > JE_MAX ||
+            !all && started && je->crc32_prev != crc32_last)
        {
            break;
        }
-        const char *crc32_valid = je_crc32(je) == je->crc32 ? "(valid)" : "(invalid)";
-        printf("entry % 3d: crc32=%08x %s prev=%08x ", entry, je->crc32, crc32_valid, je->crc32_prev);
+        bool crc32_valid = je_crc32(je) == je->crc32;
+        if (!all && !crc32_valid)
+        {
+            break;
+        }
+        started = true;
+        crc32_last = je->crc32;
+        printf("entry % 3d: crc32=%08x %s prev=%08x ", entry, je->crc32, (crc32_valid ? "(valid)" : "(invalid)"), je->crc32_prev);
        if (je->type == JE_START)
        {
            printf("je_start start=%08lx\n", je->start.journal_start);
@@ -170,4 +220,5 @@ void journal_dump_t::dump_block(void *buf)
    {
        journal_pos = journal_len;
    }
+    return entry;
 }
--- a/epoll_manager.cpp
+++ b/epoll_manager.cpp
@@ -84,8 +84,12 @@ void epoll_manager_t::handle_epoll_events()
        nfds = epoll_wait(epoll_fd, events, MAX_EPOLL_EVENTS, 0);
        for (int i = 0; i < nfds; i++)
        {
-            auto & cb = epoll_handlers[events[i].data.fd];
-            cb(events[i].data.fd, events[i].events);
+            auto cb_it = epoll_handlers.find(events[i].data.fd);
+            if (cb_it != epoll_handlers.end())
+            {
+                auto & cb = cb_it->second;
+                cb(events[i].data.fd, events[i].events);
+            }
        }
    } while (nfds == MAX_EPOLL_EVENTS);
 }
--- a/etcd_state_client.cpp
+++ b/etcd_state_client.cpp
@@ -173,6 +173,7 @@ void etcd_state_client_t::start_etcd_watcher()
            { "range_end", base64_encode(etcd_prefix+"/config0") },
            { "start_revision", etcd_watch_revision+1 },
            { "watch_id", ETCD_CONFIG_WATCH_ID },
+            { "progress_notify", true },
        } }
    }).dump());
    etcd_watch_ws->post_message(WS_TEXT, json11::Json(json11::Json::object {
@@ -181,6 +182,7 @@ void etcd_state_client_t::start_etcd_watcher()
            { "range_end", base64_encode(etcd_prefix+"/osd/state0") },
            { "start_revision", etcd_watch_revision+1 },
            { "watch_id", ETCD_OSD_STATE_WATCH_ID },
+            { "progress_notify", true },
        } }
    }).dump());
    etcd_watch_ws->post_message(WS_TEXT, json11::Json(json11::Json::object {
@@ -189,6 +191,7 @@ void etcd_state_client_t::start_etcd_watcher()
            { "range_end", base64_encode(etcd_prefix+"/pg/state0") },
            { "start_revision", etcd_watch_revision+1 },
            { "watch_id", ETCD_PG_STATE_WATCH_ID },
+            { "progress_notify", true },
        } }
    }).dump());
    etcd_watch_ws->post_message(WS_TEXT, json11::Json(json11::Json::object {
@@ -197,6 +200,7 @@ void etcd_state_client_t::start_etcd_watcher()
            { "range_end", base64_encode(etcd_prefix+"/pg/history0") },
            { "start_revision", etcd_watch_revision+1 },
            { "watch_id", ETCD_PG_HISTORY_WATCH_ID },
+            { "progress_notify", true },
        } }
    }).dump());
 }
@@ -315,67 +319,99 @@ void etcd_state_client_t::parse_state(const std::string & key, const json11::Jso
        }
        for (auto & pool_item: value.object_items())
        {
+            pool_config_t pc;
+            // ID
            pool_id_t pool_id = stoull_full(pool_item.first);
            if (!pool_id || pool_id >= POOL_ID_MAX)
            {
                printf("Pool ID %s is invalid (must be a number less than 0x%x), skipping pool\n", pool_item.first.c_str(), POOL_ID_MAX);
                continue;
            }
-            if (pool_item.second["pg_size"].uint64_value() < 1 ||
-                pool_item.second["scheme"] == "xor" && pool_item.second["pg_size"].uint64_value() < 3)
-            {
-                printf("Pool %u has invalid pg_size, skipping pool\n", pool_id);
-                continue;
-            }
-            if (pool_item.second["pg_minsize"].uint64_value() < 1 ||
-                pool_item.second["pg_minsize"].uint64_value() > pool_item.second["pg_size"].uint64_value() ||
-                pool_item.second["pg_minsize"].uint64_value() < (pool_item.second["pg_size"].uint64_value() - 1))
-            {
-                printf("Pool %u has invalid pg_minsize, skipping pool\n", pool_id);
-                continue;
-            }
-            if (pool_item.second["pg_count"].uint64_value() < 1)
-            {
-                printf("Pool %u has invalid pg_count, skipping pool\n", pool_id);
-                continue;
-            }
-            if (pool_item.second["name"].string_value() == "")
+            pc.id = pool_id;
+            // Pool Name
+            pc.name = pool_item.second["name"].string_value();
+            if (pc.name == "")
            {
                printf("Pool %u has empty name, skipping pool\n", pool_id);
                continue;
            }
-            if (pool_item.second["scheme"] != "replicated" && pool_item.second["scheme"] != "xor")
+            // Failure Domain
+            pc.failure_domain = pool_item.second["failure_domain"].string_value();
+            // Coding Scheme
+            if (pool_item.second["scheme"] == "replicated")
+                pc.scheme = POOL_SCHEME_REPLICATED;
+            else if (pool_item.second["scheme"] == "xor")
+                pc.scheme = POOL_SCHEME_XOR;
+            else if (pool_item.second["scheme"] == "jerasure")
+                pc.scheme = POOL_SCHEME_JERASURE;
+            else
            {
-                printf("Pool %u has invalid coding scheme (only \"xor\" and \"replicated\" are allowed), skipping pool\n", pool_id);
+                printf("Pool %u has invalid coding scheme (one of \"xor\", \"replicated\" or \"jerasure\" required), skipping pool\n", pool_id);
                continue;
            }
-            if (pool_item.second["max_osd_combinations"].uint64_value() > 0 &&
-                pool_item.second["max_osd_combinations"].uint64_value() < 100)
+            // PG Size
+            pc.pg_size = pool_item.second["pg_size"].uint64_value();
+            if (pc.pg_size < 1 ||
+                pool_item.second["pg_size"].uint64_value() < 3 &&
+                (pc.scheme == POOL_SCHEME_XOR || pc.scheme == POOL_SCHEME_JERASURE) ||
+                pool_item.second["pg_size"].uint64_value() > 256)
+            {
+                printf("Pool %u has invalid pg_size, skipping pool\n", pool_id);
+                continue;
+            }
+            // Parity Chunks
+            pc.parity_chunks = pool_item.second["parity_chunks"].uint64_value();
+            if (pc.scheme == POOL_SCHEME_XOR)
+            {
+                if (pc.parity_chunks > 1)
+                {
+                    printf("Pool %u has invalid parity_chunks (must be 1), skipping pool\n", pool_id);
+                    continue;
+                }
+                pc.parity_chunks = 1;
+            }
+            if (pc.scheme == POOL_SCHEME_JERASURE &&
+                (pc.parity_chunks < 1 || pc.parity_chunks > pc.pg_size-2))
+            {
+                printf("Pool %u has invalid parity_chunks (must be between 1 and pg_size-2), skipping pool\n", pool_id);
+                continue;
+            }
+            // PG MinSize
+            pc.pg_minsize = pool_item.second["pg_minsize"].uint64_value();
+            if (pc.pg_minsize < 1 || pc.pg_minsize > pc.pg_size ||
+                (pc.scheme == POOL_SCHEME_XOR || pc.scheme == POOL_SCHEME_JERASURE) &&
+                pc.pg_minsize < (pc.pg_size-pc.parity_chunks))
+            {
+                printf("Pool %u has invalid pg_minsize, skipping pool\n", pool_id);
+                continue;
+            }
+            // PG Count
+            pc.pg_count = pool_item.second["pg_count"].uint64_value();
+            if (pc.pg_count < 1)
+            {
+                printf("Pool %u has invalid pg_count, skipping pool\n", pool_id);
+                continue;
+            }
+            // Max OSD Combinations
+            pc.max_osd_combinations = pool_item.second["max_osd_combinations"].uint64_value();
+            if (!pc.max_osd_combinations)
+                pc.max_osd_combinations = 10000;
+            if (pc.max_osd_combinations > 0 && pc.max_osd_combinations < 100)
            {
                printf("Pool %u has invalid max_osd_combinations (must be at least 100), skipping pool\n", pool_id);
                continue;
            }
+            // PG Stripe Size
+            pc.pg_stripe_size = pool_item.second["pg_stripe_size"].uint64_value();
+            uint64_t min_stripe_size = bs_block_size * (pc.scheme == POOL_SCHEME_REPLICATED ? 1 : (pc.pg_size-pc.parity_chunks));
+            if (pc.pg_stripe_size < min_stripe_size)
+                pc.pg_stripe_size = min_stripe_size;
+            // Save
+            pc.real_pg_count = this->pool_config[pool_id].real_pg_count;
+            std::swap(pc.pg_config, this->pool_config[pool_id].pg_config);
+            std::swap(this->pool_config[pool_id], pc);
            auto & parsed_cfg = this->pool_config[pool_id];
            parsed_cfg.exists = true;
-            parsed_cfg.id = pool_id;
-            parsed_cfg.name = pool_item.second["name"].string_value();
-            parsed_cfg.scheme = pool_item.second["scheme"] == "replicated" ? POOL_SCHEME_REPLICATED : POOL_SCHEME_XOR;
-            parsed_cfg.pg_size = pool_item.second["pg_size"].uint64_value();
-            parsed_cfg.pg_minsize = pool_item.second["pg_minsize"].uint64_value();
-            parsed_cfg.pg_count = pool_item.second["pg_count"].uint64_value();
-            parsed_cfg.failure_domain = pool_item.second["failure_domain"].string_value();
-            parsed_cfg.pg_stripe_size = pool_item.second["pg_stripe_size"].uint64_value();
-            uint64_t min_stripe_size = bs_block_size *
-                (parsed_cfg.scheme == POOL_SCHEME_REPLICATED ? 1 : parsed_cfg.pg_minsize);
-            if (parsed_cfg.pg_stripe_size < min_stripe_size)
-            {
-                parsed_cfg.pg_stripe_size = min_stripe_size;
-            }
-            parsed_cfg.max_osd_combinations = pool_item.second["max_osd_combinations"].uint64_value();
-            if (!parsed_cfg.max_osd_combinations)
-            {
-                parsed_cfg.max_osd_combinations = 10000;
-            }
            for (auto & pg_item: parsed_cfg.pg_config)
            {
                if (pg_item.second.target_set.size() != parsed_cfg.pg_size)
--- a/etcd_state_client.h
+++ b/etcd_state_client.h
@@ -43,7 +43,7 @@ struct pool_config_t
    pool_id_t id;
    std::string name;
    uint64_t scheme;
-    uint64_t pg_size, pg_minsize;
+    uint64_t pg_size, pg_minsize, parity_chunks;
    uint64_t pg_count;
    uint64_t real_pg_count;
    std::string failure_domain;
--- a/fio_cluster.cpp
+++ b/fio_cluster.cpp
@@ -93,7 +93,7 @@ static struct fio_option options[] = {
    {
        .name   = "cluster_log_level",
        .lname  = "cluster log level",
-        .type   = FIO_OPT_BOOL,
+        .type   = FIO_OPT_INT,
        .off1   = offsetof(struct sec_options, cluster_log),
        .help   = "Set log level for the Vitastor client",
        .def    = "0",
@@ -145,9 +145,7 @@ static void sec_cleanup(struct thread_data *td)
        delete bsd->cli;
        delete bsd->epmgr;
        delete bsd->ringloop;
-        bsd->cli = NULL;
-        bsd->epmgr = NULL;
-        bsd->ringloop = NULL;
+        delete bsd;
    }
 }

--- a/fio_sec_osd.cpp
+++ b/fio_sec_osd.cpp
@@ -140,6 +140,7 @@ static void sec_cleanup(struct thread_data *td)
    if (bsd)
    {
        close(bsd->connect_fd);
+        delete bsd;
    }
 }

@@ -312,6 +313,7 @@ static int sec_getevents(struct thread_data *td, unsigned int min, unsigned int
            exit(1);
        }
        io_u* io = it->second;
+        bsd->queue.erase(it);
        if (io->ddir == DDIR_READ)
        {
            if (reply.hdr.retval != io->xfer_buflen)
--- a/messenger.cpp
+++ b/messenger.cpp
@@ -30,7 +30,7 @@ osd_messenger_t::~osd_messenger_t()
 {
    while (clients.size() > 0)
    {
-        stop_client(clients.begin()->first);
+        stop_client(clients.begin()->first, true);
    }
 }

@@ -111,12 +111,12 @@ void osd_messenger_t::try_connect_peer_addr(osd_num_t peer_osd, const char *peer
        timeout_id = tfd->set_timer(1000*peer_connect_timeout, false, [this, peer_fd](int timer_id)
        {
            osd_num_t peer_osd = clients.at(peer_fd)->osd_num;
-            stop_client(peer_fd);
+            stop_client(peer_fd, true);
            on_connect_peer(peer_osd, -EIO);
            return;
        });
    }
-    clients[peer_fd] = new osd_client_t({
+    clients[peer_fd] = new osd_client_t((osd_client_t){
        .peer_addr = addr,
        .peer_port = peer_port,
        .peer_fd = peer_fd,
@@ -149,7 +149,7 @@ void osd_messenger_t::handle_connect_epoll(int peer_fd)
    }
    if (result != 0)
    {
-        stop_client(peer_fd);
+        stop_client(peer_fd, true);
        on_connect_peer(peer_osd, -result);
        return;
    }
@@ -171,7 +171,7 @@ void osd_messenger_t::handle_peer_epoll(int peer_fd, int epoll_events)
    {
        // Stop client
        printf("[OSD %lu] client %d disconnected\n", this->osd_num, peer_fd);
-        stop_client(peer_fd);
+        stop_client(peer_fd, true);
    }
    else if (epoll_events & EPOLLIN)
    {
@@ -233,7 +233,7 @@ void osd_messenger_t::check_peer_config(osd_client_t *cl)
    osd_op_t *op = new osd_op_t();
    op->op_type = OSD_OP_OUT;
    op->peer_fd = cl->peer_fd;
-    op->req = {
+    op->req = (osd_any_op_t){
        .show_conf = {
            .header = {
                .magic = SECONDARY_OSD_OP_MAGIC,
@@ -309,7 +309,7 @@ void osd_messenger_t::cancel_op(osd_op_t *op)
    }
 }

-void osd_messenger_t::stop_client(int peer_fd)
+void osd_messenger_t::stop_client(int peer_fd, bool force)
 {
    assert(peer_fd != 0);
    auto it = clients.find(peer_fd);
@@ -334,6 +334,10 @@ void osd_messenger_t::stop_client(int peer_fd)
                printf("[OSD %lu] Stopping client %d (regular client)\n", osd_num, peer_fd);
        }
    }
+    else if (!force)
+    {
+        return;
+    }
    cl->peer_state = PEER_STOPPED;
    clients.erase(it);
    tfd->set_fd_handler(peer_fd, false, NULL);
@@ -348,7 +352,14 @@ void osd_messenger_t::stop_client(int peer_fd)
    }
    if (cl->read_op)
    {
-        delete cl->read_op;
+        if (cl->read_op->callback)
+        {
+            cancel_op(cl->read_op);
+        }
+        else
+        {
+            delete cl->read_op;
+        }
        cl->read_op = NULL;
    }
    for (auto rit = read_ready_clients.begin(); rit != read_ready_clients.end(); rit++)
@@ -402,7 +413,7 @@ void osd_messenger_t::accept_connections(int listen_fd)
        fcntl(peer_fd, F_SETFL, fcntl(peer_fd, F_GETFL, 0) | O_NONBLOCK);
        int one = 1;
        setsockopt(peer_fd, SOL_TCP, TCP_NODELAY, &one, sizeof(one));
-        clients[peer_fd] = new osd_client_t({
+        clients[peer_fd] = new osd_client_t((osd_client_t){
            .peer_addr = addr,
            .peer_port = ntohs(addr.sin_port),
            .peer_fd = peer_fd,
--- a/messenger.h
+++ b/messenger.h
@@ -275,7 +275,7 @@ struct osd_messenger_t

 public:
    void connect_peer(uint64_t osd_num, json11::Json peer_state);
-    void stop_client(int peer_fd);
+    void stop_client(int peer_fd, bool force = false);
    void outbox_push(osd_op_t *cur_op);
    std::function<void(osd_op_t*)> exec_op;
    std::function<void(osd_num_t)> repeer_pgs;
--- a/mon/mon-main.js
+++ b/mon/mon-main.js
--- a/mon/mon.js
+++ b/mon/mon.js
@@ -9,212 +9,215 @@ const LPOptimizer = require('./lp-optimizer.js');
 const stableStringify = require('./stable-stringify.js');
 const PGUtil = require('./PGUtil.js');

+// FIXME document all etcd keys and config variables in the form of JSON schema or similar
+const etcd_allow = new RegExp('^'+[
+    'config/global',
+    'config/node_placement',
+    'config/pools',
+    'config/osd/[1-9]\\d*',
+    'config/pgs',
+    'osd/state/[1-9]\\d*',
+    'osd/stats/[1-9]\\d*',
+    'mon/master',
+    'pg/state/[1-9]\\d*/[1-9]\\d*',
+    'pg/stats/[1-9]\\d*/[1-9]\\d*',
+    'pg/history/[1-9]\\d*/[1-9]\\d*',
+    'stats',
+].join('$|^')+'$');
+
+const etcd_tree = {
+    config: {
+        /* global: {
+            // mon
+            etcd_mon_ttl: 30, // min: 10
+            etcd_mon_timeout: 1000, // ms. min: 0
+            etcd_mon_retries: 5, // min: 0
+            mon_change_timeout: 1000, // ms. min: 100
+            mon_stats_timeout: 1000, // ms. min: 100
+            osd_out_time: 1800, // seconds. min: 0
+            placement_levels: { datacenter: 1, rack: 2, host: 3, osd: 4, ... },
+            // client and osd
+            use_sync_send_recv: false,
+            log_level: 0,
+            block_size: 131072,
+            disk_alignment: 4096,
+            bitmap_granularity: 4096,
+            immediate_commit: false, // 'all' or 'small'
+            client_dirty_limit: 33554432,
+            peer_connect_interval: 5, // seconds. min: 1
+            peer_connect_timeout: 5, // seconds. min: 1
+            up_wait_retry_interval: 500, // ms. min: 50
+            // osd
+            etcd_report_interval: 30, // min: 10
+            run_primary: true,
+            bind_address: "0.0.0.0",
+            bind_port: 0,
+            autosync_interval: 5,
+            client_queue_depth: 128, // unused
+            recovery_queue_depth: 4,
+            readonly: false,
+            print_stats_interval: 3,
+            // blockstore - fixed in superblock
+            block_size,
+            disk_alignment,
+            journal_block_size,
+            meta_block_size,
+            bitmap_granularity,
+            journal_device,
+            journal_offset,
+            journal_size,
+            disable_journal_fsync,
+            data_device,
+            data_offset,
+            data_size,
+            disable_data_fsync,
+            meta_device,
+            meta_offset,
+            disable_meta_fsync,
+            disable_device_lock,
+            // blockstore - configurable
+            flusher_count,
+            inmemory_metadata,
+            inmemory_journal,
+            journal_sector_buffer_count,
+            journal_no_same_sector_overwrites,
+        }, */
+        global: {},
+        /* node_placement: {
+            host1: { level: 'host', parent: 'rack1' },
+            ...
+        }, */
+        node_placement: {},
+        /* pools: {
+            <id>: {
+                name: 'testpool',
+                // jerasure uses Reed-Solomon-Vandermonde codes
+                scheme: 'replicated' | 'xor' | 'jerasure',
+                pg_size: 3,
+                pg_minsize: 2,
+                // number of parity chunks, required for jerasure
+                parity_chunks?: 1,
+                pg_count: 100,
+                failure_domain: 'host',
+                max_osd_combinations: 10000,
+                pg_stripe_size: 4194304,
+                root_node?: 'rack1',
+                // restrict pool to OSDs having all of these tags
+                osd_tags?: 'nvme' | [ 'nvme', ... ],
+            },
+            ...
+        }, */
+        pools: {},
+        osd: {
+            /* <id>: { reweight?: 1, tags?: [ 'nvme', ... ] }, ... */
+        },
+        /* pgs: {
+            hash: string,
+            items: {
+                <pool_id>: {
+                    <pg_id>: {
+                        osd_set: [ 1, 2, 3 ],
+                        primary: 1,
+                        pause: false,
+                    }
+                }
+            }
+        }, */
+        pgs: {},
+    },
+    osd: {
+        state: {
+            /* <osd_num_t>: {
+                state: "up",
+                addresses: string[],
+                host: string,
+                port: uint16_t,
+                primary_enabled: boolean,
+                blockstore_enabled: boolean,
+            }, */
+        },
+        stats: {
+            /* <osd_num_t>: {
+                time: number, // unix time
+                blockstore_ready: boolean,
+                size: uint64_t, // bytes
+                free: uint64_t, // bytes
+                host: string,
+                op_stats: {
+                    <string>: { count: uint64_t, usec: uint64_t, bytes: uint64_t },
+                },
+                subop_stats: {
+                    <string>: { count: uint64_t, usec: uint64_t },
+                },
+                recovery_stats: {
+                    degraded: { count: uint64_t, bytes: uint64_t },
+                    misplaced: { count: uint64_t, bytes: uint64_t },
+                },
+            }, */
+        },
+    },
+    mon: {
+        master: {
+            /* ip: [ string ], */
+        },
+    },
+    pg: {
+        state: {
+            /* <pool_id>: {
+                <pg_id>: {
+                    primary: osd_num_t,
+                    state: ("starting"|"peering"|"incomplete"|"active"|"stopping"|"offline"|
+                        "degraded"|"has_incomplete"|"has_degraded"|"has_misplaced"|"has_unclean"|
+                        "has_invalid"|"left_on_dead")[],
+                }
+            }, */
+        },
+        stats: {
+            /* <pool_id>: {
+                <pg_id>: {
+                    object_count: uint64_t,
+                    clean_count: uint64_t,
+                    misplaced_count: uint64_t,
+                    degraded_count: uint64_t,
+                    incomplete_count: uint64_t,
+                    write_osd_set: osd_num_t[],
+                },
+            }, */
+        },
+        history: {
+            /* <pool_id>: {
+                <pg_id>: {
+                    osd_sets: osd_num_t[][],
+                    all_peers: osd_num_t[],
+                    epoch: uint32_t,
+                },
+            }, */
+        },
+    },
+    stats: {
+        /* op_stats: {
+            <string>: { count: uint64_t, usec: uint64_t, bytes: uint64_t },
+        },
+        subop_stats: {
+            <string>: { count: uint64_t, usec: uint64_t },
+        },
+        recovery_stats: {
+            degraded: { count: uint64_t, bytes: uint64_t },
+            misplaced: { count: uint64_t, bytes: uint64_t },
+        },
+        object_counts: {
+            object: uint64_t,
+            clean: uint64_t,
+            misplaced: uint64_t,
+            degraded: uint64_t,
+            incomplete: uint64_t,
+        }, */
+    },
+};
+
 // FIXME Split into several files
 class Mon
 {
-    // FIXME document all etcd keys and config variables in the form of JSON schema or similar
-    static etcd_allow = new RegExp('^'+[
-        'config/global',
-        'config/node_placement',
-        'config/pools',
-        'config/osd/[1-9]\\d*',
-        'config/pgs',
-        'osd/state/[1-9]\\d*',
-        'osd/stats/[1-9]\\d*',
-        'mon/master',
-        'pg/state/[1-9]\\d*/[1-9]\\d*',
-        'pg/stats/[1-9]\\d*/[1-9]\\d*',
-        'pg/history/[1-9]\\d*/[1-9]\\d*',
-        'stats',
-    ].join('$|^')+'$')
-
-    static etcd_tree = {
-        config: {
-            /* global: {
-                // mon
-                etcd_mon_ttl: 30, // min: 10
-                etcd_mon_timeout: 1000, // ms. min: 0
-                etcd_mon_retries: 5, // min: 0
-                mon_change_timeout: 1000, // ms. min: 100
-                mon_stats_timeout: 1000, // ms. min: 100
-                osd_out_time: 1800, // seconds. min: 0
-                placement_levels: { datacenter: 1, rack: 2, host: 3, osd: 4, ... },
-                // client and osd
-                use_sync_send_recv: false,
-                log_level: 0,
-                block_size: 131072,
-                disk_alignment: 4096,
-                bitmap_granularity: 4096,
-                immediate_commit: false, // 'all' or 'small'
-                client_dirty_limit: 33554432,
-                peer_connect_interval: 5, // seconds. min: 1
-                peer_connect_timeout: 5, // seconds. min: 1
-                up_wait_retry_interval: 500, // ms. min: 50
-                // osd
-                etcd_report_interval: 30, // min: 10
-                run_primary: true,
-                bind_address: "0.0.0.0",
-                bind_port: 0,
-                autosync_interval: 5,
-                client_queue_depth: 128, // unused
-                recovery_queue_depth: 4,
-                readonly: false,
-                print_stats_interval: 3,
-                // blockstore - fixed in superblock
-                block_size,
-                disk_alignment,
-                journal_block_size,
-                meta_block_size,
-                bitmap_granularity,
-                journal_device,
-                journal_offset,
-                journal_size,
-                disable_journal_fsync,
-                data_device,
-                data_offset,
-                data_size,
-                disable_data_fsync,
-                meta_device,
-                meta_offset,
-                disable_meta_fsync,
-                disable_device_lock,
-                // blockstore - configurable
-                flusher_count,
-                inmemory_metadata,
-                inmemory_journal,
-                journal_sector_buffer_count,
-                journal_no_same_sector_overwrites,
-            }, */
-            global: {},
-            /* node_placement: {
-                host1: { level: 'host', parent: 'rack1' },
-                ...
-            }, */
-            node_placement: {},
-            /* pools: {
-                <id>: {
-                    name: 'testpool',
-                    scheme: 'xor',
-                    pg_size: 3,
-                    pg_minsize: 2,
-                    pg_count: 100,
-                    failure_domain: 'host',
-                    max_osd_combinations: 10000,
-                    pg_stripe_size: 4194304,
-                    root_node?: 'rack1',
-                    // restrict pool to OSDs having all of these tags
-                    osd_tags?: 'nvme' | [ 'nvme', ... ],
-                },
-                ...
-            }, */
-            pools: {},
-            osd: {
-                /* <id>: { reweight?: 1, tags?: [ 'nvme', ... ] }, ... */
-            },
-            /* pgs: {
-                hash: string,
-                items: {
-                    <pool_id>: {
-                        <pg_id>: {
-                            osd_set: [ 1, 2, 3 ],
-                            primary: 1,
-                            pause: false,
-                        }
-                    }
-                }
-            }, */
-            pgs: {},
-        },
-        osd: {
-            state: {
-                /* <osd_num_t>: {
-                    state: "up",
-                    addresses: string[],
-                    host: string,
-                    port: uint16_t,
-                    primary_enabled: boolean,
-                    blockstore_enabled: boolean,
-                }, */
-            },
-            stats: {
-                /* <osd_num_t>: {
-                    time: number, // unix time
-                    blockstore_ready: boolean,
-                    size: uint64_t, // bytes
-                    free: uint64_t, // bytes
-                    host: string,
-                    op_stats: {
-                        <string>: { count: uint64_t, usec: uint64_t, bytes: uint64_t },
-                    },
-                    subop_stats: {
-                        <string>: { count: uint64_t, usec: uint64_t },
-                    },
-                    recovery_stats: {
-                        degraded: { count: uint64_t, bytes: uint64_t },
-                        misplaced: { count: uint64_t, bytes: uint64_t },
-                    },
-                }, */
-            },
-        },
-        mon: {
-            master: {
-                /* ip: [ string ], */
-            },
-        },
-        pg: {
-            state: {
-                /* <pool_id>: {
-                    <pg_id>: {
-                        primary: osd_num_t,
-                        state: ("starting"|"peering"|"incomplete"|"active"|"stopping"|"offline"|
-                            "degraded"|"has_incomplete"|"has_degraded"|"has_misplaced"|"has_unclean"|
-                            "has_invalid"|"left_on_dead")[],
-                    }
-                }, */
-            },
-            stats: {
-                /* <pool_id>: {
-                    <pg_id>: {
-                        object_count: uint64_t,
-                        clean_count: uint64_t,
-                        misplaced_count: uint64_t,
-                        degraded_count: uint64_t,
-                        incomplete_count: uint64_t,
-                        write_osd_set: osd_num_t[],
-                    },
-                }, */
-            },
-            history: {
-                /* <pool_id>: {
-                    <pg_id>: {
-                        osd_sets: osd_num_t[][],
-                        all_peers: osd_num_t[],
-                        epoch: uint32_t,
-                    },
-                }, */
-            },
-        },
-        stats: {
-            /* op_stats: {
-                <string>: { count: uint64_t, usec: uint64_t, bytes: uint64_t },
-            },
-            subop_stats: {
-                <string>: { count: uint64_t, usec: uint64_t },
-            },
-            recovery_stats: {
-                degraded: { count: uint64_t, bytes: uint64_t },
-                misplaced: { count: uint64_t, bytes: uint64_t },
-            },
-            object_counts: {
-                object: uint64_t,
-                clean: uint64_t,
-                misplaced: uint64_t,
-                degraded: uint64_t,
-                incomplete: uint64_t,
-            }, */
-        },
-    }
-
    constructor(config)
    {
        // FIXME: Maybe prefer local etcd
@@ -250,7 +253,10 @@ class Mon
        const res = await this.etcd_call('/kv/txn', { success: [
            { requestRange: { key: b64(this.etcd_prefix+'/config/global') } }
        ] }, this.etcd_start_timeout, -1);
-        this.parse_kv(res.responses[0].response_range.kvs[0]);
+        if (res.responses[0].response_range.kvs)
+        {
+            this.parse_kv(res.responses[0].response_range.kvs[0]);
+        }
        this.check_config();
    }

@@ -331,6 +337,7 @@ class Mon
                range_end: b64(this.etcd_prefix+'0'),
                start_revision: ''+this.etcd_watch_revision,
                watch_id: 1,
+                progress_notify: true,
            },
        }));
        this.ws.on('message', (msg) =>
@@ -561,19 +568,15 @@ class Mon
                    { requestPut: { key: b64(this.etcd_prefix+'/config/pgs'), value: b64(JSON.stringify(new_cfg)) } },
                ],
            }, this.config.etcd_mon_timeout, 0);
-            if (!res.succeeded)
-            {
-                return false;
-            }
-            this.state.config.pgs = new_cfg;
+            return false;
        }
        return !has_online;
    }

    save_new_pgs_txn(request, pool_id, up_osds, prev_pgs, new_pgs, pg_history)
    {
-        const replicated = this.state.config.pools[pool_id].scheme === 'replicated';
-        const pg_minsize = this.state.config.pools[pool_id].pg_minsize;
+        const replicated = new_pgs.length && this.state.config.pools[pool_id].scheme === 'replicated';
+        const pg_minsize = new_pgs.length && this.state.config.pools[pool_id].pg_minsize;
        const pg_items = {};
        new_pgs.map((osd_set, i) =>
        {
@@ -628,13 +631,21 @@ class Mon
            }
        }
        this.state.config.pgs.items = this.state.config.pgs.items || {};
-        this.state.config.pgs.items[pool_id] = pg_items;
+        if (!new_pgs.length)
+        {
+            delete this.state.config.pgs.items[pool_id];
+        }
+        else
+        {
+            this.state.config.pgs.items[pool_id] = pg_items;
+        }
    }

    validate_pool_cfg(pool_id, pool_cfg, warn)
    {
        pool_cfg.pg_size = Math.floor(pool_cfg.pg_size);
        pool_cfg.pg_minsize = Math.floor(pool_cfg.pg_minsize);
+        pool_cfg.parity_chunks = Math.floor(pool_cfg.parity_chunks) || undefined;
        pool_cfg.pg_count = Math.floor(pool_cfg.pg_count);
        pool_cfg.failure_domain = pool_cfg.failure_domain || 'host';
        pool_cfg.max_osd_combinations = Math.floor(pool_cfg.max_osd_combinations) || 10000;
@@ -644,8 +655,14 @@ class Mon
                console.log('Pool ID '+pool_id+' is invalid');
            return false;
        }
-        if (!pool_cfg.pg_size || pool_cfg.pg_size < 1 ||
-            pool_cfg.scheme === 'xor' && pool_cfg.pg_size < 3)
+        if (pool_cfg.scheme !== 'xor' && pool_cfg.scheme !== 'replicated' && pool_cfg.scheme !== 'jerasure')
+        {
+            if (warn)
+                console.log('Pool '+pool_id+' has invalid coding scheme (one of "xor", "replicated" and "jerasure" required)');
+            return false;
+        }
+        if (!pool_cfg.pg_size || pool_cfg.pg_size < 1 || pool_cfg.pg_size > 256 ||
+            (pool_cfg.scheme === 'xor' || pool_cfg.scheme == 'jerasure') && pool_cfg.pg_size < 3)
        {
            if (warn)
                console.log('Pool '+pool_id+' has invalid pg_size');
@@ -658,6 +675,18 @@ class Mon
                console.log('Pool '+pool_id+' has invalid pg_minsize');
            return false;
        }
+        if (pool_cfg.scheme === 'xor' && pool_cfg.parity_chunks != 0 && pool_cfg.parity_chunks != 1)
+        {
+            if (warn)
+                console.log('Pool '+pool_id+' has invalid parity_chunks (must be 1)');
+            return false;
+        }
+        if (pool_cfg.scheme === 'jerasure' && (pool_cfg.parity_chunks < 1 || pool_cfg.parity_chunks > pool_cfg.pg_size-2))
+        {
+            if (warn)
+                console.log('Pool '+pool_id+' has invalid parity_chunks (must be between 1 and pg_size-2)');
+            return false;
+        }
        if (!pool_cfg.pg_count || pool_cfg.pg_count < 1)
        {
            if (warn)
@@ -670,12 +699,6 @@ class Mon
                console.log('Pool '+pool_id+' has empty name');
            return false;
        }
-        if (pool_cfg.scheme !== 'xor' && pool_cfg.scheme !== 'replicated')
-        {
-            if (warn)
-                console.log('Pool '+pool_id+' has invalid coding scheme (only "xor" and "replicated" are allowed)');
-            return false;
-        }
        if (pool_cfg.max_osd_combinations < 100)
        {
            if (warn)
@@ -739,6 +762,24 @@ class Mon
        {
            // Something has changed
            const etcd_request = { compare: [], success: [] };
+            for (const pool_id in (this.state.config.pgs||{}).items||{})
+            {
+                if (!this.state.config.pools[pool_id])
+                {
+                    // Pool deleted. Delete all PGs, but first stop them.
+                    if (!await this.stop_all_pgs(pool_id))
+                    {
+                        this.schedule_recheck();
+                        return;
+                    }
+                    const prev_pgs = [];
+                    for (const pg in this.state.config.pgs.items[pool_id]||{})
+                    {
+                        prev_pgs[pg-1] = this.state.config.pgs.items[pool_id][pg].osd_set;
+                    }
+                    this.save_new_pgs_txn(etcd_request, pool_id, up_osds, prev_pgs, [], []);
+                }
+            }
            for (const pool_id in this.state.config.pools)
            {
                const pool_cfg = this.state.config.pools[pool_id];
@@ -1228,4 +1269,7 @@ function sha1hex(str)
    return hash.digest('hex');
 }

+Mon.etcd_allow = etcd_allow;
+Mon.etcd_tree = etcd_tree;
+
 module.exports = Mon;
--- a/mon/simple-offsets.js
+++ b/mon/simple-offsets.js
@@ -4,6 +4,7 @@
 // Simple tool to calculate journal and metadata offsets for a single device
 // Will be replaced by smarter tools in the future

+const fs = require('fs').promises;
 const child_process = require('child_process');

 async function run()
@@ -15,6 +16,7 @@ async function run()
        device_block_size: 4096,
        journal_offset: 0,
        device_size: 0,
+        format: 'text',
    };
    for (let i = 2; i < process.argv.length; i++)
    {
@@ -24,7 +26,22 @@ async function run()
            i++;
        }
    }
-    const device_size = Number(options.device_size || await system("blockdev --getsize64 "+options.device));
+    if (!options.device)
+    {
+        process.stderr.write('USAGE: nodejs '+process.argv[1]+' --device /dev/sdXXX\n');
+        process.exit(1);
+    }
+    options.device_size = Number(options.device_size);
+    let device_size = options.device_size;
+    if (!device_size)
+    {
+        const st = await fs.stat(options.device);
+        options.device_block_size = st.blksize;
+        if (st.isBlockDevice())
+            device_size = Number(await system("/sbin/blockdev --getsize64 "+options.device))
+        else
+            device_size = st.size;
+    }
    if (!device_size)
    {
        process.stderr.write('Failed to get device size\n');
@@ -32,25 +49,45 @@ async function run()
    }
    options.journal_offset = Math.ceil(options.journal_offset/options.device_block_size)*options.device_block_size;
    const meta_offset = options.journal_offset + Math.ceil(options.journal_size/options.device_block_size)*options.device_block_size;
-    const entries_per_block = Math.floor(options.device_block_size / (24 + options.object_size/options.bitmap_granularity/8));
+    const entries_per_block = Math.floor(options.device_block_size / (24 + 2*options.object_size/options.bitmap_granularity/8));
    const object_count = Math.floor((device_size-meta_offset)/options.object_size);
    const meta_size = Math.ceil(object_count / entries_per_block) * options.device_block_size;
    const data_offset = meta_offset + meta_size;
    const meta_size_fmt = (meta_size > 1024*1024*1024 ? Math.round(meta_size/1024/1024/1024*100)/100+" GB"
        : Math.round(meta_size/1024/1024*100)/100+" MB");
-    process.stdout.write(
-        `Metadata size: ${meta_size_fmt}\n`+
-        `Options for the OSD:\n`+
-        `    --journal_offset ${options.journal_offset}\n`+
-        `    --meta_offset ${meta_offset}\n`+
-        `    --data_offset ${data_offset}\n`+
-        (options.device_size ? `    --data_size ${device_size-data_offset}\n` : '')
-    );
+    if (options.format == 'text' || options.format == 'options')
+    {
+        if (options.format == 'text')
+        {
+            process.stderr.write(
+                `Metadata size: ${meta_size_fmt}\n`+
+                `Options for the OSD:\n`
+            );
+        }
+        process.stdout.write(
+            `    --data_device ${options.device}\n`+
+            `    --journal_offset ${options.journal_offset}\n`+
+            `    --meta_offset ${meta_offset}\n`+
+            `    --data_offset ${data_offset}\n`+
+            (options.device_size ? `    --data_size ${device_size-data_offset}\n` : '')
+        );
+    }
+    else if (options.format == 'env')
+    {
+        process.stdout.write(
+            `journal_offset=${options.journal_offset}\n`+
+            `meta_offset=${meta_offset}\n`+
+            `data_offset=${data_offset}\n`+
+            `data_size=${device_size-data_offset}\n`
+        );
+    }
+    else
+        process.stdout.write('Unknown format: '+options.format);
 }

 function system(cmd)
 {
-    return new Promise((ok, no) => child_process.exec(cmd, { maxBuffer: 64*1024*1024 }, (err, stdout, stderr) => (err ? no(err) : ok(stdout))));
+    return new Promise((ok, no) => child_process.exec(cmd, { maxBuffer: 64*1024*1024 }, (err, stdout, stderr) => (err ? no(err.message) : ok(stdout))));
 }

-run().catch(console.error);
+run().catch(err => { console.error(err); process.exit(1); });
--- a/msgr_receive.cpp
+++ b/msgr_receive.cpp
@@ -9,6 +9,10 @@ void osd_messenger_t::read_requests()
    {
        int peer_fd = read_ready_clients[i];
        osd_client_t *cl = clients[peer_fd];
+        if (cl->read_msg.msg_iovlen)
+        {
+            continue;
+        }
        if (cl->read_remaining < receive_buffer_size)
        {
            cl->read_iov.iov_base = cl->in_buf;
@@ -29,6 +33,7 @@ void osd_messenger_t::read_requests()
            io_uring_sqe* sqe = ringloop->get_sqe();
            if (!sqe)
            {
+                cl->read_msg.msg_iovlen = 0;
                read_ready_clients.erase(read_ready_clients.begin(), read_ready_clients.begin() + i);
                return;
            }
@@ -52,6 +57,7 @@ void osd_messenger_t::read_requests()
 bool osd_messenger_t::handle_read(int result, osd_client_t *cl)
 {
    bool ret = false;
+    cl->read_msg.msg_iovlen = 0;
    cl->refs--;
    if (cl->peer_state == PEER_STOPPED)
    {
@@ -160,8 +166,14 @@ bool osd_messenger_t::handle_finished_read(osd_client_t *cl)
    {
        if (cl->read_op->req.hdr.magic == SECONDARY_OSD_REPLY_MAGIC)
            return handle_reply_hdr(cl);
-        else
+        else if (cl->read_op->req.hdr.magic == SECONDARY_OSD_OP_MAGIC)
            handle_op_hdr(cl);
+        else
+        {
+            printf("Received garbage: magic=%lx id=%lu opcode=%lx from %d\n", cl->read_op->req.hdr.magic, cl->read_op->req.hdr.id, cl->read_op->req.hdr.opcode, cl->peer_fd);
+            stop_client(cl->peer_fd);
+            return false;
+        }
    }
    else if (cl->read_state == CL_READ_DATA)
    {
--- a/msgr_send.cpp
+++ b/msgr_send.cpp
@@ -46,7 +46,8 @@ void osd_messenger_t::outbox_push(osd_op_t *cur_op)
        to_send_list.push_back((iovec){ .iov_base = cur_op->req.buf, .iov_len = OSD_PACKET_SIZE });
        cl->sent_ops[cur_op->req.hdr.id] = cur_op;
    }
-    // Pre-defined send_lists
+    to_outbox.push_back(NULL);
+    // Operation data
    if ((cur_op->op_type == OSD_OP_IN
        ? (cur_op->req.hdr.opcode == OSD_OP_READ ||
        cur_op->req.hdr.opcode == OSD_OP_SEC_READ ||
@@ -58,17 +59,17 @@ void osd_messenger_t::outbox_push(osd_op_t *cur_op)
        cur_op->req.hdr.opcode == OSD_OP_SEC_STABILIZE ||
        cur_op->req.hdr.opcode == OSD_OP_SEC_ROLLBACK)) && cur_op->iov.count > 0)
    {
-        to_outbox.push_back(NULL);
        for (int i = 0; i < cur_op->iov.count; i++)
        {
            assert(cur_op->iov.buf[i].iov_base);
            to_send_list.push_back(cur_op->iov.buf[i]);
-            to_outbox.push_back(i == cur_op->iov.count-1 ? cur_op : NULL);
+            to_outbox.push_back(NULL);
        }
    }
-    else
+    if (cur_op->op_type == OSD_OP_IN)
    {
-        to_outbox.push_back(cur_op);
+        // To free it later
+        to_outbox[to_outbox.size()-1] = cur_op;
    }
    if (!ringloop)
    {
@@ -92,6 +93,10 @@ void osd_messenger_t::outbox_push(osd_op_t *cur_op)
 void osd_messenger_t::measure_exec(osd_op_t *cur_op)
 {
    // Measure execution latency
+    if (cur_op->req.hdr.opcode > OSD_OP_MAX)
+    {
+        return;
+    }
    timespec tv_end;
    clock_gettime(CLOCK_REALTIME, &tv_end);
    stats.op_stat_count[cur_op->req.hdr.opcode]++;
@@ -198,11 +203,8 @@ void osd_messenger_t::handle_send(int result, osd_client_t *cl)
            {
                if (cl->outbox[done])
                {
-                    // Operation fully sent
-                    if (cl->outbox[done]->op_type == OSD_OP_IN)
-                    {
-                        delete cl->outbox[done];
-                    }
+                    // Reply fully sent
+                    delete cl->outbox[done];
                }
                result -= iov.iov_len;
                done++;
--- a/nbd_proxy.cpp
+++ b/nbd_proxy.cpp
@@ -17,6 +17,10 @@
 #include "epoll_manager.h"
 #include "cluster_client.h"

+#ifndef MSG_ZEROCOPY
+#define MSG_ZEROCOPY 0
+#endif
+
 const char *exe_name = NULL;

 class nbd_proxy
--- a/osd_cluster.cpp
+++ b/osd_cluster.cpp
@@ -4,6 +4,7 @@
 #include "osd.h"
 #include "base64.h"
 #include "etcd_state_client.h"
+#include "osd_rmw.h"

 // Startup sequence:
 //   Start etcd watcher -> Load global OSD configuration -> Bind socket -> Acquire lease -> Report&lock OSD state
@@ -32,12 +33,26 @@ void osd_t::init_cluster()
            }
            pgs[{ 1, 1 }] = (pg_t){
                .state = PG_PEERING,
+                .scheme = POOL_SCHEME_XOR,
                .pg_cursize = 0,
+                .pg_size = 3,
+                .pg_minsize = 2,
+                .parity_chunks = 1,
                .pool_id = 1,
                .pg_num = 1,
                .target_set = { 1, 2, 3 },
                .cur_set = { 0, 0, 0 },
            };
+            st_cli.pool_config[1] = (pool_config_t){
+                .exists = true,
+                .id = 1,
+                .name = "testpool",
+                .scheme = POOL_SCHEME_XOR,
+                .pg_size = 3,
+                .pg_minsize = 2,
+                .pg_count = 1,
+                .real_pg_count = 1,
+            };
            report_pg_state(pgs[{ 1, 1 }]);
            pg_counts[1] = 1;
        }
@@ -369,6 +384,7 @@ void osd_t::create_osd_state()
        {
            st_cli.load_pgs();
        }
+        report_statistics();
    });
 }

@@ -479,7 +495,11 @@ void osd_t::apply_pg_count()
            }
            if (still_active > 0)
            {
-                printf("[OSD %lu] PG count change detected, but %d PG(s) are still active. This is not allowed. Exiting\n", this->osd_num, still_active);
+                printf(
+                    "[OSD %lu] PG count change detected for pool %u (new is %lu, old is %u),"
+                    " but %u PG(s) are still active. This is not allowed. Exiting\n",
+                    this->osd_num, pool_item.first, pool_item.second.real_pg_count, pg_counts[pool_item.first], still_active
+                );
                force_stop(1);
                return;
            }
@@ -583,6 +603,7 @@ void osd_t::apply_pg_config()
                    .pg_cursize = 0,
                    .pg_size = pool_item.second.pg_size,
                    .pg_minsize = pool_item.second.pg_minsize,
+                    .parity_chunks = pool_item.second.parity_chunks,
                    .pool_id = pool_id,
                    .pg_num = pg_num,
                    .reported_epoch = pg_cfg.epoch,
@@ -590,6 +611,10 @@ void osd_t::apply_pg_config()
                    .all_peers = std::vector<osd_num_t>(all_peers.begin(), all_peers.end()),
                    .target_set = pg_cfg.target_set,
                };
+                if (pg.scheme == POOL_SCHEME_JERASURE)
+                {
+                    use_jerasure(pg.pg_size, pg.pg_size-pg.parity_chunks, true);
+                }
                this->pg_state_dirty.insert({ .pool_id = pool_id, .pg_num = pg_num });
                pg.print_state();
                if (pg_cfg.cur_primary == this->osd_num)
@@ -778,6 +803,10 @@ void osd_t::report_pg_states()
                    {
                        // Remove offline PGs after reporting their state
                        this->pgs.erase(pg_it);
+                        if (pg_it->second.scheme == POOL_SCHEME_JERASURE)
+                        {
+                            use_jerasure(pg_it->second.pg_size, pg_it->second.pg_size-pg_it->second.parity_chunks, false);
+                        }
                    }
                }
            }
--- a/osd_flush.cpp
+++ b/osd_flush.cpp
@@ -166,7 +166,7 @@ void osd_t::submit_flush_op(pool_id_t pool_id, pg_num_t pg_num, pg_flush_batch_t
    {
        // local
        clock_gettime(CLOCK_REALTIME, &op->tv_begin);
-        op->bs_op = new blockstore_op_t({
+        op->bs_op = new blockstore_op_t((blockstore_op_t){
            .opcode = (uint64_t)(rollback ? BS_OP_ROLLBACK : BS_OP_STABLE),
            .callback = [this, op, pool_id, pg_num, fb](blockstore_op_t *bs_op)
            {
@@ -188,7 +188,7 @@ void osd_t::submit_flush_op(pool_id_t pool_id, pg_num_t pg_num, pg_flush_batch_t
        op->op_type = OSD_OP_OUT;
        op->iov.push_back(op->buf, count * sizeof(obj_ver_id));
        op->peer_fd = peer_fd;
-        op->req = {
+        op->req = (osd_any_op_t){
            .sec_stab = {
                .header = {
                    .magic = SECONDARY_OSD_OP_MAGIC,
@@ -246,7 +246,7 @@ void osd_t::submit_recovery_op(osd_recovery_op_t *op)
 {
    op->osd_op = new osd_op_t();
    op->osd_op->op_type = OSD_OP_OUT;
-    op->osd_op->req = {
+    op->osd_op->req = (osd_any_op_t){
        .rw = {
            .header = {
                .magic = SECONDARY_OSD_OP_MAGIC,
--- a/osd_id.h
+++ b/osd_id.h
@@ -5,6 +5,7 @@

 #define POOL_SCHEME_REPLICATED 1
 #define POOL_SCHEME_XOR 2
+#define POOL_SCHEME_JERASURE 3
 #define POOL_ID_MAX 0x10000
 #define POOL_ID_BITS 16
 #define INODE_POOL(inode) (pool_id_t)((inode) >> (64 - POOL_ID_BITS))
--- a/osd_peering.cpp
+++ b/osd_peering.cpp
@@ -307,7 +307,7 @@ void osd_t::submit_sync_and_list_subop(osd_num_t role_osd, pg_peering_state_t *p
        osd_op_t *op = new osd_op_t();
        op->op_type = OSD_OP_OUT;
        op->peer_fd = cl->peer_fd;
-        op->req = {
+        op->req = (osd_any_op_t){
            .sec_sync = {
                .header = {
                    .magic = SECONDARY_OSD_OP_MAGIC,
@@ -382,7 +382,7 @@ void osd_t::submit_list_subop(osd_num_t role_osd, pg_peering_state_t *ps)
        osd_op_t *op = new osd_op_t();
        op->op_type = OSD_OP_OUT;
        op->peer_fd = c_cli.osd_peer_fds[role_osd];
-        op->req = {
+        op->req = (osd_any_op_t){
            .sec_list = {
                .header = {
                    .magic = SECONDARY_OSD_OP_MAGIC,
--- a/osd_peering_pg.h
+++ b/osd_peering_pg.h
@@ -75,7 +75,7 @@ struct pg_t
 {
    int state = 0;
    uint64_t scheme = 0;
-    uint64_t pg_cursize = 0, pg_size = 0, pg_minsize = 0;
+    uint64_t pg_cursize = 0, pg_size = 0, pg_minsize = 0, parity_chunks = 0;
    pool_id_t pool_id = 0;
    pg_num_t pg_num = 0;
    uint64_t clean_count = 0, total_count = 0;
@@ -94,7 +94,7 @@ struct pg_t
    std::vector<osd_num_t> cur_set;
    // same thing in state_dict-like format
    pg_osd_set_t cur_loc_set;
-    // moved object map. by default, each object is considered to reside on the cur_set.
+    // moved object map. by default, each object is considered to reside on cur_set.
    // this map stores all objects that differ.
    // it may consume up to ~ (raw storage / object size) * 24 bytes in the worst case scenario
    // which is up to ~192 MB per 1 TB in the worst case scenario
--- a/osd_primary.cpp
+++ b/osd_primary.cpp
@@ -16,8 +16,9 @@ bool osd_t::prepare_primary_rw(osd_op_t *cur_op)
 {
    // PG number is calculated from the offset
    // Our EC scheme stores data in fixed chunks equal to (K*block size)
-    // K = pg_minsize in case of EC/XOR, or 1 for replicated pools
+    // K = (pg_size-parity_chunks) in case of EC/XOR, or 1 for replicated pools
    pool_id_t pool_id = INODE_POOL(cur_op->req.rw.inode);
+    // FIXME: We have to access pool config here, so make sure that it doesn't change while its PGs are active...
    auto pool_cfg_it = st_cli.pool_config.find(pool_id);
    if (pool_cfg_it == st_cli.pool_config.end())
    {
@@ -26,7 +27,8 @@ bool osd_t::prepare_primary_rw(osd_op_t *cur_op)
        return false;
    }
    auto & pool_cfg = pool_cfg_it->second;
-    uint64_t pg_block_size = bs_block_size * (pool_cfg.scheme == POOL_SCHEME_REPLICATED ? 1 : pool_cfg.pg_minsize);
+    uint64_t pg_data_size = (pool_cfg.scheme == POOL_SCHEME_REPLICATED ? 1 : pool_cfg.pg_size-pool_cfg.parity_chunks);
+    uint64_t pg_block_size = bs_block_size * pg_data_size;
    object_id oid = {
        .inode = cur_op->req.rw.inode,
        // oid.stripe = starting offset of the parity stripe
@@ -37,6 +39,7 @@ bool osd_t::prepare_primary_rw(osd_op_t *cur_op)
    if (pg_it == pgs.end() || !(pg_it->second.state & PG_ACTIVE))
    {
        // This OSD is not primary for this PG or the PG is inactive
+        // FIXME: Allow reads from PGs degraded under pg_minsize, but don't allow writes
        finish_op(cur_op, -EPIPE);
        return false;
    }
@@ -54,9 +57,9 @@ bool osd_t::prepare_primary_rw(osd_op_t *cur_op)
    op_data->oid = oid;
    op_data->stripes = ((osd_rmw_stripe_t*)(op_data+1));
    op_data->scheme = pool_cfg.scheme;
+    op_data->pg_data_size = pg_data_size;
    cur_op->op_data = op_data;
-    split_stripes((pool_cfg.scheme == POOL_SCHEME_REPLICATED ? 1 : pg_it->second.pg_minsize),
-        bs_block_size, (uint32_t)(cur_op->req.rw.offset - oid.stripe), cur_op->req.rw.len, op_data->stripes);
+    split_stripes(pg_data_size, bs_block_size, (uint32_t)(cur_op->req.rw.offset - oid.stripe), cur_op->req.rw.len, op_data->stripes);
    pg_it->second.inflight++;
    return true;
 }
@@ -101,7 +104,7 @@ void osd_t::continue_primary_read(osd_op_t *cur_op)
    else if (op_data->st == 2) goto resume_2;
    {
        auto & pg = pgs[{ .pool_id = INODE_POOL(op_data->oid.inode), .pg_num = op_data->pg_num }];
-        for (int role = 0; role < (op_data->scheme == POOL_SCHEME_REPLICATED ? 1 : pg.pg_minsize); role++)
+        for (int role = 0; role < op_data->pg_data_size; role++)
        {
            op_data->stripes[role].read_start = op_data->stripes[role].req_start;
            op_data->stripes[role].read_end = op_data->stripes[role].req_end;
@@ -112,24 +115,23 @@ void osd_t::continue_primary_read(osd_op_t *cur_op)
        if (pg.state == PG_ACTIVE || op_data->scheme == POOL_SCHEME_REPLICATED)
        {
            // Fast happy-path
-            cur_op->buf = alloc_read_buffer(op_data->stripes,
-                (op_data->scheme == POOL_SCHEME_REPLICATED ? 1 : pg.pg_minsize), 0);
+            cur_op->buf = alloc_read_buffer(op_data->stripes, op_data->pg_data_size, 0);
            submit_primary_subops(SUBMIT_READ, op_data->target_ver,
-                (op_data->scheme == POOL_SCHEME_REPLICATED ? pg.pg_size : pg.pg_minsize), pg.cur_set.data(), cur_op);
+                (op_data->scheme == POOL_SCHEME_REPLICATED ? pg.pg_size : op_data->pg_data_size), pg.cur_set.data(), cur_op);
            op_data->st = 1;
        }
        else
        {
            // PG may be degraded or have misplaced objects
            uint64_t* cur_set = get_object_osd_set(pg, op_data->oid, pg.cur_set.data(), &op_data->object_state);
-            if (extend_missing_stripes(op_data->stripes, cur_set, pg.pg_minsize, pg.pg_size) < 0)
+            if (extend_missing_stripes(op_data->stripes, cur_set, op_data->pg_data_size, pg.pg_size) < 0)
            {
                finish_op(cur_op, -EIO);
                return;
            }
            // Submit reads
-            op_data->pg_minsize = pg.pg_minsize;
            op_data->pg_size = pg.pg_size;
+            op_data->scheme = pg.scheme;
            op_data->degraded = 1;
            cur_op->buf = alloc_read_buffer(op_data->stripes, pg.pg_size, 0);
            submit_primary_subops(SUBMIT_READ, op_data->target_ver, pg.pg_size, cur_set, cur_op);
@@ -147,14 +149,17 @@ resume_2:
    if (op_data->degraded)
    {
        // Reconstruct missing stripes
-        // FIXME: Always EC(k+1) by now. Add different coding schemes
        osd_rmw_stripe_t *stripes = op_data->stripes;
-        for (int role = 0; role < op_data->pg_minsize; role++)
+        if (op_data->scheme == POOL_SCHEME_XOR)
+        {
+            reconstruct_stripes_xor(stripes, op_data->pg_size);
+        }
+        else if (op_data->scheme == POOL_SCHEME_JERASURE)
+        {
+            reconstruct_stripes_jerasure(stripes, op_data->pg_size, op_data->pg_data_size);
+        }
+        for (int role = 0; role < op_data->pg_size; role++)
        {
-            if (stripes[role].read_end != 0 && stripes[role].missing)
-            {
-                reconstruct_stripe_xor(stripes, op_data->pg_size, role);
-            }
            if (stripes[role].req_end != 0)
            {
                // Send buffer in parts to avoid copying
@@ -245,7 +250,7 @@ resume_1:
    else
    {
        cur_op->rmw_buf = calc_rmw(cur_op->buf, op_data->stripes, op_data->prev_set,
-            pg.pg_size, pg.pg_minsize, pg.pg_cursize, pg.cur_set.data(), bs_block_size);
+            pg.pg_size, op_data->pg_data_size, pg.pg_cursize, pg.cur_set.data(), bs_block_size);
        if (!cur_op->rmw_buf)
        {
            // Refuse partial overwrite of an incomplete object
@@ -285,7 +290,14 @@ resume_3:
    else
    {
        // Recover missing stripes, calculate parity
-        calc_rmw_parity_xor(op_data->stripes, pg.pg_size, op_data->prev_set, pg.cur_set.data(), bs_block_size);
+        if (pg.scheme == POOL_SCHEME_XOR)
+        {
+            calc_rmw_parity_xor(op_data->stripes, pg.pg_size, op_data->prev_set, pg.cur_set.data(), bs_block_size);
+        }
+        else if (pg.scheme == POOL_SCHEME_JERASURE)
+        {
+            calc_rmw_parity_jerasure(op_data->stripes, pg.pg_size, op_data->pg_data_size, op_data->prev_set, pg.cur_set.data(), bs_block_size);
+        }
    }
    // Send writes
    if ((op_data->fact_ver >> (64-PG_EPOCH_BITS)) < pg.epoch)
@@ -477,7 +489,11 @@ resume_7:
        }
        // Remember PG as dirty to drop the connection when PG goes offline
        // (this is required because of the "lazy sync")
-        c_cli.clients[cur_op->peer_fd]->dirty_pgs.insert({ .pool_id = pg.pool_id, .pg_num = pg.pg_num });
+        auto cl_it = c_cli.clients.find(cur_op->peer_fd);
+        if (cl_it != c_cli.clients.end())
+        {
+            cl_it->second->dirty_pgs.insert({ .pool_id = pg.pool_id, .pg_num = pg.pg_num });
+        }
        dirty_pgs.insert({ .pool_id = pg.pool_id, .pg_num = pg.pg_num });
    }
    return true;
--- a/osd_primary.h
+++ b/osd_primary.h
@@ -25,7 +25,7 @@ struct osd_primary_op_data_t
    uint64_t fact_ver = 0;
    uint64_t scheme = 0;
    int n_subops = 0, done = 0, errors = 0, epipe = 0;
-    int degraded = 0, pg_size, pg_minsize;
+    int degraded = 0, pg_size, pg_data_size;
    osd_rmw_stripe_t *stripes;
    osd_op_t *subops = NULL;
    uint64_t *prev_set = NULL;
--- a/osd_primary_subops.cpp
+++ b/osd_primary_subops.cpp
@@ -11,7 +11,7 @@ void osd_t::autosync()
    {
        autosync_op = new osd_op_t();
        autosync_op->op_type = OSD_OP_IN;
-        autosync_op->req = {
+        autosync_op->req = (osd_any_op_t){
            .sync = {
                .header = {
                    .magic = SECONDARY_OSD_OP_MAGIC,
@@ -295,7 +295,7 @@ void osd_t::handle_primary_subop(osd_op_t *subop, osd_op_t *cur_op)
            uint64_t version = subop->reply.sec_rw.version;
 #ifdef OSD_DEBUG
            uint64_t peer_osd = c_cli.clients.find(subop->peer_fd) != c_cli.clients.end()
-                ? c_cli.clients[subop->peer_fd].osd_num : osd_num;
+                ? c_cli.clients[subop->peer_fd]->osd_num : osd_num;
            printf("subop %lu from osd %lu: version = %lu\n", opcode, peer_osd, version);
 #endif
            if (op_data->fact_ver != 0 && op_data->fact_ver != version)
@@ -510,7 +510,7 @@ void osd_t::submit_primary_stab_subops(osd_op_t *cur_op)
        {
            clock_gettime(CLOCK_REALTIME, &subops[i].tv_begin);
            subops[i].op_type = (uint64_t)cur_op;
-            subops[i].bs_op = new blockstore_op_t({
+            subops[i].bs_op = new blockstore_op_t((blockstore_op_t){
                .opcode = BS_OP_STABLE,
                .callback = [subop = &subops[i], this](blockstore_op_t *bs_subop)
                {
--- a/osd_rmw.cpp
+++ b/osd_rmw.cpp
@@ -1,12 +1,18 @@
 // Copyright (c) Vitaliy Filippov, 2019+
 // License: VNPL-1.0 (see README.md for details)

+#include <stdexcept>
 #include <string.h>
 #include <assert.h>
+#include <jerasure/reed_sol.h>
+#include <jerasure.h>
+#include <map>
 #include "xor.h"
 #include "osd_rmw.h"
 #include "malloc_or_die.h"

+#define OSD_JERASURE_W 32
+
 static inline void extend_read(uint32_t start, uint32_t end, osd_rmw_stripe_t & stripe)
 {
    if (stripe.read_end == 0)
@@ -75,44 +81,189 @@ void split_stripes(uint64_t pg_minsize, uint32_t bs_block_size, uint32_t start,
    }
 }

-void reconstruct_stripe_xor(osd_rmw_stripe_t *stripes, int pg_size, int role)
+void reconstruct_stripes_xor(osd_rmw_stripe_t *stripes, int pg_size)
 {
-    int prev = -2;
-    for (int other = 0; other < pg_size; other++)
+    for (int role = 0; role < pg_size; role++)
    {
-        if (other != role)
+        if (stripes[role].read_end != 0 && stripes[role].missing)
        {
-            if (prev == -2)
+            // Reconstruct missing stripe (XOR k+1)
+            int prev = -2;
+            for (int other = 0; other < pg_size; other++)
            {
-                prev = other;
-            }
-            else if (prev >= 0)
-            {
-                assert(stripes[role].read_start >= stripes[prev].read_start &&
-                    stripes[role].read_start >= stripes[other].read_start);
-                memxor(
-                    stripes[prev].read_buf + (stripes[role].read_start - stripes[prev].read_start),
-                    stripes[other].read_buf + (stripes[role].read_start - stripes[other].read_start),
-                    stripes[role].read_buf, stripes[role].read_end - stripes[role].read_start
-                );
-                prev = -1;
-            }
-            else
-            {
-                assert(stripes[role].read_start >= stripes[other].read_start);
-                memxor(
-                    stripes[role].read_buf,
-                    stripes[other].read_buf + (stripes[role].read_start - stripes[other].read_start),
-                    stripes[role].read_buf, stripes[role].read_end - stripes[role].read_start
-                );
+                if (other != role)
+                {
+                    if (prev == -2)
+                    {
+                        prev = other;
+                    }
+                    else if (prev >= 0)
+                    {
+                        assert(stripes[role].read_start >= stripes[prev].read_start &&
+                            stripes[role].read_start >= stripes[other].read_start);
+                        memxor(
+                            stripes[prev].read_buf + (stripes[role].read_start - stripes[prev].read_start),
+                            stripes[other].read_buf + (stripes[role].read_start - stripes[other].read_start),
+                            stripes[role].read_buf, stripes[role].read_end - stripes[role].read_start
+                        );
+                        prev = -1;
+                    }
+                    else
+                    {
+                        assert(stripes[role].read_start >= stripes[other].read_start);
+                        memxor(
+                            stripes[role].read_buf,
+                            stripes[other].read_buf + (stripes[role].read_start - stripes[other].read_start),
+                            stripes[role].read_buf, stripes[role].read_end - stripes[role].read_start
+                        );
+                    }
+                }
            }
        }
    }
 }

-int extend_missing_stripes(osd_rmw_stripe_t *stripes, osd_num_t *osd_set, int minsize, int size)
+struct reed_sol_erased_t
 {
-    for (int role = 0; role < minsize; role++)
+    int *data;
+    int size;
+};
+
+inline bool operator < (const reed_sol_erased_t &a, const reed_sol_erased_t &b)
+{
+    for (int i = 0; i < a.size && i < b.size; i++)
+    {
+        if (a.data[i] < b.data[i])
+            return -1;
+        else if (a.data[i] > b.data[i])
+            return 1;
+    }
+    return 0;
+}
+
+struct reed_sol_matrix_t
+{
+    int refs = 0;
+    int *data;
+    std::map<reed_sol_erased_t, int*> decodings;
+};
+
+std::map<uint64_t, reed_sol_matrix_t> matrices;
+
+void use_jerasure(int pg_size, int pg_minsize, bool use)
+{
+    uint64_t key = (uint64_t)pg_size | ((uint64_t)pg_minsize) << 32;
+    auto rs_it = matrices.find(key);
+    if (rs_it == matrices.end())
+    {
+        if (!use)
+        {
+            return;
+        }
+        int *matrix = reed_sol_vandermonde_coding_matrix(pg_minsize, pg_size-pg_minsize, OSD_JERASURE_W);
+        matrices[key] = (reed_sol_matrix_t){
+            .refs = 0,
+            .data = matrix,
+        };
+        rs_it = matrices.find(key);
+    }
+    rs_it->second.refs += (!use ? -1 : 1);
+    if (rs_it->second.refs <= 0)
+    {
+        free(rs_it->second.data);
+        for (auto dec_it = rs_it->second.decodings.begin(); dec_it != rs_it->second.decodings.end();)
+        {
+            int *data = dec_it->second;
+            rs_it->second.decodings.erase(dec_it++);
+            free(data);
+        }
+        matrices.erase(rs_it);
+    }
+}
+
+reed_sol_matrix_t* get_jerasure_matrix(int pg_size, int pg_minsize)
+{
+    uint64_t key = (uint64_t)pg_size | ((uint64_t)pg_minsize) << 32;
+    auto rs_it = matrices.find(key);
+    if (rs_it == matrices.end())
+    {
+        throw std::runtime_error("jerasure matrix not initialized");
+    }
+    return &rs_it->second;
+}
+
+// jerasure_matrix_decode() decodes all chunks at once and tries to reencode all missing coding chunks.
+// we don't need it. also it makes an extra allocation of int *erased on every call and doesn't cache
+// the decoding matrix.
+// all these flaws are fixed in this function:
+int* get_jerasure_decoding_matrix(osd_rmw_stripe_t *stripes, int pg_size, int pg_minsize)
+{
+    int edd = 0;
+    int erased[pg_size] = { 0 };
+    for (int i = 0; i < pg_size; i++)
+        if (stripes[i].read_end == 0 || stripes[i].missing)
+            erased[i] = 1;
+    for (int i = 0; i < pg_minsize; i++)
+        if (stripes[i].read_end != 0 && stripes[i].missing)
+            edd++;
+    if (edd == 0)
+        return NULL;
+    reed_sol_matrix_t *matrix = get_jerasure_matrix(pg_size, pg_minsize);
+    auto dec_it = matrix->decodings.find((reed_sol_erased_t){ .data = erased, .size = pg_size });
+    if (dec_it == matrix->decodings.end())
+    {
+        int *dm_ids = (int*)malloc(sizeof(int)*(pg_minsize + pg_minsize*pg_minsize + pg_size));
+        int *decoding_matrix = dm_ids + pg_minsize;
+        if (!dm_ids)
+            throw std::bad_alloc();
+        // we always use row_k_ones=1 and w=8 (OSD_JERASURE_W)
+        if (jerasure_make_decoding_matrix(pg_minsize, pg_size-pg_minsize, OSD_JERASURE_W, matrix->data, erased, decoding_matrix, dm_ids) < 0)
+        {
+            free(dm_ids);
+            throw std::runtime_error("jerasure_make_decoding_matrix() failed");
+        }
+        int *erased_copy = dm_ids + pg_minsize + pg_minsize*pg_minsize;
+        memcpy(erased_copy, erased, pg_size*sizeof(int));
+        matrix->decodings.emplace((reed_sol_erased_t){ .data = erased_copy, .size = pg_size }, dm_ids);
+        return dm_ids;
+    }
+    return dec_it->second;
+}
+
+void reconstruct_stripes_jerasure(osd_rmw_stripe_t *stripes, int pg_size, int pg_minsize)
+{
+    int *dm_ids = get_jerasure_decoding_matrix(stripes, pg_size, pg_minsize);
+    if (!dm_ids)
+    {
+        return;
+    }
+    int *decoding_matrix = dm_ids + pg_minsize;
+    char *data_ptrs[pg_size] = { 0 };
+    for (int role = 0; role < pg_minsize; role++)
+    {
+        if (stripes[role].read_end != 0 && stripes[role].missing)
+        {
+            for (int other = 0; other < pg_size; other++)
+            {
+                if (stripes[other].read_end != 0 && !stripes[other].missing)
+                {
+                    assert(stripes[other].read_start <= stripes[role].read_start);
+                    assert(stripes[other].read_end >= stripes[role].read_end);
+                    data_ptrs[other] = (char*)(stripes[other].read_buf + (stripes[role].read_start - stripes[other].read_start));
+                }
+            }
+            data_ptrs[role] = (char*)stripes[role].read_buf;
+            jerasure_matrix_dotprod(
+                pg_minsize, OSD_JERASURE_W, decoding_matrix+(role*pg_minsize), dm_ids, role,
+                data_ptrs, data_ptrs+pg_minsize, stripes[role].read_end - stripes[role].read_start
+            );
+        }
+    }
+}
+
+int extend_missing_stripes(osd_rmw_stripe_t *stripes, osd_num_t *osd_set, int pg_minsize, int pg_size)
+{
+    for (int role = 0; role < pg_minsize; role++)
    {
        if (stripes[role].read_end != 0 && osd_set[role] == 0)
        {
@@ -121,21 +272,21 @@ int extend_missing_stripes(osd_rmw_stripe_t *stripes, osd_num_t *osd_set, int mi
            // We need at least pg_minsize stripes to recover the lost part.
            // FIXME: LRC EC and similar don't require to read all other stripes.
            int exist = 0;
-            for (int j = 0; j < size; j++)
+            for (int j = 0; j < pg_size; j++)
            {
                if (osd_set[j] != 0)
                {
                    extend_read(stripes[role].read_start, stripes[role].read_end, stripes[j]);
                    exist++;
-                    if (exist >= minsize)
+                    if (exist >= pg_minsize)
                    {
                        break;
                    }
                }
            }
-            if (exist < minsize)
+            if (exist < pg_minsize)
            {
-                // Less than minsize stripes are available for this object
+                // Less than pg_minsize stripes are available for this object
                return -1;
            }
        }
@@ -369,19 +520,9 @@ static void xor_multiple_buffers(buf_len_t *xor1, int n1, buf_len_t *xor2, int n
    }
 }

-void calc_rmw_parity_xor(osd_rmw_stripe_t *stripes, int pg_size, uint64_t *read_osd_set, uint64_t *write_osd_set, uint32_t chunk_size)
+static void calc_rmw_parity_copy_mod(osd_rmw_stripe_t *stripes, int pg_size, int pg_minsize,
+    uint64_t *read_osd_set, uint64_t *write_osd_set, uint32_t chunk_size, uint32_t &start, uint32_t &end)
 {
-    int pg_minsize = pg_size-1;
-    for (int role = 0; role < pg_size; role++)
-    {
-        if (stripes[role].read_end != 0 && stripes[role].missing)
-        {
-            // Reconstruct missing stripe (XOR k+1)
-            reconstruct_stripe_xor(stripes, pg_size, role);
-            break;
-        }
-    }
-    uint32_t start = 0, end = 0;
    if (write_osd_set[pg_minsize] != 0 || write_osd_set != read_osd_set)
    {
        // Required for the next two if()s
@@ -421,6 +562,53 @@ void calc_rmw_parity_xor(osd_rmw_stripe_t *stripes, int pg_size, uint64_t *read_
            }
        }
    }
+}
+
+static void calc_rmw_parity_copy_parity(osd_rmw_stripe_t *stripes, int pg_size, int pg_minsize,
+    uint64_t *read_osd_set, uint64_t *write_osd_set, uint32_t chunk_size, uint32_t start, uint32_t end)
+{
+    if (write_osd_set != read_osd_set)
+    {
+        for (int role = pg_minsize; role < pg_size; role++)
+        {
+            if (write_osd_set[role] != read_osd_set[role] && (start != 0 || end != chunk_size))
+            {
+                // Copy new parity into the read buffer to write it back
+                memcpy(
+                    stripes[role].read_buf + start,
+                    stripes[role].write_buf,
+                    end - start
+                );
+                stripes[role].write_buf = stripes[role].read_buf;
+                stripes[role].write_start = 0;
+                stripes[role].write_end = chunk_size;
+            }
+        }
+    }
+#ifdef RMW_DEBUG
+    printf("calc_rmw_parity:\n");
+    for (int role = 0; role < pg_size; role++)
+    {
+        auto & s = stripes[role];
+        printf(
+            "Tr=%lu Tw=%lu Q=%x-%x R=%x-%x W=%x-%x Rb=%lx Wb=%lx\n",
+            read_osd_set[role], write_osd_set[role],
+            s.req_start, s.req_end,
+            s.read_start, s.read_end,
+            s.write_start, s.write_end,
+            (uint64_t)s.read_buf,
+            (uint64_t)s.write_buf
+        );
+    }
+#endif
+}
+
+void calc_rmw_parity_xor(osd_rmw_stripe_t *stripes, int pg_size, uint64_t *read_osd_set, uint64_t *write_osd_set, uint32_t chunk_size)
+{
+    int pg_minsize = pg_size-1;
+    reconstruct_stripes_xor(stripes, pg_size);
+    uint32_t start = 0, end = 0;
+    calc_rmw_parity_copy_mod(stripes, pg_size, pg_minsize, read_osd_set, write_osd_set, chunk_size, start, end);
    if (write_osd_set[pg_minsize] != 0 && end != 0)
    {
        // Calculate new parity (XOR k+1)
@@ -449,38 +637,71 @@ void calc_rmw_parity_xor(osd_rmw_stripe_t *stripes, int pg_size, uint64_t *read_
            }
        }
    }
-    if (write_osd_set != read_osd_set)
+    calc_rmw_parity_copy_parity(stripes, pg_size, pg_minsize, read_osd_set, write_osd_set, chunk_size, start, end);
+}
+
+void calc_rmw_parity_jerasure(osd_rmw_stripe_t *stripes, int pg_size, int pg_minsize,
+    uint64_t *read_osd_set, uint64_t *write_osd_set, uint32_t chunk_size)
+{
+    reed_sol_matrix_t *matrix = get_jerasure_matrix(pg_size, pg_minsize);
+    reconstruct_stripes_jerasure(stripes, pg_size, pg_minsize);
+    uint32_t start = 0, end = 0;
+    calc_rmw_parity_copy_mod(stripes, pg_size, pg_minsize, read_osd_set, write_osd_set, chunk_size, start, end);
+    if (end != 0)
    {
-        for (int role = pg_minsize; role < pg_size; role++)
+        int i;
+        for (i = pg_minsize; i < pg_size; i++)
        {
-            if (write_osd_set[role] != read_osd_set[role] && (start != 0 || end != chunk_size))
+            if (write_osd_set[i] != 0)
+                break;
+        }
+        if (i < pg_size)
+        {
+            // Calculate new coding chunks
+            buf_len_t bufs[pg_size][3];
+            int nbuf[pg_size] = { 0 }, curbuf[pg_size] = { 0 };
+            uint32_t positions[pg_size];
+            void *data_ptrs[pg_size] = { 0 };
+            for (int i = 0; i < pg_minsize; i++)
            {
-                // Copy new parity into the read buffer to write it back
-                memcpy(
-                    stripes[role].read_buf + start,
-                    stripes[role].write_buf,
-                    end - start
+                get_old_new_buffers(stripes[i], start, end, bufs[i], nbuf[i]);
+                positions[i] = start;
+            }
+            for (int i = pg_minsize; i < pg_size; i++)
+            {
+                bufs[i][nbuf[i]++] = { .buf = stripes[i].write_buf, .len = end-start };
+                positions[i] = start;
+            }
+            uint32_t pos = start;
+            while (pos < end)
+            {
+                uint32_t next_end = end;
+                for (int i = 0; i < pg_size; i++)
+                {
+                    assert(curbuf[i] < nbuf[i]);
+                    assert(bufs[i][curbuf[i]].buf);
+                    data_ptrs[i] = bufs[i][curbuf[i]].buf + pos-positions[i];
+                    uint32_t this_end = bufs[i][curbuf[i]].len + positions[i];
+                    if (next_end > this_end)
+                        next_end = this_end;
+                }
+                assert(next_end > pos);
+                for (int i = 0; i < pg_size; i++)
+                {
+                    uint32_t this_end = bufs[i][curbuf[i]].len + positions[i];
+                    if (next_end >= this_end)
+                    {
+                        positions[i] += bufs[i][curbuf[i]].len;
+                        curbuf[i]++;
+                    }
+                }
+                jerasure_matrix_encode(
+                    pg_minsize, pg_size-pg_minsize, OSD_JERASURE_W, matrix->data,
+                    (char**)data_ptrs, (char**)data_ptrs+pg_minsize, next_end-pos
                );
-                stripes[role].write_buf = stripes[role].read_buf;
-                stripes[role].write_start = 0;
-                stripes[role].write_end = chunk_size;
+                pos = next_end;
            }
        }
    }
-#ifdef RMW_DEBUG
-    printf("calc_rmw_xor:\n");
-    for (int role = 0; role < pg_size; role++)
-    {
-        auto & s = stripes[role];
-        printf(
-            "Tr=%lu Tw=%lu Q=%x-%x R=%x-%x W=%x-%x Rb=%lx Wb=%lx\n",
-            read_osd_set[role], write_osd_set[role],
-            s.req_start, s.req_end,
-            s.read_start, s.read_end,
-            s.write_start, s.write_end,
-            (uint64_t)s.read_buf,
-            (uint64_t)s.write_buf
-        );
-    }
-#endif
+    calc_rmw_parity_copy_parity(stripes, pg_size, pg_minsize, read_osd_set, write_osd_set, chunk_size, start, end);
 }
--- a/osd_rmw.h
+++ b/osd_rmw.h
@@ -26,11 +26,13 @@ struct osd_rmw_stripe_t
    bool missing;
 };

+// Here pg_minsize is the number of data chunks, not the minimum number of alive OSDs for the PG to operate
+
 void split_stripes(uint64_t pg_minsize, uint32_t bs_block_size, uint32_t start, uint32_t len, osd_rmw_stripe_t *stripes);

-void reconstruct_stripe_xor(osd_rmw_stripe_t *stripes, int pg_size, int role);
+void reconstruct_stripes_xor(osd_rmw_stripe_t *stripes, int pg_size);

-int extend_missing_stripes(osd_rmw_stripe_t *stripes, osd_num_t *osd_set, int minsize, int size);
+int extend_missing_stripes(osd_rmw_stripe_t *stripes, osd_num_t *osd_set, int pg_minsize, int pg_size);

 void* alloc_read_buffer(osd_rmw_stripe_t *stripes, int read_pg_size, uint64_t add_size);

@@ -38,3 +40,10 @@ void* calc_rmw(void *request_buf, osd_rmw_stripe_t *stripes, uint64_t *read_osd_
    uint64_t pg_size, uint64_t pg_minsize, uint64_t pg_cursize, uint64_t *write_osd_set, uint64_t chunk_size);

 void calc_rmw_parity_xor(osd_rmw_stripe_t *stripes, int pg_size, uint64_t *read_osd_set, uint64_t *write_osd_set, uint32_t chunk_size);
+
+void use_jerasure(int pg_size, int pg_minsize, bool use);
+
+void reconstruct_stripes_jerasure(osd_rmw_stripe_t *stripes, int pg_size, int pg_minsize);
+
+void calc_rmw_parity_jerasure(osd_rmw_stripe_t *stripes, int pg_size, int pg_minsize,
+    uint64_t *read_osd_set, uint64_t *write_osd_set, uint32_t chunk_size);
--- a/osd_rmw_test.cpp
+++ b/osd_rmw_test.cpp
@@ -18,107 +18,8 @@ void test9();
 void test10();
 void test11();
 void test12();
-
-/***
-
-Cases:
-
-1. split(offset=128K-4K, len=8K)
-   = [ [ 128K-4K, 128K ], [ 0, 4K ], [ 0, 0 ] ]
-
-2. read(offset=128K-4K, len=8K, osd_set=[1,0,3])
-   = { read: [ [ 0, 128K ], [ 0, 4K ], [ 0, 4K ] ] }
-
-3. cover_read(0, 128K, { req: [ 128K-4K, 4K ] })
-   = { read: [ 0, 128K-4K ] }
-
-4. write(offset=128K-4K, len=8K, osd_set=[1,0,3])
-   = {
-     read: [ [ 0, 128K ], [ 4K, 128K ], [ 4K, 128K ] ],
-     write: [ [ 128K-4K, 128K ], [ 0, 4K ], [ 0, 128K ] ],
-     input buffer: [ write0, write1 ],
-     rmw buffer: [ write2, read0, read1, read2 ],
-   }
-   + check write2 buffer
-
-5. write(offset=0, len=128K+64K, osd_set=[1,0,3])
-   = {
-     req: [ [ 0, 128K ], [ 0, 64K ], [ 0, 0 ] ],
-     read: [ [ 64K, 128K ], [ 64K, 128K ], [ 64K, 128K ] ],
-     write: [ [ 0, 128K ], [ 0, 64K ], [ 0, 128K ] ],
-     input buffer: [ write0, write1 ],
-     rmw buffer: [ write2, read0, read1, read2 ],
-   }
-
-6. write(offset=0, len=128K+64K, osd_set=[1,2,3])
-   = {
-     req: [ [ 0, 128K ], [ 0, 64K ], [ 0, 0 ] ],
-     read: [ [ 0, 0 ], [ 64K, 128K ], [ 0, 0 ] ],
-     write: [ [ 0, 128K ], [ 0, 64K ], [ 0, 128K ] ],
-     input buffer: [ write0, write1 ],
-     rmw buffer: [ write2, read1 ],
-   }
-
-7. calc_rmw(offset=128K-4K, len=8K, osd_set=[1,0,3], write_set=[1,2,3])
-   = {
-     read: [ [ 0, 128K ], [ 0, 128K ], [ 0, 128K ] ],
-     write: [ [ 128K-4K, 128K ], [ 0, 4K ], [ 0, 128K ] ],
-     input buffer: [ write0, write1 ],
-     rmw buffer: [ write2, read0, read1, read2 ],
-   }
-   then, after calc_rmw_parity_xor(): {
-     write: [ [ 128K-4K, 128K ], [ 0, 128K ], [ 0, 128K ] ],
-     write1==read1,
-   }
-   + check write1 buffer
-   + check write2 buffer
-
-8. calc_rmw(offset=0, len=128K+4K, osd_set=[0,2,3], write_set=[1,2,3])
-   = {
-     read: [ [ 0, 0 ], [ 4K, 128K ], [ 0, 0 ] ],
-     write: [ [ 0, 128K ], [ 0, 4K ], [ 0, 128K ] ],
-     input buffer: [ write0, write1 ],
-     rmw buffer: [ write2, read1 ],
-   }
-   + check write2 buffer
-
-9. object recovery case:
-   calc_rmw(offset=0, len=0, read_osd_set=[0,2,3], write_osd_set=[1,2,3])
-   = {
-     read: [ [ 0, 128K ], [ 0, 128K ], [ 0, 128K ] ],
-     write: [ [ 0, 0 ], [ 0, 0 ], [ 0, 0 ] ],
-     input buffer: NULL,
-     rmw buffer: [ read0, read1, read2 ],
-   }
-   then, after calc_rmw_parity_xor(): {
-     write: [ [ 0, 128K ], [ 0, 0 ], [ 0, 0 ] ],
-     write0==read0,
-   }
-   + check write0 buffer
-
-10. full overwrite/recovery case:
-   calc_rmw(offset=0, len=256K, read_osd_set=[1,0,0], write_osd_set=[1,2,3])
-   = {
-     read: [ [ 0, 0 ], [ 0, 0 ], [ 0, 0 ] ],
-     write: [ [ 0, 128K ], [ 0, 128K ], [ 0, 128K ] ],
-     input buffer: [ write0, write1 ],
-     rmw buffer: [ write2 ],
-   }
-   then, after calc_rmw_parity_xor(): all the same
-   + check write2 buffer
-
-10. partial recovery case:
-   calc_rmw(offset=128K, len=128K, read_osd_set=[1,0,0], write_osd_set=[1,2,3])
-   = {
-     read: [ [ 0, 128K ], [ 0, 0 ], [ 0, 0 ] ],
-     write: [ [ 0, 0 ], [ 0, 128K ], [ 0, 128K ] ],
-     input buffer: [ write1 ],
-     rmw buffer: [ write2, read0 ],
-   }
-   then, after calc_rmw_parity_xor(): all the same
-   + check write2 buffer
-
-***/
+void test13();
+void test14();

 int main(int narg, char *args[])
 {
@@ -142,6 +43,10 @@ int main(int narg, char *args[])
    test11();
    // Test 12
    test12();
+    // Test 13
+    test13();
+    // Test 14
+    test14();
    // End
    printf("all ok\n");
    return 0;
@@ -169,6 +74,19 @@ void dump_stripes(osd_rmw_stripe_t *stripes, int pg_size)
    printf("\n");
 }

+/***
+
+1. split(offset=128K-4K, len=8K)
+   = [ [ 128K-4K, 128K ], [ 0, 4K ], [ 0, 0 ] ]
+
+   read(offset=128K-4K, len=8K, osd_set=[1,0,3])
+   = { read: [ [ 0, 128K ], [ 0, 4K ], [ 0, 4K ] ] }
+
+   cover_read(0, 128K, { req: [ 128K-4K, 4K ] })
+   = { read: [ 0, 128K-4K ] }
+
+***/
+
 void test1()
 {
    osd_num_t osd_set[3] = { 1, 0, 3 };
@@ -188,11 +106,24 @@ void test1()
    assert(stripes[0].read_start == 0 && stripes[0].read_end == 128*1024);
    assert(stripes[2].read_start == 0 && stripes[2].read_end == 4096);
    // Test 1.3
-    stripes[0] = { .req_start = 128*1024-4096, .req_end = 128*1024 };
+    stripes[0] = (osd_rmw_stripe_t){ .req_start = 128*1024-4096, .req_end = 128*1024 };
    cover_read(0, 128*1024, stripes[0]);
    assert(stripes[0].read_start == 0 && stripes[0].read_end == 128*1024-4096);
 }

+/***
+
+4. write(offset=128K-4K, len=8K, osd_set=[1,0,3])
+   = {
+     read: [ [ 0, 128K ], [ 4K, 128K ], [ 4K, 128K ] ],
+     write: [ [ 128K-4K, 128K ], [ 0, 4K ], [ 0, 128K ] ],
+     input buffer: [ write0, write1 ],
+     rmw buffer: [ write2, read0, read1, read2 ],
+   }
+   + check write2 buffer
+
+***/
+
 void test4()
 {
    osd_num_t osd_set[3] = { 1, 0, 3 };
@@ -226,6 +157,19 @@ void test4()
    free(write_buf);
 }

+/***
+
+5. write(offset=0, len=128K+64K, osd_set=[1,0,3])
+   = {
+     req: [ [ 0, 128K ], [ 0, 64K ], [ 0, 0 ] ],
+     read: [ [ 64K, 128K ], [ 64K, 128K ], [ 64K, 128K ] ],
+     write: [ [ 0, 128K ], [ 0, 64K ], [ 0, 128K ] ],
+     input buffer: [ write0, write1 ],
+     rmw buffer: [ write2, read0, read1, read2 ],
+   }
+
+***/
+
 void test5()
 {
    osd_num_t osd_set[3] = { 1, 0, 3 };
@@ -254,6 +198,19 @@ void test5()
    free(write_buf);
 }

+/***
+
+6. write(offset=0, len=128K+64K, osd_set=[1,2,3])
+   = {
+     req: [ [ 0, 128K ], [ 0, 64K ], [ 0, 0 ] ],
+     read: [ [ 0, 0 ], [ 64K, 128K ], [ 0, 0 ] ],
+     write: [ [ 0, 128K ], [ 0, 64K ], [ 0, 128K ] ],
+     input buffer: [ write0, write1 ],
+     rmw buffer: [ write2, read1 ],
+   }
+
+***/
+
 void test6()
 {
    osd_num_t osd_set[3] = { 1, 2, 3 };
@@ -278,6 +235,24 @@ void test6()
    free(write_buf);
 }

+/***
+
+7. calc_rmw(offset=128K-4K, len=8K, osd_set=[1,0,3], write_set=[1,2,3])
+   = {
+     read: [ [ 0, 128K ], [ 0, 128K ], [ 0, 128K ] ],
+     write: [ [ 128K-4K, 128K ], [ 0, 4K ], [ 0, 128K ] ],
+     input buffer: [ write0, write1 ],
+     rmw buffer: [ write2, read0, read1, read2 ],
+   }
+   then, after calc_rmw_parity_xor(): {
+     write: [ [ 128K-4K, 128K ], [ 0, 128K ], [ 0, 128K ] ],
+     write1==read1,
+   }
+   + check write1 buffer
+   + check write2 buffer
+
+***/
+
 void test7()
 {
    osd_num_t osd_set[3] = { 1, 0, 3 };
@@ -318,6 +293,19 @@ void test7()
    free(write_buf);
 }

+/***
+
+8. calc_rmw(offset=0, len=128K+4K, osd_set=[0,2,3], write_set=[1,2,3])
+   = {
+     read: [ [ 0, 0 ], [ 4K, 128K ], [ 0, 0 ] ],
+     write: [ [ 0, 128K ], [ 0, 4K ], [ 0, 128K ] ],
+     input buffer: [ write0, write1 ],
+     rmw buffer: [ write2, read1 ],
+   }
+   + check write2 buffer
+
+***/
+
 void test8()
 {
    osd_num_t osd_set[3] = { 0, 2, 3 };
@@ -355,6 +343,24 @@ void test8()
    free(write_buf);
 }

+/***
+
+9. object recovery case:
+   calc_rmw(offset=0, len=0, read_osd_set=[0,2,3], write_osd_set=[1,2,3])
+   = {
+     read: [ [ 0, 128K ], [ 0, 128K ], [ 0, 128K ] ],
+     write: [ [ 0, 0 ], [ 0, 0 ], [ 0, 0 ] ],
+     input buffer: NULL,
+     rmw buffer: [ read0, read1, read2 ],
+   }
+   then, after calc_rmw_parity_xor(): {
+     write: [ [ 0, 128K ], [ 0, 0 ], [ 0, 0 ] ],
+     write0==read0,
+   }
+   + check write0 buffer
+
+***/
+
 void test9()
 {
    osd_num_t osd_set[3] = { 0, 2, 3 };
@@ -395,6 +401,21 @@ void test9()
    free(rmw_buf);
 }

+/***
+
+10. full overwrite/recovery case:
+   calc_rmw(offset=0, len=256K, read_osd_set=[1,0,0], write_osd_set=[1,2,3])
+   = {
+     read: [ [ 0, 0 ], [ 0, 0 ], [ 0, 0 ] ],
+     write: [ [ 0, 128K ], [ 0, 128K ], [ 0, 128K ] ],
+     input buffer: [ write0, write1 ],
+     rmw buffer: [ write2 ],
+   }
+   then, after calc_rmw_parity_xor(): all the same
+   + check write2 buffer
+
+***/
+
 void test10()
 {
    osd_num_t osd_set[3] = { 1, 0, 0 };
@@ -436,6 +457,21 @@ void test10()
    free(write_buf);
 }

+/***
+
+11. partial recovery case:
+   calc_rmw(offset=128K, len=128K, read_osd_set=[1,0,0], write_osd_set=[1,2,3])
+   = {
+     read: [ [ 0, 128K ], [ 0, 0 ], [ 0, 0 ] ],
+     write: [ [ 0, 0 ], [ 0, 128K ], [ 0, 128K ] ],
+     input buffer: [ write1 ],
+     rmw buffer: [ write2, read0 ],
+   }
+   then, after calc_rmw_parity_xor(): all the same
+   + check write2 buffer
+
+***/
+
 void test11()
 {
    osd_num_t osd_set[3] = { 1, 0, 0 };
@@ -477,17 +513,32 @@ void test11()
    free(write_buf);
 }

+/***
+
+12. parity recovery case:
+   calc_rmw(offset=0, len=0, read_osd_set=[1,2,0], write_osd_set=[1,2,3])
+   = {
+     read: [ [ 0, 128K ], [ 0, 128K ], [ 0, 0 ] ],
+     write: [ [ 0, 0 ], [ 0, 0 ], [ 0, 128K ] ],
+     input buffer: [],
+     rmw buffer: [ write2, read0, read1 ],
+   }
+   then, after calc_rmw_parity_xor(): all the same
+   + check write2 buffer
+
+***/
+
 void test12()
 {
    osd_num_t osd_set[3] = { 1, 2, 0 };
    osd_num_t write_osd_set[3] = { 1, 2, 3 };
    osd_rmw_stripe_t stripes[3] = { 0 };
-    // Test 11.0
+    // Test 12.0
    split_stripes(2, 128*1024, 0, 0, stripes);
    assert(stripes[0].req_start == 0 && stripes[0].req_end == 0);
    assert(stripes[1].req_start == 0 && stripes[1].req_end == 0);
    assert(stripes[2].req_start == 0 && stripes[2].req_end == 0);
-    // Test 11.1
+    // Test 12.1
    void *rmw_buf = calc_rmw(NULL, stripes, osd_set, 3, 2, 3, write_osd_set, 128*1024);
    assert(rmw_buf);
    assert(stripes[0].read_start == 0 && stripes[0].read_end == 128*1024);
@@ -502,7 +553,7 @@ void test12()
    assert(stripes[0].write_buf == NULL);
    assert(stripes[1].write_buf == NULL);
    assert(stripes[2].write_buf == rmw_buf);
-    // Test 11.2
+    // Test 12.2
    set_pattern(stripes[0].read_buf, 128*1024, PATTERN1);
    set_pattern(stripes[1].read_buf, 128*1024, PATTERN2);
    calc_rmw_parity_xor(stripes, 3, osd_set, write_osd_set, 128*1024);
@@ -515,3 +566,217 @@ void test12()
    check_pattern(stripes[2].write_buf, 128*1024, PATTERN1^PATTERN2);
    free(rmw_buf);
 }
+
+/***
+
+13. basic jerasure 2+2 test
+   calc_rmw(offset=128K-4K, len=8K, osd_set=[1,2,0,0], write_set=[1,2,3,4])
+   = {
+     read: [ [ 0, 128K ], [ 0, 128K ], [ 0, 0 ], [ 0, 0 ] ],
+     write: [ [ 128K-4K, 128K ], [ 0, 4K ], [ 0, 128K ], [ 0, 128K ] ],
+     input buffer: [ write0, write1 ],
+     rmw buffer: [ write2, write3, read0, read1 ],
+   }
+   then, after calc_rmw_parity_jerasure(): all the same
+   then simulate read with read_osd_set=[0,0,3,4] and check read0,read1 buffers
+
+***/
+
+void test13()
+{
+    use_jerasure(4, 2, true);
+    osd_num_t osd_set[4] = { 1, 2, 0, 0 };
+    osd_num_t write_osd_set[4] = { 1, 2, 3, 4 };
+    osd_rmw_stripe_t stripes[4] = { 0 };
+    // Test 13.0
+    void *write_buf = malloc_or_die(8192);
+    split_stripes(2, 128*1024, 128*1024-4096, 8192, stripes);
+    assert(stripes[0].req_start == 128*1024-4096 && stripes[0].req_end == 128*1024);
+    assert(stripes[1].req_start == 0 && stripes[1].req_end == 4096);
+    assert(stripes[2].req_start == 0 && stripes[2].req_end == 0);
+    assert(stripes[3].req_start == 0 && stripes[3].req_end == 0);
+    // Test 13.1
+    void *rmw_buf = calc_rmw(write_buf, stripes, osd_set, 4, 2, 4, write_osd_set, 128*1024);
+    assert(rmw_buf);
+    assert(stripes[0].read_start == 0 && stripes[0].read_end == 128*1024-4096);
+    assert(stripes[1].read_start == 4096 && stripes[1].read_end == 128*1024);
+    assert(stripes[2].read_start == 0 && stripes[2].read_end == 0);
+    assert(stripes[3].read_start == 0 && stripes[3].read_end == 0);
+    assert(stripes[0].write_start == 128*1024-4096 && stripes[0].write_end == 128*1024);
+    assert(stripes[1].write_start == 0 && stripes[1].write_end == 4096);
+    assert(stripes[2].write_start == 0 && stripes[2].write_end == 128*1024);
+    assert(stripes[3].write_start == 0 && stripes[3].write_end == 128*1024);
+    assert(stripes[0].read_buf == rmw_buf+2*128*1024);
+    assert(stripes[1].read_buf == rmw_buf+3*128*1024-4096);
+    assert(stripes[2].read_buf == NULL);
+    assert(stripes[3].read_buf == NULL);
+    assert(stripes[0].write_buf == write_buf);
+    assert(stripes[1].write_buf == write_buf+4096);
+    assert(stripes[2].write_buf == rmw_buf);
+    assert(stripes[3].write_buf == rmw_buf+128*1024);
+    // Test 13.2 - encode
+    set_pattern(write_buf, 8192, PATTERN3);
+    set_pattern(stripes[0].read_buf, 128*1024-4096, PATTERN1);
+    set_pattern(stripes[1].read_buf, 128*1024-4096, PATTERN2);
+    calc_rmw_parity_jerasure(stripes, 4, 2, osd_set, write_osd_set, 128*1024);
+    assert(stripes[0].write_start == 128*1024-4096 && stripes[0].write_end == 128*1024);
+    assert(stripes[1].write_start == 0 && stripes[1].write_end == 4096);
+    assert(stripes[2].write_start == 0 && stripes[2].write_end == 128*1024);
+    assert(stripes[3].write_start == 0 && stripes[3].write_end == 128*1024);
+    assert(stripes[0].write_buf == write_buf);
+    assert(stripes[1].write_buf == write_buf+4096);
+    assert(stripes[2].write_buf == rmw_buf);
+    assert(stripes[3].write_buf == rmw_buf+128*1024);
+    // Test 13.3 - full decode and verify
+    osd_num_t read_osd_set[4] = { 0, 0, 3, 4 };
+    memset(stripes, 0, sizeof(stripes));
+    split_stripes(2, 128*1024, 0, 256*1024, stripes);
+    assert(stripes[0].req_start == 0 && stripes[0].req_end == 128*1024);
+    assert(stripes[1].req_start == 0 && stripes[1].req_end == 128*1024);
+    assert(stripes[2].req_start == 0 && stripes[2].req_end == 0);
+    assert(stripes[3].req_start == 0 && stripes[3].req_end == 0);
+    for (int role = 0; role < 4; role++)
+    {
+        stripes[role].read_start = stripes[role].req_start;
+        stripes[role].read_end = stripes[role].req_end;
+    }
+    assert(extend_missing_stripes(stripes, read_osd_set, 2, 4) == 0);
+    assert(stripes[0].read_start == 0 && stripes[0].read_end == 128*1024);
+    assert(stripes[1].read_start == 0 && stripes[1].read_end == 128*1024);
+    assert(stripes[2].read_start == 0 && stripes[2].read_end == 128*1024);
+    assert(stripes[3].read_start == 0 && stripes[3].read_end == 128*1024);
+    void *read_buf = alloc_read_buffer(stripes, 4, 0);
+    assert(read_buf);
+    assert(stripes[0].read_buf == read_buf);
+    assert(stripes[1].read_buf == read_buf+128*1024);
+    assert(stripes[2].read_buf == read_buf+2*128*1024);
+    assert(stripes[3].read_buf == read_buf+3*128*1024);
+    memcpy(read_buf+2*128*1024, rmw_buf, 128*1024);
+    memcpy(read_buf+3*128*1024, rmw_buf+128*1024, 128*1024);
+    reconstruct_stripes_jerasure(stripes, 4, 2);
+    check_pattern(stripes[0].read_buf, 128*1024-4096, PATTERN1);
+    check_pattern(stripes[0].read_buf+128*1024-4096, 4096, PATTERN3);
+    check_pattern(stripes[1].read_buf, 4096, PATTERN3);
+    check_pattern(stripes[1].read_buf+4096, 128*1024-4096, PATTERN2);
+    free(read_buf);
+    // Test 13.4 - partial decode (only 1st chunk) and verify
+    memset(stripes, 0, sizeof(stripes));
+    split_stripes(2, 128*1024, 0, 128*1024, stripes);
+    assert(stripes[0].req_start == 0 && stripes[0].req_end == 128*1024);
+    assert(stripes[1].req_start == 0 && stripes[1].req_end == 0);
+    assert(stripes[2].req_start == 0 && stripes[2].req_end == 0);
+    assert(stripes[3].req_start == 0 && stripes[3].req_end == 0);
+    for (int role = 0; role < 4; role++)
+    {
+        stripes[role].read_start = stripes[role].req_start;
+        stripes[role].read_end = stripes[role].req_end;
+    }
+    assert(extend_missing_stripes(stripes, read_osd_set, 2, 4) == 0);
+    assert(stripes[0].read_start == 0 && stripes[0].read_end == 128*1024);
+    assert(stripes[1].read_start == 0 && stripes[1].read_end == 0);
+    assert(stripes[2].read_start == 0 && stripes[2].read_end == 128*1024);
+    assert(stripes[3].read_start == 0 && stripes[3].read_end == 128*1024);
+    read_buf = alloc_read_buffer(stripes, 4, 0);
+    assert(read_buf);
+    assert(stripes[0].read_buf == read_buf);
+    assert(stripes[1].read_buf == NULL);
+    assert(stripes[2].read_buf == read_buf+128*1024);
+    assert(stripes[3].read_buf == read_buf+2*128*1024);
+    memcpy(read_buf+128*1024, rmw_buf, 128*1024);
+    memcpy(read_buf+2*128*1024, rmw_buf+128*1024, 128*1024);
+    reconstruct_stripes_jerasure(stripes, 4, 2);
+    check_pattern(stripes[0].read_buf, 128*1024-4096, PATTERN1);
+    check_pattern(stripes[0].read_buf+128*1024-4096, 4096, PATTERN3);
+    free(read_buf);
+    // Huh done
+    free(rmw_buf);
+    free(write_buf);
+    use_jerasure(4, 2, false);
+}
+
+/***
+
+13. basic jerasure 2+1 test
+   calc_rmw(offset=128K-4K, len=8K, osd_set=[1,2,0], write_set=[1,2,3])
+   = {
+     read: [ [ 0, 128K ], [ 0, 128K ], [ 0, 0 ] ],
+     write: [ [ 128K-4K, 128K ], [ 0, 4K ], [ 0, 128K ] ],
+     input buffer: [ write0, write1 ],
+     rmw buffer: [ write2, read0, read1 ],
+   }
+   then, after calc_rmw_parity_jerasure(): all the same
+   then simulate read with read_osd_set=[0,2,3] and check read0 buffer
+
+***/
+
+void test14()
+{
+    use_jerasure(3, 2, true);
+    osd_num_t osd_set[3] = { 1, 2, 0 };
+    osd_num_t write_osd_set[3] = { 1, 2, 3 };
+    osd_rmw_stripe_t stripes[3] = { 0 };
+    // Test 13.0
+    void *write_buf = malloc_or_die(8192);
+    split_stripes(2, 128*1024, 128*1024-4096, 8192, stripes);
+    assert(stripes[0].req_start == 128*1024-4096 && stripes[0].req_end == 128*1024);
+    assert(stripes[1].req_start == 0 && stripes[1].req_end == 4096);
+    assert(stripes[2].req_start == 0 && stripes[2].req_end == 0);
+    // Test 13.1
+    void *rmw_buf = calc_rmw(write_buf, stripes, osd_set, 3, 2, 3, write_osd_set, 128*1024);
+    assert(rmw_buf);
+    assert(stripes[0].read_start == 0 && stripes[0].read_end == 128*1024-4096);
+    assert(stripes[1].read_start == 4096 && stripes[1].read_end == 128*1024);
+    assert(stripes[2].read_start == 0 && stripes[2].read_end == 0);
+    assert(stripes[0].write_start == 128*1024-4096 && stripes[0].write_end == 128*1024);
+    assert(stripes[1].write_start == 0 && stripes[1].write_end == 4096);
+    assert(stripes[2].write_start == 0 && stripes[2].write_end == 128*1024);
+    assert(stripes[0].read_buf == rmw_buf+128*1024);
+    assert(stripes[1].read_buf == rmw_buf+2*128*1024-4096);
+    assert(stripes[2].read_buf == NULL);
+    assert(stripes[0].write_buf == write_buf);
+    assert(stripes[1].write_buf == write_buf+4096);
+    assert(stripes[2].write_buf == rmw_buf);
+    // Test 13.2 - encode
+    set_pattern(write_buf, 8192, PATTERN3);
+    set_pattern(stripes[0].read_buf, 128*1024-4096, PATTERN1);
+    set_pattern(stripes[1].read_buf, 128*1024-4096, PATTERN2);
+    calc_rmw_parity_jerasure(stripes, 3, 2, osd_set, write_osd_set, 128*1024);
+    assert(stripes[0].write_start == 128*1024-4096 && stripes[0].write_end == 128*1024);
+    assert(stripes[1].write_start == 0 && stripes[1].write_end == 4096);
+    assert(stripes[2].write_start == 0 && stripes[2].write_end == 128*1024);
+    assert(stripes[0].write_buf == write_buf);
+    assert(stripes[1].write_buf == write_buf+4096);
+    assert(stripes[2].write_buf == rmw_buf);
+    // Test 13.3 - decode and verify
+    osd_num_t read_osd_set[4] = { 0, 2, 3 };
+    memset(stripes, 0, sizeof(stripes));
+    split_stripes(2, 128*1024, 0, 128*1024, stripes);
+    assert(stripes[0].req_start == 0 && stripes[0].req_end == 128*1024);
+    assert(stripes[1].req_start == 0 && stripes[1].req_end == 0);
+    assert(stripes[2].req_start == 0 && stripes[2].req_end == 0);
+    for (int role = 0; role < 3; role++)
+    {
+        stripes[role].read_start = stripes[role].req_start;
+        stripes[role].read_end = stripes[role].req_end;
+    }
+    assert(extend_missing_stripes(stripes, read_osd_set, 2, 3) == 0);
+    assert(stripes[0].read_start == 0 && stripes[0].read_end == 128*1024);
+    assert(stripes[1].read_start == 0 && stripes[1].read_end == 128*1024);
+    assert(stripes[2].read_start == 0 && stripes[2].read_end == 128*1024);
+    void *read_buf = alloc_read_buffer(stripes, 3, 0);
+    assert(read_buf);
+    assert(stripes[0].read_buf == read_buf);
+    assert(stripes[1].read_buf == read_buf+128*1024);
+    assert(stripes[2].read_buf == read_buf+2*128*1024);
+    set_pattern(stripes[1].read_buf, 4096, PATTERN3);
+    set_pattern(stripes[1].read_buf+4096, 128*1024-4096, PATTERN2);
+    memcpy(stripes[2].read_buf, rmw_buf, 128*1024);
+    reconstruct_stripes_jerasure(stripes, 3, 2);
+    check_pattern(stripes[0].read_buf, 128*1024-4096, PATTERN1);
+    check_pattern(stripes[0].read_buf+128*1024-4096, 4096, PATTERN3);
+    free(read_buf);
+    // Huh done
+    free(rmw_buf);
+    free(write_buf);
+    use_jerasure(3, 2, false);
+}
--- a/qemu-3.1-vitastor.patch
+++ b/qemu-3.1-vitastor.patch
@@ -0,0 +1,84 @@
+Index: qemu-3.1+dfsg/qapi/block-core.json
+===================================================================
+--- qemu-3.1+dfsg.orig/qapi/block-core.json
+++ qemu-3.1+dfsg/qapi/block-core.json
+@@ -2617,7 +2617,7 @@
+ ##
+ { 'enum': 'BlockdevDriver',
+   'data': [ 'blkdebug', 'blklogwrites', 'blkverify', 'bochs', 'cloop',
+-            'copy-on-read', 'dmg', 'file', 'ftp', 'ftps', 'gluster',
+            'copy-on-read', 'dmg', 'file', 'ftp', 'ftps', 'gluster', 'vitastor',
+             'host_cdrom', 'host_device', 'http', 'https', 'iscsi', 'luks',
+             'nbd', 'nfs', 'null-aio', 'null-co', 'nvme', 'parallels', 'qcow',
+             'qcow2', 'qed', 'quorum', 'raw', 'rbd', 'replication', 'sheepdog',
+@@ -3367,6 +3367,24 @@
+             '*tag': 'str' } }
+ 
+ ##
+# @BlockdevOptionsVitastor:
+#
+# Driver specific block device options for vitastor
+#
+# @inode:       Inode number
+# @pool:        Pool ID
+# @size:        Desired image size in bytes
+# @etcd_host:   etcd connection address
+# @etcd_prefix: etcd key/value prefix
+##
+{ 'struct': 'BlockdevOptionsVitastor',
+  'data': { 'inode': 'uint64',
+            'pool': 'uint64',
+            'size': 'uint64',
+            'etcd_host': 'str',
+            '*etcd_prefix': 'str' } }
+
+##
+ # @ReplicationMode:
+ #
+ # An enumeration of replication modes.
+@@ -3713,6 +3731,7 @@
+       'rbd':        'BlockdevOptionsRbd',
+       'replication':'BlockdevOptionsReplication',
+       'sheepdog':   'BlockdevOptionsSheepdog',
+      'vitastor':   'BlockdevOptionsVitastor',
+       'ssh':        'BlockdevOptionsSsh',
+       'throttle':   'BlockdevOptionsThrottle',
+       'vdi':        'BlockdevOptionsGenericFormat',
+@@ -4158,6 +4177,17 @@
+             '*block-state-zero':    'bool' } }
+ 
+ ##
+# @BlockdevCreateOptionsVitastor:
+#
+# Driver specific image creation options for Vitastor.
+#
+# @size: Size of the virtual disk in bytes
+##
+{ 'struct': 'BlockdevCreateOptionsVitastor',
+  'data': { 'location':         'BlockdevOptionsVitastor',
+            'size':             'size' } }
+
+##
+ # @BlockdevVpcSubformat:
+ #
+ # @dynamic: Growing image file
+@@ -4212,6 +4242,7 @@
+       'qed':            'BlockdevCreateOptionsQed',
+       'rbd':            'BlockdevCreateOptionsRbd',
+       'sheepdog':       'BlockdevCreateOptionsSheepdog',
+      'vitastor':       'BlockdevCreateOptionsVitastor',
+       'ssh':            'BlockdevCreateOptionsSsh',
+       'vdi':            'BlockdevCreateOptionsVdi',
+       'vhdx':           'BlockdevCreateOptionsVhdx',
+Index: qemu-3.1+dfsg/scripts/modules/module_block.py
+===================================================================
+--- qemu-3.1+dfsg.orig/scripts/modules/module_block.py
+++ qemu-3.1+dfsg/scripts/modules/module_block.py
+@@ -88,6 +88,7 @@ def print_bottom(fheader):
+ output_file = sys.argv[1]
+ with open(output_file, 'w') as fheader:
+     print_top(fheader)
+    add_module(fheader, "vitastor", "vitastor", "vitastor")
+ 
+     for filename in sys.argv[2:]:
+         if os.path.isfile(filename):
--- a/qemu-4.2-vitastor.patch
+++ b/qemu-4.2-vitastor.patch
@@ -0,0 +1,84 @@
+Index: qemu/qapi/block-core.json
+===================================================================
+--- qemu.orig/qapi/block-core.json	2020-11-07 22:57:38.932613674 +0000
+++ qemu.orig/qapi/block-core.json	2020-11-07 22:59:49.890722862 +0000
+@@ -2907,7 +2907,7 @@
+             'nbd', 'nfs', 'null-aio', 'null-co', 'nvme', 'parallels', 'qcow',
+             'qcow2', 'qed', 'quorum', 'raw', 'rbd',
+             { 'name': 'replication', 'if': 'defined(CONFIG_REPLICATION)' },
+-            'sheepdog',
+            'sheepdog', 'vitastor',
+             'ssh', 'throttle', 'vdi', 'vhdx', 'vmdk', 'vpc', 'vvfat', 'vxhs' ] }
+ 
+ ##
+@@ -3725,6 +3725,24 @@
+             '*tag': 'str' } }
+ 
+ ##
+# @BlockdevOptionsVitastor:
+#
+# Driver specific block device options for vitastor
+#
+# @inode:       Inode number
+# @pool:        Pool ID
+# @size:        Desired image size in bytes
+# @etcd_host:   etcd connection address
+# @etcd_prefix: etcd key/value prefix
+##
+{ 'struct': 'BlockdevOptionsVitastor',
+  'data': { 'inode': 'uint64',
+            'pool': 'uint64',
+            'size': 'uint64',
+            'etcd_host': 'str',
+            '*etcd_prefix': 'str' } }
+
+##
+ # @ReplicationMode:
+ #
+ # An enumeration of replication modes.
+@@ -4084,6 +4102,7 @@
+       'replication': { 'type': 'BlockdevOptionsReplication',
+                        'if': 'defined(CONFIG_REPLICATION)' },
+       'sheepdog':   'BlockdevOptionsSheepdog',
+      'vitastor':   'BlockdevOptionsVitastor',
+       'ssh':        'BlockdevOptionsSsh',
+       'throttle':   'BlockdevOptionsThrottle',
+       'vdi':        'BlockdevOptionsGenericFormat',
+@@ -4461,6 +4480,17 @@
+             '*cluster-size' :   'size' } }
+ 
+ ##
+# @BlockdevCreateOptionsVitastor:
+#
+# Driver specific image creation options for Vitastor.
+#
+# @size: Size of the virtual disk in bytes
+##
+{ 'struct': 'BlockdevCreateOptionsVitastor',
+  'data': { 'location':         'BlockdevOptionsVitastor',
+            'size':             'size' } }
+
+##
+ # @BlockdevVmdkSubformat:
+ #
+ # Subformat options for VMDK images
+@@ -4722,6 +4752,7 @@
+       'qed':            'BlockdevCreateOptionsQed',
+       'rbd':            'BlockdevCreateOptionsRbd',
+       'sheepdog':       'BlockdevCreateOptionsSheepdog',
+      'vitastor':       'BlockdevCreateOptionsVitastor',
+       'ssh':            'BlockdevCreateOptionsSsh',
+       'vdi':            'BlockdevCreateOptionsVdi',
+       'vhdx':           'BlockdevCreateOptionsVhdx',
+Index: qemu/scripts/modules/module_block.py
+===================================================================
+--- qemu.orig/scripts/modules/module_block.py	2020-11-07 22:57:38.936613739 +0000
+++ qemu/scripts/modules/module_block.py	2020-11-07 22:59:49.890722862 +0000
+@@ -86,6 +86,7 @@ def print_bottom(fheader):
+ output_file = sys.argv[1]
+ with open(output_file, 'w') as fheader:
+     print_top(fheader)
+    add_module(fheader, "vitastor", "vitastor", "vitastor")
+ 
+     for filename in sys.argv[2:]:
+         if os.path.isfile(filename):
--- a/qemu-5.0-vitastor.patch
+++ b/qemu-5.0-vitastor.patch
@@ -0,0 +1,84 @@
+Index: qemu/qapi/block-core.json
+===================================================================
+--- qemu.orig/qapi/block-core.json
+++ qemu/qapi/block-core.json
+@@ -2798,7 +2798,7 @@
+             'luks', 'nbd', 'nfs', 'null-aio', 'null-co', 'nvme', 'parallels',
+             'qcow', 'qcow2', 'qed', 'quorum', 'raw', 'rbd',
+             { 'name': 'replication', 'if': 'defined(CONFIG_REPLICATION)' },
+-            'sheepdog',
+            'sheepdog', 'vitastor',
+             'ssh', 'throttle', 'vdi', 'vhdx', 'vmdk', 'vpc', 'vvfat', 'vxhs' ] }
+ 
+ ##
+@@ -3635,6 +3635,24 @@
+             '*tag': 'str' } }
+ 
+ ##
+# @BlockdevOptionsVitastor:
+#
+# Driver specific block device options for vitastor
+#
+# @inode:       Inode number
+# @pool:        Pool ID
+# @size:        Desired image size in bytes
+# @etcd_host:   etcd connection address
+# @etcd_prefix: etcd key/value prefix
+##
+{ 'struct': 'BlockdevOptionsVitastor',
+  'data': { 'inode': 'uint64',
+            'pool': 'uint64',
+            'size': 'uint64',
+            'etcd_host': 'str',
+            '*etcd_prefix': 'str' } }
+
+##
+ # @ReplicationMode:
+ #
+ # An enumeration of replication modes.
+@@ -3995,6 +4013,7 @@
+       'replication': { 'type': 'BlockdevOptionsReplication',
+                        'if': 'defined(CONFIG_REPLICATION)' },
+       'sheepdog':   'BlockdevOptionsSheepdog',
+      'vitastor':   'BlockdevOptionsVitastor',
+       'ssh':        'BlockdevOptionsSsh',
+       'throttle':   'BlockdevOptionsThrottle',
+       'vdi':        'BlockdevOptionsGenericFormat',
+@@ -4365,6 +4384,17 @@
+             '*cluster-size' :   'size' } }
+ 
+ ##
+# @BlockdevCreateOptionsVitastor:
+#
+# Driver specific image creation options for Vitastor.
+#
+# @size: Size of the virtual disk in bytes
+##
+{ 'struct': 'BlockdevCreateOptionsVitastor',
+  'data': { 'location':         'BlockdevOptionsVitastor',
+            'size':             'size' } }
+
+##
+ # @BlockdevVmdkSubformat:
+ #
+ # Subformat options for VMDK images
+@@ -4626,6 +4656,7 @@
+       'qed':            'BlockdevCreateOptionsQed',
+       'rbd':            'BlockdevCreateOptionsRbd',
+       'sheepdog':       'BlockdevCreateOptionsSheepdog',
+      'vitastor':       'BlockdevCreateOptionsVitastor',
+       'ssh':            'BlockdevCreateOptionsSsh',
+       'vdi':            'BlockdevCreateOptionsVdi',
+       'vhdx':           'BlockdevCreateOptionsVhdx',
+Index: qemu/scripts/modules/module_block.py
+===================================================================
+--- qemu.orig/scripts/modules/module_block.py
+++ qemu/scripts/modules/module_block.py
+@@ -85,6 +85,7 @@ def print_bottom(fheader):
+ output_file = sys.argv[1]
+ with open(output_file, 'w') as fheader:
+     print_top(fheader)
+    add_module(fheader, "vitastor", "vitastor", "vitastor")
+ 
+     for filename in sys.argv[2:]:
+         if os.path.isfile(filename):
--- a/qemu-5.1-vitastor.patch
+++ b/qemu-5.1-vitastor.patch
@@ -0,0 +1,84 @@
+Index: qemu-5.1+dfsg/qapi/block-core.json
+===================================================================
+--- qemu-5.1+dfsg.orig/qapi/block-core.json
+++ qemu-5.1+dfsg/qapi/block-core.json
+@@ -2807,7 +2807,7 @@
+             'luks', 'nbd', 'nfs', 'null-aio', 'null-co', 'nvme', 'parallels',
+             'qcow', 'qcow2', 'qed', 'quorum', 'raw', 'rbd',
+             { 'name': 'replication', 'if': 'defined(CONFIG_REPLICATION)' },
+-            'sheepdog',
+            'sheepdog', 'vitastor',
+             'ssh', 'throttle', 'vdi', 'vhdx', 'vmdk', 'vpc', 'vvfat' ] }
+ 
+ ##
+@@ -3644,6 +3644,24 @@
+             '*tag': 'str' } }
+ 
+ ##
+# @BlockdevOptionsVitastor:
+#
+# Driver specific block device options for vitastor
+#
+# @inode:       Inode number
+# @pool:        Pool ID
+# @size:        Desired image size in bytes
+# @etcd_host:   etcd connection address
+# @etcd_prefix: etcd key/value prefix
+##
+{ 'struct': 'BlockdevOptionsVitastor',
+  'data': { 'inode': 'uint64',
+            'pool': 'uint64',
+            'size': 'uint64',
+            'etcd_host': 'str',
+            '*etcd_prefix': 'str' } }
+
+##
+ # @ReplicationMode:
+ #
+ # An enumeration of replication modes.
+@@ -3988,6 +4006,7 @@
+       'replication': { 'type': 'BlockdevOptionsReplication',
+                        'if': 'defined(CONFIG_REPLICATION)' },
+       'sheepdog':   'BlockdevOptionsSheepdog',
+      'vitastor':   'BlockdevOptionsVitastor',
+       'ssh':        'BlockdevOptionsSsh',
+       'throttle':   'BlockdevOptionsThrottle',
+       'vdi':        'BlockdevOptionsGenericFormat',
+@@ -4376,6 +4395,17 @@
+             '*cluster-size' :   'size' } }
+ 
+ ##
+# @BlockdevCreateOptionsVitastor:
+#
+# Driver specific image creation options for Vitastor.
+#
+# @size: Size of the virtual disk in bytes
+##
+{ 'struct': 'BlockdevCreateOptionsVitastor',
+  'data': { 'location':         'BlockdevOptionsVitastor',
+            'size':             'size' } }
+
+##
+ # @BlockdevVmdkSubformat:
+ #
+ # Subformat options for VMDK images
+@@ -4637,6 +4667,7 @@
+       'qed':            'BlockdevCreateOptionsQed',
+       'rbd':            'BlockdevCreateOptionsRbd',
+       'sheepdog':       'BlockdevCreateOptionsSheepdog',
+      'vitastor':       'BlockdevCreateOptionsVitastor',
+       'ssh':            'BlockdevCreateOptionsSsh',
+       'vdi':            'BlockdevCreateOptionsVdi',
+       'vhdx':           'BlockdevCreateOptionsVhdx',
+Index: qemu-5.1+dfsg/scripts/modules/module_block.py
+===================================================================
+--- qemu-5.1+dfsg.orig/scripts/modules/module_block.py
+++ qemu-5.1+dfsg/scripts/modules/module_block.py
+@@ -86,6 +86,7 @@ if __name__ == '__main__':
+     output_file = sys.argv[1]
+     with open(output_file, 'w') as fheader:
+         print_top(fheader)
+        add_module(fheader, "vitastor", "vitastor", "vitastor")
+ 
+         for filename in sys.argv[2:]:
+             if os.path.isfile(filename):
--- a/qemu_driver.c
+++ b/qemu_driver.c
@@ -3,11 +3,10 @@

 // QEMU block driver

+#define BUILD_DSO
 #define _GNU_SOURCE
 #include "qemu/osdep.h"
-#include "qemu/units.h"
 #include "block/block_int.h"
-#include "block/qdict.h"
 #include "qapi/error.h"
 #include "qapi/qmp/qdict.h"
 #include "qapi/qmp/qerror.h"
@@ -15,10 +14,28 @@
 #include "qemu/error-report.h"
 #include "qemu/module.h"
 #include "qemu/option.h"
+
+#if QEMU_VERSION_MAJOR >= 3
+#include "qemu/units.h"
+#include "block/qdict.h"
 #include "qemu/cutils.h"
+#else
+#include "qapi/qmp/qint.h"
+#define qdict_put_int(options, name, num_val) qdict_put_obj(options, name, QOBJECT(qint_from_int(num_val)))
+#define qdict_put_str(options, name, value) qdict_put_obj(options, name, QOBJECT(qstring_from_str(value)))
+#define qobject_unref QDECREF
+#endif

 #include "qemu_proxy.h"

+void qemu_module_dummy(void)
+{
+}
+
+void DSO_STAMP_FUN(void)
+{
+}
+
 typedef struct VitastorClient
 {
    void *proxy;
@@ -176,12 +193,14 @@ static void vitastor_close(BlockDriverState *bs)
        g_free(client->etcd_prefix);
 }

+#if QEMU_VERSION_MAJOR >= 3
 static int vitastor_probe_blocksizes(BlockDriverState *bs, BlockSizes *bsz)
 {
    bsz->phys = 4096;
    bsz->log = 4096;
    return 0;
 }
+#endif

 static int coroutine_fn vitastor_co_create_opts(
 #if QEMU_VERSION_MAJOR >= 4
@@ -208,6 +227,7 @@ out:
    return ret;
 }

+#if QEMU_VERSION_MAJOR >= 3
 static int coroutine_fn vitastor_co_truncate(BlockDriverState *bs, int64_t offset,
 #if QEMU_VERSION_MAJOR >= 4
    bool exact,
@@ -231,6 +251,7 @@ static int coroutine_fn vitastor_co_truncate(BlockDriverState *bs, int64_t offse

    return 0;
 }
+#endif

 static int vitastor_get_info(BlockDriverState *bs, BlockDriverInfo *bdi)
 {
@@ -244,11 +265,22 @@ static int64_t vitastor_getlength(BlockDriverState *bs)
    return client->size;
 }

+#if QEMU_VERSION_MAJOR >= 3
 static void vitastor_refresh_limits(BlockDriverState *bs, Error **errp)
+#else
+static int vitastor_refresh_limits(BlockDriverState *bs)
+#endif
 {
+#if QEMU_VERSION_MAJOR >= 4
    bs->bl.request_alignment = 4096;
    bs->bl.min_mem_alignment = 4096;
+#else
+    bs->request_alignment = 4096;
+#endif
    bs->bl.opt_mem_alignment = 4096;
+#if QEMU_VERSION_MAJOR < 3
+    return 0;
+#endif
 }

 static int64_t vitastor_get_allocated_file_size(BlockDriverState *bs)
@@ -271,7 +303,12 @@ static void vitastor_co_generic_bh_cb(int retval, void *opaque)
    task->complete = 1;
    if (qemu_coroutine_self() != task->co)
    {
+#if QEMU_VERSION_MAJOR >= 3
        aio_co_wake(task->co);
+#else
+        qemu_coroutine_enter(task->co, NULL);
+        qemu_aio_release(task);
+#endif
    }
 }

@@ -313,6 +350,18 @@ static int coroutine_fn vitastor_co_pwritev(BlockDriverState *bs, uint64_t offse
    return task.ret;
 }

+#if QEMU_VERSION_MAJOR < 3
+static int coroutine_fn vitastor_co_readv(BlockDriverState *bs, int64_t sector_num, int nb_sectors, QEMUIOVector *iov)
+{
+    return vitastor_co_preadv(bs, sector_num*BDRV_SECTOR_SIZE, nb_sectors*BDRV_SECTOR_SIZE, iov, 0);
+}
+
+static int coroutine_fn vitastor_co_writev(BlockDriverState *bs, int64_t sector_num, int nb_sectors, QEMUIOVector *iov)
+{
+    return vitastor_co_pwritev(bs, sector_num*BDRV_SECTOR_SIZE, nb_sectors*BDRV_SECTOR_SIZE, iov, 0);
+}
+#endif
+
 static int coroutine_fn vitastor_co_flush(BlockDriverState *bs)
 {
    VitastorClient *client = bs->opaque;
@@ -331,6 +380,7 @@ static int coroutine_fn vitastor_co_flush(BlockDriverState *bs)
    return task.ret;
 }

+#if QEMU_VERSION_MAJOR >= 3
 static QemuOptsList vitastor_create_opts = {
    .name = "vitastor-create-opts",
    .head = QTAILQ_HEAD_INITIALIZER(vitastor_create_opts.head),
@@ -343,6 +393,16 @@ static QemuOptsList vitastor_create_opts = {
        { /* end of list */ }
    }
 };
+#else
+static QEMUOptionParameter vitastor_create_opts[] = {
+    {
+        .name = BLOCK_OPT_SIZE,
+        .type = OPT_SIZE,
+        .help = "Virtual disk size"
+    },
+    { NULL }
+};
+#endif

 static const char *vitastor_strong_runtime_opts[] = {
    "inode",
@@ -363,7 +423,9 @@ static BlockDriver bdrv_vitastor = {
    .bdrv_has_zero_init             = bdrv_has_zero_init_1,
    .bdrv_get_info                  = vitastor_get_info,
    .bdrv_getlength                 = vitastor_getlength,
+#if QEMU_VERSION_MAJOR >= 3
    .bdrv_probe_blocksizes          = vitastor_probe_blocksizes,
+#endif
    .bdrv_refresh_limits            = vitastor_refresh_limits,

    // FIXME: Implement it along with per-inode statistics
@@ -373,12 +435,17 @@ static BlockDriver bdrv_vitastor = {
    .bdrv_close                     = vitastor_close,

    // Option list for the create operation
+#if QEMU_VERSION_MAJOR >= 3
    .create_opts                    = &vitastor_create_opts,
+#else
+    .create_options                 = vitastor_create_opts,
+#endif

    // For qmp_blockdev_create(), used by the qemu monitor / QAPI
    // Requires patching QAPI IDL, thus unimplemented
    //.bdrv_co_create                 = vitastor_co_create,

+#if QEMU_VERSION_MAJOR >= 3
    // For bdrv_create(), used by qemu-img
    .bdrv_co_create_opts            = vitastor_co_create_opts,

@@ -386,6 +453,11 @@ static BlockDriver bdrv_vitastor = {

    .bdrv_co_preadv                 = vitastor_co_preadv,
    .bdrv_co_pwritev                = vitastor_co_pwritev,
+#else
+    .bdrv_co_readv                  = vitastor_co_readv,
+    .bdrv_co_writev                 = vitastor_co_writev,
+#endif
+
    .bdrv_co_flush_to_disk          = vitastor_co_flush,

 #if QEMU_VERSION_MAJOR >= 4
--- a/ringloop.cpp
+++ b/ringloop.cpp
@@ -77,7 +77,10 @@ void ring_loop_t::loop()
            dl.callback(&dl);
        }
        else
+        {
+            printf("Warning: empty callback in SQE\n");
            free_ring_data[free_ring_data_ptr++] = d - ring_datas;
+        }
        io_uring_cqe_seen(&ring, cqe);
    }
    while (get_sqe_queue.size() > 0)
--- a/ringloop.h
+++ b/ringloop.h
@@ -142,7 +142,10 @@ public:
            return NULL;
        struct io_uring_sqe* sqe = io_uring_get_sqe(&ring);
        if (sqe)
+        {
+            *sqe = { 0 };
            io_uring_sqe_set_data(sqe, ring_datas + free_ring_data[--free_ring_data_ptr]);
+        }
        return sqe;
    }
    inline int wait_sqe(std::function<void()> cb)
--- a/rm_inode.cpp
+++ b/rm_inode.cpp
@@ -6,26 +6,38 @@
 * May be included into a bigger "command-line management interface" in the future
 */

+#include <vector>
 #include <algorithm>

 #include "epoll_manager.h"
 #include "cluster_client.h"
 #include "pg_states.h"

-#define RM_NO_LIST 1
-#define RM_LIST_SENT 2
-#define RM_REMOVING 3
-#define RM_END 4
+#define RM_LISTING 1
+#define RM_REMOVING 2
+#define RM_END 3

 const char *exe_name = NULL;

+struct rm_pg_t;
+
 struct rm_pg_osd_t
 {
-    pg_num_t pg_num;
+    rm_pg_t *pg = NULL;
    osd_num_t osd_num;
+    bool sent = false;
+};
+
+struct rm_pg_t
+{
+    pg_num_t pg_num;
+    osd_num_t rm_osd_num;
+    std::vector<rm_pg_osd_t> list_osds;
    int state = 0;
-    obj_ver_id *obj_list = NULL;
-    uint64_t obj_count = 0, obj_pos = 0, obj_done = 0, obj_prev_done = 0;
+    int to_list;
+    std::set<object_id> objects;
+    std::set<object_id>::iterator obj_pos;
+    uint64_t obj_count = 0, obj_done = 0, obj_prev_done = 0;
    int in_flight = 0;
 };

@@ -41,11 +53,12 @@ protected:
    cluster_client_t *cli = NULL;
    ring_consumer_t consumer;

-    std::vector<rm_pg_osd_t*> lists;
+    std::vector<rm_pg_t*> lists;
    uint64_t total_count = 0, total_done = 0, total_prev_pct = 0;
    uint64_t pgs_to_list = 0;
    bool started = false;
    bool progress = true;
+    bool list_first = false;
    int log_level = 0;

 public:
@@ -62,7 +75,7 @@ public:
            else if (args[i][0] == '-' && args[i][1] == '-')
            {
                const char *opt = args[i]+2;
-                cfg[opt] = !strcmp(opt, "json") || i == narg-1 ? "1" : args[++i];
+                cfg[opt] = !strcmp(opt, "json") || !strcmp(opt, "wait-list") || i == narg-1 ? "1" : args[++i];
            }
        }
        return cfg;
@@ -74,7 +87,7 @@ public:
            "Vitastor inode removal tool\n"
            "(c) Vitaliy Filippov, 2020 (VNPL-1.0)\n\n"
            "USAGE:\n"
-            "  %s --etcd_address <etcd_address> --pool <pool> --inode <inode>\n",
+            "  %s --etcd_address <etcd_address> --pool <pool> --inode <inode> [--wait-list]\n",
            exe_name
        );
        exit(0);
@@ -105,6 +118,7 @@ public:
            parallel_osds = 4;
        log_level = cfg["log_level"].int64_value();
        progress = cfg["progress"].uint64_value() ? true : false;
+        list_first = cfg["wait-list"].uint64_value() ? true : false;
        // Create client
        ringloop = new ring_loop_t(512);
        epmgr = new epoll_manager_t(ringloop);
@@ -137,21 +151,57 @@ public:
        for (auto & pg_item: pool_cfg.pg_config)
        {
            auto & pg = pg_item.second;
-            if (pg.pause || !pg.cur_primary || pg.cur_state != PG_ACTIVE)
+            if (pg.pause || !pg.cur_primary || !(pg.cur_state & PG_ACTIVE))
            {
-                // FIXME Support deletion in non-clean active PGs by introducing a "primary-list" command
-                fprintf(stderr, "PG %u is not active+clean, skipping\n", pg_item.first);
+                fprintf(stderr, "PG %u is inactive, skipping\n", pg_item.first);
                continue;
            }
-            rm_pg_osd_t *r = new rm_pg_osd_t();
+            rm_pg_t *r = new rm_pg_t();
            r->pg_num = pg_item.first;
-            r->osd_num = pg.cur_primary;
-            r->state = RM_NO_LIST;
+            r->rm_osd_num = pg.cur_primary;
+            r->state = RM_LISTING;
+            if (pg.cur_state != PG_ACTIVE)
+            {
+                std::set<osd_num_t> all_peers;
+                for (osd_num_t pg_osd: pg.target_set)
+                {
+                    if (pg_osd != 0)
+                    {
+                        all_peers.insert(pg_osd);
+                    }
+                }
+                for (osd_num_t pg_osd: pg.all_peers)
+                {
+                    if (pg_osd != 0)
+                    {
+                        all_peers.insert(pg_osd);
+                    }
+                }
+                for (auto & hist_item: pg.target_history)
+                {
+                    for (auto pg_osd: hist_item)
+                    {
+                        if (pg_osd != 0)
+                        {
+                            all_peers.insert(pg_osd);
+                        }
+                    }
+                }
+                for (osd_num_t peer_osd: all_peers)
+                {
+                    r->list_osds.push_back((rm_pg_osd_t){ .pg = r, .osd_num = peer_osd, .sent = false });
+                }
+            }
+            else
+            {
+                r->list_osds.push_back((rm_pg_osd_t){ .pg = r, .osd_num = pg.cur_primary, .sent = false });
+            }
+            r->to_list = r->list_osds.size();
            lists.push_back(r);
        }
-        std::sort(lists.begin(), lists.end(), [](rm_pg_osd_t *a, rm_pg_osd_t *b)
+        std::sort(lists.begin(), lists.end(), [](rm_pg_t *a, rm_pg_t *b)
        {
-            return a->osd_num < b->osd_num ? true : false;
+            return a->rm_osd_num < b->rm_osd_num ? true : false;
        });
        pgs_to_list = lists.size();
        started = true;
@@ -160,6 +210,10 @@ public:

    void send_list(rm_pg_osd_t *cur_list)
    {
+        if (cur_list->sent)
+        {
+            return;
+        }
        if (cli->msgr.osd_peer_fds.find(cur_list->osd_num) ==
            cli->msgr.osd_peer_fds.end())
        {
@@ -170,14 +224,14 @@ public:
        osd_op_t *op = new osd_op_t();
        op->op_type = OSD_OP_OUT;
        op->peer_fd = cli->msgr.osd_peer_fds[cur_list->osd_num];
-        op->req = {
+        op->req = (osd_any_op_t){
            .sec_list = {
                .header = {
                    .magic = SECONDARY_OSD_OP_MAGIC,
                    .id = cli->msgr.next_subop_id++,
                    .opcode = OSD_OP_SEC_LIST,
                },
-                .list_pg = cur_list->pg_num,
+                .list_pg = cur_list->pg->pg_num,
                .pg_count = (pg_num_t)cli->st_cli.pool_config[pool_id].real_pg_count,
                .pg_stripe_size = cli->st_cli.pool_config[pool_id].pg_stripe_size,
                .min_inode = inode,
@@ -186,62 +240,76 @@ public:
        };
        op->callback = [this, cur_list](osd_op_t *op)
        {
-            pgs_to_list--;
+            cur_list->pg->to_list--;
            if (op->reply.hdr.retval < 0)
            {
-                fprintf(stderr, "Failed to get object list from OSD %lu (retval=%ld), skipping the PG\n",
-                    cur_list->osd_num, op->reply.hdr.retval);
-                cli->msgr.stop_client(cur_list->osd_num);
-                delete op;
-                cur_list->state = RM_END;
-                continue_delete();
-                return;
+                fprintf(stderr, "Failed to get PG %u/%u object list from OSD %lu (retval=%ld), skipping\n",
+                    pool_id, cur_list->pg->pg_num, cur_list->osd_num, op->reply.hdr.retval);
            }
-            if (log_level > 0)
+            else
            {
-                printf(
-                    "[PG %u/%u] Got inode object list from OSD %lu: %ld object versions\n",
-                    pool_id, cur_list->pg_num, cur_list->osd_num, op->reply.hdr.retval
-                );
+                if (op->reply.sec_list.stable_count < op->reply.hdr.retval)
+                {
+                    // Unstable objects, if present, mean that someone still writes into the inode. Warn the user about it.
+                    printf(
+                        "[PG %u/%u] Inode still has %lu unstable object versions - is it still open? Not a good idea to delete it.\n",
+                        pool_id, cur_list->pg->pg_num, op->reply.hdr.retval - op->reply.sec_list.stable_count
+                    );
+                }
+                if (log_level > 0)
+                {
+                    printf(
+                        "[PG %u/%u] Got inode object list from OSD %lu: %ld object versions\n",
+                        pool_id, cur_list->pg->pg_num, cur_list->osd_num, op->reply.hdr.retval
+                    );
+                }
+                for (uint64_t i = 0; i < op->reply.hdr.retval; i++)
+                {
+                    object_id oid = ((obj_ver_id*)op->buf)[i].oid;
+                    oid.stripe = oid.stripe & ~STRIPE_MASK;
+                    cur_list->pg->objects.insert(oid);
+                }
            }
-            cur_list->obj_list = (obj_ver_id*)op->buf;
-            cur_list->obj_count = (uint64_t)op->reply.hdr.retval;
-            cur_list->obj_done = cur_list->obj_prev_done = cur_list->obj_pos = 0;
-            total_count += cur_list->obj_count;
-            total_prev_pct = 0;
-            // set op->buf to NULL so it doesn't get freed
-            op->buf = NULL;
            delete op;
-            cur_list->state = RM_REMOVING;
+            if (cur_list->pg->to_list <= 0)
+            {
+                cur_list->pg->obj_done = cur_list->pg->obj_prev_done = 0;
+                cur_list->pg->obj_pos = cur_list->pg->objects.begin();
+                cur_list->pg->obj_count = cur_list->pg->objects.size();
+                total_count += cur_list->pg->obj_count;
+                total_prev_pct = 0;
+                cur_list->pg->state = RM_REMOVING;
+                pgs_to_list--;
+            }
            continue_delete();
        };
-        cur_list->state = RM_LIST_SENT;
        cli->msgr.outbox_push(op);
+        cur_list->sent = true;
    }

-    void send_ops(rm_pg_osd_t *cur_list)
+    void send_ops(rm_pg_t *cur_list)
    {
-        if (cli->msgr.osd_peer_fds.find(cur_list->osd_num) ==
+        if (cli->msgr.osd_peer_fds.find(cur_list->rm_osd_num) ==
            cli->msgr.osd_peer_fds.end())
        {
            // Initiate connection
-            cli->msgr.connect_peer(cur_list->osd_num, cli->st_cli.peer_states[cur_list->osd_num]);
+            cli->msgr.connect_peer(cur_list->rm_osd_num, cli->st_cli.peer_states[cur_list->rm_osd_num]);
            return;
        }
-        while (cur_list->in_flight < iodepth && cur_list->obj_pos < cur_list->obj_count)
+        while (cur_list->in_flight < iodepth && cur_list->obj_pos != cur_list->objects.end())
        {
            osd_op_t *op = new osd_op_t();
            op->op_type = OSD_OP_OUT;
-            op->peer_fd = cli->msgr.osd_peer_fds[cur_list->osd_num];
-            op->req = {
+            op->peer_fd = cli->msgr.osd_peer_fds[cur_list->rm_osd_num];
+            op->req = (osd_any_op_t){
                .rw = {
                    .header = {
                        .magic = SECONDARY_OSD_OP_MAGIC,
                        .id = cli->msgr.next_subop_id++,
                        .opcode = OSD_OP_DELETE,
                    },
-                    .inode = cur_list->obj_list[cur_list->obj_pos].oid.inode,
-                    .offset = (cur_list->obj_list[cur_list->obj_pos].oid.stripe & ~STRIPE_MASK),
+                    .inode = cur_list->obj_pos->inode,
+                    .offset = (cur_list->obj_pos->stripe & ~STRIPE_MASK),
                    .len = 0,
                },
            };
@@ -251,7 +319,7 @@ public:
                if (op->reply.hdr.retval < 0)
                {
                    fprintf(stderr, "Failed to remove object from PG %u (OSD %lu) (retval=%ld)\n",
-                        cur_list->pg_num, cur_list->osd_num, op->reply.hdr.retval);
+                        cur_list->pg_num, cur_list->rm_osd_num, op->reply.hdr.retval);
                }
                delete op;
                cur_list->obj_done++;
@@ -262,12 +330,10 @@ public:
            cur_list->obj_pos++;
            cur_list->in_flight++;
        }
-        if (!cur_list->in_flight && cur_list->obj_pos >= cur_list->obj_count)
+        if (!cur_list->in_flight && cur_list->obj_pos == cur_list->objects.end())
        {
-            free(cur_list->obj_list);
-            cur_list->obj_list = NULL;
            cur_list->obj_count = 0;
-            cur_list->obj_done = cur_list->obj_prev_done = cur_list->obj_pos = 0;
+            cur_list->obj_done = cur_list->obj_prev_done = 0;
            cur_list->state = RM_END;
        }
    }
@@ -276,6 +342,22 @@ public:
    {
        int par_osd = 0;
        osd_num_t max_seen_osd = 0;
+        bool no_del = false;
+        if (list_first)
+        {
+            int i, n = 0;
+            for (i = 0; i < lists.size(); i++)
+            {
+                if (lists[i]->state == RM_LISTING)
+                {
+                    n++;
+                }
+            }
+            if (n > 0)
+            {
+                no_del = true;
+            }
+        }
        for (int i = 0; i < lists.size(); i++)
        {
            if (lists[i]->state == RM_END)
@@ -284,18 +366,25 @@ public:
                lists.erase(lists.begin()+i, lists.begin()+i+1);
                i--;
            }
-            else if (lists[i]->osd_num > max_seen_osd)
+            else if (lists[i]->rm_osd_num > max_seen_osd)
            {
-                if (lists[i]->state == RM_NO_LIST)
+                if (lists[i]->state == RM_LISTING)
                {
-                    send_list(lists[i]);
+                    for (int j = 0; j < lists[i]->list_osds.size(); j++)
+                    {
+                        send_list(&lists[i]->list_osds[j]);
+                    }
                }
                else if (lists[i]->state == RM_REMOVING)
                {
+                    if (no_del)
+                    {
+                        continue;
+                    }
                    send_ops(lists[i]);
                }
                par_osd++;
-                max_seen_osd = lists[i]->osd_num;
+                max_seen_osd = lists[i]->rm_osd_num;
                if (par_osd >= parallel_osds)
                {
                    break;
--- a/rpm/build-tarball.sh
+++ b/rpm/build-tarball.sh
@@ -0,0 +1,51 @@
+#!/bin/bash
+# Vitastor depends on QEMU and FIO headers, but QEMU and FIO don't have -devel packages
+# So we have to copy their headers into the source tarball
+
+set -e
+
+VITASTOR=$(dirname $0)
+VITASTOR=$(realpath "$VITASTOR/..")
+
+if [ -d /opt/rh/gcc-toolset-9 ]; then
+    # CentOS 8
+    EL=8
+    . /opt/rh/gcc-toolset-9/enable
+else
+    # CentOS 7
+    EL=7
+    . /opt/rh/devtoolset-9/enable
+fi
+cd ~/rpmbuild/SPECS
+rpmbuild -bp fio.spec
+perl -i -pe 's/^make V=1/exit 0; make V=1/' qemu*.spec
+rpmbuild -bc qemu*.spec
+perl -i -pe 's/^exit 0; make V=1/make V=1/' qemu*.spec
+cd ~/rpmbuild/BUILD/qemu*/
+rm -rf $VITASTOR/qemu $VITASTOR/fio
+mkdir -p $VITASTOR/qemu/b/qemu
+make -j8 config-host.h
+cp config-host.h $VITASTOR/qemu/b/qemu
+cp -r include $VITASTOR/qemu
+if [ -f qapi-schema.json ]; then
+    # QEMU 2.0
+    make qapi-types.h
+    cp qapi-types.h $VITASTOR/qemu/b/qemu
+else
+    # QEMU 3.0+
+    make qapi
+    cp -r qapi $VITASTOR/qemu/b/qemu
+fi
+cd $VITASTOR
+sh copy-qemu-includes.sh
+rm -rf qemu
+mv qemu-copy qemu
+ln -s ~/rpmbuild/BUILD/fio*/ fio
+sh copy-fio-includes.sh
+rm fio
+mv fio-copy fio
+FIO=`rpm -qi fio | perl -e 'while(<>) { /^Epoch[\s:]+(\S+)/ && print "$1:"; /^Version[\s:]+(\S+)/ && print $1; /^Release[\s:]+(\S+)/ && print "-$1"; }'`
+QEMU=`rpm -qi qemu qemu-kvm | perl -e 'while(<>) { /^Epoch[\s:]+(\S+)/ && print "$1:"; /^Version[\s:]+(\S+)/ && print $1; /^Release[\s:]+(\S+)/ && print "-$1"; }'`
+perl -i -pe 's/(Requires:\s*fio)([^\n]+)?/$1 = '$FIO'/' $VITASTOR/rpm/vitastor-el$EL.spec
+perl -i -pe 's/(Requires:\s*qemu(?:-kvm)?)([^\n]+)?/$1 = '$QEMU'/' $VITASTOR/rpm/vitastor-el$EL.spec
+tar --transform 's#^#vitastor-0.5.4/#' --exclude 'rpm/*.rpm' -czf $VITASTOR/../vitastor-0.5.4$(rpm --eval '%dist').tar.gz *
--- a/rpm/qemu-el8.Dockerfile
+++ b/rpm/qemu-el8.Dockerfile
@@ -0,0 +1,31 @@
+# Build packages for CentOS 8 inside a container
+# cd ..; podman build -t qemu-el8 -v `pwd`/build:/root/build -f rpm/qemu-el8.Dockerfile .
+
+FROM centos:8
+
+WORKDIR /root
+
+RUN rm -f /etc/yum.repos.d/CentOS-Media.repo
+RUN dnf -y install centos-release-advanced-virtualization epel-release dnf-plugins-core rpm-build
+RUN rm -rf /var/lib/dnf/*; dnf download --disablerepo='*' --enablerepo='centos-advanced-virtualization-source' --source qemu-kvm
+RUN rpm --nomd5 -i qemu*.src.rpm
+RUN cd ~/rpmbuild/SPECS && dnf builddep -y --enablerepo=PowerTools --spec qemu-kvm.spec
+
+ADD qemu-*-vitastor.patch /root/vitastor/
+
+RUN set -e; \
+    mkdir -p /root/build/qemu-el8; \
+    rm -rf /root/build/qemu-el8/*; \
+    rpm --nomd5 -i /root/qemu*.src.rpm; \
+    cd ~/rpmbuild/SPECS; \
+    PN=$(grep ^Patch qemu-kvm.spec | tail -n1 | perl -pe 's/Patch(\d+).*/$1/'); \
+    csplit qemu-kvm.spec "/^Patch$PN/"; \
+    cat xx00 > qemu-kvm.spec; \
+    head -n 1 xx01 >> qemu-kvm.spec; \
+    echo "Patch$((PN+1)): qemu-4.2-vitastor.patch" >> qemu-kvm.spec; \
+    tail -n +2 xx01 >> qemu-kvm.spec; \
+    perl -i -pe 's/(^Release:\s*\d+)/$1.vitastor/' qemu-kvm.spec; \
+    cp /root/vitastor/qemu-4.2-vitastor.patch ~/rpmbuild/SOURCES; \
+    rpmbuild --nocheck -ba qemu-kvm.spec; \
+    cp ~/rpmbuild/RPMS/*/*qemu* /root/build/qemu-el8/; \
+    cp ~/rpmbuild/SRPMS/*qemu* /root/build/qemu-el8/
--- a/rpm/qemu-kvm-el7.spec.patch
+++ b/rpm/qemu-kvm-el7.spec.patch
@@ -0,0 +1,257 @@
+--- qemu-kvm.spec.orig	2020-11-09 23:41:03.000000000 +0000
+++ qemu-kvm.spec	2020-12-06 10:44:24.207640963 +0000
+@@ -2,7 +2,7 @@
+ %global SLOF_gittagcommit 899d9883
+ 
+ %global have_usbredir 1
+-%global have_spice    1
+%global have_spice    0
+ %global have_opengl   1
+ %global have_fdt      0
+ %global have_gluster  1
+@@ -56,7 +56,7 @@ Requires: %{name}-block-curl = %{epoch}:
+ Requires: %{name}-block-gluster = %{epoch}:%{version}-%{release} \
+ %endif                                                           \
+ Requires: %{name}-block-iscsi = %{epoch}:%{version}-%{release}   \
+-Requires: %{name}-block-rbd = %{epoch}:%{version}-%{release}     \
+#Requires: %{name}-block-rbd = %{epoch}:%{version}-%{release}     \
+ Requires: %{name}-block-ssh = %{epoch}:%{version}-%{release}
+ 
+ # Macro to properly setup RHEL/RHEV conflict handling
+@@ -67,7 +67,7 @@ Obsoletes: %1-rhev
+ Summary: QEMU is a machine emulator and virtualizer
+ Name: qemu-kvm
+ Version: 4.2.0
+-Release: 29.vitastor%{?dist}.6
+Release: 30.vitastor%{?dist}.6
+ # Epoch because we pushed a qemu-1.0 package. AIUI this can't ever be dropped
+ Epoch: 15
+ License: GPLv2 and GPLv2+ and CC-BY
+@@ -99,8 +99,8 @@ Source30: kvm-s390x.conf
+ Source31: kvm-x86.conf
+ Source32: qemu-pr-helper.service
+ Source33: qemu-pr-helper.socket
+-Source34: 81-kvm-rhel.rules
+-Source35: udev-kvm-check.c
+#Source34: 81-kvm-rhel.rules
+#Source35: udev-kvm-check.c
+ Source36: README.tests
+ 
+ 
+@@ -825,7 +825,9 @@ Patch331: kvm-Drop-bogus-IPv6-messages.p
+ Patch333: kvm-virtiofsd-Whitelist-fchmod.patch
+ # For bz#1883869 - virtiofsd core dump in KATA Container [rhel-8.2.1.z]
+ Patch334: kvm-virtiofsd-avoid-proc-self-fd-tempdir.patch
+-Patch335: qemu-4.2-vitastor.patch
+Patch335: qemu-use-sphinx-1.2.patch
+Patch336: qemu-config-tcmalloc-warning.patch
+Patch337: qemu-4.2-vitastor.patch
+ 
+ BuildRequires: wget
+ BuildRequires: rpm-build
+@@ -842,7 +844,8 @@ BuildRequires: pciutils-devel
+ BuildRequires: libiscsi-devel
+ BuildRequires: ncurses-devel
+ BuildRequires: libattr-devel
+-BuildRequires: libusbx-devel >= 1.0.22
+BuildRequires: gperftools-devel
+BuildRequires: libusbx-devel >= 1.0.21
+ %if %{have_usbredir}
+ BuildRequires: usbredir-devel >= 0.7.1
+ %endif
+@@ -856,12 +859,12 @@ BuildRequires: virglrenderer-devel
+ # For smartcard NSS support
+ BuildRequires: nss-devel
+ %endif
+-BuildRequires: libseccomp-devel >= 2.4.0
+#Requires: libseccomp >= 2.4.0
+ # For network block driver
+ BuildRequires: libcurl-devel
+ BuildRequires: libssh-devel
+-BuildRequires: librados-devel
+-BuildRequires: librbd-devel
+#BuildRequires: librados-devel
+#BuildRequires: librbd-devel
+ %if %{have_gluster}
+ # For gluster block driver
+ BuildRequires: glusterfs-api-devel
+@@ -955,25 +958,25 @@ hardware for a full system such as a PC
+ 
+ %package -n qemu-kvm-core
+ Summary: qemu-kvm core components
+Requires: gperftools-libs
+ Requires: qemu-img = %{epoch}:%{version}-%{release}
+ %ifarch %{ix86} x86_64
+ Requires: seabios-bin >= 1.10.2-1
+ Requires: sgabios-bin
+-Requires: edk2-ovmf
+ %endif
+ %ifarch aarch64
+ Requires: edk2-aarch64
+ %endif
+ 
+ %ifnarch aarch64 s390x
+-Requires: seavgabios-bin >= 1.12.0-3
+-Requires: ipxe-roms-qemu >= 20170123-1
+Requires: seavgabios-bin >= 1.11.0-1
+Requires: ipxe-roms-qemu >= 20181214-1
+Requires: /usr/share/ipxe.efi
+ %endif
+ %ifarch %{power64}
+ Requires: SLOF >= %{SLOF_gittagdate}-1.git%{SLOF_gittagcommit}
+ %endif
+ Requires: %{name}-common = %{epoch}:%{version}-%{release}
+-Requires: libseccomp >= 2.4.0
+ # For compressed guest memory dumps
+ Requires: lzo snappy
+ %if %{have_kvm_setup}
+@@ -1085,15 +1088,15 @@ This package provides the additional iSC
+ Install this package if you want to access iSCSI volumes.
+ 
+ 
+-%package  block-rbd
+-Summary: QEMU Ceph/RBD block driver
+-Requires: %{name}-common%{?_isa} = %{epoch}:%{version}-%{release}
+-
+-%description block-rbd
+-This package provides the additional Ceph/RBD block driver for QEMU.
+-
+-Install this package if you want to access remote Ceph volumes
+-using the rbd protocol.
+#%package  block-rbd
+#Summary: QEMU Ceph/RBD block driver
+#Requires: %{name}-common%{?_isa} = %{epoch}:%{version}-%{release}
+#
+#%description block-rbd
+#This package provides the additional Ceph/RBD block driver for QEMU.
+#
+#Install this package if you want to access remote Ceph volumes
+#using the rbd protocol.
+ 
+ 
+ %package  block-ssh
+@@ -1117,12 +1120,14 @@ the Secure Shell (SSH) protocol.
+ # --build-id option is used for giving info to the debug packages.
+ buildldflags="VL_LDFLAGS=-Wl,--build-id"
+ 
+-%global block_drivers_list qcow2,raw,file,host_device,nbd,iscsi,rbd,blkdebug,luks,null-co,nvme,copy-on-read,throttle
+#%global block_drivers_list qcow2,raw,file,host_device,nbd,iscsi,rbd,blkdebug,luks,null-co,nvme,copy-on-read,throttle
+%global block_drivers_list qcow2,raw,file,host_device,nbd,iscsi,blkdebug,luks,null-co,nvme,copy-on-read,throttle
+ 
+ %if 0%{have_gluster}
+     %global block_drivers_list %{block_drivers_list},gluster
+ %endif
+ 
+[ -e /usr/bin/sphinx-build ] || ln -s sphinx-build-3 /usr/bin/sphinx-build
+ ./configure  \
+  --prefix="%{_prefix}" \
+  --libdir="%{_libdir}" \
+@@ -1152,15 +1157,15 @@ buildldflags="VL_LDFLAGS=-Wl,--build-id"
+ %else
+   --disable-numa \
+ %endif
+-  --enable-rbd \
+  --disable-rbd \
+ %if 0%{have_librdma}
+   --enable-rdma \
+ %else
+   --disable-rdma \
+ %endif
+   --disable-pvrdma \
+-  --enable-seccomp \
+-%if 0%{have_spice}
+  --disable-seccomp \
+%if %{have_spice}
+   --enable-spice \
+   --enable-smartcard \
+   --enable-virglrenderer \
+@@ -1179,7 +1184,7 @@ buildldflags="VL_LDFLAGS=-Wl,--build-id"
+ %else
+   --disable-usb-redir \
+ %endif
+-  --disable-tcmalloc \
+  --enable-tcmalloc \
+ %ifarch x86_64
+   --enable-libpmem \
+ %else
+@@ -1193,9 +1198,7 @@ buildldflags="VL_LDFLAGS=-Wl,--build-id"
+ %endif
+   --python=%{__python3} \
+   --target-list="%{buildarch}" \
+-  --block-drv-rw-whitelist=%{block_drivers_list} \
+   --audio-drv-list= \
+-  --block-drv-ro-whitelist=vmdk,vhdx,vpc,https,ssh \
+   --with-coroutine=ucontext \
+   --tls-priority=NORMAL \
+   --disable-bluez \
+@@ -1262,7 +1265,7 @@ buildldflags="VL_LDFLAGS=-Wl,--build-id"
+   --disable-sanitizers \
+   --disable-hvf \
+   --disable-whpx \
+-  --enable-malloc-trim \
+  --disable-malloc-trim \
+   --disable-membarrier \
+   --disable-vhost-crypto \
+   --disable-libxml2 \
+@@ -1308,7 +1311,7 @@ make V=1 %{?_smp_mflags} $buildldflags
+ cp -a %{kvm_target}-softmmu/qemu-system-%{kvm_target} qemu-kvm
+ 
+ gcc %{SOURCE6} $RPM_OPT_FLAGS $RPM_LD_FLAGS -o ksmctl
+-gcc %{SOURCE35} $RPM_OPT_FLAGS $RPM_LD_FLAGS -o udev-kvm-check
+#gcc %{SOURCE35} $RPM_OPT_FLAGS $RPM_LD_FLAGS -o udev-kvm-check
+ 
+ %install
+ %define _udevdir %(pkg-config --variable=udevdir udev)
+@@ -1343,8 +1346,8 @@ mkdir -p $RPM_BUILD_ROOT%{testsdir}/test
+ mkdir -p $RPM_BUILD_ROOT%{testsdir}/tests/qemu-iotests
+ mkdir -p $RPM_BUILD_ROOT%{testsdir}/scripts/qmp
+ 
+-install -p -m 0755 udev-kvm-check $RPM_BUILD_ROOT%{_udevdir}
+-install -p -m 0644 %{SOURCE34} $RPM_BUILD_ROOT%{_udevrulesdir}
+#install -p -m 0755 udev-kvm-check $RPM_BUILD_ROOT%{_udevdir}
+#install -p -m 0644 %{SOURCE34} $RPM_BUILD_ROOT%{_udevrulesdir}
+ 
+ install -m 0644 scripts/dump-guest-memory.py \
+                 $RPM_BUILD_ROOT%{_datadir}/%{name}
+@@ -1562,6 +1565,8 @@ rm -rf $RPM_BUILD_ROOT%{qemudocdir}/inte
+ # Remove spec
+ rm -rf $RPM_BUILD_ROOT%{qemudocdir}/specs
+ 
+%global __os_install_post %(echo '%{__os_install_post}' | sed -e 's!/usr/lib[^[:space:]]*/brp-python-bytecompile[[:space:]].*$!!g')
+
+ %check
+ export DIFF=diff; make check V=1
+ 
+@@ -1645,8 +1650,8 @@ useradd -r -u 107 -g qemu -G kvm -d / -s
+ %config(noreplace) %{_sysconfdir}/sysconfig/ksm
+ %{_unitdir}/ksmtuned.service
+ %{_sbindir}/ksmtuned
+-%{_udevdir}/udev-kvm-check
+-%{_udevrulesdir}/81-kvm-rhel.rules
+#%{_udevdir}/udev-kvm-check
+#%{_udevrulesdir}/81-kvm-rhel.rules
+ %ghost %{_sysconfdir}/kvm
+ %config(noreplace) %{_sysconfdir}/ksmtuned.conf
+ %dir %{_sysconfdir}/%{name}
+@@ -1711,8 +1716,8 @@ useradd -r -u 107 -g qemu -G kvm -d / -s
+ %{_libexecdir}/vhost-user-gpu
+ %{_datadir}/%{name}/vhost-user/50-qemu-gpu.json
+ %endif
+-%{_libexecdir}/virtiofsd
+-%{_datadir}/%{name}/vhost-user/50-qemu-virtiofsd.json
+#%{_libexecdir}/virtiofsd
+#%{_datadir}/%{name}/vhost-user/50-qemu-virtiofsd.json
+ 
+ %files -n qemu-img
+ %defattr(-,root,root)
+@@ -1748,8 +1753,8 @@ useradd -r -u 107 -g qemu -G kvm -d / -s
+ %files block-iscsi
+ %{_libdir}/qemu-kvm/block-iscsi.so
+ 
+-%files block-rbd
+-%{_libdir}/qemu-kvm/block-rbd.so
+#%files block-rbd
+#%{_libdir}/qemu-kvm/block-rbd.so
+ 
+ %files block-ssh
+ %{_libdir}/qemu-kvm/block-ssh.so
--- a/rpm/qemu-kvm.spec.patch
+++ b/rpm/qemu-kvm.spec.patch
@@ -0,0 +1,29 @@
+--- qemu-kvm.spec	2020-12-05 13:13:54.388623517 +0000
+++ qemu-kvm.spec	2020-12-05 13:13:58.728696598 +0000
+@@ -67,7 +67,7 @@ Obsoletes: %1-rhev
+ Summary: QEMU is a machine emulator and virtualizer
+ Name: qemu-kvm
+ Version: 4.2.0
+-Release: 29%{?dist}.6
+Release: 29.vitastor%{?dist}.6
+ # Epoch because we pushed a qemu-1.0 package. AIUI this can't ever be dropped
+ Epoch: 15
+ License: GPLv2 and GPLv2+ and CC-BY
+@@ -825,6 +825,7 @@ Patch331: kvm-Drop-bogus-IPv6-messages.p
+ Patch333: kvm-virtiofsd-Whitelist-fchmod.patch
+ # For bz#1883869 - virtiofsd core dump in KATA Container [rhel-8.2.1.z]
+ Patch334: kvm-virtiofsd-avoid-proc-self-fd-tempdir.patch
+Patch335: qemu-4.2-vitastor.patch
+ 
+ BuildRequires: wget
+ BuildRequires: rpm-build
+@@ -1192,9 +1193,7 @@ buildldflags="VL_LDFLAGS=-Wl,--build-id"
+ %endif
+   --python=%{__python3} \
+   --target-list="%{buildarch}" \
+-  --block-drv-rw-whitelist=%{block_drivers_list} \
+   --audio-drv-list= \
+-  --block-drv-ro-whitelist=vmdk,vhdx,vpc,https,ssh \
+   --with-coroutine=ucontext \
+   --tls-priority=NORMAL \
+   --disable-bluez \
--- a/rpm/vitastor-el7.Dockerfile
+++ b/rpm/vitastor-el7.Dockerfile
@@ -0,0 +1,47 @@
+# Build packages for CentOS 7 inside a container
+# cd ..; podman build -t vitastor-el7 -v `pwd`/build:/root/build -f rpm/vitastor-el7.Dockerfile .
+# localedef -i ru_RU -f UTF-8 ru_RU.UTF-8
+
+FROM centos:7
+
+WORKDIR /root
+
+RUN rm -f /etc/yum.repos.d/CentOS-Media.repo
+RUN yum -y --enablerepo=extras install centos-release-scl epel-release yum-utils rpm-build
+RUN yum -y install https://vitastor.io/rpms/centos/7/vitastor-release-1.0-1.el7.noarch.rpm
+RUN yum -y install devtoolset-9-gcc-c++ devtoolset-9-libatomic-devel gperftools-devel qemu-kvm fio rh-nodejs12 jerasure-devel gf-complete-devel
+RUN yumdownloader --disablerepo=centos-sclo-rh --source qemu-kvm
+RUN yumdownloader --disablerepo=centos-sclo-rh --source fio
+RUN rpm --nomd5 -i qemu*.src.rpm
+RUN rpm --nomd5 -i fio*.src.rpm
+RUN rm -f /etc/yum.repos.d/CentOS-Media.repo
+RUN cd ~/rpmbuild/SPECS && yum-builddep -y --enablerepo='*' --disablerepo=centos-sclo-rh --disablerepo=centos-sclo-rh-source --disablerepo=centos-sclo-sclo-testing qemu-kvm.spec
+RUN cd ~/rpmbuild/SPECS && yum-builddep -y --enablerepo='*' --disablerepo=centos-sclo-rh --disablerepo=centos-sclo-rh-source --disablerepo=centos-sclo-sclo-testing fio.spec
+
+ADD https://vitastor.io/rpms/liburing-el7/liburing-0.7-2.el7.src.rpm /root
+
+RUN set -e; \
+    rpm -i liburing*.src.rpm; \
+    cd ~/rpmbuild/SPECS/; \
+    . /opt/rh/devtoolset-9/enable; \
+    rpmbuild -ba liburing.spec; \
+    mkdir -p /root/build/liburing-el7; \
+    rm -rf /root/build/liburing-el7/*; \
+    cp ~/rpmbuild/RPMS/*/liburing* /root/build/liburing-el7/; \
+    cp ~/rpmbuild/SRPMS/liburing* /root/build/liburing-el7/
+
+RUN rpm -i `ls /root/build/liburing-el7/liburing-*.x86_64.rpm | grep -v debug`
+
+ADD . /root/vitastor
+
+RUN set -e; \
+    cd /root/vitastor/rpm; \
+    sh build-tarball.sh; \
+    cp /root/vitastor-0.5.4.el7.tar.gz ~/rpmbuild/SOURCES; \
+    cp vitastor-el7.spec ~/rpmbuild/SPECS/vitastor.spec; \
+    cd ~/rpmbuild/SPECS/; \
+    rpmbuild -ba vitastor.spec; \
+    mkdir -p /root/build/vitastor-el7; \
+    rm -rf /root/build/vitastor-el7/*; \
+    cp ~/rpmbuild/RPMS/*/vitastor* /root/build/vitastor-el7/; \
+    cp ~/rpmbuild/SRPMS/vitastor* /root/build/vitastor-el7/
--- a/rpm/vitastor-el7.spec
+++ b/rpm/vitastor-el7.spec
@@ -0,0 +1,63 @@
+Name:           vitastor
+Version:        0.5.4
+Release:        2%{?dist}
+Summary:        Vitastor, a fast software-defined clustered block storage
+
+License:        Vitastor Network Public License 1.0
+URL:            https://vitastor.io/
+Source0:        vitastor-0.5.4.el7.tar.gz
+
+BuildRequires:  liburing-devel >= 0.6
+BuildRequires:  gperftools-devel
+BuildRequires:  devtoolset-9-gcc-c++
+BuildRequires:  rh-nodejs12
+BuildRequires:  rh-nodejs12-npm
+BuildRequires:  jerasure-devel
+BuildRequires:  gf-complete-devel
+Requires:       fio = 3.7-1.el7
+Requires:       qemu-kvm = 2.0.0-1.el7.6
+Requires:       rh-nodejs12
+Requires:       rh-nodejs12-npm
+Requires:       liburing >= 0.6
+Requires:       libJerasure2
+Requires:       lpsolve
+
+%description
+Vitastor is a small, simple and fast clustered block storage (storage for VM drives),
+architecturally similar to Ceph which means strong consistency, primary-replication,
+symmetric clustering and automatic data distribution over any number of drives of any
+size with configurable redundancy (replication or erasure codes/XOR).
+
+
+%prep
+%setup -q
+
+
+%build
+. /opt/rh/devtoolset-9/enable
+make %{?_smp_mflags} BINDIR=%_bindir LIBDIR=%_libdir QEMU_PLUGINDIR=%_libdir/qemu-kvm
+
+
+%install
+rm -rf $RPM_BUILD_ROOT
+%make_install BINDIR=%_bindir LIBDIR=%_libdir QEMU_PLUGINDIR=%_libdir/qemu-kvm
+. /opt/rh/rh-nodejs12/enable
+cd mon
+npm install
+cd ..
+mkdir -p %buildroot/usr/lib/vitastor
+cp -r mon %buildroot/usr/lib/vitastor/mon
+
+
+%files
+%doc
+%_bindir/vitastor-dump-journal
+%_bindir/vitastor-nbd
+%_bindir/vitastor-osd
+%_bindir/vitastor-rm
+%_libdir/qemu-kvm/block-vitastor.so
+%_libdir/vitastor
+/usr/lib/vitastor
+
+
+%changelog
--- a/rpm/vitastor-el8.Dockerfile
+++ b/rpm/vitastor-el8.Dockerfile
@@ -0,0 +1,45 @@
+# Build packages for CentOS 8 inside a container
+# cd ..; podman build -t vitastor-el8 -v `pwd`/build:/root/build -f rpm/vitastor-el8.Dockerfile .
+
+FROM centos:8
+
+WORKDIR /root
+
+RUN rm -f /etc/yum.repos.d/CentOS-Media.repo
+RUN dnf -y install centos-release-advanced-virtualization epel-release dnf-plugins-core
+RUN yum -y install https://vitastor.io/rpms/centos/8/vitastor-release-1.0-1.el8.noarch.rpm
+RUN dnf --enablerepo='centos-advanced-virtualization' -y install gcc-toolset-9 gcc-toolset-9-gcc-c++ gperftools-devel qemu-kvm fio nodejs rpm-build jerasure-devel gf-complete-devel
+RUN rm -rf /var/lib/dnf/*; dnf download --disablerepo='*' --enablerepo='vitastor' --source qemu-kvm
+RUN dnf download --source fio
+RUN rpm --nomd5 -i qemu*.src.rpm
+RUN rpm --nomd5 -i fio*.src.rpm
+RUN cd ~/rpmbuild/SPECS && dnf builddep -y --enablerepo=PowerTools --spec qemu-kvm.spec
+RUN cd ~/rpmbuild/SPECS && dnf builddep -y --enablerepo=PowerTools --spec fio.spec
+
+ADD https://vitastor.io/rpms/liburing-el7/liburing-0.7-2.el7.src.rpm /root
+
+RUN set -e; \
+    rpm -i liburing*.src.rpm; \
+    cd ~/rpmbuild/SPECS/; \
+    . /opt/rh/gcc-toolset-9/enable; \
+    rpmbuild -ba liburing.spec; \
+    mkdir -p /root/build/liburing-el8; \
+    rm -rf /root/build/liburing-el8/*; \
+    cp ~/rpmbuild/RPMS/*/liburing* /root/build/liburing-el8/; \
+    cp ~/rpmbuild/SRPMS/liburing* /root/build/liburing-el8/
+
+RUN rpm -i `ls /root/build/liburing-el7/liburing-*.x86_64.rpm | grep -v debug`
+
+ADD . /root/vitastor
+
+RUN set -e; \
+    cd /root/vitastor/rpm; \
+    sh build-tarball.sh; \
+    cp /root/vitastor-0.5.4.el8.tar.gz ~/rpmbuild/SOURCES; \
+    cp vitastor-el8.spec ~/rpmbuild/SPECS/vitastor.spec; \
+    cd ~/rpmbuild/SPECS/; \
+    rpmbuild -ba vitastor.spec; \
+    mkdir -p /root/build/vitastor-el8; \
+    rm -rf /root/build/vitastor-el8/*; \
+    cp ~/rpmbuild/RPMS/*/vitastor* /root/build/vitastor-el8/; \
+    cp ~/rpmbuild/SRPMS/vitastor* /root/build/vitastor-el8/
--- a/rpm/vitastor-el8.spec
+++ b/rpm/vitastor-el8.spec
@@ -0,0 +1,60 @@
+Name:           vitastor
+Version:        0.5.4
+Release:        2%{?dist}
+Summary:        Vitastor, a fast software-defined clustered block storage
+
+License:        Vitastor Network Public License 1.0
+URL:            https://vitastor.io/
+Source0:        vitastor-0.5.4.el8.tar.gz
+
+BuildRequires:  liburing-devel >= 0.6
+BuildRequires:  gperftools-devel
+BuildRequires:  gcc-toolset-9-gcc-c++
+BuildRequires:  nodejs >= 10
+BuildRequires:  jerasure-devel
+BuildRequires:  gf-complete-devel
+Requires:       fio = 3.7-3.el8
+Requires:       qemu-kvm = 4.2.0-29.el8.6
+Requires:       nodejs >= 10
+Requires:       liburing >= 0.6
+Requires:       libJerasure2
+Requires:       lpsolve
+
+%description
+Vitastor is a small, simple and fast clustered block storage (storage for VM drives),
+architecturally similar to Ceph which means strong consistency, primary-replication,
+symmetric clustering and automatic data distribution over any number of drives of any
+size with configurable redundancy (replication or erasure codes/XOR).
+
+
+%prep
+%setup -q
+
+
+%build
+. /opt/rh/gcc-toolset-9/enable
+make %{?_smp_mflags} BINDIR=%_bindir LIBDIR=%_libdir QEMU_PLUGINDIR=%_libdir/qemu-kvm
+
+
+%install
+rm -rf $RPM_BUILD_ROOT
+%make_install BINDIR=%_bindir LIBDIR=%_libdir QEMU_PLUGINDIR=%_libdir/qemu-kvm
+cd mon
+npm install
+cd ..
+mkdir -p %buildroot/usr/lib/vitastor
+cp -r mon %buildroot/usr/lib/vitastor
+
+
+%files
+%doc
+%_bindir/vitastor-dump-journal
+%_bindir/vitastor-nbd
+%_bindir/vitastor-osd
+%_bindir/vitastor-rm
+%_libdir/qemu-kvm/block-vitastor.so
+%_libdir/vitastor
+/usr/lib/vitastor
+
+
+%changelog
--- a/run_tests.sh
+++ b/run_tests.sh
@@ -0,0 +1,77 @@
+#!/bin/bash
+
+if [ ! "$BASH_VERSION" ] ; then
+    echo "Use bash to run this script ($0)" 1>&2
+    exit 1
+fi
+
+format_error()
+{
+    echo $(echo -n -e "\033[1;31m")$1$(echo -n -e "\033[m")
+    $ETCDCTL get --prefix /vitastor > ./testdata/etcd-dump.txt
+    exit 1
+}
+format_green()
+{
+    echo $(echo -n -e "\033[1;32m")$1$(echo -n -e "\033[m")
+}
+
+set -e -x
+
+trap 'kill -9 $(jobs -p)' EXIT
+
+ETCD=${ETCD:-etcd}
+ETCD_PORT=${ETCD_PORT:-12379}
+
+rm -rf ./testdata
+mkdir -p ./testdata
+dd if=/dev/zero of=./testdata/test_osd1.bin bs=1024 count=1 seek=$((1024*1024-1))
+dd if=/dev/zero of=./testdata/test_osd2.bin bs=1024 count=1 seek=$((1024*1024-1))
+dd if=/dev/zero of=./testdata/test_osd3.bin bs=1024 count=1 seek=$((1024*1024-1))
+
+$ETCD -name etcd_test --data-dir ./testdata/etcd \
+    --advertise-client-urls http://127.0.0.1:$ETCD_PORT --listen-client-urls http://127.0.0.1:$ETCD_PORT \
+    --initial-advertise-peer-urls http://127.0.0.1:$((ETCD_PORT+1)) --listen-peer-urls http://127.0.0.1:$((ETCD_PORT+1)) \
+    --max-txn-ops=100000 --auto-compaction-retention=10 --auto-compaction-mode=revision &>./testdata/etcd.log &
+ETCD_PID=$!
+ETCD_URL=127.0.0.1:$ETCD_PORT/v3
+ETCDCTL="${ETCD}ctl --endpoints=http://$ETCD_URL"
+
+./osd --osd_num 1 --bind_address 127.0.0.1 --etcd_address $ETCD_URL $(node mon/simple-offsets.js --format options --device ./testdata/test_osd1.bin 2>/dev/null) &>./testdata/osd1.log &
+OSD1_PID=$!
+./osd --osd_num 2 --bind_address 127.0.0.1 --etcd_address $ETCD_URL $(node mon/simple-offsets.js --format options --device ./testdata/test_osd2.bin 2>/dev/null) &>./testdata/osd2.log &
+OSD2_PID=$!
+./osd --osd_num 3 --bind_address 127.0.0.1 --etcd_address $ETCD_URL $(node mon/simple-offsets.js --format options --device ./testdata/test_osd3.bin 2>/dev/null) &>./testdata/osd3.log &
+OSD3_PID=$!
+
+cd mon
+npm install
+cd ..
+node mon/mon-main.js --etcd_url http://$ETCD_URL --etcd_prefix "/vitastor" &>./testdata/mon.log &
+MON_PID=$!
+
+$ETCDCTL put /vitastor/config/pools '{"1":{"name":"testpool","scheme":"xor","pg_size":3,"pg_minsize":2,"parity_chunks":1,"pg_count":1,"failure_domain":"osd"}}'
+
+sleep 2
+
+if ! ($ETCDCTL get /vitastor/config/pgs --print-value-only | jq -s -e '(. | length) != 0 and (.[0].items["1"]["1"].osd_set | sort) == ["1","2","3"]'); then
+    format_error "FAILED: 1 PG NOT CONFIGURED"
+fi
+
+if ! ($ETCDCTL get /vitastor/pg/state/1/1 --print-value-only | jq -s -e '(. | length) != 0 and .[0].state == ["active"]'); then
+    format_error "FAILED: 1 PG NOT UP"
+fi
+
+echo leak:fio >> testdata/lsan-suppress.txt
+echo leak:tcmalloc >> testdata/lsan-suppress.txt
+echo leak:ceph >> testdata/lsan-suppress.txt
+echo leak:librbd >> testdata/lsan-suppress.txt
+echo leak:_M_mutate >> testdata/lsan-suppress.txt
+echo leak:_M_assign >> testdata/lsan-suppress.txt
+#LSAN_OPTIONS=suppressions=`pwd`/testdata/lsan-suppress.txt LD_PRELOAD=libasan.so.5 \
+#    fio -thread -name=test -ioengine=./libfio_sec_osd.so -bs=4k -fsync=128 `$ETCDCTL get /vitastor/osd/state/1 --print-value-only | jq -r '"-host="+.addresses[0]+" -port="+(.port|tostring)'` -rw=write -size=32M
+
+LSAN_OPTIONS=suppressions=`pwd`/testdata/lsan-suppress.txt LD_PRELOAD=libasan.so.5 \
+    fio -thread -name=test -ioengine=./libfio_cluster.so -bs=4M -direct=1 -iodepth=1 -fsync=1 -rw=write -etcd=$ETCD_URL -pool=1 -inode=1 -size=1G -cluster_log_level=10
+
+format_green OK
--- a/test-build-el7.sh
+++ b/test-build-el7.sh
@@ -0,0 +1,34 @@
+#!/bin/bash
+
+# Cheatsheet for CentOS 7 packaging (not a build script)
+
+set -e
+rm -f /etc/yum.repos.d/CentOS-Media.repo
+yum -y --enablerepo=extras install centos-release-scl epel-release
+yum -y --enablerepo='*' install devtoolset-9-gcc-c++ devtoolset-9-libatomic-devel gperftools-devel
+yumdownloader --source qemu
+yumdownloader --source fio
+yum-builddep -y --enablerepo='*' qemu
+yum -y install rpm-build
+. /opt/rh/devtoolset-9/enable
+rpm --nomd5 -i qemu*.src.rpm
+rpm --nomd5 -i fio*.src.rpm
+cd ~/rpmbuild/SPECS
+rpmbuild -bp fio.spec
+perl -i -pe 's/^make V=1/exit 1; make V=1/' qemu.spec
+rpmbuild -bc qemu.spec
+perl -i -pe 's/^exit 1; make V=1/make V=1/' qemu.spec
+cd ~/rpmbuild/BUILD/qemu*/
+make qapi-types.h
+mkdir -p ~/vitastor/qemu/b/qemu
+cp config-host.h ~/vitastor/qemu/b/qemu 
+cp qapi-types.h ~/vitastor/qemu/b/qemu
+cp -r include ~/vitastor/qemu
+cd ~/vitastor
+sh copy-qemu-includes.sh
+mv qemu qemu-old
+mv qemu-copy qemu
+ln -s ~/rpmbuild/BUILD/fio*/ fio
+sh copy-fio-includes.sh
+rm fio
+mv fio-copy fio
--- a/test_pattern.h
+++ b/test_pattern.h
@@ -12,4 +12,4 @@
 #define PATTERN3 0x426bd7854eb08509

 #define set_pattern(buf, len, pattern) for (uint64_t i = 0; i < len; i += 8) { *(uint64_t*)((void*)buf + i) = pattern; }
-#define check_pattern(buf, len, pattern) for (uint64_t i = 0; i < len; i += 8) { assert(*(uint64_t*)(buf + i) == pattern); }
+#define check_pattern(buf, len, pattern) { uint64_t bad = UINT64_MAX; for (uint64_t i = 0; i < len; i += 8) { if ((*(uint64_t*)(buf + i)) != (pattern)) { bad = i; break; } } if (bad != UINT64_MAX) { printf("mismatch at %lx\n", bad); } assert(bad == UINT64_MAX); }
--- a/test_shit.cpp
+++ b/test_shit.cpp
@@ -30,7 +30,7 @@

 #include "blockstore.h"
 #include "blockstore_impl.h"
-#include "osd_peering_pg.h"
+#include "osd_peering_pg.cpp"
 //#include "cpp-btree/btree_map.h"

 static int setup_context(unsigned entries, struct io_uring *ring)
@@ -168,7 +168,7 @@ int main0(int argc, char *argv[])
            },
            .version = 1,
        }] = (dirty_entry){
-            .state = ST_D_SYNCED,
+            .state = BS_ST_SYNCED | BS_ST_BIG_WRITE,
            .flags = 0,
            .location = (uint64_t)i << 17,
            .offset = 0,
Author	SHA1	Message	Date
Vitaliy Filippov	b9e7d31aa1	Release v0.5.4 - Fix a rare hang, more or less reproducible with very slow drives - Fix a hang with the no_same_sector_overwrites mode	2021-02-24 01:40:30 +03:00
Vitaliy Filippov	2d9f09dcb6	Attempt forced trim when stopping an overrun flusher Fixes a rare hang happening in the event of journal space running out without new work to do for flushers except the current sector. The hang could be reproduced more or less consistently with very slow drives.	2021-02-24 01:33:01 +03:00
Vitaliy Filippov	7cc59260c5	Fix no_same_sector_overwrites related bug	2021-02-23 18:50:51 +03:00
Vitaliy Filippov	ca0a11ec85	Release 0.5.3	2021-02-03 00:38:57 +03:00
Vitaliy Filippov	51c0b5afee	Whitelist more leaks	2021-02-02 02:05:41 +03:00
Vitaliy Filippov	e1e01d042e	Rename sector_info.usage_count to flush_count	2021-02-02 01:32:23 +03:00
Vitaliy Filippov	534a4a657e	Rename space_check.sectors_required to sectors_to_write	2021-02-02 01:30:23 +03:00
Vitaliy Filippov	9b5d8b9ad4	Fix multiple-sector journal writes, add assertions to not miss any SQEs	2021-02-02 01:29:11 +03:00
Vitaliy Filippov	e66ed47515	Clear SQEs before returning them to the caller to prevent erroneous double submissions	2021-02-02 01:26:54 +03:00
Vitaliy Filippov	036c6d4c42	Add a simple test case	2021-02-01 19:43:10 +03:00
Vitaliy Filippov	4cb79a3bf8	Allow to calculate simple-offsets for files	2021-02-01 19:43:10 +03:00
Vitaliy Filippov	3bf53754c2	Fix several I/O bugs	2021-02-01 19:43:10 +03:00
Vitaliy Filippov	6023cac361	Do not stop clients before they are connected	2021-02-01 19:31:10 +03:00
Vitaliy Filippov	915d04c446	Allow empty global configuration, report OSD statistics faster	2021-02-01 19:31:10 +03:00
Vitaliy Filippov	21e06ea40d	Fix memory leaks in fio engines	2021-02-01 19:31:10 +03:00
Vitaliy Filippov	9ef7f865b0	Fix incorrect calls to prepare_journal_sector_write() when flushing multiple sectors	2021-02-01 19:31:10 +03:00
Vitaliy Filippov	9dd20a31aa	Do not use pg_minsize in the client code!	2021-02-01 19:31:10 +03:00
Vitaliy Filippov	28be049909	Dump only actual part of the journal by default	2021-01-01 23:04:30 +03:00
Vitaliy Filippov	78fbaacf1f	External jerasure's w into defines In fact, w=8 looks better than w=32, so it may be changed in the future	2020-12-31 19:15:22 +03:00
Vitaliy Filippov	1526c5a213	Add lp_solve into dependencies	2020-12-31 01:32:31 +03:00
Vitaliy Filippov	c7cc414c90	Skip removed descriptors in epoll (this is possible in real clusters)	2020-12-30 17:04:18 +03:00
Vitaliy Filippov	f4ea313707	Fix cl->read_op being freed without calling the completion callback	2020-12-30 16:55:54 +03:00
Vitaliy Filippov	b88b76f316	Parallel usage of multiple network interfaces was a sick fantasy	2020-12-30 00:05:17 +03:00
Vitaliy Filippov	4a17a61d1f	Make rm_inode work with incomplete and degraded objects, allow to wait before deleting objects	2020-12-28 16:38:08 +03:00
Vitaliy Filippov	ccabbbfbcb	For reference: include a spec patch for building QEMU 4.2 or CentOS 7	2020-12-06 15:43:38 +03:00
Vitaliy Filippov	26dac57083	State that jerasure is now supported	2020-12-06 15:25:48 +03:00
Vitaliy Filippov	44a53d8352	Huh. Fix rpath for packages	2020-12-05 20:16:39 +03:00
Vitaliy Filippov	9d80bd2d98	Build with jerasure, split some build scripts	2020-12-05 19:02:23 +03:00
Vitaliy Filippov	322a38a144	Fix non-preserved real_pg_count leading to inability to change pools online	2020-12-04 23:46:48 +03:00
Vitaliy Filippov	1018764c91	Fix write->delete->write bugs, add & fix some debugging output	2020-12-04 23:21:58 +03:00
Vitaliy Filippov	a45e0e5e67	Use custom decoding instead of just jerasure_matrix_decode() - Cache the decoding matrix - Don't do unnecessary erasures->erased conversion during decoding - Avoid extra memory allocations during decoding - Don't always reconstruct coding chunks - Reconstruct chunks one-by-one, without overlapping ranges	2020-12-04 17:43:48 +03:00
Vitaliy Filippov	44656fbf67	Allow writes with low version numbers after a delete	2020-12-04 11:54:41 +03:00
Vitaliy Filippov	089f138e0c	Allow situations where the journal contains a big_write(v1) after delete(v2) and v1 < v2 Fixes a crash in the following scenario: - client issues a delete request (object version is at least 2) - OSD has time to flush it to the metadata, but doesn't have time to move the journal start pointer on disk - client overwrites the same object and it gets the version number 1 again - OSD is restarted and sees delete(v=2), big_write(v=1) in the journal - dirty_db sequence gets broken and OSD crashes with assert("Writes and deletes shouldn't happen at the same time")	2020-12-04 11:47:27 +03:00
Vitaliy Filippov	bcc8e697f9	Delete PGs when deleting pools (All OSD crash with "Online PG count change not allowed" if you try to delete an active pool though)	2020-12-04 11:47:27 +03:00
Vitaliy Filippov	a4c46ba745	Add jerasure EC support (reed_sol_van, others are slower) (not tested yet)	2020-12-04 11:47:27 +03:00
Vitaliy Filippov	5596ad8997	Use custom QEMU build for CentOS 7	2020-12-04 11:47:05 +03:00
Vitaliy Filippov	59c29b0cee	Fix RPATH for CentOS builds, add additional repos into the CentOS installation instructions	2020-12-04 11:47:04 +03:00
Vitaliy Filippov	959089b919	Enable progress_notify=true for etcd watches	2020-11-17 16:29:42 +03:00
Vitaliy Filippov	d3e7749616	Final fixes for packaging	2020-11-10 23:33:07 +03:00
Vitaliy Filippov	b56f8820ec	Container packaging for Debian 11 Bullseye, CentOS 7 and CentOS 8	2020-11-10 00:02:53 +03:00
Vitaliy Filippov	4bd2bd48eb	Build Vitastor packages, too	2020-11-09 14:41:39 +03:00
Vitaliy Filippov	a3fc9f8d7d	Add a Dockerfile to build patched QEMU for Debian (Buster)	2020-11-09 02:30:41 +03:00
Vitaliy Filippov	530975aed7	Make it also build with GCC 8 and on Debian Buster	2020-11-09 00:07:07 +03:00
Vitaliy Filippov	1446aad107	Simple patch for qemu-kvm .spec	2020-11-08 02:14:53 +03:00
Vitaliy Filippov	46479e2456	Add RPM build scripts for CentOS 8	2020-11-08 01:55:17 +03:00
Vitaliy Filippov	e41bee72a5	Lower node.js requirement to 10.x	2020-11-08 01:54:12 +03:00
Vitaliy Filippov	2e0f223ddb	Add RPM build scripts for CentOS 7	2020-11-07 01:52:10 +03:00
Vitaliy Filippov	3be7bc29d8	Make it build with QEMU 2.0, too Also begin to work on rpms	2020-11-06 20:05:00 +03:00
Vitaliy Filippov	0c43ff9daf	Add scripts to copy fio and qemu includes to the source package	2020-11-06 18:40:42 +03:00
Vitaliy Filippov	64d471cf53	Add simple Debian packaging	2020-11-06 18:40:42 +03:00
Vitaliy Filippov	809b2ad8cd	Add install target	2020-11-06 01:12:22 +03:00
Vitaliy Filippov	550d4af151	Rename test.cpp to test_shit.cpp (random shit)	2020-11-06 01:12:22 +03:00
Vitaliy Filippov	cf0f23ab8e	Add patches for QEMU QAPI IDL	2020-11-04 23:30:51 +03:00
Vitaliy Filippov	a516fefa8c	Add qemu_module_dummy and qemu_stamp_xxx to qemu_driver.c	2020-11-04 23:10:29 +03:00