Huh. Fix rpath for packages

Build with jerasure, split some build scripts
Fix non-preserved real_pg_count leading to inability to change pools online
2020-12-05 20:16:39 +03:00 · 2020-12-05 19:02:23 +03:00 · 2020-12-04 23:46:48 +03:00 · 2020-12-04 23:21:58 +03:00 · 2020-12-04 17:43:48 +03:00 · 2020-12-04 11:54:41 +03:00
79 changed files with 3799 additions and 1015 deletions
--- a/.dockerignore
+++ b/.dockerignore
@@ -0,0 +1,17 @@
+.git
+build
+mon/node_modules
+*.o
+*.so
+osd
+stub_osd
+stub_uring_osd
+stub_bench
+osd_test
+dump_journal
+nbd_proxy
+rm_inode
+fio
+qemu
+rpm/*.Dockerfile
+debian/*.Dockerfile
--- a/54
+++ b/54
@@ -1,10 +1,28 @@
+BINDIR ?= /usr/bin
+LIBDIR ?= /usr/lib/x86_64-linux-gnu
+QEMU_PLUGINDIR ?= /usr/lib/x86_64-linux-gnu/qemu
+
 BLOCKSTORE_OBJS := allocator.o blockstore.o blockstore_impl.o blockstore_init.o blockstore_open.o blockstore_journal.o blockstore_read.o \
 	blockstore_write.o blockstore_sync.o blockstore_stable.o blockstore_rollback.o blockstore_flush.o crc32c.o ringloop.o
 # -fsanitize=address
-CXXFLAGS := -g -O3 -Wall -Wno-sign-compare -Wno-comment -Wno-parentheses -Wno-pointer-arith -fPIC -fdiagnostics-color=always
-all: libfio_blockstore.so osd libfio_sec_osd.so libfio_cluster.so stub_osd stub_uring_osd stub_bench osd_test dump_journal qemu_driver.so nbd_proxy
+CXXFLAGS := -g -O3 -Wall -Wno-sign-compare -Wno-comment -Wno-parentheses -Wno-pointer-arith -fPIC -fdiagnostics-color=always -I/usr/include/jerasure
+all: libfio_blockstore.so osd libfio_sec_osd.so libfio_cluster.so stub_osd stub_uring_osd stub_bench osd_test dump_journal qemu_driver.so nbd_proxy rm_inode
 clean:
-	rm -f *.o
+	rm -f *.o libblockstore.so libfio_blockstore.so osd libfio_sec_osd.so libfio_cluster.so stub_osd stub_uring_osd stub_bench osd_test dump_journal qemu_driver.so nbd_proxy rm_inode
+
+install: all
+	mkdir -p $(DESTDIR)$(LIBDIR)/vitastor
+	install -m 0755 libfio_sec_osd.so $(DESTDIR)$(LIBDIR)/vitastor/
+	install -m 0755 libfio_cluster.so $(DESTDIR)$(LIBDIR)/vitastor/
+	install -m 0755 libfio_blockstore.so $(DESTDIR)$(LIBDIR)/vitastor/
+	install -m 0755 libblockstore.so $(DESTDIR)$(LIBDIR)/vitastor/
+	mkdir -p $(DESTDIR)$(BINDIR)
+	install -m 0755 osd $(DESTDIR)$(BINDIR)/vitastor-osd
+	install -m 0755 dump_journal $(DESTDIR)$(BINDIR)/vitastor-dump-journal
+	install -m 0755 nbd_proxy $(DESTDIR)$(BINDIR)/vitastor-nbd
+	install -m 0755 rm_inode $(DESTDIR)$(BINDIR)/vitastor-rm
+	mkdir -p $(DESTDIR)$(QEMU_PLUGINDIR)
+	install -m 0755 qemu_driver.so $(DESTDIR)$(QEMU_PLUGINDIR)/block-vitastor.so

 dump_journal: dump_journal.cpp crc32c.o blockstore_journal.h
 	g++ $(CXXFLAGS) -o $@ $< crc32c.o
@@ -12,17 +30,20 @@ dump_journal: dump_journal.cpp crc32c.o blockstore_journal.h
 libblockstore.so: $(BLOCKSTORE_OBJS)
 	g++ $(CXXFLAGS) -o $@ -shared $(BLOCKSTORE_OBJS) -ltcmalloc_minimal -luring
 libfio_blockstore.so: ./libblockstore.so fio_engine.o json11.o
-	g++ $(CXXFLAGS) -shared -o $@ fio_engine.o json11.o ./libblockstore.so -ltcmalloc_minimal -luring
+	g++ $(CXXFLAGS) -Wl,-rpath,'$(LIBDIR)/vitastor',-rpath,'$$ORIGIN' -shared -o $@ fio_engine.o json11.o libblockstore.so -ltcmalloc_minimal -luring

 OSD_OBJS := osd.o osd_secondary.o msgr_receive.o msgr_send.o osd_peering.o osd_flush.o osd_peering_pg.o \
 	osd_primary.o osd_primary_subops.o etcd_state_client.o messenger.o osd_cluster.o http_client.o osd_ops.o pg_states.o \
 	osd_rmw.o json11.o base64.o timerfd_manager.o epoll_manager.o
 osd: ./libblockstore.so osd_main.cpp osd.h osd_ops.h $(OSD_OBJS)
-	g++ $(CXXFLAGS) -o $@ osd_main.cpp $(OSD_OBJS) ./libblockstore.so -ltcmalloc_minimal -luring
+	g++ $(CXXFLAGS) -Wl,-rpath,'$(LIBDIR)/vitastor',-rpath,'$$ORIGIN' -o $@ osd_main.cpp $(OSD_OBJS) libblockstore.so -ltcmalloc_minimal -luring -lJerasure

 stub_osd: stub_osd.o rw_blocking.o
 	g++ $(CXXFLAGS) -o $@ stub_osd.o rw_blocking.o -ltcmalloc_minimal

+osd_rmw_test: osd_rmw_test.o
+	g++ $(CXXFLAGS) -o $@ osd_rmw_test.o -lJerasure -fsanitize=address
+
 STUB_URING_OSD_OBJS := stub_uring_osd.o epoll_manager.o messenger.o msgr_send.o msgr_receive.o ringloop.o timerfd_manager.o json11.o
 stub_uring_osd: $(STUB_URING_OSD_OBJS)
 	g++ $(CXXFLAGS) -o $@ -ltcmalloc_minimal $(STUB_URING_OSD_OBJS) -luring
@@ -44,17 +65,20 @@ libfio_cluster.so: fio_cluster.o $(FIO_CLUSTER_OBJS)
 nbd_proxy: nbd_proxy.o $(FIO_CLUSTER_OBJS)
 	g++ $(CXXFLAGS) -ltcmalloc_minimal -o $@ $< $(FIO_CLUSTER_OBJS) -luring

+rm_inode: rm_inode.o $(FIO_CLUSTER_OBJS)
+	g++ $(CXXFLAGS) -ltcmalloc_minimal -o $@ $< $(FIO_CLUSTER_OBJS) -luring
+
 qemu_driver.o: qemu_driver.c qemu_proxy.h
 	gcc -I qemu/b/qemu `pkg-config glib-2.0 --cflags` \
 		-I qemu/include $(CXXFLAGS) -c -o $@ $<

 qemu_driver.so: qemu_driver.o qemu_proxy.o $(FIO_CLUSTER_OBJS)
-	g++ $(CXXFLAGS) -ltcmalloc_minimal -shared -o $@ $< $(FIO_CLUSTER_OBJS) qemu_driver.o qemu_proxy.o -luring
+	g++ $(CXXFLAGS) -ltcmalloc_minimal -shared -o $@ $(FIO_CLUSTER_OBJS) qemu_driver.o qemu_proxy.o -luring

 test_blockstore: ./libblockstore.so test_blockstore.cpp timerfd_interval.o
-	g++ $(CXXFLAGS) -o test_blockstore test_blockstore.cpp timerfd_interval.o ./libblockstore.so -ltcmalloc_minimal -luring
-test: test.cpp osd_peering_pg.o
-	g++ $(CXXFLAGS) -o test test.cpp osd_peering_pg.o -luring -lm
+	g++ $(CXXFLAGS) -Wl,-rpath,'$(LIBDIR)/vitastor',-rpath,'$$ORIGIN' -o test_blockstore test_blockstore.cpp timerfd_interval.o libblockstore.so -ltcmalloc_minimal -luring
+test_shit: test_shit.cpp osd_peering_pg.o
+	g++ $(CXXFLAGS) -o test_shit test_shit.cpp -luring -lm
 test_allocator: test_allocator.cpp allocator.o
 	g++ $(CXXFLAGS) -o test_allocator test_allocator.cpp allocator.o

@@ -99,11 +123,11 @@ epoll_manager.o: epoll_manager.cpp epoll_manager.h ringloop.h timerfd_manager.h
 	g++ $(CXXFLAGS) -c -o $@ $<
 etcd_state_client.o: etcd_state_client.cpp base64.h etcd_state_client.h http_client.h json11/json11.hpp object_id.h osd_id.h osd_ops.h pg_states.h timerfd_manager.h
 	g++ $(CXXFLAGS) -c -o $@ $<
-fio_cluster.o: fio_cluster.cpp cluster_client.h epoll_manager.h etcd_state_client.h fio/fio.h fio/optgroup.h http_client.h json11/json11.hpp malloc_or_die.h messenger.h object_id.h osd_id.h osd_ops.h ringloop.h timerfd_manager.h
+fio_cluster.o: fio_cluster.cpp cluster_client.h epoll_manager.h etcd_state_client.h fio/arch/arch.h fio/fio.h fio/optgroup.h fio_headers.h http_client.h json11/json11.hpp malloc_or_die.h messenger.h object_id.h osd_id.h osd_ops.h ringloop.h timerfd_manager.h
 	g++ $(CXXFLAGS) -c -o $@ $<
-fio_engine.o: fio_engine.cpp blockstore.h fio/fio.h fio/optgroup.h json11/json11.hpp object_id.h ringloop.h
+fio_engine.o: fio_engine.cpp blockstore.h fio/arch/arch.h fio/fio.h fio/optgroup.h fio_headers.h json11/json11.hpp object_id.h ringloop.h
 	g++ $(CXXFLAGS) -c -o $@ $<
-fio_sec_osd.o: fio_sec_osd.cpp fio/fio.h fio/optgroup.h object_id.h osd_id.h osd_ops.h rw_blocking.h
+fio_sec_osd.o: fio_sec_osd.cpp fio/arch/arch.h fio/fio.h fio/optgroup.h fio_headers.h object_id.h osd_id.h osd_ops.h rw_blocking.h
 	g++ $(CXXFLAGS) -c -o $@ $<
 http_client.o: http_client.cpp http_client.h json11/json11.hpp timerfd_manager.h
 	g++ $(CXXFLAGS) -c -o $@ $<
@@ -149,6 +173,8 @@ qemu_proxy.o: qemu_proxy.cpp cluster_client.h etcd_state_client.h http_client.h
 	g++ $(CXXFLAGS) -c -o $@ $<
 ringloop.o: ringloop.cpp ringloop.h
 	g++ $(CXXFLAGS) -c -o $@ $<
+rm_inode.o: rm_inode.cpp cluster_client.h etcd_state_client.h http_client.h json11/json11.hpp malloc_or_die.h messenger.h object_id.h osd_id.h osd_ops.h ringloop.h timerfd_manager.h
+	g++ $(CXXFLAGS) -c -o $@ $<
 rw_blocking.o: rw_blocking.cpp rw_blocking.h
 	g++ $(CXXFLAGS) -c -o $@ $<
 stub_bench.o: stub_bench.cpp object_id.h osd_id.h osd_ops.h rw_blocking.h
@@ -157,12 +183,12 @@ stub_osd.o: stub_osd.cpp object_id.h osd_id.h osd_ops.h rw_blocking.h
 	g++ $(CXXFLAGS) -c -o $@ $<
 stub_uring_osd.o: stub_uring_osd.cpp epoll_manager.h json11/json11.hpp malloc_or_die.h messenger.h object_id.h osd_id.h osd_ops.h ringloop.h timerfd_manager.h
 	g++ $(CXXFLAGS) -c -o $@ $<
-test.o: test.cpp allocator.h blockstore.h blockstore_flush.h blockstore_impl.h blockstore_init.h blockstore_journal.h cpp-btree/btree_map.h crc32c.h malloc_or_die.h object_id.h osd_id.h osd_ops.h osd_peering_pg.h pg_states.h ringloop.h
-	g++ $(CXXFLAGS) -c -o $@ $<
 test_allocator.o: test_allocator.cpp allocator.h
 	g++ $(CXXFLAGS) -c -o $@ $<
 test_blockstore.o: test_blockstore.cpp blockstore.h object_id.h ringloop.h timerfd_interval.h
 	g++ $(CXXFLAGS) -c -o $@ $<
+test_shit.o: test_shit.cpp allocator.h blockstore.h blockstore_flush.h blockstore_impl.h blockstore_init.h blockstore_journal.h cpp-btree/btree_map.h crc32c.h malloc_or_die.h object_id.h osd_id.h osd_ops.h osd_peering_pg.h pg_states.h ringloop.h
+	g++ $(CXXFLAGS) -c -o $@ $<
 timerfd_interval.o: timerfd_interval.cpp ringloop.h timerfd_interval.h
 	g++ $(CXXFLAGS) -c -o $@ $<
 timerfd_manager.o: timerfd_manager.cpp timerfd_manager.h
--- a/README.md
+++ b/README.md
@@ -21,7 +21,7 @@ breaking changes in the future. However, the following is implemented:
 - Automatic data distribution over OSDs, with support for:
  - Mathematical optimization for better uniformity and less data movement
  - Multiple pools
-  - Placement tree
+  - Placement tree, OSD selection by tags (device classes) and placement root
  - Configurable failure domains
 - Recovery of degraded blocks
 - Rebalancing (data movement between OSDs)
@@ -31,12 +31,12 @@ breaking changes in the future. However, the following is implemented:
 - QEMU driver (built out-of-tree)
 - Loadable fio engine for benchmarks (also built out-of-tree)
 - NBD proxy for kernel mounts
+- Inode removal tool (vitastor-rm)
+- Packaging for Debian and CentOS

 ## Roadmap

- Packaging for Debian and, probably, CentOS too
 - OSD creation tool (OSDs currently have to be created by hand)
- Inode deletion tool (currently you can't delete anything :))
 - Other administrative tools
 - Per-inode I/O and space usage statistics
 - jerasure EC support with any number of data and parity drives in a group
@@ -46,6 +46,7 @@ breaking changes in the future. However, the following is implemented:
 - Inode metadata storage in etcd
 - Snapshots and copy-on-write image clones
 - Operation timeouts and better failure detection
+- Scrubbing without checksums (verification of replicas)
 - Checksums
 - SSD+HDD optimizations, possibly including tiered storage and soft journal flushes
 - RDMA and NVDIMM support
@@ -80,7 +81,7 @@ Architectural differences from Ceph:
  per drive you should run multiple OSDs each on a different partition of the drive.
  Vitastor isn't CPU-hungry though (as opposed to Ceph), so 1 core is sufficient in a lot of cases.
 - Metadata and journal are always kept in memory. Metadata size depends linearly on drive capacity
-  and data store block size which is 128 KB by default. With 128 KB blocks, metadata should occupy
+  and data store block size which is 128 KB by default. With 128 KB blocks metadata should occupy
  around 512 MB per 1 TB (which is still less than Ceph wants). Journal doesn't have to be big,
  the example test below was conducted with only 16 MB journal. A big journal is probably even
  harmful as dirty write metadata also take some memory.
@@ -152,9 +153,9 @@ I use the following 6 commands with small variations to benchmark any storage:
  `fio -ioengine=libaio -direct=1 -invalidate=1 -name=test -bs=4M -iodepth=32 -rw=write -runtime=60 -filename=/dev/sdX`
 - Linear read:
  `fio -ioengine=libaio -direct=1 -invalidate=1 -name=test -bs=4M -iodepth=32 -rw=read -runtime=60 -filename=/dev/sdX`
- Random write latency (this hurts storages the most):
+- Random write latency (T1Q1, this hurts storages the most):
  `fio -ioengine=libaio -direct=1 -invalidate=1 -name=test -bs=4k -iodepth=1 -fsync=1 -rw=randwrite -runtime=60 -filename=/dev/sdX`
- Random read latency:
+- Random read latency (T1Q1):
  `fio -ioengine=libaio -direct=1 -invalidate=1 -name=test -bs=4k -iodepth=1 -rw=randread -runtime=60 -filename=/dev/sdX`
 - Parallel write iops (use numjobs if a single CPU core is insufficient to saturate the load):
  `fio -ioengine=libaio -direct=1 -invalidate=1 -name=test -bs=4k -iodepth=128 [-numjobs=4 -group_reporting] -rw=randwrite -runtime=60 -filename=/dev/sdX`
@@ -205,7 +206,7 @@ Hardware configuration: 4 nodes, each with:

 CPU powersaving was disabled. Both Vitastor and Ceph were configured with 2 OSDs per 1 SSD.

-All of the results below apply to 4 KB blocks.
+All of the results below apply to 4 KB blocks and random access (unless indicated otherwise).

 Raw drive performance:
 - T1Q1 write ~27000 iops (~0.037ms latency)
@@ -232,6 +233,8 @@ Vitastor:
 - T1Q1 read: 6838 iops (0.145ms latency)
 - T2Q64 write: 162000 iops, total CPU usage by OSDs about 3 virtual cores on each node
 - T8Q64 read: 895000 iops, total CPU usage by OSDs about 4 virtual cores on each node
+- Linear write (4M T1Q32): 2800 MB/s
+- Linear read (4M T1Q32): 1500 MB/s

 T8Q64 read test was conducted over 1 larger inode (3.2T) from all hosts (every host was running 2 instances of fio).
 Vitastor has no performance penalties related to running multiple clients over a single inode.
@@ -245,6 +248,24 @@ Vitastor was configured with: `--disable_data_fsync true --immediate_commit all
  --journal_no_same_sector_overwrites true --journal_sector_buffer_count 1024
  --journal_size 16777216`.

+### EC/XOR 2+1
+
+Vitastor:
+- T1Q1 write: 2808 iops (~0.355ms latency)
+- T1Q1 read: 6190 iops (~0.16ms latency)
+- T2Q64 write: 85500 iops, total CPU usage by OSDs about 3.4 virtual cores on each node
+- T8Q64 read: 812000 iops, total CPU usage by OSDs about 4.7 virtual cores on each node
+- Linear write (4M T1Q32): 3200 MB/s
+- Linear read (4M T1Q32): 1800 MB/s
+
+Ceph:
+- T1Q1 write: 730 iops (~1.37ms latency)
+- T1Q1 read: 1500 iops with cold cache (~0.66ms latency), 2300 iops after 2 minute metadata cache warmup (~0.435ms latency)
+- T4Q128 write (4 RBD images): 45300 iops, total CPU usage by OSDs about 30 virtual cores on each node
+- T8Q64 read (4 RBD images): 278600 iops, total CPU usage by OSDs about 40 virtual cores on each node
+- Linear write (4M T1Q32): 1950 MB/s before preallocation, 2500 MB/s after preallocation
+- Linear read (4M T1Q32): 2400 MB/s
+
 ### NBD

 NBD is currently required to mount Vitastor via kernel, but it imposes additional overhead
@@ -256,19 +277,50 @@ Vitastor with single-thread NBD on the same hardware:
 - T1Q1 read: 5518 iops (0.18ms latency)
 - T1Q128 write: 94400 iops
 - T1Q128 read: 103000 iops
- Linear write (4M T1Q128): 1266 MB/s (compared to 2600 MB/s via fio)
- Linear read (4M T1Q128): 975 MB/s (compared to 1400 MB/s via fio)
+- Linear write (4M T1Q128): 1266 MB/s (compared to 2800 MB/s via fio)
+- Linear read (4M T1Q128): 975 MB/s (compared to 1500 MB/s via fio)

-## Building
+## Installation

- Install Linux kernel 5.4 or newer for io_uring support.
+### Debian
+
+- Trust Vitastor package signing key:
+  `wget -q -O - https://vitastor.io/debian/pubkey | sudo apt-key add -`
+- Add Vitastor package repository to your /etc/apt/sources.list:
+  - Debian 11 (Bullseye/Sid): `deb https://vitastor.io/debian bullseye main`
+  - Debian 10 (Buster): `deb https://vitastor.io/debian buster main`
+- For Debian 10 (Buster) also enable backports repository:
+  `deb http://deb.debian.org/debian buster-backports main`
+- Install packages: `apt update; apt install vitastor lp-solve etcd linux-image-amd64`
+
+### CentOS
+
+- Add Vitastor package repository:
+  - CentOS 7: `yum install https://vitastor.io/rpms/centos/7/vitastor-release-1.0-1.el7.noarch.rpm`
+  - CentOS 8: `dnf install https://vitastor.io/rpms/centos/8/vitastor-release-1.0-1.el8.noarch.rpm`
+- Enable EPEL: `yum/dnf install epel-release`
+- Enable additional CentOS repositories:
+  - CentOS 7: `yum install centos-release-scl`
+  - CentOS 8: `dnf install centos-release-advanced-virtualization`
+- Enable elrepo-kernel:
+  - CentOS 7: `yum install https://www.elrepo.org/elrepo-release-7.el7.elrepo.noarch.rpm`
+  - CentOS 8: `dnf install https://www.elrepo.org/elrepo-release-8.el8.elrepo.noarch.rpm`
+- Install packages: `yum/dnf install vitastor lpsolve etcd kernel-ml qemu-kvm`
+
+### Building from Source
+
+- Install Linux kernel 5.4 or newer, for io_uring support. 5.8 or later is highly recommended because
+  there is at least one known io_uring hang with 5.4 and an HP SmartArray controller.
 - Install liburing 0.4 or newer and its headers.
 - Install lp_solve.
- Install etcd.
- Install node.js 12 or newer.
- Install gcc and g++ 9.x.
+- Install etcd. Attention: you need a fixed version from here: https://github.com/vitalif/etcd/,
+  branch release-3.4, because there is a bug in upstream etcd which makes Vitastor OSDs fail to
+  move PGs out of "starting" state if you have at least around ~500 PGs or so. The custom build
+  will be unnecessary when etcd merges the fix: https://github.com/etcd-io/etcd/pull/12402.
+- Install node.js 10 or newer.
+- Install gcc and g++ 8.x or newer.
 - Clone https://yourcmc.ru/git/vitalif/vitastor/ with submodules.
- Install QEMU 4.x or 5.x, get its source, begin to build it, stop the build and copy headers:
+- Install QEMU 3.0+, get its source, begin to build it, stop the build and copy headers:
   - `<qemu>/include` &rarr; `<vitastor>/qemu/include`
   - Debian:
      * Use qemu packages from the main repository
@@ -278,13 +330,15 @@ Vitastor with single-thread NBD on the same hardware:
      * Use qemu packages from the Advanced-Virtualization repository. To enable it, run
        `yum install centos-release-advanced-virtualization.noarch` and then `yum install qemu`
      * `<qemu>/config-host.h` &rarr; `<vitastor>/qemu/b/qemu/config-host.h`
-      * `<qemu>/qapi` &rarr; `<vitastor>/qemu/b/qemu/qapi`
+      * For QEMU 3.0+: `<qemu>/qapi` &rarr; `<vitastor>/qemu/b/qemu/qapi`
+      * For QEMU 2.0+: `<qemu>/qapi-types.h` &rarr; `<vitastor>/qemu/b/qemu/qapi-types.h`
   - `config-host.h` and `qapi` are required because they contain generated headers
- Install fio 3.16, get its source and symlink it into `<vitastor>/fio`. It doesn't currently
-  build with fio 3.20 or newer due to the conflicts between g++ and gcc's atomics. This will
-  be fixed in the future.
+- You can also rebuild QEMU with a patch that makes LD_PRELOAD unnecessary to load vitastor driver.
+  See `qemu-*.*-vitastor.patch`.
+- Install fio 3.7 or later, get its source and symlink it into `<vitastor>/fio`.
 - Build Vitastor with `make -j8`.
- Copy binaries somewhere.
+- Run `make install` (optionally with `LIBDIR=/usr/lib64 QEMU_PLUGINDIR=/usr/lib64/qemu-kvm`
+  if you're using an RPM-based distro).

 ## Running

@@ -295,12 +349,12 @@ and calculate disk offsets almost by hand. This will be fixed in near future.
  with lazy fsync, but prepare for inferior single-thread latency.
 - Get a fast network (at least 10 Gbit/s).
 - Disable CPU powersaving: `cpupower idle-set -D 0 && cpupower frequency-set -g performance`.
- Install etcd with `--max-txn-ops=100000 --auto-compaction-retention=10 --auto-compaction-mode=revision` options.
- Create global configuration in etcd: `etcdctl put /vitastor/config/global '{"immediate_commit":"all"}'`
+- Start etcd with `--max-txn-ops=100000 --auto-compaction-retention=10 --auto-compaction-mode=revision` options.
+- Create global configuration in etcd: `etcdctl --endpoints=... put /vitastor/config/global '{"immediate_commit":"all"}'`
  (if all your drives have capacitors).
- Create pool configuration in etcd: `etcdctl put /vitastor/config/pools '{"1":{"name":"testpool","scheme":"replicated","pg_size":2,"pg_minsize":1,"pg_count":256,"failure_domain":"host"}}'`.
- Calculate offsets for your drives with `node ./mon/simple-offsets.js /dev/sdX`.
- Make systemd units for your OSDs. Look at `./mon/make-units.sh` for example.
+- Create pool configuration in etcd: `etcdctl --endpoints=... put /vitastor/config/pools '{"1":{"name":"testpool","scheme":"replicated","pg_size":2,"pg_minsize":1,"pg_count":256,"failure_domain":"host"}}'`.
+- Calculate offsets for your drives with `node /usr/lib/vitastor/mon/simple-offsets.js --device /dev/sdX`.
+- Make systemd units for your OSDs. Look at `/usr/lib/vitastor/mon/make-units.sh` for example.
  Notable configuration variables from the example:
  - `disable_data_fsync 1` - only safe with server-grade drives with capacitors.
  - `immediate_commit all` - use this if all your drives are server-grade.
@@ -312,39 +366,42 @@ and calculate disk offsets almost by hand. This will be fixed in near future.
  - `disk_alignment`, `journal_block_size`, `meta_block_size` should be set to the internal
    block size of your SSDs which is 4096 on most drives.
  - `journal_no_same_sector_overwrites true` prevents multiple overwrites of the same journal sector.
-    Some SSDs (like Intel D3-4510) don't like such overwrites so they benefit from this setting.
-    When this setting is set, it is also required to raise `journal_sector_buffer_count` setting,
-    which is the number of dirty journal sectors that may be written to at the same time.
+    Most (99%) SSDs don't need this option. But Intel D3-4510 does because it doesn't like when you
+    overwrite the same sector twice in a short period of time. The setting forces Vitastor to never
+    overwrite the same journal sector twice in a row which makes D3-4510 almost happy. Not totally
+    happy, because overwrites of the same block can still happen in the metadata area... When this
+    setting is set, it is also required to raise `journal_sector_buffer_count` setting, which is the
+    number of dirty journal sectors that may be written to at the same time.
 - `systemctl start vitastor.target` everywhere.
- Start any number of monitors: `cd mon; node mon-main.js --etcd_url 'http://10.115.0.10:2379,http://10.115.0.11:2379,http://10.115.0.12:2379,http://10.115.0.13:2379' --etcd_prefix '/vitastor' --etcd_start_timeout 5`.
+- Start any number of monitors: `node /usr/lib/vitastor/mon/mon-main.js --etcd_url 'http://10.115.0.10:2379,http://10.115.0.11:2379,http://10.115.0.12:2379,http://10.115.0.13:2379' --etcd_prefix '/vitastor' --etcd_start_timeout 5`.
 - At this point, one of the monitors will configure PGs and OSDs will start them.
- You can check PG states with `etcdctl get --prefix /vitastor/pg/state`. All PGs should become 'active'.
- Run tests with (for example): `fio -thread -ioengine=./libfio_cluster.so -name=test -bs=4M -direct=1 -iodepth=16 -rw=write -etcd=10.115.0.10:2379/v3 -pool=1 -inode=1 -size=400G`.
+- You can check PG states with `etcdctl --endpoints=... get --prefix /vitastor/pg/state`. All PGs should become 'active'.
+- Run tests with (for example): `fio -thread -ioengine=/usr/lib/x86_64-linux-gnu/vitastor/libfio_cluster.so -name=test -bs=4M -direct=1 -iodepth=16 -rw=write -etcd=10.115.0.10:2379/v3 -pool=1 -inode=1 -size=400G`.
 - Upload VM disk image with qemu-img (for example):
  ```
-  LD_PRELOAD=./qemu_driver.so qemu-img convert -f qcow2 debian10.qcow2 -p
+  LD_PRELOAD=/usr/lib/x86_64-linux-gnu/qemu/block-vitastor.so qemu-img convert -f qcow2 debian10.qcow2 -p
    -O raw 'vitastor:etcd_host=10.115.0.10\:2379/v3:pool=1:inode=1:size=2147483648'
  ```
 - Run QEMU with (for example):
  ```
-  LD_PRELOAD=./qemu_driver.so qemu-system-x86_64 -enable-kvm -m 1024
+  LD_PRELOAD=/usr/lib/x86_64-linux-gnu/qemu/block-vitastor.so qemu-system-x86_64 -enable-kvm -m 1024
    -drive 'file=vitastor:etcd_host=10.115.0.10\:2379/v3:pool=1:inode=1:size=2147483648',format=raw,if=none,id=drive-virtio-disk0,cache=none
    -device virtio-blk-pci,scsi=off,bus=pci.0,addr=0x5,drive=drive-virtio-disk0,id=virtio-disk0,bootindex=1,write-cache=off,physical_block_size=4096,logical_block_size=512
    -vnc 0.0.0.0:0
  ```
+- Remove inode with (for example):
+  ```
+  vitastor-rm --etcd_address 10.115.0.10:2379/v3 --pool 1 --inode 1 --parallel_osds 16 --iodepth 32
+  ```

 ## Known Problems

- OSDs may currently crash with "can't get SQE, will fall out of sync with EPOLLET"
-  if you try to load them with very long iodepths because io_uring queue (ring) is limited
-  and OSDs don't check if it fills up.
- Object deletion requests may currently lead to unfound objects on crashes because
-  proper handling of deletions in a cluster requires a "three-phase cleanup process"
-  and it's currently not implemented. In fact, even though deletion requests are
-  implemented, there's no user tool to delete anything from the cluster yet :).
-  Of course I'll create such tool, but its first implementation will be vulnerable to this issue.
-  It's not a big deal though, because you'll be able to just repeat the deletion request
-  in this case.
+- Object deletion requests may currently lead to 'incomplete' objects if your OSDs crash during
+  deletion because proper handling of object cleanup in a cluster should be "three-phase"
+  and it's currently not implemented. Inode removal tool currently can't handle unclean
+  objects, so incomplete objects become undeletable. This will be fixed in near future
+  by allowing the inode removal tool to delete unclean objects. With this problem fixed
+  you'll be able just to repeat the removal again.

 ## Implementation Principles

--- a/blockstore.h
+++ b/blockstore.h
@@ -9,6 +9,7 @@

 #include <stdint.h>

+#include <string>
 #include <map>
 #include <unordered_map>
 #include <functional>
--- a/blockstore_flush.cpp
+++ b/blockstore_flush.cpp
@@ -8,11 +8,14 @@ journal_flusher_t::journal_flusher_t(int flusher_count, blockstore_impl_t *bs)
    this->bs = bs;
    this->flusher_count = flusher_count;
    dequeuing = false;
+    trimming = false;
    active_flushers = 0;
    syncing_flushers = 0;
+    // FIXME: allow to configure flusher_start_threshold and journal_trim_interval
    flusher_start_threshold = bs->journal_block_size / sizeof(journal_entry_stable);
-    journal_trim_interval = flusher_start_threshold;
+    journal_trim_interval = 512;
    journal_trim_counter = 0;
+    trim_wanted = 0;
    journal_superblock = bs->journal.inmemory ? bs->journal.buffer : memalign_or_die(MEM_ALIGNMENT, bs->journal_block_size);
    co = new journal_flusher_co[flusher_count];
    for (int i = 0; i < flusher_count; i++)
@@ -73,6 +76,9 @@ void journal_flusher_t::loop()

 void journal_flusher_t::enqueue_flush(obj_ver_id ov)
 {
+#ifdef BLOCKSTORE_DEBUG
+    printf("enqueue_flush %lx:%lx v%lu\n", ov.oid.inode, ov.oid.stripe, ov.version);
+#endif
    auto it = flush_versions.find(ov.oid);
    if (it != flush_versions.end())
    {
@@ -84,15 +90,18 @@ void journal_flusher_t::enqueue_flush(obj_ver_id ov)
        flush_versions[ov.oid] = ov.version;
        flush_queue.push_back(ov.oid);
    }
-    if (!dequeuing && flush_queue.size() >= flusher_start_threshold)
+    if (!dequeuing && (flush_queue.size() >= flusher_start_threshold || trim_wanted > 0))
    {
        dequeuing = true;
        bs->ringloop->wakeup();
    }
 }

-void journal_flusher_t::unshift_flush(obj_ver_id ov)
+void journal_flusher_t::unshift_flush(obj_ver_id ov, bool force)
 {
+#ifdef BLOCKSTORE_DEBUG
+    printf("unshift_flush %lx:%lx v%lu\n", ov.oid.inode, ov.oid.stripe, ov.version);
+#endif
    auto it = flush_versions.find(ov.oid);
    if (it != flush_versions.end())
    {
@@ -102,15 +111,38 @@ void journal_flusher_t::unshift_flush(obj_ver_id ov)
    else
    {
        flush_versions[ov.oid] = ov.version;
-        flush_queue.push_front(ov.oid);
+        if (!force)
+            flush_queue.push_front(ov.oid);
    }
-    if (!dequeuing && flush_queue.size() >= flusher_start_threshold)
+    if (force)
+        flush_queue.push_front(ov.oid);
+    if (force || !dequeuing && (flush_queue.size() >= flusher_start_threshold || trim_wanted > 0))
    {
        dequeuing = true;
        bs->ringloop->wakeup();
    }
 }

+void journal_flusher_t::remove_flush(object_id oid)
+{
+#ifdef BLOCKSTORE_DEBUG
+    printf("undo_flush %lx:%lx\n", oid.inode, oid.stripe);
+#endif
+    auto v_it = flush_versions.find(oid);
+    if (v_it != flush_versions.end())
+    {
+        flush_versions.erase(v_it);
+        for (auto q_it = flush_queue.begin(); q_it != flush_queue.end(); q_it++)
+        {
+            if (*q_it == oid)
+            {
+                flush_queue.erase(q_it);
+                break;
+            }
+        }
+    }
+}
+
 void journal_flusher_t::request_trim()
 {
    dequeuing = true;
@@ -118,6 +150,16 @@ void journal_flusher_t::request_trim()
    bs->ringloop->wakeup();
 }

+void journal_flusher_t::mark_trim_possible()
+{
+    if (trim_wanted > 0)
+    {
+        dequeuing = true;
+        journal_trim_counter++;
+        bs->ringloop->wakeup();
+    }
+}
+
 void journal_flusher_t::release_trim()
 {
    trim_wanted--;
@@ -172,9 +214,21 @@ bool journal_flusher_co::loop()
        goto resume_17;
    else if (wait_state == 18)
        goto resume_18;
+    else if (wait_state == 19)
+        goto resume_19;
+    else if (wait_state == 20)
+        goto resume_20;
+    else if (wait_state == 21)
+        goto resume_21;
 resume_0:
    if (!flusher->flush_queue.size() || !flusher->dequeuing)
    {
+        if (flusher->trim_wanted > 0 && flusher->journal_trim_counter > 0)
+        {
+            // Attempt forced trim
+            flusher->active_flushers++;
+            goto trim_journal;
+        }
        flusher->dequeuing = false;
        wait_state = 0;
        return true;
@@ -294,12 +348,12 @@ resume_1:
            return false;
        }
        // Writes and deletes shouldn't happen at the same time
-        assert(!(copy_count > 0 || has_writes) || !has_delete);
-        if (copy_count == 0 && !has_writes && !has_delete || has_delete && old_clean_loc == UINT64_MAX)
+        assert(!has_writes || !has_delete);
+        if (!has_writes && !has_delete || has_delete && old_clean_loc == UINT64_MAX)
        {
            // Nothing to flush
            bs->erase_dirty(dirty_start, std::next(dirty_end), clean_loc);
-            goto trim_journal;
+            goto release_oid;
        }
        if (clean_loc == UINT64_MAX)
        {
@@ -418,7 +472,12 @@ resume_1:
        else
        {
            clean_disk_entry *new_entry = (clean_disk_entry*)(meta_new.buf + meta_new.pos*bs->clean_entry_size);
-            assert(new_entry->oid.inode == 0 || new_entry->oid == cur.oid);
+            if (new_entry->oid.inode != 0 && new_entry->oid != cur.oid)
+            {
+                printf("Fatal error (metadata corruption or bug): tried to overwrite non-zero metadata entry %lu (%lx:%lx) with %lx:%lx\n",
+                    clean_loc >> bs->block_order, new_entry->oid.inode, new_entry->oid.stripe, cur.oid.inode, cur.oid.stripe);
+                exit(1);
+            }
            new_entry->oid = cur.oid;
            new_entry->version = cur.version;
            if (!bs->inmemory_meta)
@@ -474,14 +533,35 @@ resume_1:
        }
        // Update clean_db and dirty_db, free old data locations
        update_clean_db();
+#ifdef BLOCKSTORE_DEBUG
+        printf("Flushed %lx:%lx v%lu (%d copies, wr:%d, del:%d), %ld left\n", cur.oid.inode, cur.oid.stripe, cur.version,
+            copy_count, has_writes, has_delete, flusher->flush_queue.size());
+#endif
+    release_oid:
+        repeat_it = flusher->sync_to_repeat.find(cur.oid);
+        if (repeat_it != flusher->sync_to_repeat.end() && repeat_it->second > cur.version)
+        {
+            // Requeue version
+            flusher->unshift_flush({ .oid = cur.oid, .version = repeat_it->second }, false);
+        }
+        flusher->sync_to_repeat.erase(repeat_it);
    trim_journal:
        // Clear unused part of the journal every <journal_trim_interval> flushes
        if (!((++flusher->journal_trim_counter) % flusher->journal_trim_interval) || flusher->trim_wanted > 0)
        {
            flusher->journal_trim_counter = 0;
-            if (bs->journal.trim())
+            new_trim_pos = bs->journal.get_trim_pos();
+            if (new_trim_pos != bs->journal.used_start)
            {
-                // Update journal "superblock"
+            resume_19:
+                // Wait for other coroutines trimming the journal, if any
+                if (flusher->trimming)
+                {
+                    wait_state = 19;
+                    return false;
+                }
+                flusher->trimming = true;
+                // First update journal "superblock" and only then update <used_start> in memory
                await_sqe(12);
                *((journal_entry_start*)flusher->journal_superblock) = {
                    .crc32 = 0,
@@ -489,7 +569,7 @@ resume_1:
                    .type = JE_START,
                    .size = sizeof(journal_entry_start),
                    .reserved = 0,
-                    .journal_start = bs->journal.used_start,
+                    .journal_start = new_trim_pos,
                };
                ((journal_entry_start*)flusher->journal_superblock)->crc32 = je_crc32((journal_entry*)flusher->journal_superblock);
                data->iov = (struct iovec){ flusher->journal_superblock, bs->journal_block_size };
@@ -502,21 +582,28 @@ resume_1:
                    wait_state = 13;
                    return false;
                }
+                if (!bs->disable_journal_fsync)
+                {
+                    await_sqe(20);
+                    my_uring_prep_fsync(sqe, bs->journal.fd, IORING_FSYNC_DATASYNC);
+                    data->iov = { 0 };
+                    data->callback = simple_callback_w;
+                resume_21:
+                    if (wait_count > 0)
+                    {
+                        wait_state = 21;
+                        return false;
+                    }
+                }
+                bs->journal.used_start = new_trim_pos;
+#ifdef BLOCKSTORE_DEBUG
+                printf("Journal trimmed to %08lx (next_free=%08lx)\n", bs->journal.used_start, bs->journal.next_free);
+#endif
+                flusher->trimming = false;
            }
        }
        // All done
-#ifdef BLOCKSTORE_DEBUG
-        printf("Flushed %lx:%lx v%lu (%d copies, wr:%d, del:%d), %ld left\n", cur.oid.inode, cur.oid.stripe, cur.version,
-            copy_count, has_writes, has_delete, flusher->flush_queue.size());
-#endif
        flusher->active_flushers--;
-        repeat_it = flusher->sync_to_repeat.find(cur.oid);
-        if (repeat_it != flusher->sync_to_repeat.end() && repeat_it->second > cur.version)
-        {
-            // Requeue version
-            flusher->unshift_flush({ .oid = cur.oid, .version = repeat_it->second });
-        }
-        flusher->sync_to_repeat.erase(repeat_it);
        wait_state = 0;
        goto resume_0;
    }
@@ -544,7 +631,7 @@ bool journal_flusher_co::scan_dirty(int wait_base)
        {
            char err[1024];
            snprintf(
-                err, 1024, "BUG: Unexpected dirty_entry %lx:%lx v%lu state during flush: %d",
+                err, 1024, "BUG: Unexpected dirty_entry %lx:%lx v%lu unstable state during flush: %d",
                dirty_it->first.oid.inode, dirty_it->first.oid.stripe, dirty_it->first.version, dirty_it->second.state
            );
            throw std::runtime_error(err);
--- a/blockstore_flush.h
+++ b/blockstore_flush.h
@@ -59,6 +59,8 @@ class journal_flusher_co
    uint64_t clean_bitmap_offset, clean_bitmap_len;
    void *new_clean_bitmap;

+    uint64_t new_trim_pos;
+
    // local: scan_dirty()
    uint64_t offset, end_offset, submit_offset, submit_len;

@@ -85,6 +87,7 @@ class journal_flusher_t
    friend class journal_flusher_co;

    int journal_trim_counter, journal_trim_interval;
+    bool trimming;
    void* journal_superblock;

    int active_flushers;
@@ -100,8 +103,10 @@ public:
    ~journal_flusher_t();
    void loop();
    bool is_active();
+    void mark_trim_possible();
    void request_trim();
    void release_trim();
    void enqueue_flush(obj_ver_id oid);
-    void unshift_flush(obj_ver_id oid);
+    void unshift_flush(obj_ver_id oid, bool force);
+    void remove_flush(object_id oid);
 };
--- a/blockstore_impl.cpp
+++ b/blockstore_impl.cpp
@@ -151,8 +151,8 @@ void blockstore_impl_t::loop()
            {
                if (has_writes == 2)
                {
-                    // Some writes could not be submitted
-                    break;
+                    // Some writes already could not be submitted
+                    continue;
                }
                dequeue_op = dequeue_write(op);
                has_writes = dequeue_op ? 1 : 2;
@@ -161,8 +161,8 @@ void blockstore_impl_t::loop()
            {
                if (has_writes == 2)
                {
-                    // Some writes could not be submitted
-                    break;
+                    // Some writes already could not be submitted
+                    continue;
                }
                dequeue_op = dequeue_del(op);
                has_writes = dequeue_op ? 1 : 2;
@@ -182,33 +182,19 @@ void blockstore_impl_t::loop()
            }
            else if (op->opcode == BS_OP_STABLE)
            {
-                if (has_writes == 2)
-                {
-                    // Don't submit additional flushes before completing previous LISTs
-                    break;
-                }
                dequeue_op = dequeue_stable(op);
            }
            else if (op->opcode == BS_OP_ROLLBACK)
            {
-                if (has_writes == 2)
-                {
-                    // Don't submit additional flushes before completing previous LISTs
-                    break;
-                }
                dequeue_op = dequeue_rollback(op);
            }
            else if (op->opcode == BS_OP_LIST)
            {
-                // Block LIST operation by previous modifications,
-                // so it always returns a consistent state snapshot
-                if (has_writes == 2 || inflight_writes > 0)
-                    has_writes = 2;
-                else
-                {
-                    process_list(op);
-                    dequeue_op = true;
-                }
+                // LIST doesn't need to be blocked by previous modifications,
+                // it only needs to include all in-progress writes as they're guaranteed
+                // to be readable and stabilizable/rollbackable by subsequent operations
+                process_list(op);
+                dequeue_op = true;
            }
            if (dequeue_op)
            {
@@ -531,7 +517,7 @@ void blockstore_impl_t::process_list(blockstore_op_t *op)
                    if (!replace_stable(dirty_it->first.oid, dirty_it->first.version, 0, clean_stable_count, stable))
                    {
                        // Then try to replace the last dirty stable version in the second part of the list
-                        if (stable[stable_count-1].oid == dirty_it->first.oid)
+                        if (stable_count > 0 && stable[stable_count-1].oid == dirty_it->first.oid)
                        {
                            stable[stable_count-1].version = dirty_it->first.version;
                        }
--- a/blockstore_impl.h
+++ b/blockstore_impl.h
@@ -30,12 +30,13 @@
 #define BS_ST_BIG_WRITE 0x02
 #define BS_ST_DELETE 0x03

-#define BS_ST_WAIT_BIG 0x10
-#define BS_ST_IN_FLIGHT 0x20
-#define BS_ST_SUBMITTED 0x30
-#define BS_ST_WRITTEN 0x40
-#define BS_ST_SYNCED 0x50
-#define BS_ST_STABLE 0x60
+#define BS_ST_WAIT_DEL 0x10
+#define BS_ST_WAIT_BIG 0x20
+#define BS_ST_IN_FLIGHT 0x30
+#define BS_ST_SUBMITTED 0x40
+#define BS_ST_WRITTEN 0x50
+#define BS_ST_SYNCED 0x60
+#define BS_ST_STABLE 0x70

 #define BS_ST_INSTANT 0x100

@@ -153,6 +154,8 @@ struct blockstore_op_private_t

    // Write
    struct iovec iov_zerofill[3];
+    // Warning: must not have a default value here because it's written to before calling constructor in blockstore_write.cpp O_o
+    uint64_t real_version;

    // Sync
    std::vector<obj_ver_id> sync_big_writes, sync_small_writes;
@@ -226,7 +229,6 @@ class blockstore_impl_t

    bool live = false, queue_stall = false;
    ring_loop_t *ringloop;
-    int inflight_writes = 0;

    bool stop_sync_submitted;

--- a/blockstore_init.cpp
+++ b/blockstore_init.cpp
@@ -111,7 +111,7 @@ void blockstore_init_meta::handle_entries(void* entries, unsigned count, int blo
                {
                    // free the previous block
 #ifdef BLOCKSTORE_DEBUG
-                    printf("Free block %lu (new location is %lu)\n", clean_it->second.location >> block_order, done_cnt+i >> block_order);
+                    printf("Free block %lu (new location is %lu)\n", clean_it->second.location >> block_order, done_cnt+i);
 #endif
                    bs->data_alloc->set(clean_it->second.location >> block_order, false);
                }
@@ -399,8 +399,7 @@ resume_1:
            }
        }
    }
-    // Trim journal on start so we don't stall when all entries are older
-    bs->journal.trim();
+    bs->flusher->mark_trim_possible();
    bs->journal.dirty_start = bs->journal.next_free;
    printf(
        "Journal entries loaded: %lu, free journal space: %lu bytes (%08lx..%08lx is used), free blocks: %lu / %lu\n",
@@ -560,9 +559,54 @@ int blockstore_init_journal::handle_journal_part(void *buf, uint64_t done_pos, u
                printf(
                    "je_big_write%s oid=%lx:%lx ver=%lu loc=%lu\n",
                    je->type == JE_BIG_WRITE_INSTANT ? "_instant" : "",
-                    je->big_write.oid.inode, je->big_write.oid.stripe, je->big_write.version, je->big_write.location
+                    je->big_write.oid.inode, je->big_write.oid.stripe, je->big_write.version, je->big_write.location >> bs->block_order
                );
 #endif
+                auto dirty_it = bs->dirty_db.upper_bound((obj_ver_id){
+                    .oid = je->big_write.oid,
+                    .version = UINT64_MAX,
+                });
+                if (dirty_it != bs->dirty_db.begin() && bs->dirty_db.size() > 0)
+                {
+                    dirty_it--;
+                    if (dirty_it->first.oid == je->big_write.oid &&
+                        dirty_it->first.version >= je->big_write.version &&
+                        (dirty_it->second.state & BS_ST_TYPE_MASK) == BS_ST_DELETE)
+                    {
+                        // It is allowed to overwrite a deleted object with a
+                        // version number smaller than deletion version number,
+                        // because the presence of a BIG_WRITE entry means that
+                        // its data and metadata are already flushed.
+                        // We don't know if newer versions are flushed, but
+                        // the previous delete definitely is.
+                        // So we flush previous dirty entries, but retain the clean one.
+                        // This feature is required for writes happening shortly
+                        // after deletes.
+                        auto dirty_end = dirty_it;
+                        dirty_end++;
+                        while (1)
+                        {
+                            if (dirty_it == bs->dirty_db.begin())
+                            {
+                                break;
+                            }
+                            dirty_it--;
+                            if (dirty_it->first.oid != je->big_write.oid)
+                            {
+                                dirty_it++;
+                                break;
+                            }
+                        }
+                        auto clean_it = bs->clean_db.find(je->big_write.oid);
+                        bs->erase_dirty(
+                            dirty_it, dirty_end,
+                            clean_it != bs->clean_db.end() ? clean_it->second.location : UINT64_MAX
+                        );
+                        // Remove it from the flusher's queue, too
+                        // Otherwise it may end up referring to a small unstable write after reading the rest of the journal
+                        bs->flusher->remove_flush(je->big_write.oid);
+                    }
+                }
                auto clean_it = bs->clean_db.find(je->big_write.oid);
                if (clean_it == bs->clean_db.end() ||
                    clean_it->second.version < je->big_write.version)
@@ -585,6 +629,12 @@ int blockstore_init_journal::handle_journal_part(void *buf, uint64_t done_pos, u
 #endif
                    bs->data_alloc->set(je->big_write.location >> bs->block_order, true);
                    bs->journal.used_sectors[proc_pos]++;
+#ifdef BLOCKSTORE_DEBUG
+                    printf(
+                        "journal offset %08lx is used by %lx:%lx v%lu (%lu refs)\n",
+                        proc_pos, ov.oid.inode, ov.oid.stripe, ov.version, bs->journal.used_sectors[proc_pos]
+                    );
+#endif
                    auto & unstab = bs->unstable_writes[ov.oid];
                    unstab = unstab < ov.version ? ov.version : unstab;
                    if (je->type == JE_BIG_WRITE_INSTANT)
--- a/blockstore_journal.cpp
+++ b/blockstore_journal.cpp
@@ -100,10 +100,11 @@ int blockstore_journal_check_t::check_available(blockstore_op_t *op, int entries
    {
        // No space in the journal. Wait until used_start changes.
        printf(
-            "Ran out of journal space (free space: %lu bytes)\n",
+            "Ran out of journal space (free space: %lu bytes, sectors to write: %d)\n",
            (bs->journal.next_free >= bs->journal.used_start
                ? bs->journal.len-bs->journal.block_size - (bs->journal.next_free-bs->journal.used_start)
-                : bs->journal.used_start - bs->journal.next_free)
+                : bs->journal.used_start - bs->journal.next_free),
+            sectors_required
        );
        PRIV(op)->wait_for = WAIT_JOURNAL;
        bs->flusher->request_trim();
@@ -183,7 +184,7 @@ journal_t::~journal_t()
    buffer = NULL;
 }

-bool journal_t::trim()
+uint64_t journal_t::get_trim_pos()
 {
    auto journal_used_it = used_sectors.lower_bound(used_start);
 #ifdef BLOCKSTORE_DEBUG
@@ -201,26 +202,19 @@ bool journal_t::trim()
        if (journal_used_it == used_sectors.end())
        {
            // Journal is empty
-            used_start = next_free;
+            return next_free;
        }
        else
        {
-            used_start = journal_used_it->first;
-            // next_free does not need updating here
+            // next_free does not need updating during trim
+            return journal_used_it->first;
        }
    }
    else if (journal_used_it->first > used_start)
    {
        // Journal is cleared up to <journal_used_it>
-        used_start = journal_used_it->first;
+        return journal_used_it->first;
    }
-    else
-    {
-        // Can't trim journal
-        return false;
-    }
-#ifdef BLOCKSTORE_DEBUG
-    printf("Journal trimmed to %08lx (next_free=%08lx)\n", used_start, next_free);
-#endif
-    return true;
+    // Can't trim journal
+    return used_start;
 }
--- a/blockstore_journal.h
+++ b/blockstore_journal.h
@@ -10,6 +10,8 @@
 #define JOURNAL_BUFFER_SIZE 4*1024*1024

 // We reserve some extra space for future stabilize requests during writes
+// FIXME: This value should be dynamic i.e. Blockstore ideally shouldn't allow
+// writing more than can be stabilized afterwards
 #define JOURNAL_STABILIZE_RESERVATION 65536

 // Journal entries
@@ -167,6 +169,7 @@ struct journal_t

    ~journal_t();
    bool trim();
+    uint64_t get_trim_pos();
 };

 struct blockstore_journal_check_t
--- a/blockstore_rollback.cpp
+++ b/blockstore_rollback.cpp
@@ -9,10 +9,14 @@ int blockstore_impl_t::dequeue_rollback(blockstore_op_t *op)
    {
        return continue_rollback(op);
    }
-    obj_ver_id* v;
+    obj_ver_id *v, *nv;
    int i, todo = op->len;
-    for (i = 0, v = (obj_ver_id*)op->buf; i < op->len; i++, v++)
+    for (i = 0, v = (obj_ver_id*)op->buf, nv = (obj_ver_id*)op->buf; i < op->len; i++, v++, nv++)
    {
+        if (nv != v)
+        {
+            *nv = *v;
+        }
        // Check that there are some versions greater than v->version (which may be zero),
        // check that they're unstable, synced, and not currently written to
        auto dirty_it = dirty_db.lower_bound((obj_ver_id){
@@ -21,26 +25,27 @@ int blockstore_impl_t::dequeue_rollback(blockstore_op_t *op)
        });
        if (dirty_it == dirty_db.begin())
        {
-            if (v->version == 0)
-            {
-                // Already rolled back
-                // FIXME Skip this object version
-            }
-        bad_op:
-            op->retval = -ENOENT;
-            FINISH_OP(op);
-            return 1;
+skip_ov:
+            // Already rolled back, skip this object version
+            todo--;
+            nv--;
+            continue;
        }
        else
        {
            dirty_it--;
            if (dirty_it->first.oid != v->oid || dirty_it->first.version < v->version)
            {
-                goto bad_op;
+                goto skip_ov;
            }
            while (dirty_it->first.oid == v->oid && dirty_it->first.version > v->version)
            {
-                if (!IS_SYNCED(dirty_it->second.state) ||
+                if (IS_IN_FLIGHT(dirty_it->second.state))
+                {
+                    // Object write is still in progress. Wait until the write request completes
+                    return 0;
+                }
+                else if (!IS_SYNCED(dirty_it->second.state) ||
                    IS_STABLE(dirty_it->second.state))
                {
                    op->retval = -EBUSY;
@@ -55,6 +60,14 @@ int blockstore_impl_t::dequeue_rollback(blockstore_op_t *op)
            }
        }
    }
+    op->len = todo;
+    if (!todo)
+    {
+        // Already rolled back
+        op->retval = 0;
+        FINISH_OP(op);
+        return 1;
+    }
    // Check journal space
    blockstore_journal_check_t space_check(this);
    if (!space_check.check_available(op, todo, sizeof(journal_entry_rollback), 0))
@@ -89,16 +102,20 @@ int blockstore_impl_t::dequeue_rollback(blockstore_op_t *op)
        journal.crc32_last = je->crc32;
        if (cur_sector != journal.cur_sector)
        {
-            if (cur_sector == -1)
+            // Write previous sector. We should write the sector only after filling it,
+            // because otherwise we'll write a lot more sectors in the "no_same_sector_overwrite" mode
+            if (cur_sector != -1)
+                prepare_journal_sector_write(journal, cur_sector, sqe[s++], cb);
+            else
                PRIV(op)->min_flushed_journal_sector = 1 + journal.cur_sector;
            cur_sector = journal.cur_sector;
-            prepare_journal_sector_write(journal, cur_sector, sqe[s++], cb);
        }
    }
+    if (cur_sector != -1)
+        prepare_journal_sector_write(journal, cur_sector, sqe[s++], cb);
    PRIV(op)->max_flushed_journal_sector = 1 + journal.cur_sector;
    PRIV(op)->pending_ops = s;
    PRIV(op)->op_state = 1;
-    inflight_writes++;
    return 1;
 }

@@ -139,8 +156,7 @@ resume_5:
    {
        mark_rolled_back(*v);
    }
-    journal.trim();
-    inflight_writes--;
+    flusher->mark_trim_possible();
    // Acknowledge op
    op->retval = 0;
    FINISH_OP(op);
@@ -200,7 +216,6 @@ void blockstore_impl_t::handle_rollback_event(ring_data_t *data, blockstore_op_t
    live = true;
    if (data->res != data->iov.iov_len)
    {
-        inflight_writes--;
        throw std::runtime_error(
            "write operation failed ("+std::to_string(data->res)+" != "+std::to_string(data->iov.iov_len)+
            "). in-memory state is corrupted. AAAAAAAaaaaaaaaa!!!111"
@@ -219,10 +234,38 @@ void blockstore_impl_t::handle_rollback_event(ring_data_t *data, blockstore_op_t

 void blockstore_impl_t::erase_dirty(blockstore_dirty_db_t::iterator dirty_start, blockstore_dirty_db_t::iterator dirty_end, uint64_t clean_loc)
 {
-    auto dirty_it = dirty_end;
-    while (dirty_it != dirty_start)
+    if (dirty_end == dirty_start)
    {
+        return;
+    }
+    auto dirty_it = dirty_end;
+    dirty_it--;
+    if (IS_DELETE(dirty_it->second.state))
+    {
+        object_id oid = dirty_it->first.oid;
+#ifdef BLOCKSTORE_DEBUG
+        printf("Unblock writes-after-delete %lx:%lx v%lx\n", oid.inode, oid.stripe, dirty_it->first.version);
+#endif
+        dirty_it = dirty_end;
+        // Unblock operations blocked by delete flushing
+        uint32_t next_state = BS_ST_IN_FLIGHT;
+        while (dirty_it != dirty_db.end() && dirty_it->first.oid == oid)
+        {
+            if ((dirty_it->second.state & BS_ST_WORKFLOW_MASK) == BS_ST_WAIT_DEL)
+            {
+                dirty_it->second.state = (dirty_it->second.state & ~BS_ST_WORKFLOW_MASK) | next_state;
+                if (IS_BIG_WRITE(dirty_it->second.state))
+                {
+                    next_state = BS_ST_WAIT_BIG;
+                }
+            }
+            dirty_it++;
+        }
+        dirty_it = dirty_end;
        dirty_it--;
+    }
+    while (1)
+    {
        if (IS_BIG_WRITE(dirty_it->second.state) && dirty_it->second.location != clean_loc)
        {
 #ifdef BLOCKSTORE_DEBUG
@@ -241,6 +284,11 @@ void blockstore_impl_t::erase_dirty(blockstore_dirty_db_t::iterator dirty_start,
        {
            journal.used_sectors.erase(dirty_it->second.journal_sector);
        }
+        if (dirty_it == dirty_start)
+        {
+            break;
+        }
+        dirty_it--;
    }
    dirty_db.erase(dirty_start, dirty_end);
 }
--- a/blockstore_stable.cpp
+++ b/blockstore_stable.cpp
@@ -67,6 +67,11 @@ int blockstore_impl_t::dequeue_stable(blockstore_op_t *op)
                // Already stable
            }
        }
+        else if (IS_IN_FLIGHT(dirty_it->second.state))
+        {
+            // Object write is still in progress. Wait until the write request completes
+            return 0;
+        }
        else if (!IS_SYNCED(dirty_it->second.state))
        {
            // Object not synced yet. Caller must sync it first
@@ -121,16 +126,20 @@ int blockstore_impl_t::dequeue_stable(blockstore_op_t *op)
        journal.crc32_last = je->crc32;
        if (cur_sector != journal.cur_sector)
        {
-            if (cur_sector == -1)
+            // Write previous sector. We should write the sector only after filling it,
+            // because otherwise we'll write a lot more sectors in the "no_same_sector_overwrite" mode
+            if (cur_sector != -1)
+                prepare_journal_sector_write(journal, cur_sector, sqe[s++], cb);
+            else
                PRIV(op)->min_flushed_journal_sector = 1 + journal.cur_sector;
            cur_sector = journal.cur_sector;
-            prepare_journal_sector_write(journal, cur_sector, sqe[s++], cb);
        }
    }
+    if (cur_sector != -1)
+        prepare_journal_sector_write(journal, cur_sector, sqe[s++], cb);
    PRIV(op)->max_flushed_journal_sector = 1 + journal.cur_sector;
    PRIV(op)->pending_ops = s;
    PRIV(op)->op_state = 1;
-    inflight_writes++;
    return 1;
 }

@@ -173,7 +182,6 @@ resume_5:
        // Mark all dirty_db entries up to op->version as stable
        mark_stable(*v);
    }
-    inflight_writes--;
    // Acknowledge op
    op->retval = 0;
    FINISH_OP(op);
@@ -205,9 +213,6 @@ void blockstore_impl_t::mark_stable(const obj_ver_id & v)
                break;
            }
        }
-#ifdef BLOCKSTORE_DEBUG
-        printf("enqueue_flush %lx:%lx v%lu\n", v.oid.inode, v.oid.stripe, v.version);
-#endif
        flusher->enqueue_flush(v);
    }
    auto unstab_it = unstable_writes.find(v.oid);
@@ -223,7 +228,6 @@ void blockstore_impl_t::handle_stable_event(ring_data_t *data, blockstore_op_t *
    live = true;
    if (data->res != data->iov.iov_len)
    {
-        inflight_writes--;
        throw std::runtime_error(
            "write operation failed ("+std::to_string(data->res)+" != "+std::to_string(data->iov.iov_len)+
            "). in-memory state is corrupted. AAAAAAAaaaaaaaaa!!!111"
--- a/blockstore_sync.cpp
+++ b/blockstore_sync.cpp
@@ -107,7 +107,7 @@ int blockstore_impl_t::continue_sync(blockstore_op_t *op)
        // 2nd step: Data device is synced, prepare & write journal entries
        // Check space in the journal and journal memory buffers
        blockstore_journal_check_t space_check(this);
-        if (!space_check.check_available(op, PRIV(op)->sync_big_writes.size(), sizeof(journal_entry_big_write), 0))
+        if (!space_check.check_available(op, PRIV(op)->sync_big_writes.size(), sizeof(journal_entry_big_write), JOURNAL_STABILIZE_RESERVATION))
        {
            return 0;
        }
@@ -154,12 +154,17 @@ int blockstore_impl_t::continue_sync(blockstore_op_t *op)
            it++;
            if (cur_sector != journal.cur_sector)
            {
-                if (cur_sector == -1)
+                // Write previous sector. We should write the sector only after filling it,
+                // because otherwise we'll write a lot more sectors in the "no_same_sector_overwrite" mode
+                if (cur_sector != -1)
+                    prepare_journal_sector_write(journal, cur_sector, sqe[s++], cb);
+                else
                    PRIV(op)->min_flushed_journal_sector = 1 + journal.cur_sector;
                cur_sector = journal.cur_sector;
-                prepare_journal_sector_write(journal, cur_sector, sqe[s++], cb);
            }
        }
+        if (cur_sector != -1)
+            prepare_journal_sector_write(journal, cur_sector, sqe[s++], cb);
        PRIV(op)->max_flushed_journal_sector = 1 + journal.cur_sector;
        PRIV(op)->pending_ops = s;
        PRIV(op)->op_state = SYNC_JOURNAL_WRITE_SENT;
--- a/blockstore_write.cpp
+++ b/blockstore_write.cpp
@@ -7,7 +7,7 @@ bool blockstore_impl_t::enqueue_write(blockstore_op_t *op)
 {
    // Check or assign version number
    bool found = false, deleted = false, is_del = (op->opcode == BS_OP_DELETE);
-    bool is_inflight_big = false;
+    bool wait_big = false, wait_del = false;
    uint64_t version = 1;
    if (dirty_db.size() > 0)
    {
@@ -21,7 +21,8 @@ bool blockstore_impl_t::enqueue_write(blockstore_op_t *op)
            found = true;
            version = dirty_it->first.version + 1;
            deleted = IS_DELETE(dirty_it->second.state);
-            is_inflight_big = (dirty_it->second.state & BS_ST_TYPE_MASK) == BS_ST_BIG_WRITE
+            wait_del = ((dirty_it->second.state & BS_ST_WORKFLOW_MASK) == BS_ST_WAIT_DEL);
+            wait_big = (dirty_it->second.state & BS_ST_TYPE_MASK) == BS_ST_BIG_WRITE
                ? !IS_SYNCED(dirty_it->second.state)
                : ((dirty_it->second.state & BS_ST_WORKFLOW_MASK) == BS_ST_WAIT_BIG);
        }
@@ -38,23 +39,43 @@ bool blockstore_impl_t::enqueue_write(blockstore_op_t *op)
            deleted = true;
        }
    }
-    if (op->version == 0)
-    {
-        op->version = version;
-    }
-    else if (op->version < version)
-    {
-        // Invalid version requested
-        op->retval = -EEXIST;
-        return false;
-    }
    if (deleted && is_del)
    {
        // Already deleted
        op->retval = 0;
        return false;
    }
-    if (is_inflight_big && !is_del && !deleted && op->len < block_size &&
+    PRIV(op)->real_version = 0;
+    if (op->version == 0)
+    {
+        op->version = version;
+    }
+    else if (op->version < version)
+    {
+        // Implicit operations must be added like that: DEL [FLUSH] BIG [SYNC] SMALL SMALL
+        if (deleted || wait_del)
+        {
+            // It's allowed to write versions with low numbers over deletes
+            // However, we have to flush those deletes first as we use version number for ordering
+#ifdef BLOCKSTORE_DEBUG
+            printf("Write %lx:%lx v%lu over delete (real v%lu) offset=%u len=%u\n", op->oid.inode, op->oid.stripe, version, op->version, op->offset, op->len);
+#endif
+            wait_del = true;
+            PRIV(op)->real_version = op->version;
+            op->version = version;
+            flusher->unshift_flush((obj_ver_id){
+                .oid = op->oid,
+                .version = version-1,
+            }, true);
+        }
+        else
+        {
+            // Invalid version requested
+            op->retval = -EEXIST;
+            return false;
+        }
+    }
+    if (wait_big && !is_del && !deleted && op->len < block_size &&
        immediate_commit != IMMEDIATE_ALL)
    {
        // Issue an additional sync so that the previous big write can reach the journal
@@ -69,22 +90,31 @@ bool blockstore_impl_t::enqueue_write(blockstore_op_t *op)
 #ifdef BLOCKSTORE_DEBUG
    if (is_del)
        printf("Delete %lx:%lx v%lu\n", op->oid.inode, op->oid.stripe, op->version);
-    else
+    else if (!wait_del)
        printf("Write %lx:%lx v%lu offset=%u len=%u\n", op->oid.inode, op->oid.stripe, op->version, op->offset, op->len);
 #endif
-    // No strict need to add it into dirty_db here, it's just left
+    // FIXME No strict need to add it into dirty_db here, it's just left
    // from the previous implementation where reads waited for writes
+    uint32_t state;
+    if (is_del)
+        state = BS_ST_DELETE | BS_ST_IN_FLIGHT;
+    else
+    {
+        state = (op->len == block_size || deleted ? BS_ST_BIG_WRITE : BS_ST_SMALL_WRITE);
+        if (wait_del)
+            state |= BS_ST_WAIT_DEL;
+        else if (state == BS_ST_SMALL_WRITE && wait_big)
+            state |= BS_ST_WAIT_BIG;
+        else
+            state |= BS_ST_IN_FLIGHT;
+        if (op->opcode == BS_OP_WRITE_STABLE)
+            state |= BS_ST_INSTANT;
+    }
    dirty_db.emplace((obj_ver_id){
        .oid = op->oid,
        .version = op->version,
    }, (dirty_entry){
-        .state = (uint32_t)(
-            is_del
-                ? (BS_ST_DELETE | BS_ST_IN_FLIGHT)
-                : (op->opcode == BS_OP_WRITE_STABLE ? BS_ST_INSTANT : 0) | (op->len == block_size || deleted
-                    ? (BS_ST_BIG_WRITE | BS_ST_IN_FLIGHT)
-                    : (is_inflight_big ? (BS_ST_SMALL_WRITE | BS_ST_WAIT_BIG) : (BS_ST_SMALL_WRITE | BS_ST_IN_FLIGHT)))
-        ),
+        .state = state,
        .flags = 0,
        .location = 0,
        .offset = is_del ? 0 : op->offset,
@@ -106,12 +136,38 @@ int blockstore_impl_t::dequeue_write(blockstore_op_t *op)
        .version = op->version,
    });
    assert(dirty_it != dirty_db.end());
-    if ((dirty_it->second.state & BS_ST_WORKFLOW_MASK) == BS_ST_WAIT_BIG)
+    if ((dirty_it->second.state & BS_ST_WORKFLOW_MASK) < BS_ST_IN_FLIGHT)
    {
        // Don't dequeue
        return 0;
    }
-    else if ((dirty_it->second.state & BS_ST_TYPE_MASK) == BS_ST_BIG_WRITE)
+    if (PRIV(op)->real_version != 0)
+    {
+        // Restore original low version number for unblocked operations
+#ifdef BLOCKSTORE_DEBUG
+        printf("Restoring %lx:%lx version: v%lu -> v%lu\n", op->oid.inode, op->oid.stripe, op->version, PRIV(op)->real_version);
+#endif
+        auto prev_it = dirty_it;
+        prev_it--;
+        if (prev_it->first.oid == op->oid && prev_it->first.version >= PRIV(op)->real_version)
+        {
+            // Original version is still invalid
+            // FIXME Oops. Successive small writes will currently break in an unexpected way. Fix it
+            dirty_db.erase(dirty_it);
+            op->retval = -EEXIST;
+            FINISH_OP(op);
+            return 1;
+        }
+        op->version = PRIV(op)->real_version;
+        PRIV(op)->real_version = 0;
+        dirty_entry e = dirty_it->second;
+        dirty_db.erase(dirty_it);
+        dirty_it = dirty_db.emplace((obj_ver_id){
+            .oid = op->oid,
+            .version = op->version,
+        }, e).first;
+    }
+    if ((dirty_it->second.state & BS_ST_TYPE_MASK) == BS_ST_BIG_WRITE)
    {
        blockstore_journal_check_t space_check(this);
        if (!space_check.check_available(op, unsynced_big_writes.size() + 1, sizeof(journal_entry_big_write), JOURNAL_STABILIZE_RESERVATION))
@@ -129,6 +185,8 @@ int blockstore_impl_t::dequeue_write(blockstore_op_t *op)
                PRIV(op)->wait_for = WAIT_FREE;
                return 0;
            }
+            // FIXME Oops. Successive small writes will currently break in an unexpected way. Fix it
+            dirty_db.erase(dirty_it);
            op->retval = -ENOSPC;
            FINISH_OP(op);
            return 1;
@@ -289,7 +347,6 @@ int blockstore_impl_t::dequeue_write(blockstore_op_t *op)
            PRIV(op)->op_state = 3;
        }
    }
-    inflight_writes++;
    return 1;
 }

@@ -345,7 +402,7 @@ resume_2:
 resume_4:
    // Switch object state
 #ifdef BLOCKSTORE_DEBUG
-    printf("Ack write %lx:%lx v%lu = %d\n", op->oid.inode, op->oid.stripe, op->version, dirty_it->second.state);
+    printf("Ack write %lx:%lx v%lu = state %x\n", op->oid.inode, op->oid.stripe, op->version, dirty_it->second.state);
 #endif
    bool imm = (dirty_it->second.state & BS_ST_TYPE_MASK) == BS_ST_BIG_WRITE
        ? (immediate_commit == IMMEDIATE_ALL)
@@ -374,7 +431,6 @@ resume_4:
            dirty_it++;
        }
    }
-    inflight_writes--;
    // Acknowledge write
    op->retval = op->len;
    FINISH_OP(op);
@@ -386,7 +442,6 @@ void blockstore_impl_t::handle_write_event(ring_data_t *data, blockstore_op_t *o
    live = true;
    if (data->res != data->iov.iov_len)
    {
-        inflight_writes--;
        // FIXME: our state becomes corrupted after a write error. maybe do something better than just die
        throw std::runtime_error(
            "write operation failed ("+std::to_string(data->res)+" != "+std::to_string(data->iov.iov_len)+
@@ -445,7 +500,7 @@ int blockstore_impl_t::dequeue_del(blockstore_op_t *op)
    });
    assert(dirty_it != dirty_db.end());
    blockstore_journal_check_t space_check(this);
-    if (!space_check.check_available(op, 1, sizeof(journal_entry_del), 0))
+    if (!space_check.check_available(op, 1, sizeof(journal_entry_del), JOURNAL_STABILIZE_RESERVATION))
    {
        return 0;
    }
@@ -495,7 +550,10 @@ int blockstore_impl_t::dequeue_del(blockstore_op_t *op)
        prepare_journal_sector_write(journal, journal.cur_sector, sqe, cb);
        PRIV(op)->min_flushed_journal_sector = PRIV(op)->max_flushed_journal_sector = 1 + journal.cur_sector;
        PRIV(op)->pending_ops++;
-        // Remember small write as unsynced
+    }
+    else
+    {
+        // Remember delete as unsynced
        unsynced_small_writes.push_back((obj_ver_id){
            .oid = op->oid,
            .version = op->version,
--- a/cluster_client.cpp
+++ b/cluster_client.cpp
@@ -1,6 +1,7 @@
 // Copyright (c) Vitaliy Filippov, 2019+
 // License: VNPL-1.0 or GNU GPL-2.0+ (see README.md for details)

+#include <stdexcept>
 #include "cluster_client.h"

 cluster_client_t::cluster_client_t(ring_loop_t *ringloop, timerfd_manager_t *tfd, json11::Json & config)
@@ -8,9 +9,12 @@ cluster_client_t::cluster_client_t(ring_loop_t *ringloop, timerfd_manager_t *tfd
    this->ringloop = ringloop;
    this->tfd = tfd;

+    log_level = config["log_level"].int64_value();
+
    msgr.osd_num = 0;
    msgr.tfd = tfd;
    msgr.ringloop = ringloop;
+    msgr.log_level = log_level;
    msgr.repeer_pgs = [this](osd_num_t peer_osd)
    {
        if (msgr.osd_peer_fds.find(peer_osd) != msgr.osd_peer_fds.end())
@@ -72,7 +76,6 @@ cluster_client_t::cluster_client_t(ring_loop_t *ringloop, timerfd_manager_t *tfd
    st_cli.on_change_hook = [this](json11::Json::object & changes) { on_change_hook(changes); };
    st_cli.on_load_pgs_hook = [this](bool success) { on_load_pgs_hook(success); };

-    log_level = config["log_level"].int64_value();
    st_cli.parse_config(config);
    st_cli.load_global_config();

@@ -192,7 +195,6 @@ void cluster_client_t::on_load_config_hook(json11::Json::object & config)
    {
        msgr.peer_connect_timeout = DEFAULT_PEER_CONNECT_TIMEOUT;
    }
-    st_cli.start_etcd_watcher();
    st_cli.load_pgs();
 }

@@ -202,6 +204,12 @@ void cluster_client_t::on_load_pgs_hook(bool success)
    {
        pg_counts[pool_item.first] = pool_item.second.real_pg_count;
    }
+    pgs_loaded = true;
+    for (auto fn: on_ready_hooks)
+    {
+        fn();
+    }
+    on_ready_hooks.clear();
    for (auto op: offline_ops)
    {
        execute(op);
@@ -253,6 +261,18 @@ void cluster_client_t::on_change_osd_state_hook(uint64_t peer_osd)
    }
 }

+void cluster_client_t::on_ready(std::function<void(void)> fn)
+{
+    if (pgs_loaded)
+    {
+        fn();
+    }
+    else
+    {
+        on_ready_hooks.push_back(fn);
+    }
+}
+
 /**
 * How writes are synced when immediate_commit is false
 *
@@ -283,7 +303,7 @@ void cluster_client_t::on_change_osd_state_hook(uint64_t peer_osd)

 void cluster_client_t::execute(cluster_op_t *op)
 {
-    if (!bs_disk_alignment)
+    if (!pgs_loaded)
    {
        // We're offline
        offline_ops.push_back(op);
@@ -468,7 +488,7 @@ void cluster_client_t::slice_rw(cluster_op_t *op)
        uint64_t begin = (op->offset < stripe ? stripe : op->offset);
        uint64_t end = (op->offset + op->len) > (stripe + pg_block_size)
            ? (stripe + pg_block_size) : (op->offset + op->len);
-        op->parts[i] = {
+        op->parts[i] = (cluster_op_part_t){
            .parent = op,
            .offset = begin,
            .len = (uint32_t)(end - begin),
@@ -513,7 +533,7 @@ bool cluster_client_t::try_send(cluster_op_t *op, cluster_op_part_t *part)
            part->osd_num = primary_osd;
            part->sent = true;
            op->sent_count++;
-            part->op = {
+            part->op = (osd_op_t){
                .op_type = OSD_OP_OUT,
                .peer_fd = peer_fd,
                .req = { .rw = {
@@ -674,7 +694,7 @@ void cluster_client_t::send_sync(cluster_op_t *op, cluster_op_part_t *part)
    assert(peer_it != msgr.osd_peer_fds.end());
    part->sent = true;
    op->sent_count++;
-    part->op = {
+    part->op = (osd_op_t){
        .op_type = OSD_OP_OUT,
        .peer_fd = peer_it->second,
        .req = {
@@ -737,7 +757,7 @@ void cluster_client_t::handle_op_part(cluster_op_part_t *part)
            assert(op == cur_sync);
            finish_sync();
        }
-        else
+        else if (!op->up_wait)
        {
            continue_rw(op);
        }
--- a/cluster_client.h
+++ b/cluster_client.h
@@ -8,7 +8,6 @@

 #define MIN_BLOCK_SIZE 4*1024
 #define MAX_BLOCK_SIZE 128*1024*1024
-#define DEFAULT_BLOCK_SIZE 128*1024
 #define DEFAULT_DISK_ALIGNMENT 4096
 #define DEFAULT_BITMAP_GRANULARITY 4096
 #define DEFAULT_CLIENT_DIRTY_LIMIT 32*1024*1024
@@ -64,8 +63,6 @@ class cluster_client_t
    int up_wait_retry_interval = 500; // ms

    uint64_t op_id = 1;
-    etcd_state_client_t st_cli;
-    osd_messenger_t msgr;
    ring_consumer_t consumer;
    // operations currently in progress
    std::set<cluster_op_t*> cur_ops;
@@ -79,10 +76,17 @@ class cluster_client_t
    std::vector<cluster_op_t*> offline_ops;
    uint64_t queued_bytes = 0;

+    bool pgs_loaded = false;
+    std::vector<std::function<void(void)>> on_ready_hooks;
+
 public:
+    etcd_state_client_t st_cli;
+    osd_messenger_t msgr;
+
    cluster_client_t(ring_loop_t *ringloop, timerfd_manager_t *tfd, json11::Json & config);
    ~cluster_client_t();
    void execute(cluster_op_t *op);
+    void on_ready(std::function<void(void)> fn);
    void stop();

 protected:
--- a/copy-fio-includes.sh
+++ b/copy-fio-includes.sh
@@ -0,0 +1,13 @@
+#!/bin/bash
+
+gcc -E -o fio_headers.i fio_headers.h
+
+rm -rf fio-copy
+for i in `grep -Po 'fio/[^"]+' fio_headers.i | sort | uniq`; do
+    j=${i##fio/}
+    p=$(dirname $j)
+    mkdir -p fio-copy/$p
+    cp $i fio-copy/$j
+done
+
+rm fio_headers.i
--- a/copy-qemu-includes.sh
+++ b/copy-qemu-includes.sh
@@ -0,0 +1,18 @@
+#!/bin/bash
+
+#cd qemu
+#debian/rules b/configure-stamp
+#cd b/qemu; make qapi
+
+gcc -I qemu/b/qemu `pkg-config glib-2.0 --cflags` \
+    -I qemu/include -E -o qemu_driver.i qemu_driver.c
+
+rm -rf qemu-copy
+for i in `grep -Po 'qemu/[^"]+' qemu_driver.i | sort | uniq`; do
+    j=${i##qemu/}
+    p=$(dirname $j)
+    mkdir -p qemu-copy/$p
+    cp $i qemu-copy/$j
+done
+
+rm qemu_driver.i
--- a/debian/changelog
+++ b/debian/changelog
@@ -0,0 +1,11 @@
+vitastor (0.5.1-1) unstable; urgency=medium
+
+  * Add jerasure support
+
+ -- Vitaliy Filippov <vitalif@yourcmc.ru>  Sat, 05 Dec 2020 17:02:26 +0300
+
+vitastor (0.5-1) unstable; urgency=medium
+
+  * First packaging for Debian
+
+ -- Vitaliy Filippov <vitalif@yourcmc.ru>  Thu, 05 Nov 2020 02:20:59 +0300
--- a/debian/compat
+++ b/debian/compat
@@ -0,0 +1 @@
+13
--- a/debian/control
+++ b/debian/control
@@ -0,0 +1,17 @@
+Source: vitastor
+Section: admin
+Priority: optional
+Maintainer: Vitaliy Filippov <vitalif@yourcmc.ru>
+Build-Depends: debhelper, liburing-dev (>= 0.6), g++ (>= 8), libstdc++6 (>= 8), linux-libc-dev, libgoogle-perftools-dev, libjerasure-dev, libgf-complete-dev
+Standards-Version: 4.5.0
+Homepage: https://vitastor.io/
+Rules-Requires-Root: no
+
+Package: vitastor
+Architecture: amd64
+Depends: ${shlibs:Depends}, ${misc:Depends}, fio (= ${dep:fio}), qemu (= ${dep:qemu}), nodejs (>= 10), node-sprintf-js, node-ws (>= 7), libjerasure2
+Description: Vitastor, a fast software-defined clustered block storage
+ Vitastor is a small, simple and fast clustered block storage (storage for VM drives),
+ architecturally similar to Ceph which means strong consistency, primary-replication,
+ symmetric clustering and automatic data distribution over any number of drives of any
+ size with configurable redundancy (replication or erasure codes/XOR).
--- a/debian/copyright
+++ b/debian/copyright
@@ -0,0 +1,20 @@
+Format: https://www.debian.org/doc/packaging-manuals/copyright-format/1.0/
+Upstream-Name: vitastor
+Upstream-Contact: Vitaliy Filippov <vitalif@yourcmc.ru>
+Source: https://vitastor.io
+
+Files: *
+Copyright: 2019+ Vitaliy Filippov <vitalif@yourcmc.ru>
+License: Multiple licenses VNPL-1.0 and/or GPL-2.0+
+ All server-side code (OSD, Monitor and so on) is licensed under the terms of
+ Vitastor Network Public License 1.0 (VNPL 1.0), a copyleft license based on
+ GNU GPLv3.0 with the additional "Network Interaction" clause which requires
+ opensourcing all programs directly or indirectly interacting with Vitastor
+ through a computer network ("Proxy Programs"). Proxy Programs may be made public
+ not only under the terms of the same license, but also under the terms of any
+ GPL-Compatible Free Software License, as listed by the Free Software Foundation.
+ This is a stricter copyleft license than the Affero GPL.
+ .
+ Client libraries (cluster_client and so on) are dual-licensed under the same
+ VNPL 1.0 and also GNU GPL 2.0 or later to allow for compatibility with GPLed
+ software like QEMU and fio.
--- a/debian/install
+++ b/debian/install
@@ -0,0 +1,3 @@
+VNPL-1.0.txt usr/share/doc/vitastor
+GPL-2.0.txt usr/share/doc/vitastor
+mon usr/lib/vitastor
--- a/debian/patched-qemu.Dockerfile
+++ b/debian/patched-qemu.Dockerfile
@@ -0,0 +1,49 @@
+# Build patched QEMU for Debian Buster or Bullseye/Sid inside a container
+# cd ..; podman build --build-arg REL=bullseye -v `pwd`/build:/root/build -f debian/patched-qemu.Dockerfile .
+
+ARG REL=bullseye
+
+FROM debian:$REL
+
+# again, it doesn't work otherwise
+ARG REL=bullseye
+
+WORKDIR /root
+
+RUN if [ "$REL" = "buster" ]; then \
+        echo 'deb http://deb.debian.org/debian buster-backports main' >> /etc/apt/sources.list; \
+        echo >> /etc/apt/preferences; \
+        echo 'Package: *' >> /etc/apt/preferences; \
+        echo 'Pin: release a=buster-backports' >> /etc/apt/preferences; \
+        echo 'Pin-Priority: 500' >> /etc/apt/preferences; \
+    fi; \
+    grep '^deb ' /etc/apt/sources.list | perl -pe 's/^deb/deb-src/' >> /etc/apt/sources.list; \
+    echo 'APT::Install-Recommends false;' >> /etc/apt/apt.conf; \
+    echo 'APT::Install-Suggests false;' >> /etc/apt/apt.conf
+
+RUN apt-get update
+RUN apt-get -y install qemu fio liburing1 liburing-dev libgoogle-perftools-dev devscripts
+RUN apt-get -y build-dep qemu
+RUN apt-get -y build-dep fio
+RUN apt-get --download-only source qemu
+RUN apt-get --download-only source fio
+
+ADD qemu-5.0-vitastor.patch qemu-5.1-vitastor.patch /root/vitastor/
+RUN set -e; \
+    mkdir -p /root/build/qemu-$REL; \
+    rm -rf /root/build/qemu-$REL/*; \
+    cd /root/build/qemu-$REL; \
+    dpkg-source -x /root/qemu*.dsc; \
+    if [ -d /root/build/qemu-$REL/qemu-5.0 ]; then \
+        cp /root/vitastor/qemu-5.0-vitastor.patch /root/build/qemu-$REL/qemu-5.0/debian/patches; \
+        echo qemu-5.0-vitastor.patch >> /root/build/qemu-$REL/qemu-5.0/debian/patches/series; \
+    else \
+        cp /root/vitastor/qemu-5.1-vitastor.patch /root/build/qemu-$REL/qemu-*/debian/patches; \
+        P=`ls -d /root/build/qemu-$REL/qemu-*/debian/patches`; \
+        echo qemu-5.1-vitastor.patch >> $P/series; \
+    fi; \
+    cd /root/build/qemu-$REL/qemu-*/; \
+    V=$(head -n1 debian/changelog | perl -pe 's/^.*\((.*?)(~bpo[\d\+]*)?\).*$/$1/')+vitastor1; \
+    DEBFULLNAME="Vitaliy Filippov <vitalif@yourcmc.ru>" dch -D $REL -v $V 'Plug Vitastor block driver'; \
+    DEB_BUILD_OPTIONS=nocheck dpkg-buildpackage --jobs=auto -sa; \
+    rm -rf /root/build/qemu-$REL/qemu-*/
--- a/debian/rules
+++ b/debian/rules
@@ -0,0 +1,9 @@
+#!/usr/bin/make -f
+export DH_VERBOSE = 1
+
+%:
+	dh $@
+
+override_dh_installdeb:
+	cat debian/substvars >> debian/vitastor.substvars
+	dh_installdeb
--- a/debian/source/format
+++ b/debian/source/format
@@ -0,0 +1 @@
+3.0 (quilt)
--- a/debian/substvars
+++ b/debian/substvars
@@ -0,0 +1,2 @@
+dep:fio=3.16-1
+dep:qemu=1:5.1+dfsg-4+vitastor1
--- a/debian/vitastor.Dockerfile
+++ b/debian/vitastor.Dockerfile
@@ -0,0 +1,72 @@
+# Build Vitastor packages for Debian Buster or Bullseye/Sid inside a container
+# cd ..; podman build --build-arg REL=bullseye -v `pwd`/build:/root/build -f debian/vitastor.Dockerfile .
+
+ARG REL=bullseye
+
+FROM debian:$REL
+
+# again, it doesn't work otherwise
+ARG REL=bullseye
+
+WORKDIR /root
+
+RUN if [ "$REL" = "buster" ]; then \
+        echo 'deb http://deb.debian.org/debian buster-backports main' >> /etc/apt/sources.list; \
+        echo >> /etc/apt/preferences; \
+        echo 'Package: *' >> /etc/apt/preferences; \
+        echo 'Pin: release a=buster-backports' >> /etc/apt/preferences; \
+        echo 'Pin-Priority: 500' >> /etc/apt/preferences; \
+    fi; \
+    grep '^deb ' /etc/apt/sources.list | perl -pe 's/^deb/deb-src/' >> /etc/apt/sources.list; \
+    echo 'APT::Install-Recommends false;' >> /etc/apt/apt.conf; \
+    echo 'APT::Install-Suggests false;' >> /etc/apt/apt.conf
+
+RUN apt-get update
+RUN apt-get -y install qemu fio liburing1 liburing-dev libgoogle-perftools-dev devscripts
+RUN apt-get -y build-dep qemu
+RUN apt-get -y build-dep fio
+RUN apt-get --download-only source qemu
+RUN apt-get --download-only source fio
+RUN apt-get -y install libjerasure-dev
+
+ADD . /root/vitastor
+RUN set -e -x; \
+    mkdir -p /root/fio-build/; \
+    cd /root/fio-build/; \
+    rm -rf /root/fio-build/*; \
+    dpkg-source -x /root/fio*.dsc; \
+    cd /root/build/qemu-$REL/; \
+    rm -rf qemu*/; \
+    dpkg-source -x qemu*.dsc; \
+    cd /root/build/qemu-$REL/qemu*/; \
+    debian/rules b/configure-stamp; \
+    cd b/qemu; \
+    make -j8 qapi; \
+    mkdir -p /root/build/vitastor-$REL; \
+    rm -rf /root/build/vitastor-$REL/*; \
+    cd /root/build/vitastor-$REL; \
+    cp -r /root/vitastor vitastor-0.5.1; \
+    ln -s /root/build/qemu-$REL/qemu-*/ vitastor-0.5.1/qemu; \
+    ln -s /root/fio-build/fio-*/ vitastor-0.5.1/fio; \
+    cd vitastor-0.5.1; \
+    FIO=$(head -n1 fio/debian/changelog | perl -pe 's/^.*\((.*?)\).*$/$1/'); \
+    QEMU=$(head -n1 qemu/debian/changelog | perl -pe 's/^.*\((.*?)\).*$/$1/'); \
+    sh copy-qemu-includes.sh; \
+    sh copy-fio-includes.sh; \
+    rm qemu fio; \
+    mkdir -p a b debian/patches; \
+    mv qemu-copy b/qemu; \
+    mv fio-copy b/fio; \
+    diff -NaurpbB a b > debian/patches/qemu-fio-headers.patch || true; \
+    echo qemu-fio-headers.patch >> debian/patches/series; \
+    rm -rf a b; \
+    rm -rf /root/build/qemu-$REL/qemu*/; \
+    echo "dep:fio=$FIO" > debian/substvars; \
+    echo "dep:qemu=$QEMU" >> debian/substvars; \
+    cd /root/build/vitastor-$REL; \
+    tar --sort=name --mtime='2020-01-01' --owner=0 --group=0 --exclude=debian -cJf vitastor_0.5.1.orig.tar.xz vitastor-0.5.1; \
+    cd vitastor-0.5.1; \
+    V=$(head -n1 debian/changelog | perl -pe 's/^.*\((.*?)\).*$/$1/'); \
+    DEBFULLNAME="Vitaliy Filippov <vitalif@yourcmc.ru>" dch -D $REL -v "$V""$REL" "Rebuild for $REL"; \
+    DEB_BUILD_OPTIONS=nocheck dpkg-buildpackage --jobs=auto -sa; \
+    rm -rf /root/build/vitastor-$REL/vitastor-*/
--- a/epoll_manager.cpp
+++ b/epoll_manager.cpp
@@ -4,6 +4,7 @@
 #include <sys/epoll.h>
 #include <sys/poll.h>
 #include <unistd.h>
+#include <stdexcept>

 #include "epoll_manager.h"

--- a/etcd_state_client.cpp
+++ b/etcd_state_client.cpp
@@ -136,7 +136,7 @@ void etcd_state_client_t::start_etcd_watcher()
                }
                for (auto & kv: changes)
                {
-                    if (this->log_level > 0)
+                    if (this->log_level > 3)
                    {
                        printf("Incoming event: %s -> %s\n", kv.first.c_str(), kv.second.dump().c_str());
                    }
@@ -173,6 +173,7 @@ void etcd_state_client_t::start_etcd_watcher()
            { "range_end", base64_encode(etcd_prefix+"/config0") },
            { "start_revision", etcd_watch_revision+1 },
            { "watch_id", ETCD_CONFIG_WATCH_ID },
+            { "progress_notify", true },
        } }
    }).dump());
    etcd_watch_ws->post_message(WS_TEXT, json11::Json(json11::Json::object {
@@ -181,6 +182,7 @@ void etcd_state_client_t::start_etcd_watcher()
            { "range_end", base64_encode(etcd_prefix+"/osd/state0") },
            { "start_revision", etcd_watch_revision+1 },
            { "watch_id", ETCD_OSD_STATE_WATCH_ID },
+            { "progress_notify", true },
        } }
    }).dump());
    etcd_watch_ws->post_message(WS_TEXT, json11::Json(json11::Json::object {
@@ -189,6 +191,7 @@ void etcd_state_client_t::start_etcd_watcher()
            { "range_end", base64_encode(etcd_prefix+"/pg/state0") },
            { "start_revision", etcd_watch_revision+1 },
            { "watch_id", ETCD_PG_STATE_WATCH_ID },
+            { "progress_notify", true },
        } }
    }).dump());
    etcd_watch_ws->post_message(WS_TEXT, json11::Json(json11::Json::object {
@@ -197,6 +200,7 @@ void etcd_state_client_t::start_etcd_watcher()
            { "range_end", base64_encode(etcd_prefix+"/pg/history0") },
            { "start_revision", etcd_watch_revision+1 },
            { "watch_id", ETCD_PG_HISTORY_WATCH_ID },
+            { "progress_notify", true },
        } }
    }).dump());
 }
@@ -216,10 +220,6 @@ void etcd_state_client_t::load_global_config()
            });
            return;
        }
-        if (!etcd_watch_revision)
-        {
-            etcd_watch_revision = data["header"]["revision"].uint64_value();
-        }
        json11::Json::object global_config;
        if (data["kvs"].array_items().size() > 0)
        {
@@ -229,6 +229,11 @@ void etcd_state_client_t::load_global_config()
                global_config = kv.value.object_items();
            }
        }
+        bs_block_size = global_config["block_size"].uint64_value();
+        if (!bs_block_size)
+        {
+            bs_block_size = DEFAULT_BLOCK_SIZE;
+        }
        on_load_config_hook(global_config);
    });
 }
@@ -287,6 +292,10 @@ void etcd_state_client_t::load_pgs()
            on_load_pgs_hook(false);
            return;
        }
+        if (!etcd_watch_revision)
+        {
+            etcd_watch_revision = data["header"]["revision"].uint64_value();
+        }
        for (auto & res: data["responses"].array_items())
        {
            for (auto & kv_json: res["response_range"]["kvs"].array_items())
@@ -296,6 +305,7 @@ void etcd_state_client_t::load_pgs()
            }
        }
        on_load_pgs_hook(true);
+        start_etcd_watcher();
    });
 }

@@ -309,65 +319,99 @@ void etcd_state_client_t::parse_state(const std::string & key, const json11::Jso
        }
        for (auto & pool_item: value.object_items())
        {
+            pool_config_t pc;
+            // ID
            pool_id_t pool_id = stoull_full(pool_item.first);
            if (!pool_id || pool_id >= POOL_ID_MAX)
            {
                printf("Pool ID %s is invalid (must be a number less than 0x%x), skipping pool\n", pool_item.first.c_str(), POOL_ID_MAX);
                continue;
            }
-            if (pool_item.second["pg_size"].uint64_value() < 1 ||
-                pool_item.second["scheme"] == "xor" && pool_item.second["pg_size"].uint64_value() < 3)
-            {
-                printf("Pool %u has invalid pg_size, skipping pool\n", pool_id);
-                continue;
-            }
-            if (pool_item.second["pg_minsize"].uint64_value() < 1 ||
-                pool_item.second["pg_minsize"].uint64_value() > pool_item.second["pg_size"].uint64_value() ||
-                pool_item.second["pg_minsize"].uint64_value() < (pool_item.second["pg_size"].uint64_value() - 1))
-            {
-                printf("Pool %u has invalid pg_minsize, skipping pool\n", pool_id);
-                continue;
-            }
-            if (pool_item.second["pg_count"].uint64_value() < 1)
-            {
-                printf("Pool %u has invalid pg_count, skipping pool\n", pool_id);
-                continue;
-            }
-            if (pool_item.second["name"].string_value() == "")
+            pc.id = pool_id;
+            // Pool Name
+            pc.name = pool_item.second["name"].string_value();
+            if (pc.name == "")
            {
                printf("Pool %u has empty name, skipping pool\n", pool_id);
                continue;
            }
-            if (pool_item.second["scheme"] != "replicated" && pool_item.second["scheme"] != "xor")
+            // Failure Domain
+            pc.failure_domain = pool_item.second["failure_domain"].string_value();
+            // Coding Scheme
+            if (pool_item.second["scheme"] == "replicated")
+                pc.scheme = POOL_SCHEME_REPLICATED;
+            else if (pool_item.second["scheme"] == "xor")
+                pc.scheme = POOL_SCHEME_XOR;
+            else if (pool_item.second["scheme"] == "jerasure")
+                pc.scheme = POOL_SCHEME_JERASURE;
+            else
            {
-                printf("Pool %u has invalid coding scheme (only \"xor\" and \"replicated\" are allowed), skipping pool\n", pool_id);
+                printf("Pool %u has invalid coding scheme (one of \"xor\", \"replicated\" or \"jerasure\" required), skipping pool\n", pool_id);
                continue;
            }
-            if (pool_item.second["max_osd_combinations"].uint64_value() > 0 &&
-                pool_item.second["max_osd_combinations"].uint64_value() < 100)
+            // PG Size
+            pc.pg_size = pool_item.second["pg_size"].uint64_value();
+            if (pc.pg_size < 1 ||
+                pool_item.second["pg_size"].uint64_value() < 3 &&
+                (pc.scheme == POOL_SCHEME_XOR || pc.scheme == POOL_SCHEME_JERASURE) ||
+                pool_item.second["pg_size"].uint64_value() > 256)
+            {
+                printf("Pool %u has invalid pg_size, skipping pool\n", pool_id);
+                continue;
+            }
+            // Parity Chunks
+            pc.parity_chunks = pool_item.second["parity_chunks"].uint64_value();
+            if (pc.scheme == POOL_SCHEME_XOR)
+            {
+                if (pc.parity_chunks > 1)
+                {
+                    printf("Pool %u has invalid parity_chunks (must be 1), skipping pool\n", pool_id);
+                    continue;
+                }
+                pc.parity_chunks = 1;
+            }
+            if (pc.scheme == POOL_SCHEME_JERASURE &&
+                (pc.parity_chunks < 1 || pc.parity_chunks > pc.pg_size-2))
+            {
+                printf("Pool %u has invalid parity_chunks (must be between 1 and pg_size-2), skipping pool\n", pool_id);
+                continue;
+            }
+            // PG MinSize
+            pc.pg_minsize = pool_item.second["pg_minsize"].uint64_value();
+            if (pc.pg_minsize < 1 || pc.pg_minsize > pc.pg_size ||
+                (pc.scheme == POOL_SCHEME_XOR || pc.scheme == POOL_SCHEME_JERASURE) &&
+                pc.pg_minsize < (pc.pg_size-pc.parity_chunks))
+            {
+                printf("Pool %u has invalid pg_minsize, skipping pool\n", pool_id);
+                continue;
+            }
+            // PG Count
+            pc.pg_count = pool_item.second["pg_count"].uint64_value();
+            if (pc.pg_count < 1)
+            {
+                printf("Pool %u has invalid pg_count, skipping pool\n", pool_id);
+                continue;
+            }
+            // Max OSD Combinations
+            pc.max_osd_combinations = pool_item.second["max_osd_combinations"].uint64_value();
+            if (!pc.max_osd_combinations)
+                pc.max_osd_combinations = 10000;
+            if (pc.max_osd_combinations > 0 && pc.max_osd_combinations < 100)
            {
                printf("Pool %u has invalid max_osd_combinations (must be at least 100), skipping pool\n", pool_id);
                continue;
            }
+            // PG Stripe Size
+            pc.pg_stripe_size = pool_item.second["pg_stripe_size"].uint64_value();
+            uint64_t min_stripe_size = bs_block_size * (pc.scheme == POOL_SCHEME_REPLICATED ? 1 : (pc.pg_size-pc.parity_chunks));
+            if (pc.pg_stripe_size < min_stripe_size)
+                pc.pg_stripe_size = min_stripe_size;
+            // Save
+            pc.real_pg_count = this->pool_config[pool_id].real_pg_count;
+            std::swap(pc.pg_config, this->pool_config[pool_id].pg_config);
+            std::swap(this->pool_config[pool_id], pc);
            auto & parsed_cfg = this->pool_config[pool_id];
            parsed_cfg.exists = true;
-            parsed_cfg.id = pool_id;
-            parsed_cfg.name = pool_item.second["name"].string_value();
-            parsed_cfg.scheme = pool_item.second["scheme"] == "replicated" ? POOL_SCHEME_REPLICATED : POOL_SCHEME_XOR;
-            parsed_cfg.pg_size = pool_item.second["pg_size"].uint64_value();
-            parsed_cfg.pg_minsize = pool_item.second["pg_minsize"].uint64_value();
-            parsed_cfg.pg_count = pool_item.second["pg_count"].uint64_value();
-            parsed_cfg.failure_domain = pool_item.second["failure_domain"].string_value();
-            parsed_cfg.pg_stripe_size = pool_item.second["pg_stripe_size"].uint64_value();
-            if (!parsed_cfg.pg_stripe_size)
-            {
-                parsed_cfg.pg_stripe_size = DEFAULT_PG_STRIPE_SIZE;
-            }
-            parsed_cfg.max_osd_combinations = pool_item.second["max_osd_combinations"].uint64_value();
-            if (!parsed_cfg.max_osd_combinations)
-            {
-                parsed_cfg.max_osd_combinations = 10000;
-            }
            for (auto & pg_item: parsed_cfg.pg_config)
            {
                if (pg_item.second.target_set.size() != parsed_cfg.pg_size)
--- a/etcd_state_client.h
+++ b/etcd_state_client.h
@@ -16,7 +16,7 @@
 #define ETCD_SLOW_TIMEOUT 5000
 #define ETCD_QUICK_TIMEOUT 1000

-#define DEFAULT_PG_STRIPE_SIZE 4*1024*1024
+#define DEFAULT_BLOCK_SIZE 128*1024

 struct json_kv_t
 {
@@ -43,7 +43,7 @@ struct pool_config_t
    pool_id_t id;
    std::string name;
    uint64_t scheme;
-    uint64_t pg_size, pg_minsize;
+    uint64_t pg_size, pg_minsize, parity_chunks;
    uint64_t pg_count;
    uint64_t real_pg_count;
    std::string failure_domain;
@@ -62,6 +62,7 @@ struct etcd_state_client_t
    int etcd_watches_initialised = 0;
    uint64_t etcd_watch_revision = 0;
    websocket_t *etcd_watch_ws = NULL;
+    uint64_t bs_block_size = 0;
    std::map<pool_id_t, pool_config_t> pool_config;
    std::map<osd_num_t, json11::Json> peer_states;

--- a/fio_cluster.cpp
+++ b/fio_cluster.cpp
@@ -28,12 +28,7 @@

 #include "epoll_manager.h"
 #include "cluster_client.h"
-extern "C" {
-#define CONFIG_HAVE_GETTID
-#define CONFIG_PWRITEV2
-#include "fio/fio.h"
-#include "fio/optgroup.h"
-}
+#include "fio_headers.h"

 struct sec_data
 {
--- a/fio_engine.cpp
+++ b/fio_engine.cpp
@@ -25,12 +25,7 @@
 //     -bs_config='{"data_device":"./test_data.bin"}' -size=1000M

 #include "blockstore.h"
-extern "C" {
-#define CONFIG_HAVE_GETTID
-#define CONFIG_PWRITEV2
-#include "fio/fio.h"
-#include "fio/optgroup.h"
-}
+#include "fio_headers.h"

 #include "json11/json11.hpp"

--- a/fio_headers.h
+++ b/fio_headers.h
@@ -0,0 +1,16 @@
+extern "C" {
+// Kill atomics in fio headers
+#define _STDATOMIC_H
+#include "fio/arch/arch.h"
+
+#undef atomic_load_acquire
+#undef atomic_store_release
+#define atomic_load_acquire(p) *(p)
+#define atomic_store_release(p, v) (*(p)) = (v)
+
+#define CONFIG_HAVE_GETTID
+#define CONFIG_SYNC_FILE_RANGE
+#define CONFIG_PWRITEV2
+#include "fio/fio.h"
+#include "fio/optgroup.h"
+}
--- a/fio_sec_osd.cpp
+++ b/fio_sec_osd.cpp
@@ -30,12 +30,7 @@

 #include "rw_blocking.h"
 #include "osd_ops.h"
-extern "C" {
-#define CONFIG_HAVE_GETTID
-#define CONFIG_PWRITEV2
-#include "fio/fio.h"
-#include "fio/optgroup.h"
-}
+#include "fio_headers.h"

 struct sec_data
 {
--- a/http_client.cpp
+++ b/http_client.cpp
@@ -13,6 +13,8 @@
 #include <fcntl.h>
 #include <string.h>

+#include <stdexcept>
+
 #include "json11/json11.hpp"
 #include "http_client.h"
 #include "timerfd_manager.h"
--- a/malloc_or_die.h
+++ b/malloc_or_die.h
@@ -4,6 +4,7 @@
 #pragma once

 #include <malloc.h>
+#include <stdlib.h>

 inline void* memalign_or_die(size_t alignment, size_t size)
 {
--- a/messenger.cpp
+++ b/messenger.cpp
@@ -6,6 +6,7 @@
 #include <sys/socket.h>
 #include <sys/epoll.h>
 #include <netinet/tcp.h>
+#include <stdexcept>

 #include "messenger.h"

@@ -80,6 +81,7 @@ void osd_messenger_t::try_connect_peer(uint64_t peer_osd)

 void osd_messenger_t::try_connect_peer_addr(osd_num_t peer_osd, const char *peer_host, int peer_port)
 {
+    assert(peer_osd != this->osd_num);
    struct sockaddr_in addr;
    int r;
    if ((r = inet_pton(AF_INET, peer_host, &addr.sin_addr)) != 1)
@@ -96,17 +98,6 @@ void osd_messenger_t::try_connect_peer_addr(osd_num_t peer_osd, const char *peer
        return;
    }
    fcntl(peer_fd, F_SETFL, fcntl(peer_fd, F_GETFL, 0) | O_NONBLOCK);
-    int timeout_id = -1;
-    if (peer_connect_timeout > 0)
-    {
-        timeout_id = tfd->set_timer(1000*peer_connect_timeout, false, [this, peer_fd](int timer_id)
-        {
-            osd_num_t peer_osd = clients[peer_fd].osd_num;
-            stop_client(peer_fd);
-            on_connect_peer(peer_osd, -EIO);
-            return;
-        });
-    }
    r = connect(peer_fd, (sockaddr*)&addr, sizeof(addr));
    if (r < 0 && errno != EINPROGRESS)
    {
@@ -114,8 +105,18 @@ void osd_messenger_t::try_connect_peer_addr(osd_num_t peer_osd, const char *peer
        on_connect_peer(peer_osd, -errno);
        return;
    }
-    assert(peer_osd != this->osd_num);
-    clients[peer_fd] = (osd_client_t){
+    int timeout_id = -1;
+    if (peer_connect_timeout > 0)
+    {
+        timeout_id = tfd->set_timer(1000*peer_connect_timeout, false, [this, peer_fd](int timer_id)
+        {
+            osd_num_t peer_osd = clients.at(peer_fd)->osd_num;
+            stop_client(peer_fd);
+            on_connect_peer(peer_osd, -EIO);
+            return;
+        });
+    }
+    clients[peer_fd] = new osd_client_t((osd_client_t){
        .peer_addr = addr,
        .peer_port = peer_port,
        .peer_fd = peer_fd,
@@ -123,7 +124,7 @@ void osd_messenger_t::try_connect_peer_addr(osd_num_t peer_osd, const char *peer
        .connect_timeout_id = timeout_id,
        .osd_num = peer_osd,
        .in_buf = malloc_or_die(receive_buffer_size),
-    };
+    });
    tfd->set_fd_handler(peer_fd, true, [this](int peer_fd, int epoll_events)
    {
        // Either OUT (connected) or HUP
@@ -133,13 +134,13 @@ void osd_messenger_t::try_connect_peer_addr(osd_num_t peer_osd, const char *peer

 void osd_messenger_t::handle_connect_epoll(int peer_fd)
 {
-    auto & cl = clients[peer_fd];
-    if (cl.connect_timeout_id >= 0)
+    auto cl = clients[peer_fd];
+    if (cl->connect_timeout_id >= 0)
    {
-        tfd->clear_timer(cl.connect_timeout_id);
-        cl.connect_timeout_id = -1;
+        tfd->clear_timer(cl->connect_timeout_id);
+        cl->connect_timeout_id = -1;
    }
-    osd_num_t peer_osd = cl.osd_num;
+    osd_num_t peer_osd = cl->osd_num;
    int result = 0;
    socklen_t result_len = sizeof(result);
    if (getsockopt(peer_fd, SOL_SOCKET, SO_ERROR, &result, &result_len) < 0)
@@ -154,7 +155,7 @@ void osd_messenger_t::handle_connect_epoll(int peer_fd)
    }
    int one = 1;
    setsockopt(peer_fd, SOL_TCP, TCP_NODELAY, &one, sizeof(one));
-    cl.peer_state = PEER_CONNECTED;
+    cl->peer_state = PEER_CONNECTED;
    tfd->set_fd_handler(peer_fd, false, [this](int peer_fd, int epoll_events)
    {
        handle_peer_epoll(peer_fd, epoll_events);
@@ -175,11 +176,11 @@ void osd_messenger_t::handle_peer_epoll(int peer_fd, int epoll_events)
    else if (epoll_events & EPOLLIN)
    {
        // Mark client as ready (i.e. some data is available)
-        auto & cl = clients[peer_fd];
-        cl.read_ready++;
-        if (cl.read_ready == 1)
+        auto cl = clients[peer_fd];
+        cl->read_ready++;
+        if (cl->read_ready == 1)
        {
-            read_ready_clients.push_back(cl.peer_fd);
+            read_ready_clients.push_back(cl->peer_fd);
            if (ringloop)
                ringloop->wakeup();
            else
@@ -219,17 +220,20 @@ void osd_messenger_t::on_connect_peer(osd_num_t peer_osd, int peer_fd)
        }
        return;
    }
-    printf("Connected with peer OSD %lu (fd %d)\n", peer_osd, peer_fd);
+    if (log_level > 0)
+    {
+        printf("[OSD %lu] Connected with peer OSD %lu (client %d)\n", osd_num, peer_osd, peer_fd);
+    }
    wanted_peers.erase(peer_osd);
    repeer_pgs(peer_osd);
 }

-void osd_messenger_t::check_peer_config(osd_client_t & cl)
+void osd_messenger_t::check_peer_config(osd_client_t *cl)
 {
    osd_op_t *op = new osd_op_t();
    op->op_type = OSD_OP_OUT;
-    op->peer_fd = cl.peer_fd;
-    op->req = {
+    op->peer_fd = cl->peer_fd;
+    op->req = (osd_any_op_t){
        .show_conf = {
            .header = {
                .magic = SECONDARY_OSD_OP_MAGIC,
@@ -238,16 +242,15 @@ void osd_messenger_t::check_peer_config(osd_client_t & cl)
            },
        },
    };
-    op->callback = [this](osd_op_t *op)
+    op->callback = [this, cl](osd_op_t *op)
    {
-        osd_client_t & cl = clients[op->peer_fd];
        std::string json_err;
        json11::Json config;
        bool err = false;
        if (op->reply.hdr.retval < 0)
        {
            err = true;
-            printf("Failed to get config from OSD %lu (retval=%ld), disconnecting peer\n", cl.osd_num, op->reply.hdr.retval);
+            printf("Failed to get config from OSD %lu (retval=%ld), disconnecting peer\n", cl->osd_num, op->reply.hdr.retval);
        }
        else
        {
@@ -255,46 +258,37 @@ void osd_messenger_t::check_peer_config(osd_client_t & cl)
            if (json_err != "")
            {
                err = true;
-                printf("Failed to get config from OSD %lu: bad JSON: %s, disconnecting peer\n", cl.osd_num, json_err.c_str());
+                printf("Failed to get config from OSD %lu: bad JSON: %s, disconnecting peer\n", cl->osd_num, json_err.c_str());
            }
-            else if (config["osd_num"].uint64_value() != cl.osd_num)
+            else if (config["osd_num"].uint64_value() != cl->osd_num)
            {
                err = true;
-                printf("Connected to OSD %lu instead of OSD %lu, peer state is outdated, disconnecting peer\n", config["osd_num"].uint64_value(), cl.osd_num);
+                printf("Connected to OSD %lu instead of OSD %lu, peer state is outdated, disconnecting peer\n", config["osd_num"].uint64_value(), cl->osd_num);
            }
        }
        if (err)
        {
-            osd_num_t osd_num = cl.osd_num;
+            osd_num_t osd_num = cl->osd_num;
            stop_client(op->peer_fd);
            on_connect_peer(osd_num, -1);
            delete op;
            return;
        }
-        osd_peer_fds[cl.osd_num] = cl.peer_fd;
-        on_connect_peer(cl.osd_num, cl.peer_fd);
+        osd_peer_fds[cl->osd_num] = cl->peer_fd;
+        on_connect_peer(cl->osd_num, cl->peer_fd);
        delete op;
    };
    outbox_push(op);
 }

-void osd_messenger_t::cancel_osd_ops(osd_client_t & cl)
+void osd_messenger_t::cancel_osd_ops(osd_client_t *cl)
 {
-    for (auto p: cl.sent_ops)
+    for (auto p: cl->sent_ops)
    {
        cancel_op(p.second);
    }
-    cl.sent_ops.clear();
-    for (auto op: cl.outbox)
-    {
-        cancel_op(op);
-    }
-    cl.outbox.clear();
-    if (cl.write_op)
-    {
-        cancel_op(cl.write_op);
-        cl.write_op = NULL;
-    }
+    cl->sent_ops.clear();
+    cl->outbox.clear();
 }

 void osd_messenger_t::cancel_op(osd_op_t *op)
@@ -324,32 +318,38 @@ void osd_messenger_t::stop_client(int peer_fd)
        return;
    }
    uint64_t repeer_osd = 0;
-    osd_client_t cl = it->second;
-    if (cl.peer_state == PEER_CONNECTED)
+    osd_client_t *cl = it->second;
+    if (cl->peer_state == PEER_CONNECTED)
    {
-        if (cl.osd_num)
+        if (cl->osd_num)
        {
            // Reload configuration from etcd when the connection is dropped
-            printf("[OSD %lu] Stopping client %d (OSD peer %lu)\n", osd_num, peer_fd, cl.osd_num);
-            repeer_osd = cl.osd_num;
+            if (log_level > 0)
+                printf("[OSD %lu] Stopping client %d (OSD peer %lu)\n", osd_num, peer_fd, cl->osd_num);
+            repeer_osd = cl->osd_num;
        }
        else
        {
-            printf("[OSD %lu] Stopping client %d (regular client)\n", osd_num, peer_fd);
+            if (log_level > 0)
+                printf("[OSD %lu] Stopping client %d (regular client)\n", osd_num, peer_fd);
        }
    }
+    cl->peer_state = PEER_STOPPED;
    clients.erase(it);
    tfd->set_fd_handler(peer_fd, false, NULL);
-    if (cl.osd_num)
+    if (cl->connect_timeout_id >= 0)
    {
-        osd_peer_fds.erase(cl.osd_num);
-        // Cancel outbound operations
-        cancel_osd_ops(cl);
+        tfd->clear_timer(cl->connect_timeout_id);
+        cl->connect_timeout_id = -1;
    }
-    if (cl.read_op)
+    if (cl->osd_num)
    {
-        delete cl.read_op;
-        cl.read_op = NULL;
+        osd_peer_fds.erase(cl->osd_num);
+    }
+    if (cl->read_op)
+    {
+        delete cl->read_op;
+        cl->read_op = NULL;
    }
    for (auto rit = read_ready_clients.begin(); rit != read_ready_clients.end(); rit++)
    {
@@ -367,12 +367,24 @@ void osd_messenger_t::stop_client(int peer_fd)
            break;
        }
    }
-    free(cl.in_buf);
+    free(cl->in_buf);
+    cl->in_buf = NULL;
    close(peer_fd);
    if (repeer_osd)
    {
+        // First repeer PGs as canceling OSD ops may push new operations
+        // and we need correct PG states when we do that
        repeer_pgs(repeer_osd);
    }
+    if (cl->osd_num)
+    {
+        // Cancel outbound operations
+        cancel_osd_ops(cl);
+    }
+    if (cl->refs <= 0)
+    {
+        delete cl;
+    }
 }

 void osd_messenger_t::accept_connections(int listen_fd)
@@ -390,13 +402,13 @@ void osd_messenger_t::accept_connections(int listen_fd)
        fcntl(peer_fd, F_SETFL, fcntl(peer_fd, F_GETFL, 0) | O_NONBLOCK);
        int one = 1;
        setsockopt(peer_fd, SOL_TCP, TCP_NODELAY, &one, sizeof(one));
-        clients[peer_fd] = {
+        clients[peer_fd] = new osd_client_t((osd_client_t){
            .peer_addr = addr,
            .peer_port = ntohs(addr.sin_port),
            .peer_fd = peer_fd,
            .peer_state = PEER_CONNECTED,
            .in_buf = malloc_or_die(receive_buffer_size),
-        };
+        });
        // Add FD to epoll
        tfd->set_fd_handler(peer_fd, false, [this](int peer_fd, int epoll_events)
        {
--- a/messenger.h
+++ b/messenger.h
@@ -30,6 +30,7 @@

 #define PEER_CONNECTING 1
 #define PEER_CONNECTED 2
+#define PEER_STOPPED 3

 #define DEFAULT_PEER_CONNECT_INTERVAL 5
 #define DEFAULT_PEER_CONNECT_TIMEOUT 5
@@ -190,6 +191,8 @@ struct osd_op_t

 struct osd_client_t
 {
+    int refs = 0;
+
    sockaddr_in peer_addr;
    int peer_port;
    int peer_fd;
@@ -202,8 +205,8 @@ struct osd_client_t
    // Read state
    int read_ready = 0;
    osd_op_t *read_op = NULL;
-    iovec read_iov;
-    msghdr read_msg;
+    iovec read_iov = { 0 };
+    msghdr read_msg = { 0 };
    int read_remaining = 0;
    int read_state = 0;
    osd_op_buf_list_t recv_list;
@@ -212,17 +215,16 @@ struct osd_client_t
    std::vector<osd_op_t*> received_ops;

    // Outbound operations
-    std::deque<osd_op_t*> outbox;
-    std::map<int, osd_op_t*> sent_ops;
+    std::map<uint64_t, osd_op_t*> sent_ops;

    // PGs dirtied by this client's primary-writes
    std::set<pool_pg_num_t> dirty_pgs;

    // Write state
-    osd_op_t *write_op = NULL;
-    msghdr write_msg;
+    msghdr write_msg = { 0 };
    int write_state = 0;
-    osd_op_buf_list_t send_list;
+    std::vector<iovec> send_list, next_send_list;
+    std::vector<osd_op_t*> outbox, next_outbox;
 };

 struct osd_wanted_peer_t
@@ -263,7 +265,7 @@ struct osd_messenger_t
    std::map<uint64_t, int> osd_peer_fds;
    uint64_t next_subop_id = 1;

-    std::map<int, osd_client_t> clients;
+    std::map<int, osd_client_t*> clients;
    std::vector<int> read_ready_clients;
    std::vector<int> write_ready_clients;
    std::vector<std::function<void()>> set_immediate;
@@ -288,15 +290,16 @@ protected:
    void try_connect_peer_addr(osd_num_t peer_osd, const char *peer_host, int peer_port);
    void handle_connect_epoll(int peer_fd);
    void on_connect_peer(osd_num_t peer_osd, int peer_fd);
-    void check_peer_config(osd_client_t & cl);
-    void cancel_osd_ops(osd_client_t & cl);
+    void check_peer_config(osd_client_t *cl);
+    void cancel_osd_ops(osd_client_t *cl);
    void cancel_op(osd_op_t *op);

-    bool try_send(osd_client_t & cl);
-    void handle_send(int result, int peer_fd);
+    bool try_send(osd_client_t *cl);
+    void measure_exec(osd_op_t *cur_op);
+    void handle_send(int result, osd_client_t *cl);

-    bool handle_read(int result, int peer_fd);
-    bool handle_finished_read(osd_client_t & cl);
+    bool handle_read(int result, osd_client_t *cl);
+    bool handle_finished_read(osd_client_t *cl);
    void handle_op_hdr(osd_client_t *cl);
    bool handle_reply_hdr(osd_client_t *cl);
    void handle_reply_ready(osd_op_t *op);
--- a/mon/make-units.sh
+++ b/mon/make-units.sh
@@ -8,6 +8,43 @@ IP=`ip -json a s | jq -r '.[].addr_info[] | select(.broadcast == "10.115.0.255")

 [ "$IP" != "" ] || exit 1

+BASE=${IP/*./}
+BASE=$((BASE-10))
+
+useradd etcd
+
+mkdir -p /var/lib/etcd$BASE.etcd
+cat >/etc/systemd/system/etcd.service <<EOF
+[Unit]
+Description=etcd for vitastor
+After=network-online.target local-fs.target time-sync.target
+Wants=network-online.target local-fs.target time-sync.target
+
+[Service]
+Restart=always
+ExecStart=/usr/local/bin/etcd -name etcd$BASE --data-dir /var/lib/etcd$BASE.etcd \\
+    --advertise-client-urls http://$IP:2379 --listen-client-urls http://$IP:2379 \\
+    --initial-advertise-peer-urls http://$IP:2380 --listen-peer-urls http://$IP:2380 \\
+    --initial-cluster-token vitastor-etcd-1 --initial-cluster etcd0=http://10.115.0.10:2380,etcd1=http://10.115.0.11:2380,etcd2=http://10.115.0.12:2380,etcd3=http://10.115.0.13:2380 \\
+    --initial-cluster-state new --max-txn-ops=100000 --auto-compaction-retention=10 --auto-compaction-mode=revision
+WorkingDirectory=/var/lib/etcd$BASE.etcd
+ExecStartPre=+chown -R etcd /var/lib/etcd$BASE.etcd
+User=etcd
+PrivateTmp=false
+TasksMax=infinity
+Restart=always
+StartLimitInterval=0
+StartLimitIntervalSec=0
+RestartSec=10
+
+[Install]
+WantedBy=local.target
+EOF
+
+systemctl daemon-reload
+systemctl enable etcd
+systemctl start etcd
+
 useradd vitastor
 chmod 755 /root

--- a/mon/mon-main.js
+++ b/mon/mon-main.js
--- a/mon/mon.js
+++ b/mon/mon.js
@@ -9,210 +9,215 @@ const LPOptimizer = require('./lp-optimizer.js');
 const stableStringify = require('./stable-stringify.js');
 const PGUtil = require('./PGUtil.js');

+// FIXME document all etcd keys and config variables in the form of JSON schema or similar
+const etcd_allow = new RegExp('^'+[
+    'config/global',
+    'config/node_placement',
+    'config/pools',
+    'config/osd/[1-9]\\d*',
+    'config/pgs',
+    'osd/state/[1-9]\\d*',
+    'osd/stats/[1-9]\\d*',
+    'mon/master',
+    'pg/state/[1-9]\\d*/[1-9]\\d*',
+    'pg/stats/[1-9]\\d*/[1-9]\\d*',
+    'pg/history/[1-9]\\d*/[1-9]\\d*',
+    'stats',
+].join('$|^')+'$');
+
+const etcd_tree = {
+    config: {
+        /* global: {
+            // mon
+            etcd_mon_ttl: 30, // min: 10
+            etcd_mon_timeout: 1000, // ms. min: 0
+            etcd_mon_retries: 5, // min: 0
+            mon_change_timeout: 1000, // ms. min: 100
+            mon_stats_timeout: 1000, // ms. min: 100
+            osd_out_time: 1800, // seconds. min: 0
+            placement_levels: { datacenter: 1, rack: 2, host: 3, osd: 4, ... },
+            // client and osd
+            use_sync_send_recv: false,
+            log_level: 0,
+            block_size: 131072,
+            disk_alignment: 4096,
+            bitmap_granularity: 4096,
+            immediate_commit: false, // 'all' or 'small'
+            client_dirty_limit: 33554432,
+            peer_connect_interval: 5, // seconds. min: 1
+            peer_connect_timeout: 5, // seconds. min: 1
+            up_wait_retry_interval: 500, // ms. min: 50
+            // osd
+            etcd_report_interval: 30, // min: 10
+            run_primary: true,
+            bind_address: "0.0.0.0",
+            bind_port: 0,
+            autosync_interval: 5,
+            client_queue_depth: 128, // unused
+            recovery_queue_depth: 4,
+            readonly: false,
+            print_stats_interval: 3,
+            // blockstore - fixed in superblock
+            block_size,
+            disk_alignment,
+            journal_block_size,
+            meta_block_size,
+            bitmap_granularity,
+            journal_device,
+            journal_offset,
+            journal_size,
+            disable_journal_fsync,
+            data_device,
+            data_offset,
+            data_size,
+            disable_data_fsync,
+            meta_device,
+            meta_offset,
+            disable_meta_fsync,
+            disable_device_lock,
+            // blockstore - configurable
+            flusher_count,
+            inmemory_metadata,
+            inmemory_journal,
+            journal_sector_buffer_count,
+            journal_no_same_sector_overwrites,
+        }, */
+        global: {},
+        /* node_placement: {
+            host1: { level: 'host', parent: 'rack1' },
+            ...
+        }, */
+        node_placement: {},
+        /* pools: {
+            <id>: {
+                name: 'testpool',
+                // jerasure uses Reed-Solomon-Vandermonde codes
+                scheme: 'replicated' | 'xor' | 'jerasure',
+                pg_size: 3,
+                pg_minsize: 2,
+                // number of parity chunks, required for jerasure
+                parity_chunks?: 1,
+                pg_count: 100,
+                failure_domain: 'host',
+                max_osd_combinations: 10000,
+                pg_stripe_size: 4194304,
+                root_node?: 'rack1',
+                // restrict pool to OSDs having all of these tags
+                osd_tags?: 'nvme' | [ 'nvme', ... ],
+            },
+            ...
+        }, */
+        pools: {},
+        osd: {
+            /* <id>: { reweight?: 1, tags?: [ 'nvme', ... ] }, ... */
+        },
+        /* pgs: {
+            hash: string,
+            items: {
+                <pool_id>: {
+                    <pg_id>: {
+                        osd_set: [ 1, 2, 3 ],
+                        primary: 1,
+                        pause: false,
+                    }
+                }
+            }
+        }, */
+        pgs: {},
+    },
+    osd: {
+        state: {
+            /* <osd_num_t>: {
+                state: "up",
+                addresses: string[],
+                host: string,
+                port: uint16_t,
+                primary_enabled: boolean,
+                blockstore_enabled: boolean,
+            }, */
+        },
+        stats: {
+            /* <osd_num_t>: {
+                time: number, // unix time
+                blockstore_ready: boolean,
+                size: uint64_t, // bytes
+                free: uint64_t, // bytes
+                host: string,
+                op_stats: {
+                    <string>: { count: uint64_t, usec: uint64_t, bytes: uint64_t },
+                },
+                subop_stats: {
+                    <string>: { count: uint64_t, usec: uint64_t },
+                },
+                recovery_stats: {
+                    degraded: { count: uint64_t, bytes: uint64_t },
+                    misplaced: { count: uint64_t, bytes: uint64_t },
+                },
+            }, */
+        },
+    },
+    mon: {
+        master: {
+            /* ip: [ string ], */
+        },
+    },
+    pg: {
+        state: {
+            /* <pool_id>: {
+                <pg_id>: {
+                    primary: osd_num_t,
+                    state: ("starting"|"peering"|"incomplete"|"active"|"stopping"|"offline"|
+                        "degraded"|"has_incomplete"|"has_degraded"|"has_misplaced"|"has_unclean"|
+                        "has_invalid"|"left_on_dead")[],
+                }
+            }, */
+        },
+        stats: {
+            /* <pool_id>: {
+                <pg_id>: {
+                    object_count: uint64_t,
+                    clean_count: uint64_t,
+                    misplaced_count: uint64_t,
+                    degraded_count: uint64_t,
+                    incomplete_count: uint64_t,
+                    write_osd_set: osd_num_t[],
+                },
+            }, */
+        },
+        history: {
+            /* <pool_id>: {
+                <pg_id>: {
+                    osd_sets: osd_num_t[][],
+                    all_peers: osd_num_t[],
+                    epoch: uint32_t,
+                },
+            }, */
+        },
+    },
+    stats: {
+        /* op_stats: {
+            <string>: { count: uint64_t, usec: uint64_t, bytes: uint64_t },
+        },
+        subop_stats: {
+            <string>: { count: uint64_t, usec: uint64_t },
+        },
+        recovery_stats: {
+            degraded: { count: uint64_t, bytes: uint64_t },
+            misplaced: { count: uint64_t, bytes: uint64_t },
+        },
+        object_counts: {
+            object: uint64_t,
+            clean: uint64_t,
+            misplaced: uint64_t,
+            degraded: uint64_t,
+            incomplete: uint64_t,
+        }, */
+    },
+};
+
 // FIXME Split into several files
 class Mon
 {
-    // FIXME document all etcd keys and config variables in the form of JSON schema or similar
-    static etcd_allow = new RegExp('^'+[
-        'config/global',
-        'config/node_placement',
-        'config/pools',
-        'config/osd/[1-9]\\d*',
-        'config/pgs',
-        'osd/state/[1-9]\\d*',
-        'osd/stats/[1-9]\\d*',
-        'mon/master',
-        'pg/state/[1-9]\\d*/[1-9]\\d*',
-        'pg/stats/[1-9]\\d*/[1-9]\\d*',
-        'pg/history/[1-9]\\d*/[1-9]\\d*',
-        'stats',
-    ].join('$|^')+'$')
-
-    static etcd_tree = {
-        config: {
-            /* global: {
-                // mon
-                etcd_mon_ttl: 30, // min: 10
-                etcd_mon_timeout: 1000, // ms. min: 0
-                etcd_mon_retries: 5, // min: 0
-                mon_change_timeout: 1000, // ms. min: 100
-                mon_stats_timeout: 1000, // ms. min: 100
-                osd_out_time: 1800, // seconds. min: 0
-                placement_levels: { datacenter: 1, rack: 2, host: 3, osd: 4, ... },
-                // client and osd
-                use_sync_send_recv: false,
-                log_level: 0,
-                block_size: 131072,
-                disk_alignment: 4096,
-                bitmap_granularity: 4096,
-                immediate_commit: false, // 'all' or 'small'
-                client_dirty_limit: 33554432,
-                peer_connect_interval: 5, // seconds. min: 1
-                peer_connect_timeout: 5, // seconds. min: 1
-                up_wait_retry_interval: 500, // ms. min: 50
-                // osd
-                etcd_report_interval: 30, // min: 10
-                run_primary: true,
-                bind_address: "0.0.0.0",
-                bind_port: 0,
-                autosync_interval: 5,
-                client_queue_depth: 128, // unused
-                recovery_queue_depth: 4,
-                readonly: false,
-                print_stats_interval: 3,
-                // blockstore - fixed in superblock
-                block_size,
-                disk_alignment,
-                journal_block_size,
-                meta_block_size,
-                bitmap_granularity,
-                journal_device,
-                journal_offset,
-                journal_size,
-                disable_journal_fsync,
-                data_device,
-                data_offset,
-                data_size,
-                disable_data_fsync,
-                meta_device,
-                meta_offset,
-                disable_meta_fsync,
-                disable_device_lock,
-                // blockstore - configurable
-                flusher_count,
-                inmemory_metadata,
-                inmemory_journal,
-                journal_sector_buffer_count,
-                journal_no_same_sector_overwrites,
-            }, */
-            global: {},
-            /* node_placement: {
-                host1: { level: 'host', parent: 'rack1' },
-                ...
-            }, */
-            node_placement: {},
-            /* pools: {
-                <id>: {
-                    name: 'testpool',
-                    scheme: 'xor',
-                    pg_size: 3,
-                    pg_minsize: 2,
-                    pg_count: 100,
-                    failure_domain: 'host',
-                    max_osd_combinations: 10000,
-                    pg_stripe_size: 4194304,
-                    // FIXME add device classes/tags
-                },
-                ...
-            }, */
-            pools: {},
-            osd: {
-                /* <id>: { reweight: 1 }, ... */
-            },
-            /* pgs: {
-                hash: string,
-                items: {
-                    <pool_id>: {
-                        <pg_id>: {
-                            osd_set: [ 1, 2, 3 ],
-                            primary: 1,
-                            pause: false,
-                        }
-                    }
-                }
-            }, */
-            pgs: {},
-        },
-        osd: {
-            state: {
-                /* <osd_num_t>: {
-                    state: "up",
-                    addresses: string[],
-                    host: string,
-                    port: uint16_t,
-                    primary_enabled: boolean,
-                    blockstore_enabled: boolean,
-                }, */
-            },
-            stats: {
-                /* <osd_num_t>: {
-                    time: number, // unix time
-                    blockstore_ready: boolean,
-                    size: uint64_t, // bytes
-                    free: uint64_t, // bytes
-                    host: string,
-                    op_stats: {
-                        <string>: { count: uint64_t, usec: uint64_t, bytes: uint64_t },
-                    },
-                    subop_stats: {
-                        <string>: { count: uint64_t, usec: uint64_t },
-                    },
-                    recovery_stats: {
-                        degraded: { count: uint64_t, bytes: uint64_t },
-                        misplaced: { count: uint64_t, bytes: uint64_t },
-                    },
-                }, */
-            },
-        },
-        mon: {
-            master: {
-                /* ip: [ string ], */
-            },
-        },
-        pg: {
-            state: {
-                /* <pool_id>: {
-                    <pg_id>: {
-                        primary: osd_num_t,
-                        state: ("starting"|"peering"|"incomplete"|"active"|"stopping"|"offline"|
-                            "degraded"|"has_incomplete"|"has_degraded"|"has_misplaced"|"has_unclean"|
-                            "has_invalid"|"left_on_dead")[],
-                    }
-                }, */
-            },
-            stats: {
-                /* <pool_id>: {
-                    <pg_id>: {
-                        object_count: uint64_t,
-                        clean_count: uint64_t,
-                        misplaced_count: uint64_t,
-                        degraded_count: uint64_t,
-                        incomplete_count: uint64_t,
-                        write_osd_set: osd_num_t[],
-                    },
-                }, */
-            },
-            history: {
-                /* <pool_id>: {
-                    <pg_id>: {
-                        osd_sets: osd_num_t[][],
-                        all_peers: osd_num_t[],
-                        epoch: uint32_t,
-                    },
-                }, */
-            },
-        },
-        stats: {
-            /* op_stats: {
-                <string>: { count: uint64_t, usec: uint64_t, bytes: uint64_t },
-            },
-            subop_stats: {
-                <string>: { count: uint64_t, usec: uint64_t },
-            },
-            recovery_stats: {
-                degraded: { count: uint64_t, bytes: uint64_t },
-                misplaced: { count: uint64_t, bytes: uint64_t },
-            },
-            object_counts: {
-                object: uint64_t,
-                clean: uint64_t,
-                misplaced: uint64_t,
-                degraded: uint64_t,
-                incomplete: uint64_t,
-            }, */
-        },
-    }
-
    constructor(config)
    {
        // FIXME: Maybe prefer local etcd
@@ -329,6 +334,7 @@ class Mon
                range_end: b64(this.etcd_prefix+'0'),
                start_revision: ''+this.etcd_watch_revision,
                watch_id: 1,
+                progress_notify: true,
            },
        }));
        this.ws.on('message', (msg) =>
@@ -466,7 +472,8 @@ class Mon
            if (stat.size && (this.state.osd.state[osd_num] || Number(stat.time) >= down_time))
            {
                // Numeric IDs are reserved for OSDs
-                let reweight = this.state.config.osd[osd_num] && Number(this.state.config.osd[osd_num].reweight);
+                const osd_cfg = this.state.config.osd[osd_num];
+                let reweight = osd_cfg && Number(osd_cfg.reweight);
                if (reweight < 0 || isNaN(reweight))
                    reweight = 1;
                if (this.state.osd.state[osd_num] && reweight > 0)
@@ -474,10 +481,26 @@ class Mon
                    // React to down OSDs immediately
                    up_osds[osd_num] = true;
                }
-                tree[osd_num] = tree[osd_num] || { id: osd_num, parent: stat.host };
+                tree[osd_num] = tree[osd_num] || {};
+                tree[osd_num].id = osd_num;
+                tree[osd_num].parent = tree[osd_num].parent || stat.host;
                tree[osd_num].level = 'osd';
                tree[osd_num].size = reweight * stat.size / 1024 / 1024 / 1024 / 1024; // terabytes
+                if (osd_cfg && osd_cfg.tags)
+                {
+                    tree[osd_num].tags = (osd_cfg.tags instanceof Array ? [ ...osd_cfg.tags ] : [ osd_cfg.tags ])
+                        .reduce((a, c) => { a[c] = true; return a; }, {});
+                }
                delete tree[osd_num].children;
+                if (!tree[tree[osd_num].parent])
+                {
+                    tree[tree[osd_num].parent] = {
+                        id: tree[osd_num].parent,
+                        level: 'host',
+                        parent: null,
+                        children: [],
+                    };
+                }
            }
        }
        for (const node_id in tree)
@@ -492,11 +515,11 @@ class Mon
                && tree[node_cfg.parent].level;
            parent_level = parent_level ? (levels[parent_level] || parent_level) : null;
            // Parent's level must be less than child's; OSDs must be leaves
-            const parent = parent_level && parent_level < node_level ? tree[node_cfg.parent] : '';
+            const parent = parent_level && parent_level < node_level ? node_cfg.parent : '';
            tree[parent].children.push(tree[node_id]);
            delete node_cfg.parent;
        }
-        return { up_osds, osd_tree: LPOptimizer.flatten_tree(tree[''].children, levels, this.config.failure_domain, 'osd') };
+        return { up_osds, levels, osd_tree: tree };
    }

    async stop_all_pgs(pool_id)
@@ -542,19 +565,15 @@ class Mon
                    { requestPut: { key: b64(this.etcd_prefix+'/config/pgs'), value: b64(JSON.stringify(new_cfg)) } },
                ],
            }, this.config.etcd_mon_timeout, 0);
-            if (!res.succeeded)
-            {
-                return false;
-            }
-            this.state.config.pgs = new_cfg;
+            return false;
        }
        return !has_online;
    }

    save_new_pgs_txn(request, pool_id, up_osds, prev_pgs, new_pgs, pg_history)
    {
-        const replicated = this.state.config.pools[pool_id].scheme === 'replicated';
-        const pg_minsize = this.state.config.pools[pool_id].pg_minsize;
+        const replicated = new_pgs.length && this.state.config.pools[pool_id].scheme === 'replicated';
+        const pg_minsize = new_pgs.length && this.state.config.pools[pool_id].pg_minsize;
        const pg_items = {};
        new_pgs.map((osd_set, i) =>
        {
@@ -609,13 +628,21 @@ class Mon
            }
        }
        this.state.config.pgs.items = this.state.config.pgs.items || {};
-        this.state.config.pgs.items[pool_id] = pg_items;
+        if (!new_pgs.length)
+        {
+            delete this.state.config.pgs.items[pool_id];
+        }
+        else
+        {
+            this.state.config.pgs.items[pool_id] = pg_items;
+        }
    }

    validate_pool_cfg(pool_id, pool_cfg, warn)
    {
        pool_cfg.pg_size = Math.floor(pool_cfg.pg_size);
        pool_cfg.pg_minsize = Math.floor(pool_cfg.pg_minsize);
+        pool_cfg.parity_chunks = Math.floor(pool_cfg.parity_chunks) || undefined;
        pool_cfg.pg_count = Math.floor(pool_cfg.pg_count);
        pool_cfg.failure_domain = pool_cfg.failure_domain || 'host';
        pool_cfg.max_osd_combinations = Math.floor(pool_cfg.max_osd_combinations) || 10000;
@@ -625,8 +652,14 @@ class Mon
                console.log('Pool ID '+pool_id+' is invalid');
            return false;
        }
-        if (!pool_cfg.pg_size || pool_cfg.pg_size < 1 ||
-            pool_cfg.scheme === 'xor' && pool_cfg.pg_size < 3)
+        if (pool_cfg.scheme !== 'xor' && pool_cfg.scheme !== 'replicated' && pool_cfg.scheme !== 'jerasure')
+        {
+            if (warn)
+                console.log('Pool '+pool_id+' has invalid coding scheme (one of "xor", "replicated" and "jerasure" required)');
+            return false;
+        }
+        if (!pool_cfg.pg_size || pool_cfg.pg_size < 1 || pool_cfg.pg_size > 256 ||
+            (pool_cfg.scheme === 'xor' || pool_cfg.scheme == 'jerasure') && pool_cfg.pg_size < 3)
        {
            if (warn)
                console.log('Pool '+pool_id+' has invalid pg_size');
@@ -639,6 +672,18 @@ class Mon
                console.log('Pool '+pool_id+' has invalid pg_minsize');
            return false;
        }
+        if (pool_cfg.scheme === 'xor' && pool_cfg.parity_chunks != 0 && pool_cfg.parity_chunks != 1)
+        {
+            if (warn)
+                console.log('Pool '+pool_id+' has invalid parity_chunks (must be 1)');
+            return false;
+        }
+        if (pool_cfg.scheme === 'jerasure' && (pool_cfg.parity_chunks < 1 || pool_cfg.parity_chunks > pool_cfg.pg_size-2))
+        {
+            if (warn)
+                console.log('Pool '+pool_id+' has invalid parity_chunks (must be between 1 and pg_size-2)');
+            return false;
+        }
        if (!pool_cfg.pg_count || pool_cfg.pg_count < 1)
        {
            if (warn)
@@ -651,27 +696,60 @@ class Mon
                console.log('Pool '+pool_id+' has empty name');
            return false;
        }
-        if (pool_cfg.scheme !== 'xor' && pool_cfg.scheme !== 'replicated')
-        {
-            if (warn)
-                console.log('Pool '+pool_id+' has invalid coding scheme (only "xor" and "replicated" are allowed)');
-            return false;
-        }
        if (pool_cfg.max_osd_combinations < 100)
        {
            if (warn)
                console.log('Pool '+pool_id+' has invalid max_osd_combinations (must be at least 100)');
            return false;
        }
+        if (pool_cfg.root_node && typeof(pool_cfg.root_node) != 'string')
+        {
+            if (warn)
+                console.log('Pool '+pool_id+' has invalid root_node (must be a string)');
+            return false;
+        }
+        if (pool_cfg.osd_tags && typeof(pool_cfg.osd_tags) != 'string' &&
+            (!(pool_cfg.osd_tags instanceof Array) || pool_cfg.osd_tags.filter(t => typeof t != 'string').length > 0))
+        {
+            if (warn)
+                console.log('Pool '+pool_id+' has invalid osd_tags (must be a string or array of strings)');
+            return false;
+        }
        return true;
    }

+    filter_osds_by_tags(orig_tree, flat_tree, tags)
+    {
+        if (!tags)
+        {
+            return;
+        }
+        for (const tag of (tags instanceof Array ? tags : [ tags ]))
+        {
+            for (const host in flat_tree)
+            {
+                let found = 0;
+                for (const osd in flat_tree[host])
+                {
+                    if (!orig_tree[osd].tags || !orig_tree[osd].tags[tag])
+                        delete flat_tree[host][osd];
+                    else
+                        found++;
+                }
+                if (!found)
+                {
+                    delete flat_tree[host];
+                }
+            }
+        }
+    }
+
    async recheck_pgs()
    {
        // Take configuration and state, check it against the stored configuration hash
        // Recalculate PGs and save them to etcd if the configuration is changed
        // FIXME: Also do not change anything if the distribution is good enough and no PGs are degraded
-        const { up_osds, osd_tree } = this.get_osd_tree();
+        const { up_osds, levels, osd_tree } = this.get_osd_tree();
        const tree_cfg = {
            osd_tree,
            pools: this.state.config.pools,
@@ -681,6 +759,24 @@ class Mon
        {
            // Something has changed
            const etcd_request = { compare: [], success: [] };
+            for (const pool_id in (this.state.config.pgs||{}).items||{})
+            {
+                if (!this.state.config.pools[pool_id])
+                {
+                    // Pool deleted. Delete all PGs, but first stop them.
+                    if (!await this.stop_all_pgs(pool_id))
+                    {
+                        this.schedule_recheck();
+                        return;
+                    }
+                    const prev_pgs = [];
+                    for (const pg in this.state.config.pgs.items[pool_id]||{})
+                    {
+                        prev_pgs[pg-1] = this.state.config.pgs.items[pool_id][pg].osd_set;
+                    }
+                    this.save_new_pgs_txn(etcd_request, pool_id, up_osds, prev_pgs, [], []);
+                }
+            }
            for (const pool_id in this.state.config.pools)
            {
                const pool_cfg = this.state.config.pools[pool_id];
@@ -688,6 +784,10 @@ class Mon
                {
                    continue;
                }
+                let pool_tree = osd_tree[pool_cfg.root_node || ''];
+                pool_tree = pool_tree ? pool_tree.children : [];
+                pool_tree = LPOptimizer.flatten_tree(pool_tree, levels, pool_cfg.failure_domain, 'osd');
+                this.filter_osds_by_tags(osd_tree, pool_tree, pool_cfg.osd_tags);
                const prev_pgs = [];
                for (const pg in ((this.state.config.pgs.items||{})[pool_id]||{})||{})
                {
@@ -710,7 +810,7 @@ class Mon
                    }
                    optimize_result = await LPOptimizer.optimize_change({
                        prev_pgs,
-                        osd_tree: tree_cfg.osd_tree,
+                        osd_tree: pool_tree,
                        pg_size: pool_cfg.pg_size,
                        pg_minsize: pool_cfg.pg_minsize,
                        max_combinations: pool_cfg.max_osd_combinations,
@@ -719,7 +819,7 @@ class Mon
                else
                {
                    optimize_result = await LPOptimizer.optimize_initial({
-                        osd_tree: tree_cfg.osd_tree,
+                        osd_tree: pool_tree,
                        pg_count: pool_cfg.pg_count,
                        pg_size: pool_cfg.pg_size,
                        pg_minsize: pool_cfg.pg_minsize,
@@ -753,7 +853,7 @@ class Mon
                const replicated = pool_cfg.scheme === 'replicated';
                for (const pg_num in ((this.state.config.pgs.items||{})[pool_id]||{})||{})
                {
-                    const pg_cfg = this.state.config.pgs.items[pool_id][pg];
+                    const pg_cfg = this.state.config.pgs.items[pool_id][pg_num];
                    if (!Number(pg_cfg.primary) || !up_osds[pg_cfg.primary])
                    {
                        let alive_set;
@@ -1005,7 +1105,7 @@ class Mon
        }
        catch (e)
        {
-            console.log('Bad key in etcd: '+kv.key+' = '+kv.value);
+            console.log('Bad value in etcd: '+kv.key+' = '+kv.value);
            return;
        }
        key = key.split('/');
@@ -1166,4 +1266,7 @@ function sha1hex(str)
    return hash.digest('hex');
 }

+Mon.etcd_allow = etcd_allow;
+Mon.etcd_tree = etcd_tree;
+
 module.exports = Mon;
--- a/msgr_receive.cpp
+++ b/msgr_receive.cpp
@@ -8,21 +8,22 @@ void osd_messenger_t::read_requests()
    for (int i = 0; i < read_ready_clients.size(); i++)
    {
        int peer_fd = read_ready_clients[i];
-        auto & cl = clients[peer_fd];
-        if (cl.read_remaining < receive_buffer_size)
+        osd_client_t *cl = clients[peer_fd];
+        if (cl->read_remaining < receive_buffer_size)
        {
-            cl.read_iov.iov_base = cl.in_buf;
-            cl.read_iov.iov_len = receive_buffer_size;
-            cl.read_msg.msg_iov = &cl.read_iov;
-            cl.read_msg.msg_iovlen = 1;
+            cl->read_iov.iov_base = cl->in_buf;
+            cl->read_iov.iov_len = receive_buffer_size;
+            cl->read_msg.msg_iov = &cl->read_iov;
+            cl->read_msg.msg_iovlen = 1;
        }
        else
        {
-            cl.read_iov.iov_base = 0;
-            cl.read_iov.iov_len = cl.read_remaining;
-            cl.read_msg.msg_iov = cl.recv_list.get_iovec();
-            cl.read_msg.msg_iovlen = cl.recv_list.get_size();
+            cl->read_iov.iov_base = 0;
+            cl->read_iov.iov_len = cl->read_remaining;
+            cl->read_msg.msg_iov = cl->recv_list.get_iovec();
+            cl->read_msg.msg_iovlen = cl->recv_list.get_size();
        }
+        cl->refs++;
        if (ringloop && !use_sync_send_recv)
        {
            io_uring_sqe* sqe = ringloop->get_sqe();
@@ -32,111 +33,115 @@ void osd_messenger_t::read_requests()
                return;
            }
            ring_data_t* data = ((ring_data_t*)sqe->user_data);
-            data->callback = [this, peer_fd](ring_data_t *data) { handle_read(data->res, peer_fd); };
-            my_uring_prep_recvmsg(sqe, peer_fd, &cl.read_msg, 0);
+            data->callback = [this, cl](ring_data_t *data) { handle_read(data->res, cl); };
+            my_uring_prep_recvmsg(sqe, peer_fd, &cl->read_msg, 0);
        }
        else
        {
-            int result = recvmsg(peer_fd, &cl.read_msg, 0);
+            int result = recvmsg(peer_fd, &cl->read_msg, 0);
            if (result < 0)
            {
                result = -errno;
            }
-            handle_read(result, peer_fd);
+            handle_read(result, cl);
        }
    }
    read_ready_clients.clear();
 }

-bool osd_messenger_t::handle_read(int result, int peer_fd)
+bool osd_messenger_t::handle_read(int result, osd_client_t *cl)
 {
    bool ret = false;
-    auto cl_it = clients.find(peer_fd);
-    if (cl_it != clients.end())
+    cl->refs--;
+    if (cl->peer_state == PEER_STOPPED)
    {
-        auto & cl = cl_it->second;
-        if (result <= 0 && result != -EAGAIN)
+        if (cl->refs <= 0)
        {
-            // this is a client socket, so don't panic on error. just disconnect it
-            if (result != 0)
-            {
-                printf("Client %d socket read error: %d (%s). Disconnecting client\n", peer_fd, -result, strerror(-result));
-            }
-            stop_client(peer_fd);
-            return false;
+            delete cl;
        }
-        if (result == -EAGAIN || result < cl.read_iov.iov_len)
+        return false;
+    }
+    if (result <= 0 && result != -EAGAIN)
+    {
+        // this is a client socket, so don't panic on error. just disconnect it
+        if (result != 0)
        {
-            cl.read_ready--;
-            if (cl.read_ready > 0)
-                read_ready_clients.push_back(peer_fd);
+            printf("Client %d socket read error: %d (%s). Disconnecting client\n", cl->peer_fd, -result, strerror(-result));
+        }
+        stop_client(cl->peer_fd);
+        return false;
+    }
+    if (result == -EAGAIN || result < cl->read_iov.iov_len)
+    {
+        cl->read_ready--;
+        if (cl->read_ready > 0)
+            read_ready_clients.push_back(cl->peer_fd);
+    }
+    else
+    {
+        read_ready_clients.push_back(cl->peer_fd);
+    }
+    if (result > 0)
+    {
+        if (cl->read_iov.iov_base == cl->in_buf)
+        {
+            // Compose operation(s) from the buffer
+            int remain = result;
+            void *curbuf = cl->in_buf;
+            while (remain > 0)
+            {
+                if (!cl->read_op)
+                {
+                    cl->read_op = new osd_op_t;
+                    cl->read_op->peer_fd = cl->peer_fd;
+                    cl->read_op->op_type = OSD_OP_IN;
+                    cl->recv_list.push_back(cl->read_op->req.buf, OSD_PACKET_SIZE);
+                    cl->read_remaining = OSD_PACKET_SIZE;
+                    cl->read_state = CL_READ_HDR;
+                }
+                while (cl->recv_list.done < cl->recv_list.count && remain > 0)
+                {
+                    iovec* cur = cl->recv_list.get_iovec();
+                    if (cur->iov_len > remain)
+                    {
+                        memcpy(cur->iov_base, curbuf, remain);
+                        cl->read_remaining -= remain;
+                        cur->iov_len -= remain;
+                        cur->iov_base += remain;
+                        remain = 0;
+                    }
+                    else
+                    {
+                        memcpy(cur->iov_base, curbuf, cur->iov_len);
+                        curbuf += cur->iov_len;
+                        cl->read_remaining -= cur->iov_len;
+                        remain -= cur->iov_len;
+                        cur->iov_len = 0;
+                        cl->recv_list.done++;
+                    }
+                }
+                if (cl->recv_list.done >= cl->recv_list.count)
+                {
+                    if (!handle_finished_read(cl))
+                    {
+                        goto fin;
+                    }
+                }
+            }
        }
        else
        {
-            read_ready_clients.push_back(peer_fd);
+            // Long data
+            cl->read_remaining -= result;
+            cl->recv_list.eat(result);
+            if (cl->recv_list.done >= cl->recv_list.count)
+            {
+                handle_finished_read(cl);
+            }
        }
-        if (result > 0)
+        if (result >= cl->read_iov.iov_len)
        {
-            if (cl.read_iov.iov_base == cl.in_buf)
-            {
-                // Compose operation(s) from the buffer
-                int remain = result;
-                void *curbuf = cl.in_buf;
-                while (remain > 0)
-                {
-                    if (!cl.read_op)
-                    {
-                        cl.read_op = new osd_op_t;
-                        cl.read_op->peer_fd = peer_fd;
-                        cl.read_op->op_type = OSD_OP_IN;
-                        cl.recv_list.push_back(cl.read_op->req.buf, OSD_PACKET_SIZE);
-                        cl.read_remaining = OSD_PACKET_SIZE;
-                        cl.read_state = CL_READ_HDR;
-                    }
-                    while (cl.recv_list.done < cl.recv_list.count && remain > 0)
-                    {
-                        iovec* cur = cl.recv_list.get_iovec();
-                        if (cur->iov_len > remain)
-                        {
-                            memcpy(cur->iov_base, curbuf, remain);
-                            cl.read_remaining -= remain;
-                            cur->iov_len -= remain;
-                            cur->iov_base += remain;
-                            remain = 0;
-                        }
-                        else
-                        {
-                            memcpy(cur->iov_base, curbuf, cur->iov_len);
-                            curbuf += cur->iov_len;
-                            cl.read_remaining -= cur->iov_len;
-                            remain -= cur->iov_len;
-                            cur->iov_len = 0;
-                            cl.recv_list.done++;
-                        }
-                    }
-                    if (cl.recv_list.done >= cl.recv_list.count)
-                    {
-                        if (!handle_finished_read(cl))
-                        {
-                            goto fin;
-                        }
-                    }
-                }
-            }
-            else
-            {
-                // Long data
-                cl.read_remaining -= result;
-                cl.recv_list.eat(result);
-                if (cl.recv_list.done >= cl.recv_list.count)
-                {
-                    handle_finished_read(cl);
-                }
-            }
-            if (result >= cl.read_iov.iov_len)
-            {
-                ret = true;
-            }
+            ret = true;
        }
    }
 fin:
@@ -148,30 +153,30 @@ fin:
    return ret;
 }

-bool osd_messenger_t::handle_finished_read(osd_client_t & cl)
+bool osd_messenger_t::handle_finished_read(osd_client_t *cl)
 {
-    cl.recv_list.reset();
-    if (cl.read_state == CL_READ_HDR)
+    cl->recv_list.reset();
+    if (cl->read_state == CL_READ_HDR)
    {
-        if (cl.read_op->req.hdr.magic == SECONDARY_OSD_REPLY_MAGIC)
-            return handle_reply_hdr(&cl);
+        if (cl->read_op->req.hdr.magic == SECONDARY_OSD_REPLY_MAGIC)
+            return handle_reply_hdr(cl);
        else
-            handle_op_hdr(&cl);
+            handle_op_hdr(cl);
    }
-    else if (cl.read_state == CL_READ_DATA)
+    else if (cl->read_state == CL_READ_DATA)
    {
        // Operation is ready
-        cl.received_ops.push_back(cl.read_op);
-        set_immediate.push_back([this, op = cl.read_op]() { exec_op(op); });
-        cl.read_op = NULL;
-        cl.read_state = 0;
+        cl->received_ops.push_back(cl->read_op);
+        set_immediate.push_back([this, op = cl->read_op]() { exec_op(op); });
+        cl->read_op = NULL;
+        cl->read_state = 0;
    }
-    else if (cl.read_state == CL_READ_REPLY_DATA)
+    else if (cl->read_state == CL_READ_REPLY_DATA)
    {
        // Reply is ready
-        handle_reply_ready(cl.read_op);
-        cl.read_op = NULL;
-        cl.read_state = 0;
+        handle_reply_ready(cl->read_op);
+        cl->read_op = NULL;
+        cl->read_state = 0;
    }
    else
    {
@@ -247,6 +252,14 @@ bool osd_messenger_t::handle_reply_hdr(osd_client_t *cl)
    {
        // Read data. In this case we assume that the buffer is preallocated by the caller (!)
        assert(op->iov.count > 0);
+        if (op->reply.hdr.retval != (op->reply.hdr.opcode == OSD_OP_SEC_READ ? op->req.sec_rw.len : op->req.rw.len))
+        {
+            // Check reply length to not overflow the buffer
+            printf("Client %d read reply of different length\n", cl->peer_fd);
+            cl->sent_ops[op->req.hdr.id] = op;
+            stop_client(cl->peer_fd);
+            return false;
+        }
        cl->recv_list.append(op->iov);
        delete cl->read_op;
        cl->read_op = op;
--- a/msgr_send.cpp
+++ b/msgr_send.cpp
@@ -1,12 +1,15 @@
 // Copyright (c) Vitaliy Filippov, 2019+
 // License: VNPL-1.0 or GNU GPL-2.0+ (see README.md for details)

+#define _XOPEN_SOURCE
+#include <limits.h>
+
 #include "messenger.h"

 void osd_messenger_t::outbox_push(osd_op_t *cur_op)
 {
    assert(cur_op->peer_fd);
-    auto & cl = clients.at(cur_op->peer_fd);
+    osd_client_t *cl = clients.at(cur_op->peer_fd);
    if (cur_op->op_type == OSD_OP_OUT)
    {
        clock_gettime(CLOCK_REALTIME, &cur_op->tv_begin);
@@ -14,13 +17,14 @@ void osd_messenger_t::outbox_push(osd_op_t *cur_op)
    else
    {
        // Check that operation actually belongs to this client
+        // FIXME: Review if this is still needed
        bool found = false;
-        for (auto it = cl.received_ops.begin(); it != cl.received_ops.end(); it++)
+        for (auto it = cl->received_ops.begin(); it != cl->received_ops.end(); it++)
        {
            if (*it == cur_op)
            {
                found = true;
-                cl.received_ops.erase(it, it+1);
+                cl->received_ops.erase(it, it+1);
                break;
            }
        }
@@ -30,85 +34,97 @@ void osd_messenger_t::outbox_push(osd_op_t *cur_op)
            return;
        }
    }
-    cl.outbox.push_back(cur_op);
+    auto & to_send_list = cl->write_msg.msg_iovlen ? cl->next_send_list : cl->send_list;
+    auto & to_outbox = cl->write_msg.msg_iovlen ? cl->next_outbox : cl->outbox;
+    if (cur_op->op_type == OSD_OP_IN)
+    {
+        measure_exec(cur_op);
+        to_send_list.push_back((iovec){ .iov_base = cur_op->reply.buf, .iov_len = OSD_PACKET_SIZE });
+    }
+    else
+    {
+        to_send_list.push_back((iovec){ .iov_base = cur_op->req.buf, .iov_len = OSD_PACKET_SIZE });
+        cl->sent_ops[cur_op->req.hdr.id] = cur_op;
+    }
+    // Pre-defined send_lists
+    if ((cur_op->op_type == OSD_OP_IN
+        ? (cur_op->req.hdr.opcode == OSD_OP_READ ||
+        cur_op->req.hdr.opcode == OSD_OP_SEC_READ ||
+        cur_op->req.hdr.opcode == OSD_OP_SEC_LIST ||
+        cur_op->req.hdr.opcode == OSD_OP_SHOW_CONFIG)
+        : (cur_op->req.hdr.opcode == OSD_OP_WRITE ||
+        cur_op->req.hdr.opcode == OSD_OP_SEC_WRITE ||
+        cur_op->req.hdr.opcode == OSD_OP_SEC_WRITE_STABLE ||
+        cur_op->req.hdr.opcode == OSD_OP_SEC_STABILIZE ||
+        cur_op->req.hdr.opcode == OSD_OP_SEC_ROLLBACK)) && cur_op->iov.count > 0)
+    {
+        to_outbox.push_back(NULL);
+        for (int i = 0; i < cur_op->iov.count; i++)
+        {
+            assert(cur_op->iov.buf[i].iov_base);
+            to_send_list.push_back(cur_op->iov.buf[i]);
+            to_outbox.push_back(i == cur_op->iov.count-1 ? cur_op : NULL);
+        }
+    }
+    else
+    {
+        to_outbox.push_back(cur_op);
+    }
    if (!ringloop)
    {
-        while (cl.write_op || cl.outbox.size())
+        // FIXME: It's worse because it doesn't allow batching
+        while (cl->outbox.size())
        {
            try_send(cl);
        }
    }
-    else if (cl.write_op || cl.outbox.size() > 1 || !try_send(cl))
+    else if (cl->write_msg.msg_iovlen > 0 || !try_send(cl))
    {
-        if (cl.write_state == 0)
+        if (cl->write_state == 0)
        {
-            cl.write_state = CL_WRITE_READY;
+            cl->write_state = CL_WRITE_READY;
            write_ready_clients.push_back(cur_op->peer_fd);
        }
        ringloop->wakeup();
    }
 }

-bool osd_messenger_t::try_send(osd_client_t & cl)
+void osd_messenger_t::measure_exec(osd_op_t *cur_op)
 {
-    int peer_fd = cl.peer_fd;
-    if (!cl.write_op)
+    // Measure execution latency
+    timespec tv_end;
+    clock_gettime(CLOCK_REALTIME, &tv_end);
+    stats.op_stat_count[cur_op->req.hdr.opcode]++;
+    if (!stats.op_stat_count[cur_op->req.hdr.opcode])
    {
-        // pick next command
-        cl.write_op = cl.outbox.front();
-        cl.outbox.pop_front();
-        cl.write_state = CL_WRITE_REPLY;
-        if (cl.write_op->op_type == OSD_OP_IN)
-        {
-            // Measure execution latency
-            timespec tv_end;
-            clock_gettime(CLOCK_REALTIME, &tv_end);
-            stats.op_stat_count[cl.write_op->req.hdr.opcode]++;
-            if (!stats.op_stat_count[cl.write_op->req.hdr.opcode])
-            {
-                stats.op_stat_count[cl.write_op->req.hdr.opcode]++;
-                stats.op_stat_sum[cl.write_op->req.hdr.opcode] = 0;
-                stats.op_stat_bytes[cl.write_op->req.hdr.opcode] = 0;
-            }
-            stats.op_stat_sum[cl.write_op->req.hdr.opcode] += (
-                (tv_end.tv_sec - cl.write_op->tv_begin.tv_sec)*1000000 +
-                (tv_end.tv_nsec - cl.write_op->tv_begin.tv_nsec)/1000
-            );
-            if (cl.write_op->req.hdr.opcode == OSD_OP_READ ||
-                cl.write_op->req.hdr.opcode == OSD_OP_WRITE)
-            {
-                stats.op_stat_bytes[cl.write_op->req.hdr.opcode] += cl.write_op->req.rw.len;
-            }
-            else if (cl.write_op->req.hdr.opcode == OSD_OP_SEC_READ ||
-                cl.write_op->req.hdr.opcode == OSD_OP_SEC_WRITE ||
-                cl.write_op->req.hdr.opcode == OSD_OP_SEC_WRITE_STABLE)
-            {
-                stats.op_stat_bytes[cl.write_op->req.hdr.opcode] += cl.write_op->req.sec_rw.len;
-            }
-            cl.send_list.push_back(cl.write_op->reply.buf, OSD_PACKET_SIZE);
-            if (cl.write_op->req.hdr.opcode == OSD_OP_READ ||
-                cl.write_op->req.hdr.opcode == OSD_OP_SEC_READ ||
-                cl.write_op->req.hdr.opcode == OSD_OP_SEC_LIST ||
-                cl.write_op->req.hdr.opcode == OSD_OP_SHOW_CONFIG)
-            {
-                cl.send_list.append(cl.write_op->iov);
-            }
-        }
-        else
-        {
-            cl.send_list.push_back(cl.write_op->req.buf, OSD_PACKET_SIZE);
-            if (cl.write_op->req.hdr.opcode == OSD_OP_WRITE ||
-                cl.write_op->req.hdr.opcode == OSD_OP_SEC_WRITE ||
-                cl.write_op->req.hdr.opcode == OSD_OP_SEC_WRITE_STABLE ||
-                cl.write_op->req.hdr.opcode == OSD_OP_SEC_STABILIZE ||
-                cl.write_op->req.hdr.opcode == OSD_OP_SEC_ROLLBACK)
-            {
-                cl.send_list.append(cl.write_op->iov);
-            }
-        }
+        stats.op_stat_count[cur_op->req.hdr.opcode]++;
+        stats.op_stat_sum[cur_op->req.hdr.opcode] = 0;
+        stats.op_stat_bytes[cur_op->req.hdr.opcode] = 0;
+    }
+    stats.op_stat_sum[cur_op->req.hdr.opcode] += (
+        (tv_end.tv_sec - cur_op->tv_begin.tv_sec)*1000000 +
+        (tv_end.tv_nsec - cur_op->tv_begin.tv_nsec)/1000
+    );
+    if (cur_op->req.hdr.opcode == OSD_OP_READ ||
+        cur_op->req.hdr.opcode == OSD_OP_WRITE)
+    {
+        stats.op_stat_bytes[cur_op->req.hdr.opcode] += cur_op->req.rw.len;
+    }
+    else if (cur_op->req.hdr.opcode == OSD_OP_SEC_READ ||
+        cur_op->req.hdr.opcode == OSD_OP_SEC_WRITE ||
+        cur_op->req.hdr.opcode == OSD_OP_SEC_WRITE_STABLE)
+    {
+        stats.op_stat_bytes[cur_op->req.hdr.opcode] += cur_op->req.sec_rw.len;
+    }
+}
+
+bool osd_messenger_t::try_send(osd_client_t *cl)
+{
+    int peer_fd = cl->peer_fd;
+    if (!cl->send_list.size() || cl->write_msg.msg_iovlen > 0)
+    {
+        return true;
    }
-    cl.write_msg.msg_iov = cl.send_list.get_iovec();
-    cl.write_msg.msg_iovlen = cl.send_list.get_size();
    if (ringloop && !use_sync_send_recv)
    {
        io_uring_sqe* sqe = ringloop->get_sqe();
@@ -116,18 +132,24 @@ bool osd_messenger_t::try_send(osd_client_t & cl)
        {
            return false;
        }
+        cl->write_msg.msg_iov = cl->send_list.data();
+        cl->write_msg.msg_iovlen = cl->send_list.size() < IOV_MAX ? cl->send_list.size() : IOV_MAX;
+        cl->refs++;
        ring_data_t* data = ((ring_data_t*)sqe->user_data);
-        data->callback = [this, peer_fd](ring_data_t *data) { handle_send(data->res, peer_fd); };
-        my_uring_prep_sendmsg(sqe, peer_fd, &cl.write_msg, 0);
+        data->callback = [this, cl](ring_data_t *data) { handle_send(data->res, cl); };
+        my_uring_prep_sendmsg(sqe, peer_fd, &cl->write_msg, 0);
    }
    else
    {
-        int result = sendmsg(peer_fd, &cl.write_msg, MSG_NOSIGNAL);
+        cl->write_msg.msg_iov = cl->send_list.data();
+        cl->write_msg.msg_iovlen = cl->send_list.size() < IOV_MAX ? cl->send_list.size() : IOV_MAX;
+        cl->refs++;
+        int result = sendmsg(peer_fd, &cl->write_msg, MSG_NOSIGNAL);
        if (result < 0)
        {
            result = -errno;
        }
-        handle_send(result, peer_fd);
+        handle_send(result, cl);
    }
    return true;
 }
@@ -137,7 +159,8 @@ void osd_messenger_t::send_replies()
    for (int i = 0; i < write_ready_clients.size(); i++)
    {
        int peer_fd = write_ready_clients[i];
-        if (!try_send(clients[peer_fd]))
+        auto cl_it = clients.find(peer_fd);
+        if (cl_it != clients.end() && !try_send(cl_it->second))
        {
            write_ready_clients.erase(write_ready_clients.begin(), write_ready_clients.begin() + i);
            return;
@@ -146,41 +169,67 @@ void osd_messenger_t::send_replies()
    write_ready_clients.clear();
 }

-void osd_messenger_t::handle_send(int result, int peer_fd)
+void osd_messenger_t::handle_send(int result, osd_client_t *cl)
 {
-    auto cl_it = clients.find(peer_fd);
-    if (cl_it != clients.end())
+    cl->write_msg.msg_iovlen = 0;
+    cl->refs--;
+    if (cl->peer_state == PEER_STOPPED)
    {
-        auto & cl = cl_it->second;
-        if (result < 0 && result != -EAGAIN)
+        if (!cl->refs)
        {
-            // this is a client socket, so don't panic. just disconnect it
-            printf("Client %d socket write error: %d (%s). Disconnecting client\n", peer_fd, -result, strerror(-result));
-            stop_client(peer_fd);
-            return;
+            delete cl;
        }
-        if (result >= 0)
+        return;
+    }
+    if (result < 0 && result != -EAGAIN)
+    {
+        // this is a client socket, so don't panic. just disconnect it
+        printf("Client %d socket write error: %d (%s). Disconnecting client\n", cl->peer_fd, -result, strerror(-result));
+        stop_client(cl->peer_fd);
+        return;
+    }
+    if (result >= 0)
+    {
+        int done = 0;
+        while (result > 0 && done < cl->send_list.size())
        {
-            cl.send_list.eat(result);
-            if (cl.send_list.done >= cl.send_list.count)
+            iovec & iov = cl->send_list[done];
+            if (iov.iov_len <= result)
            {
-                // Done
-                cl.send_list.reset();
-                if (cl.write_op->op_type == OSD_OP_IN)
+                if (cl->outbox[done])
                {
-                    delete cl.write_op;
+                    // Operation fully sent
+                    if (cl->outbox[done]->op_type == OSD_OP_IN)
+                    {
+                        delete cl->outbox[done];
+                    }
                }
-                else
-                {
-                    cl.sent_ops[cl.write_op->req.hdr.id] = cl.write_op;
-                }
-                cl.write_op = NULL;
-                cl.write_state = cl.outbox.size() > 0 ? CL_WRITE_READY : 0;
+                result -= iov.iov_len;
+                done++;
+            }
+            else
+            {
+                iov.iov_len -= result;
+                iov.iov_base += result;
+                break;
            }
        }
-        if (cl.write_state != 0)
+        if (done > 0)
        {
-            write_ready_clients.push_back(peer_fd);
+            cl->send_list.erase(cl->send_list.begin(), cl->send_list.begin()+done);
+            cl->outbox.erase(cl->outbox.begin(), cl->outbox.begin()+done);
        }
+        if (cl->next_send_list.size())
+        {
+            cl->send_list.insert(cl->send_list.end(), cl->next_send_list.begin(), cl->next_send_list.end());
+            cl->outbox.insert(cl->outbox.end(), cl->next_outbox.begin(), cl->next_outbox.end());
+            cl->next_send_list.clear();
+            cl->next_outbox.clear();
+        }
+        cl->write_state = cl->outbox.size() > 0 ? CL_WRITE_READY : 0;
+    }
+    if (cl->write_state != 0)
+    {
+        write_ready_clients.push_back(cl->peer_fd);
    }
 }
--- a/nbd_proxy.cpp
+++ b/nbd_proxy.cpp
@@ -17,6 +17,10 @@
 #include "epoll_manager.h"
 #include "cluster_client.h"

+#ifndef MSG_ZEROCOPY
+#define MSG_ZEROCOPY 0
+#endif
+
 const char *exe_name = NULL;

 class nbd_proxy
@@ -107,7 +111,7 @@ public:
    {
        printf(
            "Vitastor NBD proxy\n"
-            "(c) Vitaliy Filippov, 2020 (VNPL-1.0 or GNU GPL 2.0+)\n\n"
+            "(c) Vitaliy Filippov, 2020 (VNPL-1.0)\n\n"
            "USAGE:\n"
            "  %s map --etcd_address <etcd_address> --pool <pool> --inode <inode> --size <size in bytes>\n"
            "  %s unmap /dev/nbd0\n"
--- a/osd.cpp
+++ b/osd.cpp
@@ -28,6 +28,10 @@ osd_t::osd_t(blockstore_config_t & config, blockstore_t *bs, ring_loop_t *ringlo
    {
        print_stats();
    });
+    this->tfd->set_timer(slow_log_interval*1000, true, [this](int timer_id)
+    {
+        print_slow();
+    });

    c_cli.tfd = this->tfd;
    c_cli.ringloop = this->ringloop;
@@ -49,6 +53,8 @@ osd_t::~osd_t()

 void osd_t::parse_config(blockstore_config_t & config)
 {
+    if (config.find("log_level") == config.end())
+        config["log_level"] = "1";
    // Initial startup configuration
    json11::Json json_config = json11::Json(config);
    st_cli.parse_config(json_config);
@@ -91,6 +97,9 @@ void osd_t::parse_config(blockstore_config_t & config)
    print_stats_interval = strtoull(config["print_stats_interval"].c_str(), NULL, 10);
    if (!print_stats_interval)
        print_stats_interval = 3;
+    slow_log_interval = strtoull(config["slow_log_interval"].c_str(), NULL, 10);
+    if (!slow_log_interval)
+        slow_log_interval = 10;
    c_cli.peer_connect_interval = strtoull(config["peer_connect_interval"].c_str(), NULL, 10);
    if (!c_cli.peer_connect_interval)
        c_cli.peer_connect_interval = DEFAULT_PEER_CONNECT_INTERVAL;
@@ -313,3 +322,73 @@ void osd_t::print_stats()
        printf("[OSD %lu] %lu object(s) misplaced\n", osd_num, misplaced_objects);
    }
 }
+
+void osd_t::print_slow()
+{
+    char alloc[1024];
+    timespec now;
+    clock_gettime(CLOCK_REALTIME, &now);
+    for (auto & kv: c_cli.clients)
+    {
+        for (auto op: kv.second->received_ops)
+        {
+            if ((now.tv_sec - op->tv_begin.tv_sec) >= slow_log_interval)
+            {
+                int l = sizeof(alloc), n;
+                char *buf = alloc;
+#define bufprintf(s, ...) { n = snprintf(buf, l, s, __VA_ARGS__); n = n < 0 ? 0 : n; buf += n; l -= n; }
+                bufprintf("[OSD %lu] Slow op", osd_num);
+                if (kv.second->osd_num)
+                {
+                    bufprintf(" from peer OSD %lu (client %d)", kv.second->osd_num, kv.second->peer_fd);
+                }
+                else
+                {
+                    bufprintf(" from client %d", kv.second->peer_fd);
+                }
+                bufprintf(": %s id=%lu", osd_op_names[op->req.hdr.opcode], op->req.hdr.id);
+                if (op->req.hdr.opcode == OSD_OP_SEC_READ || op->req.hdr.opcode == OSD_OP_SEC_WRITE ||
+                    op->req.hdr.opcode == OSD_OP_SEC_WRITE_STABLE || op->req.hdr.opcode == OSD_OP_SEC_DELETE)
+                {
+                    bufprintf(" %lx:%lx v", op->req.sec_rw.oid.inode, op->req.sec_rw.oid.stripe);
+                    if (op->req.sec_rw.version == UINT64_MAX)
+                    {
+                        bufprintf("%s", "max");
+                    }
+                    else
+                    {
+                        bufprintf("%lu", op->req.sec_rw.version);
+                    }
+                    if (op->req.hdr.opcode != OSD_OP_SEC_DELETE)
+                    {
+                        bufprintf(" offset=%x len=%x", op->req.sec_rw.offset, op->req.sec_rw.len);
+                    }
+                }
+                else if (op->req.hdr.opcode == OSD_OP_SEC_STABILIZE || op->req.hdr.opcode == OSD_OP_SEC_ROLLBACK)
+                {
+                    for (uint64_t i = 0; i < op->req.sec_stab.len; i += sizeof(obj_ver_id))
+                    {
+                        obj_ver_id *ov = (obj_ver_id*)(op->buf + i);
+                        bufprintf(i == 0 ? " %lx:%lx v%lu" : ", %lx:%lx v%lu", ov->oid.inode, ov->oid.stripe, ov->version);
+                    }
+                }
+                else if (op->req.hdr.opcode == OSD_OP_SEC_LIST)
+                {
+                    bufprintf(
+                        " inode=%lx-%lx pg=%u/%u, stripe=%lu",
+                        op->req.sec_list.min_inode, op->req.sec_list.max_inode,
+                        op->req.sec_list.list_pg, op->req.sec_list.pg_count,
+                        op->req.sec_list.pg_stripe_size
+                    );
+                }
+                else if (op->req.hdr.opcode == OSD_OP_READ || op->req.hdr.opcode == OSD_OP_WRITE ||
+                    op->req.hdr.opcode == OSD_OP_DELETE)
+                {
+                    bufprintf(" inode=%lx offset=%lx len=%x", op->req.rw.inode, op->req.rw.offset, op->req.rw.len);
+                }
+#undef bufprintf
+                printf("%s\n", alloc);
+            }
+        }
+    }
+}
--- a/osd.h
+++ b/osd.h
@@ -70,6 +70,7 @@ class osd_t
    int client_queue_depth = 128;
    bool allow_test_ops = true;
    int print_stats_interval = 3;
+    int slow_log_interval = 10;
    int immediate_commit = IMMEDIATE_NONE;
    int autosync_interval = DEFAULT_AUTOSYNC_INTERVAL; // sync every 5 seconds
    int recovery_queue_depth = DEFAULT_RECOVERY_QUEUE;
@@ -138,6 +139,7 @@ class osd_t
    void create_osd_state();
    void renew_lease();
    void print_stats();
+    void print_slow();
    void reset_stats();
    json11::Json get_statistics();
    void report_statistics();
--- a/osd_cluster.cpp
+++ b/osd_cluster.cpp
@@ -4,6 +4,7 @@
 #include "osd.h"
 #include "base64.h"
 #include "etcd_state_client.h"
+#include "osd_rmw.h"

 // Startup sequence:
 //   Start etcd watcher -> Load global OSD configuration -> Bind socket -> Acquire lease -> Report&lock OSD state
@@ -32,12 +33,26 @@ void osd_t::init_cluster()
            }
            pgs[{ 1, 1 }] = (pg_t){
                .state = PG_PEERING,
+                .scheme = POOL_SCHEME_XOR,
                .pg_cursize = 0,
+                .pg_size = 3,
+                .pg_minsize = 2,
+                .parity_chunks = 1,
                .pool_id = 1,
                .pg_num = 1,
                .target_set = { 1, 2, 3 },
                .cur_set = { 0, 0, 0 },
            };
+            st_cli.pool_config[1] = (pool_config_t){
+                .exists = true,
+                .id = 1,
+                .name = "testpool",
+                .scheme = POOL_SCHEME_XOR,
+                .pg_size = 3,
+                .pg_minsize = 2,
+                .pg_count = 1,
+                .real_pg_count = 1,
+            };
            report_pg_state(pgs[{ 1, 1 }]);
            pg_counts[1] = 1;
        }
@@ -223,8 +238,11 @@ void osd_t::on_change_osd_state_hook(osd_num_t peer_osd)
 void osd_t::on_change_etcd_state_hook(json11::Json::object & changes)
 {
    // FIXME apply config changes in runtime (maybe, some)
-    apply_pg_count();
-    apply_pg_config();
+    if (run_primary)
+    {
+        apply_pg_count();
+        apply_pg_config();
+    }
 }

 void osd_t::on_change_pg_history_hook(pool_id_t pool_id, pg_num_t pg_num)
@@ -270,7 +288,6 @@ void osd_t::on_load_config_hook(json11::Json::object & global_config)
    }
    parse_config(osd_config);
    bind_socket();
-    st_cli.start_etcd_watcher();
    acquire_lease();
 }

@@ -477,7 +494,11 @@ void osd_t::apply_pg_count()
            }
            if (still_active > 0)
            {
-                printf("[OSD %lu] PG count change detected, but %d PG(s) are still active. This is not allowed. Exiting\n", this->osd_num, still_active);
+                printf(
+                    "[OSD %lu] PG count change detected for pool %u (new is %lu, old is %u),"
+                    " but %u PG(s) are still active. This is not allowed. Exiting\n",
+                    this->osd_num, pool_item.first, pool_item.second.real_pg_count, pg_counts[pool_item.first], still_active
+                );
                force_stop(1);
                return;
            }
@@ -581,6 +602,7 @@ void osd_t::apply_pg_config()
                    .pg_cursize = 0,
                    .pg_size = pool_item.second.pg_size,
                    .pg_minsize = pool_item.second.pg_minsize,
+                    .parity_chunks = pool_item.second.parity_chunks,
                    .pool_id = pool_id,
                    .pg_num = pg_num,
                    .reported_epoch = pg_cfg.epoch,
@@ -588,6 +610,10 @@ void osd_t::apply_pg_config()
                    .all_peers = std::vector<osd_num_t>(all_peers.begin(), all_peers.end()),
                    .target_set = pg_cfg.target_set,
                };
+                if (pg.scheme == POOL_SCHEME_JERASURE)
+                {
+                    use_jerasure(pg.pg_size, pg.pg_size-pg.parity_chunks, true);
+                }
                this->pg_state_dirty.insert({ .pool_id = pool_id, .pg_num = pg_num });
                pg.print_state();
                if (pg_cfg.cur_primary == this->osd_num)
@@ -776,6 +802,10 @@ void osd_t::report_pg_states()
                    {
                        // Remove offline PGs after reporting their state
                        this->pgs.erase(pg_it);
+                        if (pg_it->second.scheme == POOL_SCHEME_JERASURE)
+                        {
+                            use_jerasure(pg_it->second.pg_size, pg_it->second.pg_size-pg_it->second.parity_chunks, false);
+                        }
                    }
                }
            }
--- a/osd_flush.cpp
+++ b/osd_flush.cpp
@@ -166,7 +166,7 @@ void osd_t::submit_flush_op(pool_id_t pool_id, pg_num_t pg_num, pg_flush_batch_t
    {
        // local
        clock_gettime(CLOCK_REALTIME, &op->tv_begin);
-        op->bs_op = new blockstore_op_t({
+        op->bs_op = new blockstore_op_t((blockstore_op_t){
            .opcode = (uint64_t)(rollback ? BS_OP_ROLLBACK : BS_OP_STABLE),
            .callback = [this, op, pool_id, pg_num, fb](blockstore_op_t *bs_op)
            {
@@ -188,7 +188,7 @@ void osd_t::submit_flush_op(pool_id_t pool_id, pg_num_t pg_num, pg_flush_batch_t
        op->op_type = OSD_OP_OUT;
        op->iov.push_back(op->buf, count * sizeof(obj_ver_id));
        op->peer_fd = peer_fd;
-        op->req = {
+        op->req = (osd_any_op_t){
            .sec_stab = {
                .header = {
                    .magic = SECONDARY_OSD_OP_MAGIC,
@@ -246,7 +246,7 @@ void osd_t::submit_recovery_op(osd_recovery_op_t *op)
 {
    op->osd_op = new osd_op_t();
    op->osd_op->op_type = OSD_OP_OUT;
-    op->osd_op->req = {
+    op->osd_op->req = (osd_any_op_t){
        .rw = {
            .header = {
                .magic = SECONDARY_OSD_OP_MAGIC,
@@ -258,6 +258,10 @@ void osd_t::submit_recovery_op(osd_recovery_op_t *op)
            .len = 0,
        },
    };
+    if (log_level > 2)
+    {
+        printf("Submitting recovery operation for %lx:%lx\n", op->oid.inode, op->oid.stripe);
+    }
    op->osd_op->callback = [this, op](osd_op_t *osd_op)
    {
        // Don't sync the write, it will be synced by our regular sync coroutine
@@ -267,6 +271,11 @@ void osd_t::submit_recovery_op(osd_recovery_op_t *op)
            if (osd_op->reply.hdr.retval == -EPIPE)
            {
                // PG is stopped or one of the OSDs is gone, error is harmless
+                printf(
+                    "Recovery operation failed with object %lx:%lx (PG %u/%u)\n",
+                    op->oid.inode, op->oid.stripe, INODE_POOL(op->oid.inode),
+                    map_to_pg(op->oid, st_cli.pool_config.at(INODE_POOL(op->oid.inode)).pg_stripe_size)
+                );
            }
            else
            {
--- a/osd_id.h
+++ b/osd_id.h
@@ -5,6 +5,7 @@

 #define POOL_SCHEME_REPLICATED 1
 #define POOL_SCHEME_XOR 2
+#define POOL_SCHEME_JERASURE 3
 #define POOL_ID_MAX 0x10000
 #define POOL_ID_BITS 16
 #define INODE_POOL(inode) (pool_id_t)((inode) >> (64 - POOL_ID_BITS))
--- a/osd_peering.cpp
+++ b/osd_peering.cpp
@@ -91,7 +91,7 @@ void osd_t::repeer_pgs(osd_num_t peer_osd)
            if (repeer)
            {
                // Repeer this pg
-                printf("[PG %u] Repeer because of OSD %lu\n", p.second.pg_num, peer_osd);
+                printf("[PG %u/%u] Repeer because of OSD %lu\n", p.second.pool_id, p.second.pg_num, peer_osd);
                start_pg_peering(p.second);
            }
        }
@@ -141,7 +141,7 @@ void osd_t::start_pg_peering(pg_t & pg)
        std::vector<int> to_stop;
        for (auto & cp: c_cli.clients)
        {
-            if (cp.second.dirty_pgs.find({ .pool_id = pg.pool_id, .pg_num = pg.pg_num }) != cp.second.dirty_pgs.end())
+            if (cp.second->dirty_pgs.find({ .pool_id = pg.pool_id, .pg_num = pg.pg_num }) != cp.second->dirty_pgs.end())
            {
                to_stop.push_back(cp.first);
            }
@@ -220,8 +220,7 @@ void osd_t::start_pg_peering(pg_t & pg)
            {
                // Discard the result after completion, which, chances are, will be unsuccessful
                discard_list_subop(it->second);
-                pg.peering_state->list_ops.erase(it);
-                it = pg.peering_state->list_ops.begin();
+                pg.peering_state->list_ops.erase(it++);
            }
            else
                it++;
@@ -234,8 +233,7 @@ void osd_t::start_pg_peering(pg_t & pg)
                {
                    free(it->second.buf);
                }
-                pg.peering_state->list_results.erase(it);
-                it = pg.peering_state->list_results.begin();
+                pg.peering_state->list_results.erase(it++);
            }
            else
                it++;
@@ -308,8 +306,8 @@ void osd_t::submit_sync_and_list_subop(osd_num_t role_osd, pg_peering_state_t *p
        auto & cl = c_cli.clients.at(c_cli.osd_peer_fds[role_osd]);
        osd_op_t *op = new osd_op_t();
        op->op_type = OSD_OP_OUT;
-        op->peer_fd = cl.peer_fd;
-        op->req = {
+        op->peer_fd = cl->peer_fd;
+        op->req = (osd_any_op_t){
            .sec_sync = {
                .header = {
                    .magic = SECONDARY_OSD_OP_MAGIC,
@@ -362,8 +360,8 @@ void osd_t::submit_list_subop(osd_num_t role_osd, pg_peering_state_t *ps)
            }
            add_bs_subop_stats(op);
            printf(
-                "[PG %u] Got object list from OSD %lu (local): %d object versions (%lu of them stable)\n",
-                ps->pg_num, role_osd, bs_op->retval, bs_op->version
+                "[PG %u/%u] Got object list from OSD %lu (local): %d object versions (%lu of them stable)\n",
+                ps->pool_id, ps->pg_num, role_osd, bs_op->retval, bs_op->version
            );
            ps->list_results[role_osd] = {
                .buf = (obj_ver_id*)op->bs_op->buf,
@@ -384,7 +382,7 @@ void osd_t::submit_list_subop(osd_num_t role_osd, pg_peering_state_t *ps)
        osd_op_t *op = new osd_op_t();
        op->op_type = OSD_OP_OUT;
        op->peer_fd = c_cli.osd_peer_fds[role_osd];
-        op->req = {
+        op->req = (osd_any_op_t){
            .sec_list = {
                .header = {
                    .magic = SECONDARY_OSD_OP_MAGIC,
@@ -409,8 +407,8 @@ void osd_t::submit_list_subop(osd_num_t role_osd, pg_peering_state_t *ps)
                return;
            }
            printf(
-                "[PG %u] Got object list from OSD %lu: %ld object versions (%lu of them stable)\n",
-                ps->pg_num, role_osd, op->reply.hdr.retval, op->reply.sec_list.stable_count
+                "[PG %u/%u] Got object list from OSD %lu: %ld object versions (%lu of them stable)\n",
+                ps->pool_id, ps->pg_num, role_osd, op->reply.hdr.retval, op->reply.sec_list.stable_count
            );
            ps->list_results[role_osd] = {
                .buf = (obj_ver_id*)op->buf,
--- a/osd_peering_pg.cpp
+++ b/osd_peering_pg.cpp
@@ -1,6 +1,7 @@
 // Copyright (c) Vitaliy Filippov, 2019+
 // License: VNPL-1.0 (see README.md for details)

+#include <unordered_map>
 #include "osd_peering_pg.h"

 struct obj_ver_role
@@ -236,7 +237,7 @@ void pg_obj_state_check_t::finish_object()
    {
        if (log_level > 1)
        {
-            printf("Object is incomplete: inode=%lu stripe=%lu version=%lu/%lu\n", oid.inode, oid.stripe, target_ver, max_ver);
+            printf("Object is incomplete: %lx:%lx version=%lu/%lu\n", oid.inode, oid.stripe, target_ver, max_ver);
        }
        state = OBJ_INCOMPLETE;
        pg->state = pg->state | PG_HAS_INCOMPLETE;
@@ -245,37 +246,27 @@ void pg_obj_state_check_t::finish_object()
    {
        if (log_level > 1)
        {
-            printf("Object is degraded: inode=%lu stripe=%lu version=%lu/%lu\n", oid.inode, oid.stripe, target_ver, max_ver);
+            printf("Object is degraded: %lx:%lx version=%lu/%lu\n", oid.inode, oid.stripe, target_ver, max_ver);
        }
        state = OBJ_DEGRADED;
        pg->state = pg->state | PG_HAS_DEGRADED;
    }
    else if (n_mismatched > 0)
    {
-        if (log_level > 1 && (replicated || n_roles >= pg->pg_cursize))
+        if (log_level > 2 && (replicated || n_roles >= pg->pg_cursize))
        {
-            printf("Object is misplaced: inode=%lu stripe=%lu version=%lu/%lu\n", oid.inode, oid.stripe, target_ver, max_ver);
+            printf("Object is misplaced: %lx:%lx version=%lu/%lu\n", oid.inode, oid.stripe, target_ver, max_ver);
        }
        state |= OBJ_MISPLACED;
        pg->state = pg->state | PG_HAS_MISPLACED;
    }
-    if (log_level > 1 && ((replicated ? n_copies : n_roles) < pg->pg_cursize || n_mismatched > 0))
+    if (log_level > 1 && (state & (OBJ_INCOMPLETE | OBJ_DEGRADED)) ||
+        log_level > 2 && (state & OBJ_MISPLACED))
    {
-        if (log_level > 2)
+        for (int i = obj_start; i < obj_end; i++)
        {
-            for (int i = obj_start; i < obj_end; i++)
-            {
-                printf("v%lu present on: osd %lu, role %ld%s\n", list[i].version, list[i].osd_num,
-                    (list[i].oid.stripe & STRIPE_MASK), list[i].is_stable ? " (stable)" : "");
-            }
-        }
-        else
-        {
-            for (int i = ver_start; i < ver_end; i++)
-            {
-                printf("Target version present on: osd %lu, role %ld%s\n", list[i].osd_num,
-                    (list[i].oid.stripe & STRIPE_MASK), list[i].is_stable ? " (stable)" : "");
-            }
+            printf("v%lu present on: osd %lu, role %ld%s\n", list[i].version, list[i].osd_num,
+                (list[i].oid.stripe & STRIPE_MASK), list[i].is_stable ? " (stable)" : "");
        }
    }
    pg->total_count++;
@@ -439,7 +430,7 @@ void pg_t::calc_object_states(int log_level)
 void pg_t::print_state()
 {
    printf(
-        "[PG %u] is %s%s%s%s%s%s%s%s%s%s%s%s%s (%lu objects)\n", pg_num,
+        "[PG %u/%u] is %s%s%s%s%s%s%s%s%s%s%s%s%s (%lu objects)\n", pool_id, pg_num,
        (state & PG_STARTING) ? "starting" : "",
        (state & PG_OFFLINE) ? "offline" : "",
        (state & PG_PEERING) ? "peering" : "",
--- a/osd_peering_pg.h
+++ b/osd_peering_pg.h
@@ -2,7 +2,6 @@
 // License: VNPL-1.0 (see README.md for details)

 #include <map>
-#include <unordered_map>
 #include <vector>
 #include <algorithm>

@@ -45,8 +44,8 @@ struct osd_op_t;
 struct pg_peering_state_t
 {
    // osd_num -> list result
-    std::unordered_map<osd_num_t, osd_op_t*> list_ops;
-    std::unordered_map<osd_num_t, pg_list_result_t> list_results;
+    std::map<osd_num_t, osd_op_t*> list_ops;
+    std::map<osd_num_t, pg_list_result_t> list_results;
    pool_id_t pool_id = 0;
    pg_num_t pg_num = 0;
 };
@@ -76,7 +75,7 @@ struct pg_t
 {
    int state = 0;
    uint64_t scheme = 0;
-    uint64_t pg_cursize = 0, pg_size = 0, pg_minsize = 0;
+    uint64_t pg_cursize = 0, pg_size = 0, pg_minsize = 0, parity_chunks = 0;
    pool_id_t pool_id = 0;
    pg_num_t pg_num = 0;
    uint64_t clean_count = 0, total_count = 0;
--- a/osd_primary.cpp
+++ b/osd_primary.cpp
@@ -16,10 +16,19 @@ bool osd_t::prepare_primary_rw(osd_op_t *cur_op)
 {
    // PG number is calculated from the offset
    // Our EC scheme stores data in fixed chunks equal to (K*block size)
-    // K = pg_minsize in case of EC/XOR, or 1 for replicated pools
+    // K = (pg_size-parity_chunks) in case of EC/XOR, or 1 for replicated pools
    pool_id_t pool_id = INODE_POOL(cur_op->req.rw.inode);
-    auto & pool_cfg = st_cli.pool_config[pool_id];
-    uint64_t pg_block_size = bs_block_size * (pool_cfg.scheme == POOL_SCHEME_REPLICATED ? 1 : pool_cfg.pg_minsize);
+    // FIXME: We have to access pool config here, so make sure that it doesn't change while its PGs are active...
+    auto pool_cfg_it = st_cli.pool_config.find(pool_id);
+    if (pool_cfg_it == st_cli.pool_config.end())
+    {
+        // Pool config is not loaded yet
+        finish_op(cur_op, -EPIPE);
+        return false;
+    }
+    auto & pool_cfg = pool_cfg_it->second;
+    uint64_t pg_data_size = (pool_cfg.scheme == POOL_SCHEME_REPLICATED ? 1 : pool_cfg.pg_size-pool_cfg.parity_chunks);
+    uint64_t pg_block_size = bs_block_size * pg_data_size;
    object_id oid = {
        .inode = cur_op->req.rw.inode,
        // oid.stripe = starting offset of the parity stripe
@@ -30,6 +39,7 @@ bool osd_t::prepare_primary_rw(osd_op_t *cur_op)
    if (pg_it == pgs.end() || !(pg_it->second.state & PG_ACTIVE))
    {
        // This OSD is not primary for this PG or the PG is inactive
+        // FIXME: Allow reads from PGs degraded under pg_minsize, but don't allow writes
        finish_op(cur_op, -EPIPE);
        return false;
    }
@@ -47,9 +57,9 @@ bool osd_t::prepare_primary_rw(osd_op_t *cur_op)
    op_data->oid = oid;
    op_data->stripes = ((osd_rmw_stripe_t*)(op_data+1));
    op_data->scheme = pool_cfg.scheme;
+    op_data->pg_data_size = pg_data_size;
    cur_op->op_data = op_data;
-    split_stripes((pool_cfg.scheme == POOL_SCHEME_REPLICATED ? 1 : pg_it->second.pg_minsize),
-        bs_block_size, (uint32_t)(cur_op->req.rw.offset - oid.stripe), cur_op->req.rw.len, op_data->stripes);
+    split_stripes(pg_data_size, bs_block_size, (uint32_t)(cur_op->req.rw.offset - oid.stripe), cur_op->req.rw.len, op_data->stripes);
    pg_it->second.inflight++;
    return true;
 }
@@ -94,7 +104,7 @@ void osd_t::continue_primary_read(osd_op_t *cur_op)
    else if (op_data->st == 2) goto resume_2;
    {
        auto & pg = pgs[{ .pool_id = INODE_POOL(op_data->oid.inode), .pg_num = op_data->pg_num }];
-        for (int role = 0; role < (op_data->scheme == POOL_SCHEME_REPLICATED ? 1 : pg.pg_minsize); role++)
+        for (int role = 0; role < op_data->pg_data_size; role++)
        {
            op_data->stripes[role].read_start = op_data->stripes[role].req_start;
            op_data->stripes[role].read_end = op_data->stripes[role].req_end;
@@ -105,24 +115,23 @@ void osd_t::continue_primary_read(osd_op_t *cur_op)
        if (pg.state == PG_ACTIVE || op_data->scheme == POOL_SCHEME_REPLICATED)
        {
            // Fast happy-path
-            cur_op->buf = alloc_read_buffer(op_data->stripes,
-                (op_data->scheme == POOL_SCHEME_REPLICATED ? 1 : pg.pg_minsize), 0);
+            cur_op->buf = alloc_read_buffer(op_data->stripes, op_data->pg_data_size, 0);
            submit_primary_subops(SUBMIT_READ, op_data->target_ver,
-                (op_data->scheme == POOL_SCHEME_REPLICATED ? pg.pg_size : pg.pg_minsize), pg.cur_set.data(), cur_op);
+                (op_data->scheme == POOL_SCHEME_REPLICATED ? pg.pg_size : op_data->pg_data_size), pg.cur_set.data(), cur_op);
            op_data->st = 1;
        }
        else
        {
            // PG may be degraded or have misplaced objects
            uint64_t* cur_set = get_object_osd_set(pg, op_data->oid, pg.cur_set.data(), &op_data->object_state);
-            if (extend_missing_stripes(op_data->stripes, cur_set, pg.pg_minsize, pg.pg_size) < 0)
+            if (extend_missing_stripes(op_data->stripes, cur_set, op_data->pg_data_size, pg.pg_size) < 0)
            {
                finish_op(cur_op, -EIO);
                return;
            }
            // Submit reads
-            op_data->pg_minsize = pg.pg_minsize;
            op_data->pg_size = pg.pg_size;
+            op_data->scheme = pg.scheme;
            op_data->degraded = 1;
            cur_op->buf = alloc_read_buffer(op_data->stripes, pg.pg_size, 0);
            submit_primary_subops(SUBMIT_READ, op_data->target_ver, pg.pg_size, cur_set, cur_op);
@@ -140,14 +149,17 @@ resume_2:
    if (op_data->degraded)
    {
        // Reconstruct missing stripes
-        // FIXME: Always EC(k+1) by now. Add different coding schemes
        osd_rmw_stripe_t *stripes = op_data->stripes;
-        for (int role = 0; role < op_data->pg_minsize; role++)
+        if (op_data->scheme == POOL_SCHEME_XOR)
+        {
+            reconstruct_stripes_xor(stripes, op_data->pg_size);
+        }
+        else if (op_data->scheme == POOL_SCHEME_JERASURE)
+        {
+            reconstruct_stripes_jerasure(stripes, op_data->pg_size, op_data->pg_data_size);
+        }
+        for (int role = 0; role < op_data->pg_size; role++)
        {
-            if (stripes[role].read_end != 0 && stripes[role].missing)
-            {
-                reconstruct_stripe_xor(stripes, op_data->pg_size, role);
-            }
            if (stripes[role].req_end != 0)
            {
                // Send buffer in parts to avoid copying
@@ -238,7 +250,13 @@ resume_1:
    else
    {
        cur_op->rmw_buf = calc_rmw(cur_op->buf, op_data->stripes, op_data->prev_set,
-            pg.pg_size, pg.pg_minsize, pg.pg_cursize, pg.cur_set.data(), bs_block_size);
+            pg.pg_size, op_data->pg_data_size, pg.pg_cursize, pg.cur_set.data(), bs_block_size);
+        if (!cur_op->rmw_buf)
+        {
+            // Refuse partial overwrite of an incomplete object
+            cur_op->reply.hdr.retval = -EINVAL;
+            goto continue_others;
+        }
    }
    // Read required blocks
    submit_primary_subops(SUBMIT_RMW_READ, UINT64_MAX, pg.pg_size, op_data->prev_set, cur_op);
@@ -272,7 +290,14 @@ resume_3:
    else
    {
        // Recover missing stripes, calculate parity
-        calc_rmw_parity_xor(op_data->stripes, pg.pg_size, op_data->prev_set, pg.cur_set.data(), bs_block_size);
+        if (pg.scheme == POOL_SCHEME_XOR)
+        {
+            calc_rmw_parity_xor(op_data->stripes, pg.pg_size, op_data->prev_set, pg.cur_set.data(), bs_block_size);
+        }
+        else if (pg.scheme == POOL_SCHEME_JERASURE)
+        {
+            calc_rmw_parity_jerasure(op_data->stripes, pg.pg_size, op_data->pg_data_size, op_data->prev_set, pg.cur_set.data(), bs_block_size);
+        }
    }
    // Send writes
    if ((op_data->fact_ver >> (64-PG_EPOCH_BITS)) < pg.epoch)
@@ -312,6 +337,13 @@ resume_5:
        pg_cancel_write_queue(pg, cur_op, op_data->oid, op_data->epipe > 0 ? -EPIPE : -EIO);
        return;
    }
+resume_6:
+resume_7:
+    if (!remember_unstable_write(cur_op, pg, pg.cur_loc_set, 6))
+    {
+        // FIXME: Check for immediate_commit == IMMEDIATE_SMALL
+        return;
+    }
    if (op_data->fact_ver == 1)
    {
        // Object is created
@@ -354,17 +386,12 @@ resume_9:
        remove_object_from_state(op_data->oid, op_data->object_state, pg);
        pg.clean_count++;
    }
+    cur_op->reply.hdr.retval = cur_op->req.rw.len;
+continue_others:
    // Remove version override
    pg.ver_override.erase(op_data->oid);
-    // FIXME: Check for immediate_commit == IMMEDIATE_SMALL
-resume_6:
-resume_7:
-    if (!remember_unstable_write(cur_op, pg, pg.cur_loc_set, 6))
-    {
-        return;
-    }
    object_id oid = op_data->oid;
-    finish_op(cur_op, cur_op->req.rw.len);
+    finish_op(cur_op, cur_op->reply.hdr.retval);
    // Continue other write operations to the same object
    auto next_it = pg.write_queue.find(oid);
    auto this_it = next_it;
@@ -391,6 +418,7 @@ bool osd_t::remember_unstable_write(osd_op_t *cur_op, pg_t & pg, pg_osd_set_t &
    {
        goto resume_7;
    }
+    // FIXME: Check for immediate_commit == IMMEDIATE_SMALL
    if (immediate_commit == IMMEDIATE_ALL)
    {
        if (op_data->scheme != POOL_SCHEME_REPLICATED)
@@ -461,7 +489,7 @@ resume_7:
        }
        // Remember PG as dirty to drop the connection when PG goes offline
        // (this is required because of the "lazy sync")
-        c_cli.clients[cur_op->peer_fd].dirty_pgs.insert({ .pool_id = pg.pool_id, .pg_num = pg.pg_num });
+        c_cli.clients[cur_op->peer_fd]->dirty_pgs.insert({ .pool_id = pg.pool_id, .pg_num = pg.pg_num });
        dirty_pgs.insert({ .pool_id = pg.pool_id, .pg_num = pg.pg_num });
    }
    return true;
@@ -651,7 +679,7 @@ finish:
        {
            auto it = c_cli.clients.find(cur_op->peer_fd);
            if (it != c_cli.clients.end())
-                it->second.dirty_pgs.clear();
+                it->second->dirty_pgs.clear();
        }
        finish_op(cur_op, 0);
    }
--- a/osd_primary.h
+++ b/osd_primary.h
@@ -25,7 +25,7 @@ struct osd_primary_op_data_t
    uint64_t fact_ver = 0;
    uint64_t scheme = 0;
    int n_subops = 0, done = 0, errors = 0, epipe = 0;
-    int degraded = 0, pg_size, pg_minsize;
+    int degraded = 0, pg_size, pg_data_size;
    osd_rmw_stripe_t *stripes;
    osd_op_t *subops = NULL;
    uint64_t *prev_set = NULL;
--- a/osd_primary_subops.cpp
+++ b/osd_primary_subops.cpp
@@ -11,7 +11,7 @@ void osd_t::autosync()
    {
        autosync_op = new osd_op_t();
        autosync_op->op_type = OSD_OP_IN;
-        autosync_op->req = {
+        autosync_op->req = (osd_any_op_t){
            .sync = {
                .header = {
                    .magic = SECONDARY_OSD_OP_MAGIC,
@@ -510,7 +510,7 @@ void osd_t::submit_primary_stab_subops(osd_op_t *cur_op)
        {
            clock_gettime(CLOCK_REALTIME, &subops[i].tv_begin);
            subops[i].op_type = (uint64_t)cur_op;
-            subops[i].bs_op = new blockstore_op_t({
+            subops[i].bs_op = new blockstore_op_t((blockstore_op_t){
                .opcode = BS_OP_STABLE,
                .callback = [subop = &subops[i], this](blockstore_op_t *bs_subop)
                {
@@ -552,24 +552,28 @@ void osd_t::submit_primary_stab_subops(osd_op_t *cur_op)
 void osd_t::pg_cancel_write_queue(pg_t & pg, osd_op_t *first_op, object_id oid, int retval)
 {
    auto st_it = pg.write_queue.find(oid), it = st_it;
-    finish_op(first_op, retval);
-    if (it != pg.write_queue.end() && it->second == first_op)
-    {
-        it++;
-    }
-    else
+    if (it == pg.write_queue.end() || it->second != first_op)
    {
        // Write queue doesn't match the first operation.
        // first_op is a leftover operation from the previous peering of the same PG.
+        finish_op(first_op, retval);
        return;
    }
-    while (it != pg.write_queue.end() && it->first == oid)
+    std::vector<osd_op_t*> cancel_ops;
+    while (it != pg.write_queue.end())
    {
-        finish_op(it->second, retval);
+        cancel_ops.push_back(it->second);
        it++;
    }
    if (st_it != it)
    {
+        // First erase them and then run finish_op() for the sake of reenterability
+        // Calling finish_op() on a live iterator previously triggered a bug where some
+        // of the OSDs were looping infinitely if you stopped all of them with kill -INT during recovery
        pg.write_queue.erase(st_it, it);
+        for (auto op: cancel_ops)
+        {
+            finish_op(op, retval);
+        }
    }
 }
--- a/osd_rmw.cpp
+++ b/osd_rmw.cpp
@@ -1,8 +1,12 @@
 // Copyright (c) Vitaliy Filippov, 2019+
 // License: VNPL-1.0 (see README.md for details)

+#include <stdexcept>
 #include <string.h>
 #include <assert.h>
+#include <jerasure/reed_sol.h>
+#include <jerasure.h>
+#include <map>
 #include "xor.h"
 #include "osd_rmw.h"
 #include "malloc_or_die.h"
@@ -75,44 +79,189 @@ void split_stripes(uint64_t pg_minsize, uint32_t bs_block_size, uint32_t start,
    }
 }

-void reconstruct_stripe_xor(osd_rmw_stripe_t *stripes, int pg_size, int role)
+void reconstruct_stripes_xor(osd_rmw_stripe_t *stripes, int pg_size)
 {
-    int prev = -2;
-    for (int other = 0; other < pg_size; other++)
+    for (int role = 0; role < pg_size; role++)
    {
-        if (other != role)
+        if (stripes[role].read_end != 0 && stripes[role].missing)
        {
-            if (prev == -2)
+            // Reconstruct missing stripe (XOR k+1)
+            int prev = -2;
+            for (int other = 0; other < pg_size; other++)
            {
-                prev = other;
-            }
-            else if (prev >= 0)
-            {
-                assert(stripes[role].read_start >= stripes[prev].read_start &&
-                    stripes[role].read_start >= stripes[other].read_start);
-                memxor(
-                    stripes[prev].read_buf + (stripes[role].read_start - stripes[prev].read_start),
-                    stripes[other].read_buf + (stripes[role].read_start - stripes[other].read_start),
-                    stripes[role].read_buf, stripes[role].read_end - stripes[role].read_start
-                );
-                prev = -1;
-            }
-            else
-            {
-                assert(stripes[role].read_start >= stripes[other].read_start);
-                memxor(
-                    stripes[role].read_buf,
-                    stripes[other].read_buf + (stripes[role].read_start - stripes[other].read_start),
-                    stripes[role].read_buf, stripes[role].read_end - stripes[role].read_start
-                );
+                if (other != role)
+                {
+                    if (prev == -2)
+                    {
+                        prev = other;
+                    }
+                    else if (prev >= 0)
+                    {
+                        assert(stripes[role].read_start >= stripes[prev].read_start &&
+                            stripes[role].read_start >= stripes[other].read_start);
+                        memxor(
+                            stripes[prev].read_buf + (stripes[role].read_start - stripes[prev].read_start),
+                            stripes[other].read_buf + (stripes[role].read_start - stripes[other].read_start),
+                            stripes[role].read_buf, stripes[role].read_end - stripes[role].read_start
+                        );
+                        prev = -1;
+                    }
+                    else
+                    {
+                        assert(stripes[role].read_start >= stripes[other].read_start);
+                        memxor(
+                            stripes[role].read_buf,
+                            stripes[other].read_buf + (stripes[role].read_start - stripes[other].read_start),
+                            stripes[role].read_buf, stripes[role].read_end - stripes[role].read_start
+                        );
+                    }
+                }
            }
        }
    }
 }

-int extend_missing_stripes(osd_rmw_stripe_t *stripes, osd_num_t *osd_set, int minsize, int size)
+struct reed_sol_erased_t
 {
-    for (int role = 0; role < minsize; role++)
+    int *data;
+    int size;
+};
+
+inline bool operator < (const reed_sol_erased_t &a, const reed_sol_erased_t &b)
+{
+    for (int i = 0; i < a.size && i < b.size; i++)
+    {
+        if (a.data[i] < b.data[i])
+            return -1;
+        else if (a.data[i] > b.data[i])
+            return 1;
+    }
+    return 0;
+}
+
+struct reed_sol_matrix_t
+{
+    int refs = 0;
+    int *data;
+    std::map<reed_sol_erased_t, int*> decodings;
+};
+
+std::map<uint64_t, reed_sol_matrix_t> matrices;
+
+void use_jerasure(int pg_size, int pg_minsize, bool use)
+{
+    uint64_t key = (uint64_t)pg_size | ((uint64_t)pg_minsize) << 32;
+    auto rs_it = matrices.find(key);
+    if (rs_it == matrices.end())
+    {
+        if (!use)
+        {
+            return;
+        }
+        int *matrix = reed_sol_vandermonde_coding_matrix(pg_minsize, pg_size-pg_minsize, 32);
+        matrices[key] = (reed_sol_matrix_t){
+            .refs = 0,
+            .data = matrix,
+        };
+        rs_it = matrices.find(key);
+    }
+    rs_it->second.refs += (!use ? -1 : 1);
+    if (rs_it->second.refs <= 0)
+    {
+        free(rs_it->second.data);
+        for (auto dec_it = rs_it->second.decodings.begin(); dec_it != rs_it->second.decodings.end();)
+        {
+            int *data = dec_it->second;
+            rs_it->second.decodings.erase(dec_it++);
+            free(data);
+        }
+        matrices.erase(rs_it);
+    }
+}
+
+reed_sol_matrix_t* get_jerasure_matrix(int pg_size, int pg_minsize)
+{
+    uint64_t key = (uint64_t)pg_size | ((uint64_t)pg_minsize) << 32;
+    auto rs_it = matrices.find(key);
+    if (rs_it == matrices.end())
+    {
+        throw std::runtime_error("jerasure matrix not initialized");
+    }
+    return &rs_it->second;
+}
+
+// jerasure_matrix_decode() decodes all chunks at once and tries to reencode all missing coding chunks.
+// we don't need it. also it makes an extra allocation of int *erased on every call and doesn't cache
+// the decoding matrix.
+// all these flaws are fixed in this function:
+int* get_jerasure_decoding_matrix(osd_rmw_stripe_t *stripes, int pg_size, int pg_minsize)
+{
+    int edd = 0;
+    int erased[pg_size] = { 0 };
+    for (int i = 0; i < pg_size; i++)
+        if (stripes[i].read_end == 0 || stripes[i].missing)
+            erased[i] = 1;
+    for (int i = 0; i < pg_minsize; i++)
+        if (stripes[i].read_end != 0 && stripes[i].missing)
+            edd++;
+    if (edd == 0)
+        return NULL;
+    reed_sol_matrix_t *matrix = get_jerasure_matrix(pg_size, pg_minsize);
+    auto dec_it = matrix->decodings.find((reed_sol_erased_t){ .data = erased, .size = pg_size });
+    if (dec_it == matrix->decodings.end())
+    {
+        int *dm_ids = (int*)malloc(sizeof(int)*(pg_minsize + pg_minsize*pg_minsize + pg_size));
+        int *decoding_matrix = dm_ids + pg_minsize;
+        if (!dm_ids)
+            throw std::bad_alloc();
+        // we always use row_k_ones=1 and w=32
+        if (jerasure_make_decoding_matrix(pg_minsize, pg_size-pg_minsize, 32, matrix->data, erased, decoding_matrix, dm_ids) < 0)
+        {
+            free(dm_ids);
+            throw std::runtime_error("jerasure_make_decoding_matrix() failed");
+        }
+        int *erased_copy = dm_ids + pg_minsize + pg_minsize*pg_minsize;
+        memcpy(erased_copy, erased, pg_size*sizeof(int));
+        matrix->decodings.emplace((reed_sol_erased_t){ .data = erased_copy, .size = pg_size }, dm_ids);
+        return dm_ids;
+    }
+    return dec_it->second;
+}
+
+void reconstruct_stripes_jerasure(osd_rmw_stripe_t *stripes, int pg_size, int pg_minsize)
+{
+    int *dm_ids = get_jerasure_decoding_matrix(stripes, pg_size, pg_minsize);
+    if (!dm_ids)
+    {
+        return;
+    }
+    int *decoding_matrix = dm_ids + pg_minsize;
+    char *data_ptrs[pg_size] = { 0 };
+    for (int role = 0; role < pg_minsize; role++)
+    {
+        if (stripes[role].read_end != 0 && stripes[role].missing)
+        {
+            for (int other = 0; other < pg_size; other++)
+            {
+                if (stripes[other].read_end != 0 && !stripes[other].missing)
+                {
+                    assert(stripes[other].read_start <= stripes[role].read_start);
+                    assert(stripes[other].read_end >= stripes[role].read_end);
+                    data_ptrs[other] = (char*)(stripes[other].read_buf + (stripes[role].read_start - stripes[other].read_start));
+                }
+            }
+            data_ptrs[role] = (char*)stripes[role].read_buf;
+            jerasure_matrix_dotprod(
+                pg_minsize, 32, decoding_matrix+(role*pg_minsize), dm_ids, role,
+                data_ptrs, data_ptrs+pg_minsize, stripes[role].read_end - stripes[role].read_start
+            );
+        }
+    }
+}
+
+int extend_missing_stripes(osd_rmw_stripe_t *stripes, osd_num_t *osd_set, int pg_minsize, int pg_size)
+{
+    for (int role = 0; role < pg_minsize; role++)
    {
        if (stripes[role].read_end != 0 && osd_set[role] == 0)
        {
@@ -121,21 +270,21 @@ int extend_missing_stripes(osd_rmw_stripe_t *stripes, osd_num_t *osd_set, int mi
            // We need at least pg_minsize stripes to recover the lost part.
            // FIXME: LRC EC and similar don't require to read all other stripes.
            int exist = 0;
-            for (int j = 0; j < size; j++)
+            for (int j = 0; j < pg_size; j++)
            {
                if (osd_set[j] != 0)
                {
                    extend_read(stripes[role].read_start, stripes[role].read_end, stripes[j]);
                    exist++;
-                    if (exist >= minsize)
+                    if (exist >= pg_minsize)
                    {
                        break;
                    }
                }
            }
-            if (exist < minsize)
+            if (exist < pg_minsize)
            {
-                // Less than minsize stripes are available for this object
+                // Less than pg_minsize stripes are available for this object
                return -1;
            }
        }
@@ -193,6 +342,16 @@ void* calc_rmw(void *request_buf, osd_rmw_stripe_t *stripes, uint64_t *read_osd_
        if (write_osd_set[role] != 0)
        {
            write_parity = 1;
+            if (write_osd_set[role] != read_osd_set[role])
+            {
+                start = 0;
+                end = chunk_size;
+                for (int r2 = pg_minsize; r2 < role; r2++)
+                {
+                    stripes[r2].write_start = start;
+                    stripes[r2].write_end = end;
+                }
+            }
            stripes[role].write_start = start;
            stripes[role].write_end = end;
        }
@@ -210,7 +369,7 @@ void* calc_rmw(void *request_buf, osd_rmw_stripe_t *stripes, uint64_t *read_osd_
        // Object is degraded/misplaced and will be moved to <write_osd_set>
        for (int role = 0; role < pg_size; role++)
        {
-            if (write_osd_set[role] != read_osd_set[role] && write_osd_set[role] != 0)
+            if (role < pg_minsize && write_osd_set[role] != read_osd_set[role] && write_osd_set[role] != 0)
            {
                // We need to get data for any moved / recovered chunk
                // And we need a continuous write buffer so we'll only optimize
@@ -251,8 +410,8 @@ void* calc_rmw(void *request_buf, osd_rmw_stripe_t *stripes, uint64_t *read_osd_
                    }
                    if (found < pg_minsize)
                    {
-                        // FIXME Object is incomplete - refuse partial overwrite
-                        assert(0);
+                        // Object is incomplete - refuse partial overwrite
+                        return NULL;
                    }
                }
            }
@@ -359,19 +518,9 @@ static void xor_multiple_buffers(buf_len_t *xor1, int n1, buf_len_t *xor2, int n
    }
 }

-void calc_rmw_parity_xor(osd_rmw_stripe_t *stripes, int pg_size, uint64_t *read_osd_set, uint64_t *write_osd_set, uint32_t chunk_size)
+static void calc_rmw_parity_copy_mod(osd_rmw_stripe_t *stripes, int pg_size, int pg_minsize,
+    uint64_t *read_osd_set, uint64_t *write_osd_set, uint32_t chunk_size, uint32_t &start, uint32_t &end)
 {
-    int pg_minsize = pg_size-1;
-    for (int role = 0; role < pg_size; role++)
-    {
-        if (stripes[role].read_end != 0 && stripes[role].missing)
-        {
-            // Reconstruct missing stripe (XOR k+1)
-            reconstruct_stripe_xor(stripes, pg_size, role);
-            break;
-        }
-    }
-    uint32_t start = 0, end = 0;
    if (write_osd_set[pg_minsize] != 0 || write_osd_set != read_osd_set)
    {
        // Required for the next two if()s
@@ -383,6 +532,14 @@ void calc_rmw_parity_xor(osd_rmw_stripe_t *stripes, int pg_size, uint64_t *read_
                end = std::max(stripes[role].req_end, end);
            }
        }
+        for (int role = pg_minsize; role < pg_size; role++)
+        {
+            if (write_osd_set[role] != 0 && write_osd_set[role] != read_osd_set[role])
+            {
+                start = 0;
+                end = chunk_size;
+            }
+        }
    }
    if (write_osd_set != read_osd_set)
    {
@@ -403,6 +560,53 @@ void calc_rmw_parity_xor(osd_rmw_stripe_t *stripes, int pg_size, uint64_t *read_
            }
        }
    }
+}
+
+static void calc_rmw_parity_copy_parity(osd_rmw_stripe_t *stripes, int pg_size, int pg_minsize,
+    uint64_t *read_osd_set, uint64_t *write_osd_set, uint32_t chunk_size, uint32_t start, uint32_t end)
+{
+    if (write_osd_set != read_osd_set)
+    {
+        for (int role = pg_minsize; role < pg_size; role++)
+        {
+            if (write_osd_set[role] != read_osd_set[role] && (start != 0 || end != chunk_size))
+            {
+                // Copy new parity into the read buffer to write it back
+                memcpy(
+                    stripes[role].read_buf + start,
+                    stripes[role].write_buf,
+                    end - start
+                );
+                stripes[role].write_buf = stripes[role].read_buf;
+                stripes[role].write_start = 0;
+                stripes[role].write_end = chunk_size;
+            }
+        }
+    }
+#ifdef RMW_DEBUG
+    printf("calc_rmw_parity:\n");
+    for (int role = 0; role < pg_size; role++)
+    {
+        auto & s = stripes[role];
+        printf(
+            "Tr=%lu Tw=%lu Q=%x-%x R=%x-%x W=%x-%x Rb=%lx Wb=%lx\n",
+            read_osd_set[role], write_osd_set[role],
+            s.req_start, s.req_end,
+            s.read_start, s.read_end,
+            s.write_start, s.write_end,
+            (uint64_t)s.read_buf,
+            (uint64_t)s.write_buf
+        );
+    }
+#endif
+}
+
+void calc_rmw_parity_xor(osd_rmw_stripe_t *stripes, int pg_size, uint64_t *read_osd_set, uint64_t *write_osd_set, uint32_t chunk_size)
+{
+    int pg_minsize = pg_size-1;
+    reconstruct_stripes_xor(stripes, pg_size);
+    uint32_t start = 0, end = 0;
+    calc_rmw_parity_copy_mod(stripes, pg_size, pg_minsize, read_osd_set, write_osd_set, chunk_size, start, end);
    if (write_osd_set[pg_minsize] != 0 && end != 0)
    {
        // Calculate new parity (XOR k+1)
@@ -431,22 +635,71 @@ void calc_rmw_parity_xor(osd_rmw_stripe_t *stripes, int pg_size, uint64_t *read_
            }
        }
    }
-    if (write_osd_set != read_osd_set)
+    calc_rmw_parity_copy_parity(stripes, pg_size, pg_minsize, read_osd_set, write_osd_set, chunk_size, start, end);
+}
+
+void calc_rmw_parity_jerasure(osd_rmw_stripe_t *stripes, int pg_size, int pg_minsize,
+    uint64_t *read_osd_set, uint64_t *write_osd_set, uint32_t chunk_size)
+{
+    reed_sol_matrix_t *matrix = get_jerasure_matrix(pg_size, pg_minsize);
+    reconstruct_stripes_jerasure(stripes, pg_size, pg_minsize);
+    uint32_t start = 0, end = 0;
+    calc_rmw_parity_copy_mod(stripes, pg_size, pg_minsize, read_osd_set, write_osd_set, chunk_size, start, end);
+    if (end != 0)
    {
-        for (int role = pg_minsize; role < pg_size; role++)
+        int i;
+        for (i = pg_minsize; i < pg_size; i++)
        {
-            if (write_osd_set[role] != read_osd_set[role] && (start != 0 || end != chunk_size))
+            if (write_osd_set[i] != 0)
+                break;
+        }
+        if (i < pg_size)
+        {
+            // Calculate new coding chunks
+            buf_len_t bufs[pg_size][3];
+            int nbuf[pg_size] = { 0 }, curbuf[pg_size] = { 0 };
+            uint32_t positions[pg_size];
+            void *data_ptrs[pg_size] = { 0 };
+            for (int i = 0; i < pg_minsize; i++)
            {
-                // Copy new parity into the read buffer to write it back
-                memcpy(
-                    stripes[role].read_buf + start,
-                    stripes[role].write_buf,
-                    end - start
+                get_old_new_buffers(stripes[i], start, end, bufs[i], nbuf[i]);
+                positions[i] = start;
+            }
+            for (int i = pg_minsize; i < pg_size; i++)
+            {
+                bufs[i][nbuf[i]++] = { .buf = stripes[i].write_buf, .len = end-start };
+                positions[i] = start;
+            }
+            uint32_t pos = start;
+            while (pos < end)
+            {
+                uint32_t next_end = end;
+                for (int i = 0; i < pg_size; i++)
+                {
+                    assert(curbuf[i] < nbuf[i]);
+                    assert(bufs[i][curbuf[i]].buf);
+                    data_ptrs[i] = bufs[i][curbuf[i]].buf + pos-positions[i];
+                    uint32_t this_end = bufs[i][curbuf[i]].len + positions[i];
+                    if (next_end > this_end)
+                        next_end = this_end;
+                }
+                assert(next_end > pos);
+                for (int i = 0; i < pg_size; i++)
+                {
+                    uint32_t this_end = bufs[i][curbuf[i]].len + positions[i];
+                    if (next_end >= this_end)
+                    {
+                        positions[i] += bufs[i][curbuf[i]].len;
+                        curbuf[i]++;
+                    }
+                }
+                jerasure_matrix_encode(
+                    pg_minsize, pg_size-pg_minsize, 32, matrix->data,
+                    (char**)data_ptrs, (char**)data_ptrs+pg_minsize, next_end-pos
                );
-                stripes[role].write_buf = stripes[role].read_buf;
-                stripes[role].write_start = 0;
-                stripes[role].write_end = chunk_size;
+                pos = next_end;
            }
        }
    }
+    calc_rmw_parity_copy_parity(stripes, pg_size, pg_minsize, read_osd_set, write_osd_set, chunk_size, start, end);
 }
--- a/osd_rmw.h
+++ b/osd_rmw.h
@@ -26,11 +26,13 @@ struct osd_rmw_stripe_t
    bool missing;
 };

+// Here pg_minsize is the number of data chunks, not the minimum number of alive OSDs for the PG to operate
+
 void split_stripes(uint64_t pg_minsize, uint32_t bs_block_size, uint32_t start, uint32_t len, osd_rmw_stripe_t *stripes);

-void reconstruct_stripe_xor(osd_rmw_stripe_t *stripes, int pg_size, int role);
+void reconstruct_stripes_xor(osd_rmw_stripe_t *stripes, int pg_size);

-int extend_missing_stripes(osd_rmw_stripe_t *stripes, osd_num_t *osd_set, int minsize, int size);
+int extend_missing_stripes(osd_rmw_stripe_t *stripes, osd_num_t *osd_set, int pg_minsize, int pg_size);

 void* alloc_read_buffer(osd_rmw_stripe_t *stripes, int read_pg_size, uint64_t add_size);

@@ -38,3 +40,10 @@ void* calc_rmw(void *request_buf, osd_rmw_stripe_t *stripes, uint64_t *read_osd_
    uint64_t pg_size, uint64_t pg_minsize, uint64_t pg_cursize, uint64_t *write_osd_set, uint64_t chunk_size);

 void calc_rmw_parity_xor(osd_rmw_stripe_t *stripes, int pg_size, uint64_t *read_osd_set, uint64_t *write_osd_set, uint32_t chunk_size);
+
+void use_jerasure(int pg_size, int pg_minsize, bool use);
+
+void reconstruct_stripes_jerasure(osd_rmw_stripe_t *stripes, int pg_size, int pg_minsize);
+
+void calc_rmw_parity_jerasure(osd_rmw_stripe_t *stripes, int pg_size, int pg_minsize,
+    uint64_t *read_osd_set, uint64_t *write_osd_set, uint32_t chunk_size);
--- a/osd_rmw_test.cpp
+++ b/osd_rmw_test.cpp
@@ -1,6 +1,8 @@
 // Copyright (c) Vitaliy Filippov, 2019+
 // License: VNPL-1.0 (see README.md for details)

+#define RMW_DEBUG
+
 #include <string.h>
 #include "osd_rmw.cpp"
 #include "test_pattern.h"
@@ -13,85 +15,11 @@ void test6();
 void test7();
 void test8();
 void test9();
-
-/***
-
-Cases:
-
-1. split(offset=128K-4K, len=8K)
-   = [ [ 128K-4K, 128K ], [ 0, 4K ], [ 0, 0 ] ]
-
-2. read(offset=128K-4K, len=8K, osd_set=[1,0,3])
-   = { read: [ [ 0, 128K ], [ 0, 4K ], [ 0, 4K ] ] }
-
-3. cover_read(0, 128K, { req: [ 128K-4K, 4K ] })
-   = { read: [ 0, 128K-4K ] }
-
-4. write(offset=128K-4K, len=8K, osd_set=[1,0,3])
-   = {
-     read: [ [ 0, 128K ], [ 4K, 128K ], [ 4K, 128K ] ],
-     write: [ [ 128K-4K, 128K ], [ 0, 4K ], [ 0, 128K ] ],
-     input buffer: [ write0, write1 ],
-     rmw buffer: [ write2, read0, read1, read2 ],
-   }
-   + check write2 buffer
-
-5. write(offset=0, len=128K+64K, osd_set=[1,0,3])
-   = {
-     req: [ [ 0, 128K ], [ 0, 64K ], [ 0, 0 ] ],
-     read: [ [ 64K, 128K ], [ 64K, 128K ], [ 64K, 128K ] ],
-     write: [ [ 0, 128K ], [ 0, 64K ], [ 0, 128K ] ],
-     input buffer: [ write0, write1 ],
-     rmw buffer: [ write2, read0, read1, read2 ],
-   }
-
-6. write(offset=0, len=128K+64K, osd_set=[1,2,3])
-   = {
-     req: [ [ 0, 128K ], [ 0, 64K ], [ 0, 0 ] ],
-     read: [ [ 0, 0 ], [ 64K, 128K ], [ 0, 0 ] ],
-     write: [ [ 0, 128K ], [ 0, 64K ], [ 0, 128K ] ],
-     input buffer: [ write0, write1 ],
-     rmw buffer: [ write2, read1 ],
-   }
-
-7. calc_rmw(offset=128K-4K, len=8K, osd_set=[1,0,3], write_set=[1,2,3])
-   = {
-     read: [ [ 0, 128K ], [ 0, 128K ], [ 0, 128K ] ],
-     write: [ [ 128K-4K, 128K ], [ 0, 4K ], [ 0, 128K ] ],
-     input buffer: [ write0, write1 ],
-     rmw buffer: [ write2, read0, read1, read2 ],
-   }
-   then, after calc_rmw_parity_xor(): {
-     write: [ [ 128K-4K, 128K ], [ 0, 128K ], [ 0, 128K ] ],
-     write1==read1,
-   }
-   + check write1 buffer
-   + check write2 buffer
-
-8. calc_rmw(offset=0, len=128K+4K, osd_set=[0,2,3], write_set=[1,2,3])
-   = {
-     read: [ [ 0, 0 ], [ 4K, 128K ], [ 0, 0 ] ],
-     write: [ [ 0, 128K ], [ 0, 4K ], [ 0, 128K ] ],
-     input buffer: [ write0, write1 ],
-     rmw buffer: [ write2, read1 ],
-   }
-   + check write2 buffer
-
-9. object recovery case:
-   calc_rmw(offset=0, len=0, read_osd_set=[0,2,3], write_osd_set=[1,2,3])
-   = {
-     read: [ [ 0, 128K ], [ 0, 128K ], [ 0, 128K ] ],
-     write: [ [ 0, 0 ], [ 0, 0 ], [ 0, 0 ] ],
-     input buffer: NULL,
-     rmw buffer: [ read0, read1, read2 ],
-   }
-   then, after calc_rmw_parity_xor(): {
-     write: [ [ 0, 128K ], [ 0, 0 ], [ 0, 0 ] ],
-     write0==read0,
-   }
-   + check write0 buffer
-
-***/
+void test10();
+void test11();
+void test12();
+void test13();
+void test14();

 int main(int narg, char *args[])
 {
@@ -109,6 +37,16 @@ int main(int narg, char *args[])
    test8();
    // Test 9
    test9();
+    // Test 10
+    test10();
+    // Test 11
+    test11();
+    // Test 12
+    test12();
+    // Test 13
+    test13();
+    // Test 14
+    test14();
    // End
    printf("all ok\n");
    return 0;
@@ -136,6 +74,19 @@ void dump_stripes(osd_rmw_stripe_t *stripes, int pg_size)
    printf("\n");
 }

+/***
+
+1. split(offset=128K-4K, len=8K)
+   = [ [ 128K-4K, 128K ], [ 0, 4K ], [ 0, 0 ] ]
+
+   read(offset=128K-4K, len=8K, osd_set=[1,0,3])
+   = { read: [ [ 0, 128K ], [ 0, 4K ], [ 0, 4K ] ] }
+
+   cover_read(0, 128K, { req: [ 128K-4K, 4K ] })
+   = { read: [ 0, 128K-4K ] }
+
+***/
+
 void test1()
 {
    osd_num_t osd_set[3] = { 1, 0, 3 };
@@ -155,11 +106,24 @@ void test1()
    assert(stripes[0].read_start == 0 && stripes[0].read_end == 128*1024);
    assert(stripes[2].read_start == 0 && stripes[2].read_end == 4096);
    // Test 1.3
-    stripes[0] = { .req_start = 128*1024-4096, .req_end = 128*1024 };
+    stripes[0] = (osd_rmw_stripe_t){ .req_start = 128*1024-4096, .req_end = 128*1024 };
    cover_read(0, 128*1024, stripes[0]);
    assert(stripes[0].read_start == 0 && stripes[0].read_end == 128*1024-4096);
 }

+/***
+
+4. write(offset=128K-4K, len=8K, osd_set=[1,0,3])
+   = {
+     read: [ [ 0, 128K ], [ 4K, 128K ], [ 4K, 128K ] ],
+     write: [ [ 128K-4K, 128K ], [ 0, 4K ], [ 0, 128K ] ],
+     input buffer: [ write0, write1 ],
+     rmw buffer: [ write2, read0, read1, read2 ],
+   }
+   + check write2 buffer
+
+***/
+
 void test4()
 {
    osd_num_t osd_set[3] = { 1, 0, 3 };
@@ -193,6 +157,19 @@ void test4()
    free(write_buf);
 }

+/***
+
+5. write(offset=0, len=128K+64K, osd_set=[1,0,3])
+   = {
+     req: [ [ 0, 128K ], [ 0, 64K ], [ 0, 0 ] ],
+     read: [ [ 64K, 128K ], [ 64K, 128K ], [ 64K, 128K ] ],
+     write: [ [ 0, 128K ], [ 0, 64K ], [ 0, 128K ] ],
+     input buffer: [ write0, write1 ],
+     rmw buffer: [ write2, read0, read1, read2 ],
+   }
+
+***/
+
 void test5()
 {
    osd_num_t osd_set[3] = { 1, 0, 3 };
@@ -221,6 +198,19 @@ void test5()
    free(write_buf);
 }

+/***
+
+6. write(offset=0, len=128K+64K, osd_set=[1,2,3])
+   = {
+     req: [ [ 0, 128K ], [ 0, 64K ], [ 0, 0 ] ],
+     read: [ [ 0, 0 ], [ 64K, 128K ], [ 0, 0 ] ],
+     write: [ [ 0, 128K ], [ 0, 64K ], [ 0, 128K ] ],
+     input buffer: [ write0, write1 ],
+     rmw buffer: [ write2, read1 ],
+   }
+
+***/
+
 void test6()
 {
    osd_num_t osd_set[3] = { 1, 2, 3 };
@@ -245,6 +235,24 @@ void test6()
    free(write_buf);
 }

+/***
+
+7. calc_rmw(offset=128K-4K, len=8K, osd_set=[1,0,3], write_set=[1,2,3])
+   = {
+     read: [ [ 0, 128K ], [ 0, 128K ], [ 0, 128K ] ],
+     write: [ [ 128K-4K, 128K ], [ 0, 4K ], [ 0, 128K ] ],
+     input buffer: [ write0, write1 ],
+     rmw buffer: [ write2, read0, read1, read2 ],
+   }
+   then, after calc_rmw_parity_xor(): {
+     write: [ [ 128K-4K, 128K ], [ 0, 128K ], [ 0, 128K ] ],
+     write1==read1,
+   }
+   + check write1 buffer
+   + check write2 buffer
+
+***/
+
 void test7()
 {
    osd_num_t osd_set[3] = { 1, 0, 3 };
@@ -285,6 +293,19 @@ void test7()
    free(write_buf);
 }

+/***
+
+8. calc_rmw(offset=0, len=128K+4K, osd_set=[0,2,3], write_set=[1,2,3])
+   = {
+     read: [ [ 0, 0 ], [ 4K, 128K ], [ 0, 0 ] ],
+     write: [ [ 0, 128K ], [ 0, 4K ], [ 0, 128K ] ],
+     input buffer: [ write0, write1 ],
+     rmw buffer: [ write2, read1 ],
+   }
+   + check write2 buffer
+
+***/
+
 void test8()
 {
    osd_num_t osd_set[3] = { 0, 2, 3 };
@@ -322,6 +343,24 @@ void test8()
    free(write_buf);
 }

+/***
+
+9. object recovery case:
+   calc_rmw(offset=0, len=0, read_osd_set=[0,2,3], write_osd_set=[1,2,3])
+   = {
+     read: [ [ 0, 128K ], [ 0, 128K ], [ 0, 128K ] ],
+     write: [ [ 0, 0 ], [ 0, 0 ], [ 0, 0 ] ],
+     input buffer: NULL,
+     rmw buffer: [ read0, read1, read2 ],
+   }
+   then, after calc_rmw_parity_xor(): {
+     write: [ [ 0, 128K ], [ 0, 0 ], [ 0, 0 ] ],
+     write0==read0,
+   }
+   + check write0 buffer
+
+***/
+
 void test9()
 {
    osd_num_t osd_set[3] = { 0, 2, 3 };
@@ -361,3 +400,383 @@ void test9()
    check_pattern(stripes[0].write_buf, 128*1024, PATTERN1);
    free(rmw_buf);
 }
+
+/***
+
+10. full overwrite/recovery case:
+   calc_rmw(offset=0, len=256K, read_osd_set=[1,0,0], write_osd_set=[1,2,3])
+   = {
+     read: [ [ 0, 0 ], [ 0, 0 ], [ 0, 0 ] ],
+     write: [ [ 0, 128K ], [ 0, 128K ], [ 0, 128K ] ],
+     input buffer: [ write0, write1 ],
+     rmw buffer: [ write2 ],
+   }
+   then, after calc_rmw_parity_xor(): all the same
+   + check write2 buffer
+
+***/
+
+void test10()
+{
+    osd_num_t osd_set[3] = { 1, 0, 0 };
+    osd_num_t write_osd_set[3] = { 1, 2, 3 };
+    osd_rmw_stripe_t stripes[3] = { 0 };
+    // Test 10.0
+    split_stripes(2, 128*1024, 0, 256*1024, stripes);
+    assert(stripes[0].req_start == 0 && stripes[0].req_end == 128*1024);
+    assert(stripes[1].req_start == 0 && stripes[1].req_end == 128*1024);
+    assert(stripes[2].req_start == 0 && stripes[2].req_end == 0);
+    // Test 10.1
+    void *write_buf = malloc(256*1024);
+    void *rmw_buf = calc_rmw(write_buf, stripes, osd_set, 3, 2, 3, write_osd_set, 128*1024);
+    assert(rmw_buf);
+    assert(stripes[0].read_start == 0 && stripes[0].read_end == 0);
+    assert(stripes[1].read_start == 0 && stripes[1].read_end == 0);
+    assert(stripes[2].read_start == 0 && stripes[2].read_end == 0);
+    assert(stripes[0].write_start == 0 && stripes[0].write_end == 128*1024);
+    assert(stripes[1].write_start == 0 && stripes[1].write_end == 128*1024);
+    assert(stripes[2].write_start == 0 && stripes[2].write_end == 128*1024);
+    assert(stripes[0].read_buf == NULL);
+    assert(stripes[1].read_buf == NULL);
+    assert(stripes[2].read_buf == NULL);
+    assert(stripes[0].write_buf == write_buf);
+    assert(stripes[1].write_buf == write_buf+128*1024);
+    assert(stripes[2].write_buf == rmw_buf);
+    // Test 10.2
+    set_pattern(stripes[0].write_buf, 128*1024, PATTERN1);
+    set_pattern(stripes[1].write_buf, 128*1024, PATTERN2);
+    calc_rmw_parity_xor(stripes, 3, osd_set, write_osd_set, 128*1024);
+    assert(stripes[0].write_start == 0 && stripes[0].write_end == 128*1024);
+    assert(stripes[1].write_start == 0 && stripes[1].write_end == 128*1024);
+    assert(stripes[2].write_start == 0 && stripes[2].write_end == 128*1024);
+    assert(stripes[0].write_buf == write_buf);
+    assert(stripes[1].write_buf == write_buf+128*1024);
+    assert(stripes[2].write_buf == rmw_buf);
+    check_pattern(stripes[2].write_buf, 128*1024, PATTERN1^PATTERN2);
+    free(rmw_buf);
+    free(write_buf);
+}
+
+/***
+
+11. partial recovery case:
+   calc_rmw(offset=128K, len=128K, read_osd_set=[1,0,0], write_osd_set=[1,2,3])
+   = {
+     read: [ [ 0, 128K ], [ 0, 0 ], [ 0, 0 ] ],
+     write: [ [ 0, 0 ], [ 0, 128K ], [ 0, 128K ] ],
+     input buffer: [ write1 ],
+     rmw buffer: [ write2, read0 ],
+   }
+   then, after calc_rmw_parity_xor(): all the same
+   + check write2 buffer
+
+***/
+
+void test11()
+{
+    osd_num_t osd_set[3] = { 1, 0, 0 };
+    osd_num_t write_osd_set[3] = { 1, 2, 3 };
+    osd_rmw_stripe_t stripes[3] = { 0 };
+    // Test 11.0
+    split_stripes(2, 128*1024, 128*1024, 256*1024, stripes);
+    assert(stripes[0].req_start == 0 && stripes[0].req_end == 0);
+    assert(stripes[1].req_start == 0 && stripes[1].req_end == 128*1024);
+    assert(stripes[2].req_start == 0 && stripes[2].req_end == 0);
+    // Test 11.1
+    void *write_buf = malloc(256*1024);
+    void *rmw_buf = calc_rmw(write_buf, stripes, osd_set, 3, 2, 3, write_osd_set, 128*1024);
+    assert(rmw_buf);
+    assert(stripes[0].read_start == 0 && stripes[0].read_end == 128*1024);
+    assert(stripes[1].read_start == 0 && stripes[1].read_end == 0);
+    assert(stripes[2].read_start == 0 && stripes[2].read_end == 0);
+    assert(stripes[0].write_start == 0 && stripes[0].write_end == 0);
+    assert(stripes[1].write_start == 0 && stripes[1].write_end == 128*1024);
+    assert(stripes[2].write_start == 0 && stripes[2].write_end == 128*1024);
+    assert(stripes[0].read_buf == rmw_buf+128*1024);
+    assert(stripes[1].read_buf == NULL);
+    assert(stripes[2].read_buf == NULL);
+    assert(stripes[0].write_buf == NULL);
+    assert(stripes[1].write_buf == write_buf);
+    assert(stripes[2].write_buf == rmw_buf);
+    // Test 11.2
+    set_pattern(stripes[0].read_buf, 128*1024, PATTERN1);
+    set_pattern(stripes[1].write_buf, 128*1024, PATTERN2);
+    calc_rmw_parity_xor(stripes, 3, osd_set, write_osd_set, 128*1024);
+    assert(stripes[0].write_start == 0 && stripes[0].write_end == 0);
+    assert(stripes[1].write_start == 0 && stripes[1].write_end == 128*1024);
+    assert(stripes[2].write_start == 0 && stripes[2].write_end == 128*1024);
+    assert(stripes[0].write_buf == NULL);
+    assert(stripes[1].write_buf == write_buf);
+    assert(stripes[2].write_buf == rmw_buf);
+    check_pattern(stripes[2].write_buf, 128*1024, PATTERN1^PATTERN2);
+    free(rmw_buf);
+    free(write_buf);
+}
+
+/***
+
+12. parity recovery case:
+   calc_rmw(offset=0, len=0, read_osd_set=[1,2,0], write_osd_set=[1,2,3])
+   = {
+     read: [ [ 0, 128K ], [ 0, 128K ], [ 0, 0 ] ],
+     write: [ [ 0, 0 ], [ 0, 0 ], [ 0, 128K ] ],
+     input buffer: [],
+     rmw buffer: [ write2, read0, read1 ],
+   }
+   then, after calc_rmw_parity_xor(): all the same
+   + check write2 buffer
+
+***/
+
+void test12()
+{
+    osd_num_t osd_set[3] = { 1, 2, 0 };
+    osd_num_t write_osd_set[3] = { 1, 2, 3 };
+    osd_rmw_stripe_t stripes[3] = { 0 };
+    // Test 12.0
+    split_stripes(2, 128*1024, 0, 0, stripes);
+    assert(stripes[0].req_start == 0 && stripes[0].req_end == 0);
+    assert(stripes[1].req_start == 0 && stripes[1].req_end == 0);
+    assert(stripes[2].req_start == 0 && stripes[2].req_end == 0);
+    // Test 12.1
+    void *rmw_buf = calc_rmw(NULL, stripes, osd_set, 3, 2, 3, write_osd_set, 128*1024);
+    assert(rmw_buf);
+    assert(stripes[0].read_start == 0 && stripes[0].read_end == 128*1024);
+    assert(stripes[1].read_start == 0 && stripes[1].read_end == 128*1024);
+    assert(stripes[2].read_start == 0 && stripes[2].read_end == 0);
+    assert(stripes[0].write_start == 0 && stripes[0].write_end == 0);
+    assert(stripes[1].write_start == 0 && stripes[1].write_end == 0);
+    assert(stripes[2].write_start == 0 && stripes[2].write_end == 128*1024);
+    assert(stripes[0].read_buf == rmw_buf+128*1024);
+    assert(stripes[1].read_buf == rmw_buf+2*128*1024);
+    assert(stripes[2].read_buf == NULL);
+    assert(stripes[0].write_buf == NULL);
+    assert(stripes[1].write_buf == NULL);
+    assert(stripes[2].write_buf == rmw_buf);
+    // Test 12.2
+    set_pattern(stripes[0].read_buf, 128*1024, PATTERN1);
+    set_pattern(stripes[1].read_buf, 128*1024, PATTERN2);
+    calc_rmw_parity_xor(stripes, 3, osd_set, write_osd_set, 128*1024);
+    assert(stripes[0].write_start == 0 && stripes[0].write_end == 0);
+    assert(stripes[1].write_start == 0 && stripes[1].write_end == 0);
+    assert(stripes[2].write_start == 0 && stripes[2].write_end == 128*1024);
+    assert(stripes[0].write_buf == NULL);
+    assert(stripes[1].write_buf == NULL);
+    assert(stripes[2].write_buf == rmw_buf);
+    check_pattern(stripes[2].write_buf, 128*1024, PATTERN1^PATTERN2);
+    free(rmw_buf);
+}
+
+/***
+
+13. basic jerasure 2+2 test
+   calc_rmw(offset=128K-4K, len=8K, osd_set=[1,2,0,0], write_set=[1,2,3,4])
+   = {
+     read: [ [ 0, 128K ], [ 0, 128K ], [ 0, 0 ], [ 0, 0 ] ],
+     write: [ [ 128K-4K, 128K ], [ 0, 4K ], [ 0, 128K ], [ 0, 128K ] ],
+     input buffer: [ write0, write1 ],
+     rmw buffer: [ write2, write3, read0, read1 ],
+   }
+   then, after calc_rmw_parity_jerasure(): all the same
+   then simulate read with read_osd_set=[0,0,3,4] and check read0,read1 buffers
+
+***/
+
+void test13()
+{
+    use_jerasure(4, 2, true);
+    osd_num_t osd_set[4] = { 1, 2, 0, 0 };
+    osd_num_t write_osd_set[4] = { 1, 2, 3, 4 };
+    osd_rmw_stripe_t stripes[4] = { 0 };
+    // Test 13.0
+    void *write_buf = malloc_or_die(8192);
+    split_stripes(2, 128*1024, 128*1024-4096, 8192, stripes);
+    assert(stripes[0].req_start == 128*1024-4096 && stripes[0].req_end == 128*1024);
+    assert(stripes[1].req_start == 0 && stripes[1].req_end == 4096);
+    assert(stripes[2].req_start == 0 && stripes[2].req_end == 0);
+    assert(stripes[3].req_start == 0 && stripes[3].req_end == 0);
+    // Test 13.1
+    void *rmw_buf = calc_rmw(write_buf, stripes, osd_set, 4, 2, 4, write_osd_set, 128*1024);
+    assert(rmw_buf);
+    assert(stripes[0].read_start == 0 && stripes[0].read_end == 128*1024-4096);
+    assert(stripes[1].read_start == 4096 && stripes[1].read_end == 128*1024);
+    assert(stripes[2].read_start == 0 && stripes[2].read_end == 0);
+    assert(stripes[3].read_start == 0 && stripes[3].read_end == 0);
+    assert(stripes[0].write_start == 128*1024-4096 && stripes[0].write_end == 128*1024);
+    assert(stripes[1].write_start == 0 && stripes[1].write_end == 4096);
+    assert(stripes[2].write_start == 0 && stripes[2].write_end == 128*1024);
+    assert(stripes[3].write_start == 0 && stripes[3].write_end == 128*1024);
+    assert(stripes[0].read_buf == rmw_buf+2*128*1024);
+    assert(stripes[1].read_buf == rmw_buf+3*128*1024-4096);
+    assert(stripes[2].read_buf == NULL);
+    assert(stripes[3].read_buf == NULL);
+    assert(stripes[0].write_buf == write_buf);
+    assert(stripes[1].write_buf == write_buf+4096);
+    assert(stripes[2].write_buf == rmw_buf);
+    assert(stripes[3].write_buf == rmw_buf+128*1024);
+    // Test 13.2 - encode
+    set_pattern(write_buf, 8192, PATTERN3);
+    set_pattern(stripes[0].read_buf, 128*1024-4096, PATTERN1);
+    set_pattern(stripes[1].read_buf, 128*1024-4096, PATTERN2);
+    calc_rmw_parity_jerasure(stripes, 4, 2, osd_set, write_osd_set, 128*1024);
+    assert(stripes[0].write_start == 128*1024-4096 && stripes[0].write_end == 128*1024);
+    assert(stripes[1].write_start == 0 && stripes[1].write_end == 4096);
+    assert(stripes[2].write_start == 0 && stripes[2].write_end == 128*1024);
+    assert(stripes[3].write_start == 0 && stripes[3].write_end == 128*1024);
+    assert(stripes[0].write_buf == write_buf);
+    assert(stripes[1].write_buf == write_buf+4096);
+    assert(stripes[2].write_buf == rmw_buf);
+    assert(stripes[3].write_buf == rmw_buf+128*1024);
+    // Test 13.3 - full decode and verify
+    osd_num_t read_osd_set[4] = { 0, 0, 3, 4 };
+    memset(stripes, 0, sizeof(stripes));
+    split_stripes(2, 128*1024, 0, 256*1024, stripes);
+    assert(stripes[0].req_start == 0 && stripes[0].req_end == 128*1024);
+    assert(stripes[1].req_start == 0 && stripes[1].req_end == 128*1024);
+    assert(stripes[2].req_start == 0 && stripes[2].req_end == 0);
+    assert(stripes[3].req_start == 0 && stripes[3].req_end == 0);
+    for (int role = 0; role < 4; role++)
+    {
+        stripes[role].read_start = stripes[role].req_start;
+        stripes[role].read_end = stripes[role].req_end;
+    }
+    assert(extend_missing_stripes(stripes, read_osd_set, 2, 4) == 0);
+    assert(stripes[0].read_start == 0 && stripes[0].read_end == 128*1024);
+    assert(stripes[1].read_start == 0 && stripes[1].read_end == 128*1024);
+    assert(stripes[2].read_start == 0 && stripes[2].read_end == 128*1024);
+    assert(stripes[3].read_start == 0 && stripes[3].read_end == 128*1024);
+    void *read_buf = alloc_read_buffer(stripes, 4, 0);
+    assert(read_buf);
+    assert(stripes[0].read_buf == read_buf);
+    assert(stripes[1].read_buf == read_buf+128*1024);
+    assert(stripes[2].read_buf == read_buf+2*128*1024);
+    assert(stripes[3].read_buf == read_buf+3*128*1024);
+    memcpy(read_buf+2*128*1024, rmw_buf, 128*1024);
+    memcpy(read_buf+3*128*1024, rmw_buf+128*1024, 128*1024);
+    reconstruct_stripes_jerasure(stripes, 4, 2);
+    check_pattern(stripes[0].read_buf, 128*1024-4096, PATTERN1);
+    check_pattern(stripes[0].read_buf+128*1024-4096, 4096, PATTERN3);
+    check_pattern(stripes[1].read_buf, 4096, PATTERN3);
+    check_pattern(stripes[1].read_buf+4096, 128*1024-4096, PATTERN2);
+    free(read_buf);
+    // Test 13.4 - partial decode (only 1st chunk) and verify
+    memset(stripes, 0, sizeof(stripes));
+    split_stripes(2, 128*1024, 0, 128*1024, stripes);
+    assert(stripes[0].req_start == 0 && stripes[0].req_end == 128*1024);
+    assert(stripes[1].req_start == 0 && stripes[1].req_end == 0);
+    assert(stripes[2].req_start == 0 && stripes[2].req_end == 0);
+    assert(stripes[3].req_start == 0 && stripes[3].req_end == 0);
+    for (int role = 0; role < 4; role++)
+    {
+        stripes[role].read_start = stripes[role].req_start;
+        stripes[role].read_end = stripes[role].req_end;
+    }
+    assert(extend_missing_stripes(stripes, read_osd_set, 2, 4) == 0);
+    assert(stripes[0].read_start == 0 && stripes[0].read_end == 128*1024);
+    assert(stripes[1].read_start == 0 && stripes[1].read_end == 0);
+    assert(stripes[2].read_start == 0 && stripes[2].read_end == 128*1024);
+    assert(stripes[3].read_start == 0 && stripes[3].read_end == 128*1024);
+    read_buf = alloc_read_buffer(stripes, 4, 0);
+    assert(read_buf);
+    assert(stripes[0].read_buf == read_buf);
+    assert(stripes[1].read_buf == NULL);
+    assert(stripes[2].read_buf == read_buf+128*1024);
+    assert(stripes[3].read_buf == read_buf+2*128*1024);
+    memcpy(read_buf+128*1024, rmw_buf, 128*1024);
+    memcpy(read_buf+2*128*1024, rmw_buf+128*1024, 128*1024);
+    reconstruct_stripes_jerasure(stripes, 4, 2);
+    check_pattern(stripes[0].read_buf, 128*1024-4096, PATTERN1);
+    check_pattern(stripes[0].read_buf+128*1024-4096, 4096, PATTERN3);
+    free(read_buf);
+    // Huh done
+    free(rmw_buf);
+    free(write_buf);
+    use_jerasure(4, 2, false);
+}
+
+/***
+
+13. basic jerasure 2+1 test
+   calc_rmw(offset=128K-4K, len=8K, osd_set=[1,2,0], write_set=[1,2,3])
+   = {
+     read: [ [ 0, 128K ], [ 0, 128K ], [ 0, 0 ] ],
+     write: [ [ 128K-4K, 128K ], [ 0, 4K ], [ 0, 128K ] ],
+     input buffer: [ write0, write1 ],
+     rmw buffer: [ write2, read0, read1 ],
+   }
+   then, after calc_rmw_parity_jerasure(): all the same
+   then simulate read with read_osd_set=[0,2,3] and check read0 buffer
+
+***/
+
+void test14()
+{
+    use_jerasure(3, 2, true);
+    osd_num_t osd_set[3] = { 1, 2, 0 };
+    osd_num_t write_osd_set[3] = { 1, 2, 3 };
+    osd_rmw_stripe_t stripes[3] = { 0 };
+    // Test 13.0
+    void *write_buf = malloc_or_die(8192);
+    split_stripes(2, 128*1024, 128*1024-4096, 8192, stripes);
+    assert(stripes[0].req_start == 128*1024-4096 && stripes[0].req_end == 128*1024);
+    assert(stripes[1].req_start == 0 && stripes[1].req_end == 4096);
+    assert(stripes[2].req_start == 0 && stripes[2].req_end == 0);
+    // Test 13.1
+    void *rmw_buf = calc_rmw(write_buf, stripes, osd_set, 3, 2, 3, write_osd_set, 128*1024);
+    assert(rmw_buf);
+    assert(stripes[0].read_start == 0 && stripes[0].read_end == 128*1024-4096);
+    assert(stripes[1].read_start == 4096 && stripes[1].read_end == 128*1024);
+    assert(stripes[2].read_start == 0 && stripes[2].read_end == 0);
+    assert(stripes[0].write_start == 128*1024-4096 && stripes[0].write_end == 128*1024);
+    assert(stripes[1].write_start == 0 && stripes[1].write_end == 4096);
+    assert(stripes[2].write_start == 0 && stripes[2].write_end == 128*1024);
+    assert(stripes[0].read_buf == rmw_buf+128*1024);
+    assert(stripes[1].read_buf == rmw_buf+2*128*1024-4096);
+    assert(stripes[2].read_buf == NULL);
+    assert(stripes[0].write_buf == write_buf);
+    assert(stripes[1].write_buf == write_buf+4096);
+    assert(stripes[2].write_buf == rmw_buf);
+    // Test 13.2 - encode
+    set_pattern(write_buf, 8192, PATTERN3);
+    set_pattern(stripes[0].read_buf, 128*1024-4096, PATTERN1);
+    set_pattern(stripes[1].read_buf, 128*1024-4096, PATTERN2);
+    calc_rmw_parity_jerasure(stripes, 3, 2, osd_set, write_osd_set, 128*1024);
+    assert(stripes[0].write_start == 128*1024-4096 && stripes[0].write_end == 128*1024);
+    assert(stripes[1].write_start == 0 && stripes[1].write_end == 4096);
+    assert(stripes[2].write_start == 0 && stripes[2].write_end == 128*1024);
+    assert(stripes[0].write_buf == write_buf);
+    assert(stripes[1].write_buf == write_buf+4096);
+    assert(stripes[2].write_buf == rmw_buf);
+    // Test 13.3 - decode and verify
+    osd_num_t read_osd_set[4] = { 0, 2, 3 };
+    memset(stripes, 0, sizeof(stripes));
+    split_stripes(2, 128*1024, 0, 128*1024, stripes);
+    assert(stripes[0].req_start == 0 && stripes[0].req_end == 128*1024);
+    assert(stripes[1].req_start == 0 && stripes[1].req_end == 0);
+    assert(stripes[2].req_start == 0 && stripes[2].req_end == 0);
+    for (int role = 0; role < 3; role++)
+    {
+        stripes[role].read_start = stripes[role].req_start;
+        stripes[role].read_end = stripes[role].req_end;
+    }
+    assert(extend_missing_stripes(stripes, read_osd_set, 2, 3) == 0);
+    assert(stripes[0].read_start == 0 && stripes[0].read_end == 128*1024);
+    assert(stripes[1].read_start == 0 && stripes[1].read_end == 128*1024);
+    assert(stripes[2].read_start == 0 && stripes[2].read_end == 128*1024);
+    void *read_buf = alloc_read_buffer(stripes, 3, 0);
+    assert(read_buf);
+    assert(stripes[0].read_buf == read_buf);
+    assert(stripes[1].read_buf == read_buf+128*1024);
+    assert(stripes[2].read_buf == read_buf+2*128*1024);
+    set_pattern(stripes[1].read_buf, 4096, PATTERN3);
+    set_pattern(stripes[1].read_buf+4096, 128*1024-4096, PATTERN2);
+    memcpy(stripes[2].read_buf, rmw_buf, 128*1024);
+    reconstruct_stripes_jerasure(stripes, 3, 2);
+    check_pattern(stripes[0].read_buf, 128*1024-4096, PATTERN1);
+    check_pattern(stripes[0].read_buf+128*1024-4096, 4096, PATTERN3);
+    free(read_buf);
+    // Huh done
+    free(rmw_buf);
+    free(write_buf);
+    use_jerasure(3, 2, false);
+}
--- a/qemu-3.1-vitastor.patch
+++ b/qemu-3.1-vitastor.patch
@@ -0,0 +1,84 @@
+Index: qemu-3.1+dfsg/qapi/block-core.json
+===================================================================
+--- qemu-3.1+dfsg.orig/qapi/block-core.json
+++ qemu-3.1+dfsg/qapi/block-core.json
+@@ -2617,7 +2617,7 @@
+ ##
+ { 'enum': 'BlockdevDriver',
+   'data': [ 'blkdebug', 'blklogwrites', 'blkverify', 'bochs', 'cloop',
+-            'copy-on-read', 'dmg', 'file', 'ftp', 'ftps', 'gluster',
+            'copy-on-read', 'dmg', 'file', 'ftp', 'ftps', 'gluster', 'vitastor',
+             'host_cdrom', 'host_device', 'http', 'https', 'iscsi', 'luks',
+             'nbd', 'nfs', 'null-aio', 'null-co', 'nvme', 'parallels', 'qcow',
+             'qcow2', 'qed', 'quorum', 'raw', 'rbd', 'replication', 'sheepdog',
+@@ -3367,6 +3367,24 @@
+             '*tag': 'str' } }
+ 
+ ##
+# @BlockdevOptionsVitastor:
+#
+# Driver specific block device options for vitastor
+#
+# @inode:       Inode number
+# @pool:        Pool ID
+# @size:        Desired image size in bytes
+# @etcd_host:   etcd connection address
+# @etcd_prefix: etcd key/value prefix
+##
+{ 'struct': 'BlockdevOptionsVitastor',
+  'data': { 'inode': 'uint64',
+            'pool': 'uint64',
+            'size': 'uint64',
+            'etcd_host': 'str',
+            '*etcd_prefix': 'str' } }
+
+##
+ # @ReplicationMode:
+ #
+ # An enumeration of replication modes.
+@@ -3713,6 +3731,7 @@
+       'rbd':        'BlockdevOptionsRbd',
+       'replication':'BlockdevOptionsReplication',
+       'sheepdog':   'BlockdevOptionsSheepdog',
+      'vitastor':   'BlockdevOptionsVitastor',
+       'ssh':        'BlockdevOptionsSsh',
+       'throttle':   'BlockdevOptionsThrottle',
+       'vdi':        'BlockdevOptionsGenericFormat',
+@@ -4158,6 +4177,17 @@
+             '*block-state-zero':    'bool' } }
+ 
+ ##
+# @BlockdevCreateOptionsVitastor:
+#
+# Driver specific image creation options for Vitastor.
+#
+# @size: Size of the virtual disk in bytes
+##
+{ 'struct': 'BlockdevCreateOptionsVitastor',
+  'data': { 'location':         'BlockdevOptionsVitastor',
+            'size':             'size' } }
+
+##
+ # @BlockdevVpcSubformat:
+ #
+ # @dynamic: Growing image file
+@@ -4212,6 +4242,7 @@
+       'qed':            'BlockdevCreateOptionsQed',
+       'rbd':            'BlockdevCreateOptionsRbd',
+       'sheepdog':       'BlockdevCreateOptionsSheepdog',
+      'vitastor':       'BlockdevCreateOptionsVitastor',
+       'ssh':            'BlockdevCreateOptionsSsh',
+       'vdi':            'BlockdevCreateOptionsVdi',
+       'vhdx':           'BlockdevCreateOptionsVhdx',
+Index: qemu-3.1+dfsg/scripts/modules/module_block.py
+===================================================================
+--- qemu-3.1+dfsg.orig/scripts/modules/module_block.py
+++ qemu-3.1+dfsg/scripts/modules/module_block.py
+@@ -88,6 +88,7 @@ def print_bottom(fheader):
+ output_file = sys.argv[1]
+ with open(output_file, 'w') as fheader:
+     print_top(fheader)
+    add_module(fheader, "vitastor", "vitastor", "vitastor")
+ 
+     for filename in sys.argv[2:]:
+         if os.path.isfile(filename):
--- a/qemu-4.2-vitastor.patch
+++ b/qemu-4.2-vitastor.patch
@@ -0,0 +1,84 @@
+Index: qemu/qapi/block-core.json
+===================================================================
+--- qemu.orig/qapi/block-core.json	2020-11-07 22:57:38.932613674 +0000
+++ qemu.orig/qapi/block-core.json	2020-11-07 22:59:49.890722862 +0000
+@@ -2907,7 +2907,7 @@
+             'nbd', 'nfs', 'null-aio', 'null-co', 'nvme', 'parallels', 'qcow',
+             'qcow2', 'qed', 'quorum', 'raw', 'rbd',
+             { 'name': 'replication', 'if': 'defined(CONFIG_REPLICATION)' },
+-            'sheepdog',
+            'sheepdog', 'vitastor',
+             'ssh', 'throttle', 'vdi', 'vhdx', 'vmdk', 'vpc', 'vvfat', 'vxhs' ] }
+ 
+ ##
+@@ -3725,6 +3725,24 @@
+             '*tag': 'str' } }
+ 
+ ##
+# @BlockdevOptionsVitastor:
+#
+# Driver specific block device options for vitastor
+#
+# @inode:       Inode number
+# @pool:        Pool ID
+# @size:        Desired image size in bytes
+# @etcd_host:   etcd connection address
+# @etcd_prefix: etcd key/value prefix
+##
+{ 'struct': 'BlockdevOptionsVitastor',
+  'data': { 'inode': 'uint64',
+            'pool': 'uint64',
+            'size': 'uint64',
+            'etcd_host': 'str',
+            '*etcd_prefix': 'str' } }
+
+##
+ # @ReplicationMode:
+ #
+ # An enumeration of replication modes.
+@@ -4084,6 +4102,7 @@
+       'replication': { 'type': 'BlockdevOptionsReplication',
+                        'if': 'defined(CONFIG_REPLICATION)' },
+       'sheepdog':   'BlockdevOptionsSheepdog',
+      'vitastor':   'BlockdevOptionsVitastor',
+       'ssh':        'BlockdevOptionsSsh',
+       'throttle':   'BlockdevOptionsThrottle',
+       'vdi':        'BlockdevOptionsGenericFormat',
+@@ -4461,6 +4480,17 @@
+             '*cluster-size' :   'size' } }
+ 
+ ##
+# @BlockdevCreateOptionsVitastor:
+#
+# Driver specific image creation options for Vitastor.
+#
+# @size: Size of the virtual disk in bytes
+##
+{ 'struct': 'BlockdevCreateOptionsVitastor',
+  'data': { 'location':         'BlockdevOptionsVitastor',
+            'size':             'size' } }
+
+##
+ # @BlockdevVmdkSubformat:
+ #
+ # Subformat options for VMDK images
+@@ -4722,6 +4752,7 @@
+       'qed':            'BlockdevCreateOptionsQed',
+       'rbd':            'BlockdevCreateOptionsRbd',
+       'sheepdog':       'BlockdevCreateOptionsSheepdog',
+      'vitastor':       'BlockdevCreateOptionsVitastor',
+       'ssh':            'BlockdevCreateOptionsSsh',
+       'vdi':            'BlockdevCreateOptionsVdi',
+       'vhdx':           'BlockdevCreateOptionsVhdx',
+Index: qemu/scripts/modules/module_block.py
+===================================================================
+--- qemu.orig/scripts/modules/module_block.py	2020-11-07 22:57:38.936613739 +0000
+++ qemu/scripts/modules/module_block.py	2020-11-07 22:59:49.890722862 +0000
+@@ -86,6 +86,7 @@ def print_bottom(fheader):
+ output_file = sys.argv[1]
+ with open(output_file, 'w') as fheader:
+     print_top(fheader)
+    add_module(fheader, "vitastor", "vitastor", "vitastor")
+ 
+     for filename in sys.argv[2:]:
+         if os.path.isfile(filename):
--- a/qemu-5.0-vitastor.patch
+++ b/qemu-5.0-vitastor.patch
@@ -0,0 +1,84 @@
+Index: qemu/qapi/block-core.json
+===================================================================
+--- qemu.orig/qapi/block-core.json
+++ qemu/qapi/block-core.json
+@@ -2798,7 +2798,7 @@
+             'luks', 'nbd', 'nfs', 'null-aio', 'null-co', 'nvme', 'parallels',
+             'qcow', 'qcow2', 'qed', 'quorum', 'raw', 'rbd',
+             { 'name': 'replication', 'if': 'defined(CONFIG_REPLICATION)' },
+-            'sheepdog',
+            'sheepdog', 'vitastor',
+             'ssh', 'throttle', 'vdi', 'vhdx', 'vmdk', 'vpc', 'vvfat', 'vxhs' ] }
+ 
+ ##
+@@ -3635,6 +3635,24 @@
+             '*tag': 'str' } }
+ 
+ ##
+# @BlockdevOptionsVitastor:
+#
+# Driver specific block device options for vitastor
+#
+# @inode:       Inode number
+# @pool:        Pool ID
+# @size:        Desired image size in bytes
+# @etcd_host:   etcd connection address
+# @etcd_prefix: etcd key/value prefix
+##
+{ 'struct': 'BlockdevOptionsVitastor',
+  'data': { 'inode': 'uint64',
+            'pool': 'uint64',
+            'size': 'uint64',
+            'etcd_host': 'str',
+            '*etcd_prefix': 'str' } }
+
+##
+ # @ReplicationMode:
+ #
+ # An enumeration of replication modes.
+@@ -3995,6 +4013,7 @@
+       'replication': { 'type': 'BlockdevOptionsReplication',
+                        'if': 'defined(CONFIG_REPLICATION)' },
+       'sheepdog':   'BlockdevOptionsSheepdog',
+      'vitastor':   'BlockdevOptionsVitastor',
+       'ssh':        'BlockdevOptionsSsh',
+       'throttle':   'BlockdevOptionsThrottle',
+       'vdi':        'BlockdevOptionsGenericFormat',
+@@ -4365,6 +4384,17 @@
+             '*cluster-size' :   'size' } }
+ 
+ ##
+# @BlockdevCreateOptionsVitastor:
+#
+# Driver specific image creation options for Vitastor.
+#
+# @size: Size of the virtual disk in bytes
+##
+{ 'struct': 'BlockdevCreateOptionsVitastor',
+  'data': { 'location':         'BlockdevOptionsVitastor',
+            'size':             'size' } }
+
+##
+ # @BlockdevVmdkSubformat:
+ #
+ # Subformat options for VMDK images
+@@ -4626,6 +4656,7 @@
+       'qed':            'BlockdevCreateOptionsQed',
+       'rbd':            'BlockdevCreateOptionsRbd',
+       'sheepdog':       'BlockdevCreateOptionsSheepdog',
+      'vitastor':       'BlockdevCreateOptionsVitastor',
+       'ssh':            'BlockdevCreateOptionsSsh',
+       'vdi':            'BlockdevCreateOptionsVdi',
+       'vhdx':           'BlockdevCreateOptionsVhdx',
+Index: qemu/scripts/modules/module_block.py
+===================================================================
+--- qemu.orig/scripts/modules/module_block.py
+++ qemu/scripts/modules/module_block.py
+@@ -85,6 +85,7 @@ def print_bottom(fheader):
+ output_file = sys.argv[1]
+ with open(output_file, 'w') as fheader:
+     print_top(fheader)
+    add_module(fheader, "vitastor", "vitastor", "vitastor")
+ 
+     for filename in sys.argv[2:]:
+         if os.path.isfile(filename):
--- a/qemu-5.1-vitastor.patch
+++ b/qemu-5.1-vitastor.patch
@@ -0,0 +1,84 @@
+Index: qemu-5.1+dfsg/qapi/block-core.json
+===================================================================
+--- qemu-5.1+dfsg.orig/qapi/block-core.json
+++ qemu-5.1+dfsg/qapi/block-core.json
+@@ -2807,7 +2807,7 @@
+             'luks', 'nbd', 'nfs', 'null-aio', 'null-co', 'nvme', 'parallels',
+             'qcow', 'qcow2', 'qed', 'quorum', 'raw', 'rbd',
+             { 'name': 'replication', 'if': 'defined(CONFIG_REPLICATION)' },
+-            'sheepdog',
+            'sheepdog', 'vitastor',
+             'ssh', 'throttle', 'vdi', 'vhdx', 'vmdk', 'vpc', 'vvfat' ] }
+ 
+ ##
+@@ -3644,6 +3644,24 @@
+             '*tag': 'str' } }
+ 
+ ##
+# @BlockdevOptionsVitastor:
+#
+# Driver specific block device options for vitastor
+#
+# @inode:       Inode number
+# @pool:        Pool ID
+# @size:        Desired image size in bytes
+# @etcd_host:   etcd connection address
+# @etcd_prefix: etcd key/value prefix
+##
+{ 'struct': 'BlockdevOptionsVitastor',
+  'data': { 'inode': 'uint64',
+            'pool': 'uint64',
+            'size': 'uint64',
+            'etcd_host': 'str',
+            '*etcd_prefix': 'str' } }
+
+##
+ # @ReplicationMode:
+ #
+ # An enumeration of replication modes.
+@@ -3988,6 +4006,7 @@
+       'replication': { 'type': 'BlockdevOptionsReplication',
+                        'if': 'defined(CONFIG_REPLICATION)' },
+       'sheepdog':   'BlockdevOptionsSheepdog',
+      'vitastor':   'BlockdevOptionsVitastor',
+       'ssh':        'BlockdevOptionsSsh',
+       'throttle':   'BlockdevOptionsThrottle',
+       'vdi':        'BlockdevOptionsGenericFormat',
+@@ -4376,6 +4395,17 @@
+             '*cluster-size' :   'size' } }
+ 
+ ##
+# @BlockdevCreateOptionsVitastor:
+#
+# Driver specific image creation options for Vitastor.
+#
+# @size: Size of the virtual disk in bytes
+##
+{ 'struct': 'BlockdevCreateOptionsVitastor',
+  'data': { 'location':         'BlockdevOptionsVitastor',
+            'size':             'size' } }
+
+##
+ # @BlockdevVmdkSubformat:
+ #
+ # Subformat options for VMDK images
+@@ -4637,6 +4667,7 @@
+       'qed':            'BlockdevCreateOptionsQed',
+       'rbd':            'BlockdevCreateOptionsRbd',
+       'sheepdog':       'BlockdevCreateOptionsSheepdog',
+      'vitastor':       'BlockdevCreateOptionsVitastor',
+       'ssh':            'BlockdevCreateOptionsSsh',
+       'vdi':            'BlockdevCreateOptionsVdi',
+       'vhdx':           'BlockdevCreateOptionsVhdx',
+Index: qemu-5.1+dfsg/scripts/modules/module_block.py
+===================================================================
+--- qemu-5.1+dfsg.orig/scripts/modules/module_block.py
+++ qemu-5.1+dfsg/scripts/modules/module_block.py
+@@ -86,6 +86,7 @@ if __name__ == '__main__':
+     output_file = sys.argv[1]
+     with open(output_file, 'w') as fheader:
+         print_top(fheader)
+        add_module(fheader, "vitastor", "vitastor", "vitastor")
+ 
+         for filename in sys.argv[2:]:
+             if os.path.isfile(filename):
--- a/qemu_driver.c
+++ b/qemu_driver.c
@@ -3,11 +3,10 @@

 // QEMU block driver

+#define BUILD_DSO
 #define _GNU_SOURCE
 #include "qemu/osdep.h"
-#include "qemu/units.h"
 #include "block/block_int.h"
-#include "block/qdict.h"
 #include "qapi/error.h"
 #include "qapi/qmp/qdict.h"
 #include "qapi/qmp/qerror.h"
@@ -15,10 +14,28 @@
 #include "qemu/error-report.h"
 #include "qemu/module.h"
 #include "qemu/option.h"
+
+#if QEMU_VERSION_MAJOR >= 3
+#include "qemu/units.h"
+#include "block/qdict.h"
 #include "qemu/cutils.h"
+#else
+#include "qapi/qmp/qint.h"
+#define qdict_put_int(options, name, num_val) qdict_put_obj(options, name, QOBJECT(qint_from_int(num_val)))
+#define qdict_put_str(options, name, value) qdict_put_obj(options, name, QOBJECT(qstring_from_str(value)))
+#define qobject_unref QDECREF
+#endif

 #include "qemu_proxy.h"

+void qemu_module_dummy(void)
+{
+}
+
+void DSO_STAMP_FUN(void)
+{
+}
+
 typedef struct VitastorClient
 {
    void *proxy;
@@ -176,12 +193,14 @@ static void vitastor_close(BlockDriverState *bs)
        g_free(client->etcd_prefix);
 }

+#if QEMU_VERSION_MAJOR >= 3
 static int vitastor_probe_blocksizes(BlockDriverState *bs, BlockSizes *bsz)
 {
    bsz->phys = 4096;
    bsz->log = 4096;
    return 0;
 }
+#endif

 static int coroutine_fn vitastor_co_create_opts(
 #if QEMU_VERSION_MAJOR >= 4
@@ -208,11 +227,16 @@ out:
    return ret;
 }

+#if QEMU_VERSION_MAJOR >= 3
 static int coroutine_fn vitastor_co_truncate(BlockDriverState *bs, int64_t offset,
 #if QEMU_VERSION_MAJOR >= 4
    bool exact,
 #endif
-    PreallocMode prealloc, Error **errp)
+    PreallocMode prealloc,
+#if QEMU_VERSION_MAJOR >= 5 && QEMU_VERSION_MINOR >= 1 || QEMU_VERSION_MAJOR > 5
+    BdrvRequestFlags flags,
+#endif
+    Error **errp)
 {
    VitastorClient *client = bs->opaque;

@@ -227,6 +251,7 @@ static int coroutine_fn vitastor_co_truncate(BlockDriverState *bs, int64_t offse

    return 0;
 }
+#endif

 static int vitastor_get_info(BlockDriverState *bs, BlockDriverInfo *bdi)
 {
@@ -240,11 +265,22 @@ static int64_t vitastor_getlength(BlockDriverState *bs)
    return client->size;
 }

+#if QEMU_VERSION_MAJOR >= 3
 static void vitastor_refresh_limits(BlockDriverState *bs, Error **errp)
+#else
+static int vitastor_refresh_limits(BlockDriverState *bs)
+#endif
 {
+#if QEMU_VERSION_MAJOR >= 4
    bs->bl.request_alignment = 4096;
    bs->bl.min_mem_alignment = 4096;
+#else
+    bs->request_alignment = 4096;
+#endif
    bs->bl.opt_mem_alignment = 4096;
+#if QEMU_VERSION_MAJOR < 3
+    return 0;
+#endif
 }

 static int64_t vitastor_get_allocated_file_size(BlockDriverState *bs)
@@ -267,7 +303,12 @@ static void vitastor_co_generic_bh_cb(int retval, void *opaque)
    task->complete = 1;
    if (qemu_coroutine_self() != task->co)
    {
+#if QEMU_VERSION_MAJOR >= 3
        aio_co_wake(task->co);
+#else
+        qemu_coroutine_enter(task->co, NULL);
+        qemu_aio_release(task);
+#endif
    }
 }

@@ -309,6 +350,18 @@ static int coroutine_fn vitastor_co_pwritev(BlockDriverState *bs, uint64_t offse
    return task.ret;
 }

+#if QEMU_VERSION_MAJOR < 3
+static int coroutine_fn vitastor_co_readv(BlockDriverState *bs, int64_t sector_num, int nb_sectors, QEMUIOVector *iov)
+{
+    return vitastor_co_preadv(bs, sector_num*BDRV_SECTOR_SIZE, nb_sectors*BDRV_SECTOR_SIZE, iov, 0);
+}
+
+static int coroutine_fn vitastor_co_writev(BlockDriverState *bs, int64_t sector_num, int nb_sectors, QEMUIOVector *iov)
+{
+    return vitastor_co_pwritev(bs, sector_num*BDRV_SECTOR_SIZE, nb_sectors*BDRV_SECTOR_SIZE, iov, 0);
+}
+#endif
+
 static int coroutine_fn vitastor_co_flush(BlockDriverState *bs)
 {
    VitastorClient *client = bs->opaque;
@@ -327,6 +380,7 @@ static int coroutine_fn vitastor_co_flush(BlockDriverState *bs)
    return task.ret;
 }

+#if QEMU_VERSION_MAJOR >= 3
 static QemuOptsList vitastor_create_opts = {
    .name = "vitastor-create-opts",
    .head = QTAILQ_HEAD_INITIALIZER(vitastor_create_opts.head),
@@ -339,6 +393,16 @@ static QemuOptsList vitastor_create_opts = {
        { /* end of list */ }
    }
 };
+#else
+static QEMUOptionParameter vitastor_create_opts[] = {
+    {
+        .name = BLOCK_OPT_SIZE,
+        .type = OPT_SIZE,
+        .help = "Virtual disk size"
+    },
+    { NULL }
+};
+#endif

 static const char *vitastor_strong_runtime_opts[] = {
    "inode",
@@ -357,12 +421,11 @@ static BlockDriver bdrv_vitastor = {
    .bdrv_parse_filename            = vitastor_parse_filename,

    .bdrv_has_zero_init             = bdrv_has_zero_init_1,
-#if QEMU_VERSION_MAJOR >= 4
-    .bdrv_has_zero_init_truncate    = bdrv_has_zero_init_1,
-#endif
    .bdrv_get_info                  = vitastor_get_info,
    .bdrv_getlength                 = vitastor_getlength,
+#if QEMU_VERSION_MAJOR >= 3
    .bdrv_probe_blocksizes          = vitastor_probe_blocksizes,
+#endif
    .bdrv_refresh_limits            = vitastor_refresh_limits,

    // FIXME: Implement it along with per-inode statistics
@@ -372,12 +435,17 @@ static BlockDriver bdrv_vitastor = {
    .bdrv_close                     = vitastor_close,

    // Option list for the create operation
+#if QEMU_VERSION_MAJOR >= 3
    .create_opts                    = &vitastor_create_opts,
+#else
+    .create_options                 = vitastor_create_opts,
+#endif

    // For qmp_blockdev_create(), used by the qemu monitor / QAPI
    // Requires patching QAPI IDL, thus unimplemented
    //.bdrv_co_create                 = vitastor_co_create,

+#if QEMU_VERSION_MAJOR >= 3
    // For bdrv_create(), used by qemu-img
    .bdrv_co_create_opts            = vitastor_co_create_opts,

@@ -385,6 +453,11 @@ static BlockDriver bdrv_vitastor = {

    .bdrv_co_preadv                 = vitastor_co_preadv,
    .bdrv_co_pwritev                = vitastor_co_pwritev,
+#else
+    .bdrv_co_readv                  = vitastor_co_readv,
+    .bdrv_co_writev                 = vitastor_co_writev,
+#endif
+
    .bdrv_co_flush_to_disk          = vitastor_co_flush,

 #if QEMU_VERSION_MAJOR >= 4
--- a/ringloop.cpp
+++ b/ringloop.cpp
@@ -1,6 +1,10 @@
 // Copyright (c) Vitaliy Filippov, 2019+
 // License: VNPL-1.0 or GNU GPL-2.0+ (see README.md for details)

+#include <stdlib.h>
+
+#include <stdexcept>
+
 #include "ringloop.h"

 ring_loop_t::ring_loop_t(int qd)
@@ -11,7 +15,7 @@ ring_loop_t::ring_loop_t(int qd)
        throw std::runtime_error(std::string("io_uring_queue_init: ") + strerror(-ret));
    }
    free_ring_data_ptr = *ring.cq.kring_entries;
-    ring_datas = (struct ring_data_t*)malloc(sizeof(ring_data_t) * free_ring_data_ptr);
+    ring_datas = (struct ring_data_t*)calloc(free_ring_data_ptr, sizeof(ring_data_t));
    free_ring_data = (int*)malloc(sizeof(int) * free_ring_data_ptr);
    if (!ring_datas || !free_ring_data)
    {
@@ -62,10 +66,18 @@ void ring_loop_t::loop()
        struct ring_data_t *d = (struct ring_data_t*)cqe->user_data;
        if (d->callback)
        {
-            d->res = cqe->res;
-            d->callback(d);
+            // First free ring_data item, then call the callback
+            // so it has at least 1 free slot for the next event
+            // which is required for EPOLLET to function properly
+            struct ring_data_t dl;
+            dl.iov = d->iov;
+            dl.res = cqe->res;
+            dl.callback.swap(d->callback);
+            free_ring_data[free_ring_data_ptr++] = d - ring_datas;
+            dl.callback(&dl);
        }
-        free_ring_data[free_ring_data_ptr++] = d - ring_datas;
+        else
+            free_ring_data[free_ring_data_ptr++] = d - ring_datas;
        io_uring_cqe_seen(&ring, cqe);
    }
    while (get_sqe_queue.size() > 0)
--- a/ringloop.h
+++ b/ringloop.h
@@ -11,6 +11,7 @@
 #include <assert.h>
 #include <liburing.h>

+#include <string>
 #include <functional>
 #include <vector>

--- a/rm_inode.cpp
+++ b/rm_inode.cpp
@@ -0,0 +1,326 @@
+// Copyright (c) Vitaliy Filippov, 2019+
+// License: VNPL-1.0 (see README.md for details)
+
+/**
+ * Inode removal tool
+ * May be included into a bigger "command-line management interface" in the future
+ */
+
+#include <algorithm>
+
+#include "epoll_manager.h"
+#include "cluster_client.h"
+#include "pg_states.h"
+
+#define RM_NO_LIST 1
+#define RM_LIST_SENT 2
+#define RM_REMOVING 3
+#define RM_END 4
+
+const char *exe_name = NULL;
+
+struct rm_pg_osd_t
+{
+    pg_num_t pg_num;
+    osd_num_t osd_num;
+    int state = 0;
+    obj_ver_id *obj_list = NULL;
+    uint64_t obj_count = 0, obj_pos = 0, obj_done = 0, obj_prev_done = 0;
+    int in_flight = 0;
+};
+
+class rm_inode_t
+{
+protected:
+    uint64_t inode = 0;
+    pool_id_t pool_id = 0;
+    uint64_t iodepth = 0, parallel_osds = 0;
+
+    ring_loop_t *ringloop = NULL;
+    epoll_manager_t *epmgr = NULL;
+    cluster_client_t *cli = NULL;
+    ring_consumer_t consumer;
+
+    std::vector<rm_pg_osd_t*> lists;
+    uint64_t total_count = 0, total_done = 0, total_prev_pct = 0;
+    uint64_t pgs_to_list = 0;
+    bool started = false;
+    bool progress = true;
+    int log_level = 0;
+
+public:
+    static json11::Json::object parse_args(int narg, const char *args[])
+    {
+        json11::Json::object cfg;
+        cfg["progress"] = "1";
+        for (int i = 1; i < narg; i++)
+        {
+            if (!strcmp(args[i], "-h") || !strcmp(args[i], "--help"))
+            {
+                help();
+            }
+            else if (args[i][0] == '-' && args[i][1] == '-')
+            {
+                const char *opt = args[i]+2;
+                cfg[opt] = !strcmp(opt, "json") || i == narg-1 ? "1" : args[++i];
+            }
+        }
+        return cfg;
+    }
+
+    static void help()
+    {
+        printf(
+            "Vitastor inode removal tool\n"
+            "(c) Vitaliy Filippov, 2020 (VNPL-1.0)\n\n"
+            "USAGE:\n"
+            "  %s --etcd_address <etcd_address> --pool <pool> --inode <inode>\n",
+            exe_name
+        );
+        exit(0);
+    }
+
+    void run(json11::Json cfg)
+    {
+        if (cfg["etcd_address"].string_value() == "")
+        {
+            fprintf(stderr, "etcd_address is missing\n");
+            exit(1);
+        }
+        inode = cfg["inode"].uint64_value();
+        pool_id = cfg["pool"].uint64_value();
+        if (pool_id)
+            inode = (inode & ((1l << (64-POOL_ID_BITS)) - 1)) | (((uint64_t)pool_id) << (64-POOL_ID_BITS));
+        pool_id = INODE_POOL(inode);
+        if (!pool_id)
+        {
+            fprintf(stderr, "pool is missing");
+            exit(1);
+        }
+        iodepth = cfg["iodepth"].uint64_value();
+        if (!iodepth)
+            iodepth = 32;
+        parallel_osds = cfg["parallel_osds"].uint64_value();
+        if (!parallel_osds)
+            parallel_osds = 4;
+        log_level = cfg["log_level"].int64_value();
+        progress = cfg["progress"].uint64_value() ? true : false;
+        // Create client
+        ringloop = new ring_loop_t(512);
+        epmgr = new epoll_manager_t(ringloop);
+        cli = new cluster_client_t(ringloop, epmgr->tfd, cfg);
+        cli->on_ready([this]() { start_delete(); });
+        // Initialize job
+        consumer.loop = [this]()
+        {
+            if (started)
+                continue_delete();
+            ringloop->submit();
+        };
+        ringloop->register_consumer(&consumer);
+        // Loop until it completes
+        while (1)
+        {
+            ringloop->loop();
+            ringloop->wait();
+        }
+    }
+
+    void start_delete()
+    {
+        if (cli->st_cli.pool_config.find(pool_id) == cli->st_cli.pool_config.end())
+        {
+            fprintf(stderr, "Pool %u does not exist\n", pool_id);
+            exit(1);
+        }
+        auto pool_cfg = cli->st_cli.pool_config[pool_id];
+        for (auto & pg_item: pool_cfg.pg_config)
+        {
+            auto & pg = pg_item.second;
+            if (pg.pause || !pg.cur_primary || pg.cur_state != PG_ACTIVE)
+            {
+                // FIXME Support deletion in non-clean active PGs by introducing a "primary-list" command
+                fprintf(stderr, "PG %u is not active+clean, skipping\n", pg_item.first);
+                continue;
+            }
+            rm_pg_osd_t *r = new rm_pg_osd_t();
+            r->pg_num = pg_item.first;
+            r->osd_num = pg.cur_primary;
+            r->state = RM_NO_LIST;
+            lists.push_back(r);
+        }
+        std::sort(lists.begin(), lists.end(), [](rm_pg_osd_t *a, rm_pg_osd_t *b)
+        {
+            return a->osd_num < b->osd_num ? true : false;
+        });
+        pgs_to_list = lists.size();
+        started = true;
+        continue_delete();
+    }
+
+    void send_list(rm_pg_osd_t *cur_list)
+    {
+        if (cli->msgr.osd_peer_fds.find(cur_list->osd_num) ==
+            cli->msgr.osd_peer_fds.end())
+        {
+            // Initiate connection
+            cli->msgr.connect_peer(cur_list->osd_num, cli->st_cli.peer_states[cur_list->osd_num]);
+            return;
+        }
+        osd_op_t *op = new osd_op_t();
+        op->op_type = OSD_OP_OUT;
+        op->peer_fd = cli->msgr.osd_peer_fds[cur_list->osd_num];
+        op->req = (osd_any_op_t){
+            .sec_list = {
+                .header = {
+                    .magic = SECONDARY_OSD_OP_MAGIC,
+                    .id = cli->msgr.next_subop_id++,
+                    .opcode = OSD_OP_SEC_LIST,
+                },
+                .list_pg = cur_list->pg_num,
+                .pg_count = (pg_num_t)cli->st_cli.pool_config[pool_id].real_pg_count,
+                .pg_stripe_size = cli->st_cli.pool_config[pool_id].pg_stripe_size,
+                .min_inode = inode,
+                .max_inode = inode,
+            },
+        };
+        op->callback = [this, cur_list](osd_op_t *op)
+        {
+            pgs_to_list--;
+            if (op->reply.hdr.retval < 0)
+            {
+                fprintf(stderr, "Failed to get object list from OSD %lu (retval=%ld), skipping the PG\n",
+                    cur_list->osd_num, op->reply.hdr.retval);
+                cli->msgr.stop_client(cur_list->osd_num);
+                delete op;
+                cur_list->state = RM_END;
+                continue_delete();
+                return;
+            }
+            if (log_level > 0)
+            {
+                printf(
+                    "[PG %u/%u] Got inode object list from OSD %lu: %ld object versions\n",
+                    pool_id, cur_list->pg_num, cur_list->osd_num, op->reply.hdr.retval
+                );
+            }
+            cur_list->obj_list = (obj_ver_id*)op->buf;
+            cur_list->obj_count = (uint64_t)op->reply.hdr.retval;
+            cur_list->obj_done = cur_list->obj_prev_done = cur_list->obj_pos = 0;
+            total_count += cur_list->obj_count;
+            total_prev_pct = 0;
+            // set op->buf to NULL so it doesn't get freed
+            op->buf = NULL;
+            delete op;
+            cur_list->state = RM_REMOVING;
+            continue_delete();
+        };
+        cur_list->state = RM_LIST_SENT;
+        cli->msgr.outbox_push(op);
+    }
+
+    void send_ops(rm_pg_osd_t *cur_list)
+    {
+        if (cli->msgr.osd_peer_fds.find(cur_list->osd_num) ==
+            cli->msgr.osd_peer_fds.end())
+        {
+            // Initiate connection
+            cli->msgr.connect_peer(cur_list->osd_num, cli->st_cli.peer_states[cur_list->osd_num]);
+            return;
+        }
+        while (cur_list->in_flight < iodepth && cur_list->obj_pos < cur_list->obj_count)
+        {
+            osd_op_t *op = new osd_op_t();
+            op->op_type = OSD_OP_OUT;
+            op->peer_fd = cli->msgr.osd_peer_fds[cur_list->osd_num];
+            op->req = (osd_any_op_t){
+                .rw = {
+                    .header = {
+                        .magic = SECONDARY_OSD_OP_MAGIC,
+                        .id = cli->msgr.next_subop_id++,
+                        .opcode = OSD_OP_DELETE,
+                    },
+                    .inode = cur_list->obj_list[cur_list->obj_pos].oid.inode,
+                    .offset = (cur_list->obj_list[cur_list->obj_pos].oid.stripe & ~STRIPE_MASK),
+                    .len = 0,
+                },
+            };
+            op->callback = [this, cur_list](osd_op_t *op)
+            {
+                cur_list->in_flight--;
+                if (op->reply.hdr.retval < 0)
+                {
+                    fprintf(stderr, "Failed to remove object from PG %u (OSD %lu) (retval=%ld)\n",
+                        cur_list->pg_num, cur_list->osd_num, op->reply.hdr.retval);
+                }
+                delete op;
+                cur_list->obj_done++;
+                total_done++;
+                continue_delete();
+            };
+            cli->msgr.outbox_push(op);
+            cur_list->obj_pos++;
+            cur_list->in_flight++;
+        }
+        if (!cur_list->in_flight && cur_list->obj_pos >= cur_list->obj_count)
+        {
+            free(cur_list->obj_list);
+            cur_list->obj_list = NULL;
+            cur_list->obj_count = 0;
+            cur_list->obj_done = cur_list->obj_prev_done = cur_list->obj_pos = 0;
+            cur_list->state = RM_END;
+        }
+    }
+
+    void continue_delete()
+    {
+        int par_osd = 0;
+        osd_num_t max_seen_osd = 0;
+        for (int i = 0; i < lists.size(); i++)
+        {
+            if (lists[i]->state == RM_END)
+            {
+                delete lists[i];
+                lists.erase(lists.begin()+i, lists.begin()+i+1);
+                i--;
+            }
+            else if (lists[i]->osd_num > max_seen_osd)
+            {
+                if (lists[i]->state == RM_NO_LIST)
+                {
+                    send_list(lists[i]);
+                }
+                else if (lists[i]->state == RM_REMOVING)
+                {
+                    send_ops(lists[i]);
+                }
+                par_osd++;
+                max_seen_osd = lists[i]->osd_num;
+                if (par_osd >= parallel_osds)
+                {
+                    break;
+                }
+            }
+        }
+        if (progress && total_count > 0 && total_done*1000/total_count != total_prev_pct)
+        {
+            printf("\rRemoved %lu/%lu objects, %lu more PGs to list...", total_done, total_count, pgs_to_list);
+            total_prev_pct = total_done*1000/total_count;
+        }
+        if (!lists.size())
+        {
+            printf("Done, inode %lu in pool %u removed\n", (inode & ((1l << (64-POOL_ID_BITS)) - 1)), pool_id);
+            exit(0);
+        }
+    }
+};
+
+int main(int narg, const char *args[])
+{
+    setvbuf(stdout, NULL, _IONBF, 0);
+    setvbuf(stderr, NULL, _IONBF, 0);
+    exe_name = args[0];
+    rm_inode_t *p = new rm_inode_t();
+    p->run(rm_inode_t::parse_args(narg, args));
+    return 0;
+}
--- a/rpm/build-tarball.sh
+++ b/rpm/build-tarball.sh
@@ -0,0 +1,51 @@
+#!/bin/bash
+# Vitastor depends on QEMU and FIO headers, but QEMU and FIO don't have -devel packages
+# So we have to copy their headers into the source tarball
+
+set -e
+
+VITASTOR=$(dirname $0)
+VITASTOR=$(realpath "$VITASTOR/..")
+
+if [ -d /opt/rh/gcc-toolset-9 ]; then
+    # CentOS 8
+    EL=8
+    . /opt/rh/gcc-toolset-9/enable
+else
+    # CentOS 7
+    EL=7
+    . /opt/rh/devtoolset-9/enable
+fi
+cd ~/rpmbuild/SPECS
+rpmbuild -bp fio.spec
+perl -i -pe 's/^make V=1/exit 0; make V=1/' qemu*.spec
+rpmbuild -bc qemu*.spec
+perl -i -pe 's/^exit 0; make V=1/make V=1/' qemu*.spec
+cd ~/rpmbuild/BUILD/qemu*/
+rm -rf $VITASTOR/qemu $VITASTOR/fio
+mkdir -p $VITASTOR/qemu/b/qemu
+make -j8 config-host.h
+cp config-host.h $VITASTOR/qemu/b/qemu
+cp -r include $VITASTOR/qemu
+if [ -f qapi-schema.json ]; then
+    # QEMU 2.0
+    make qapi-types.h
+    cp qapi-types.h $VITASTOR/qemu/b/qemu
+else
+    # QEMU 3.0+
+    make qapi
+    cp -r qapi $VITASTOR/qemu/b/qemu
+fi
+cd $VITASTOR
+sh copy-qemu-includes.sh
+rm -rf qemu
+mv qemu-copy qemu
+ln -s ~/rpmbuild/BUILD/fio*/ fio
+sh copy-fio-includes.sh
+rm fio
+mv fio-copy fio
+FIO=`rpm -qi fio | perl -e 'while(<>) { /^Epoch[\s:]+(\S+)/ && print "$1:"; /^Version[\s:]+(\S+)/ && print $1; /^Release[\s:]+(\S+)/ && print "-$1"; }'`
+QEMU=`rpm -qi qemu qemu-kvm | perl -e 'while(<>) { /^Epoch[\s:]+(\S+)/ && print "$1:"; /^Version[\s:]+(\S+)/ && print $1; /^Release[\s:]+(\S+)/ && print "-$1"; }'`
+perl -i -pe 's/(Requires:\s*fio)([^\n]+)?/$1 = '$FIO'/' $VITASTOR/rpm/vitastor-el$EL.spec
+perl -i -pe 's/(Requires:\s*qemu(?:-kvm)?)([^\n]+)?/$1 = '$QEMU'/' $VITASTOR/rpm/vitastor-el$EL.spec
+tar --transform 's#^#vitastor-0.5.1/#' --exclude 'rpm/*.rpm' -czf $VITASTOR/../vitastor-0.5.1$(rpm --eval '%dist').tar.gz *
--- a/rpm/qemu-el8.Dockerfile
+++ b/rpm/qemu-el8.Dockerfile
@@ -0,0 +1,31 @@
+# Build packages for CentOS 8 inside a container
+# cd ..; podman build -t qemu-el8 -v `pwd`/build:/root/build -f rpm/qemu-el8.Dockerfile .
+
+FROM centos:8
+
+WORKDIR /root
+
+RUN rm -f /etc/yum.repos.d/CentOS-Media.repo
+RUN dnf -y install centos-release-advanced-virtualization epel-release dnf-plugins-core rpm-build
+RUN rm -rf /var/lib/dnf/*; dnf download --disablerepo='*' --enablerepo='centos-advanced-virtualization-source' --source qemu-kvm
+RUN rpm --nomd5 -i qemu*.src.rpm
+RUN cd ~/rpmbuild/SPECS && dnf builddep -y --enablerepo=PowerTools --spec qemu-kvm.spec
+
+ADD qemu-*-vitastor.patch /root/vitastor/
+
+RUN set -e; \
+    mkdir -p /root/build/qemu-el8; \
+    rm -rf /root/build/qemu-el8/*; \
+    rpm --nomd5 -i /root/qemu*.src.rpm; \
+    cd ~/rpmbuild/SPECS; \
+    PN=$(grep ^Patch qemu-kvm.spec | tail -n1 | perl -pe 's/Patch(\d+).*/$1/'); \
+    csplit qemu-kvm.spec "/^Patch$PN/"; \
+    cat xx00 > qemu-kvm.spec; \
+    head -n 1 xx01 >> qemu-kvm.spec; \
+    echo "Patch$((PN+1)): qemu-4.2-vitastor.patch" >> qemu-kvm.spec; \
+    tail -n +2 xx01 >> qemu-kvm.spec; \
+    perl -i -pe 's/(^Release:\s*\d+)/$1.vitastor/' qemu-kvm.spec; \
+    cp /root/vitastor/qemu-4.2-vitastor.patch ~/rpmbuild/SOURCES; \
+    rpmbuild --nocheck -ba qemu-kvm.spec; \
+    cp ~/rpmbuild/RPMS/*/*qemu* /root/build/qemu-el8/; \
+    cp ~/rpmbuild/SRPMS/*qemu* /root/build/qemu-el8/
--- a/rpm/qemu-kvm.spec.patch
+++ b/rpm/qemu-kvm.spec.patch
@@ -0,0 +1,29 @@
+--- qemu-kvm.spec	2020-12-05 13:13:54.388623517 +0000
+++ qemu-kvm.spec	2020-12-05 13:13:58.728696598 +0000
+@@ -67,7 +67,7 @@ Obsoletes: %1-rhev
+ Summary: QEMU is a machine emulator and virtualizer
+ Name: qemu-kvm
+ Version: 4.2.0
+-Release: 29%{?dist}.6
+Release: 29.vitastor%{?dist}.6
+ # Epoch because we pushed a qemu-1.0 package. AIUI this can't ever be dropped
+ Epoch: 15
+ License: GPLv2 and GPLv2+ and CC-BY
+@@ -825,6 +825,7 @@ Patch331: kvm-Drop-bogus-IPv6-messages.p
+ Patch333: kvm-virtiofsd-Whitelist-fchmod.patch
+ # For bz#1883869 - virtiofsd core dump in KATA Container [rhel-8.2.1.z]
+ Patch334: kvm-virtiofsd-avoid-proc-self-fd-tempdir.patch
+Patch335: qemu-4.2-vitastor.patch
+ 
+ BuildRequires: wget
+ BuildRequires: rpm-build
+@@ -1192,9 +1193,7 @@ buildldflags="VL_LDFLAGS=-Wl,--build-id"
+ %endif
+   --python=%{__python3} \
+   --target-list="%{buildarch}" \
+-  --block-drv-rw-whitelist=%{block_drivers_list} \
+   --audio-drv-list= \
+-  --block-drv-ro-whitelist=vmdk,vhdx,vpc,https,ssh \
+   --with-coroutine=ucontext \
+   --tls-priority=NORMAL \
+   --disable-bluez \
--- a/rpm/vitastor-el7.Dockerfile
+++ b/rpm/vitastor-el7.Dockerfile
@@ -0,0 +1,47 @@
+# Build packages for CentOS 7 inside a container
+# cd ..; podman build -t vitastor-el7 -v `pwd`/build:/root/build -f rpm/vitastor-el7.Dockerfile .
+# localedef -i ru_RU -f UTF-8 ru_RU.UTF-8
+
+FROM centos:7
+
+WORKDIR /root
+
+RUN rm -f /etc/yum.repos.d/CentOS-Media.repo
+RUN yum -y --enablerepo=extras install centos-release-scl epel-release yum-utils rpm-build
+RUN yum -y install https://vitastor.io/rpms/centos/7/vitastor-release-1.0-1.el7.noarch.rpm
+RUN yum -y install devtoolset-9-gcc-c++ devtoolset-9-libatomic-devel gperftools-devel qemu-kvm fio rh-nodejs12 jerasure-devel gf-complete-devel
+RUN yumdownloader --disablerepo=centos-sclo-rh --source qemu-kvm
+RUN yumdownloader --disablerepo=centos-sclo-rh --source fio
+RUN rpm --nomd5 -i qemu*.src.rpm
+RUN rpm --nomd5 -i fio*.src.rpm
+RUN rm -f /etc/yum.repos.d/CentOS-Media.repo
+RUN cd ~/rpmbuild/SPECS && yum-builddep -y --enablerepo='*' --disablerepo=centos-sclo-rh --disablerepo=centos-sclo-rh-source --disablerepo=centos-sclo-sclo-testing qemu-kvm.spec
+RUN cd ~/rpmbuild/SPECS && yum-builddep -y --enablerepo='*' --disablerepo=centos-sclo-rh --disablerepo=centos-sclo-rh-source --disablerepo=centos-sclo-sclo-testing fio.spec
+
+ADD https://vitastor.io/rpms/liburing-el7/liburing-0.7-2.el7.src.rpm /root
+
+RUN set -e; \
+    rpm -i liburing*.src.rpm; \
+    cd ~/rpmbuild/SPECS/; \
+    . /opt/rh/devtoolset-9/enable; \
+    rpmbuild -ba liburing.spec; \
+    mkdir -p /root/build/liburing-el7; \
+    rm -rf /root/build/liburing-el7/*; \
+    cp ~/rpmbuild/RPMS/*/liburing* /root/build/liburing-el7/; \
+    cp ~/rpmbuild/SRPMS/liburing* /root/build/liburing-el7/
+
+RUN rpm -i `ls /root/build/liburing-el7/liburing-*.x86_64.rpm | grep -v debug`
+
+ADD . /root/vitastor
+
+RUN set -e; \
+    cd /root/vitastor/rpm; \
+    sh build-tarball.sh; \
+    cp /root/vitastor-0.5.1.el7.tar.gz ~/rpmbuild/SOURCES; \
+    cp vitastor-el7.spec ~/rpmbuild/SPECS/vitastor.spec; \
+    cd ~/rpmbuild/SPECS/; \
+    rpmbuild -ba vitastor.spec; \
+    mkdir -p /root/build/vitastor-el7; \
+    rm -rf /root/build/vitastor-el7/*; \
+    cp ~/rpmbuild/RPMS/*/vitastor* /root/build/vitastor-el7/; \
+    cp ~/rpmbuild/SRPMS/vitastor* /root/build/vitastor-el7/
--- a/rpm/vitastor-el7.spec
+++ b/rpm/vitastor-el7.spec
@@ -0,0 +1,62 @@
+Name:           vitastor
+Version:        0.5.1
+Release:        2%{?dist}
+Summary:        Vitastor, a fast software-defined clustered block storage
+
+License:        Vitastor Network Public License 1.0
+URL:            https://vitastor.io/
+Source0:        vitastor-0.5.1.el7.tar.gz
+
+BuildRequires:  liburing-devel >= 0.6
+BuildRequires:  gperftools-devel
+BuildRequires:  devtoolset-9-gcc-c++
+BuildRequires:  rh-nodejs12
+BuildRequires:  rh-nodejs12-npm
+BuildRequires:  jerasure-devel
+BuildRequires:  gf-complete-devel
+Requires:       fio = 3.7-1.el7
+Requires:       qemu-kvm = 2.0.0-1.el7.6
+Requires:       rh-nodejs12
+Requires:       rh-nodejs12-npm
+Requires:       liburing >= 0.6
+Requires:       libJerasure2
+
+%description
+Vitastor is a small, simple and fast clustered block storage (storage for VM drives),
+architecturally similar to Ceph which means strong consistency, primary-replication,
+symmetric clustering and automatic data distribution over any number of drives of any
+size with configurable redundancy (replication or erasure codes/XOR).
+
+
+%prep
+%setup -q
+
+
+%build
+. /opt/rh/devtoolset-9/enable
+make %{?_smp_mflags} BINDIR=%_bindir LIBDIR=%_libdir QEMU_PLUGINDIR=%_libdir/qemu-kvm
+
+
+%install
+rm -rf $RPM_BUILD_ROOT
+%make_install BINDIR=%_bindir LIBDIR=%_libdir QEMU_PLUGINDIR=%_libdir/qemu-kvm
+. /opt/rh/rh-nodejs12/enable
+cd mon
+npm install
+cd ..
+mkdir -p %buildroot/usr/lib/vitastor
+cp -r mon %buildroot/usr/lib/vitastor/mon
+
+
+%files
+%doc
+%_bindir/vitastor-dump-journal
+%_bindir/vitastor-nbd
+%_bindir/vitastor-osd
+%_bindir/vitastor-rm
+%_libdir/qemu-kvm/block-vitastor.so
+%_libdir/vitastor
+/usr/lib/vitastor
+
+
+%changelog
--- a/rpm/vitastor-el8.Dockerfile
+++ b/rpm/vitastor-el8.Dockerfile
@@ -0,0 +1,45 @@
+# Build packages for CentOS 8 inside a container
+# cd ..; podman build -t vitastor-el8 -v `pwd`/build:/root/build -f rpm/vitastor-el8.Dockerfile .
+
+FROM centos:8
+
+WORKDIR /root
+
+RUN rm -f /etc/yum.repos.d/CentOS-Media.repo
+RUN dnf -y install centos-release-advanced-virtualization epel-release dnf-plugins-core
+RUN yum -y install https://vitastor.io/rpms/centos/8/vitastor-release-1.0-1.el8.noarch.rpm
+RUN dnf --enablerepo='centos-advanced-virtualization' -y install gcc-toolset-9 gcc-toolset-9-gcc-c++ gperftools-devel qemu-kvm fio nodejs rpm-build jerasure-devel gf-complete-devel
+RUN rm -rf /var/lib/dnf/*; dnf download --disablerepo='*' --enablerepo='vitastor' --source qemu-kvm
+RUN dnf download --source fio
+RUN rpm --nomd5 -i qemu*.src.rpm
+RUN rpm --nomd5 -i fio*.src.rpm
+RUN cd ~/rpmbuild/SPECS && dnf builddep -y --enablerepo=PowerTools --spec qemu-kvm.spec
+RUN cd ~/rpmbuild/SPECS && dnf builddep -y --enablerepo=PowerTools --spec fio.spec
+
+ADD https://vitastor.io/rpms/liburing-el7/liburing-0.7-2.el7.src.rpm /root
+
+RUN set -e; \
+    rpm -i liburing*.src.rpm; \
+    cd ~/rpmbuild/SPECS/; \
+    . /opt/rh/gcc-toolset-9/enable; \
+    rpmbuild -ba liburing.spec; \
+    mkdir -p /root/build/liburing-el8; \
+    rm -rf /root/build/liburing-el8/*; \
+    cp ~/rpmbuild/RPMS/*/liburing* /root/build/liburing-el8/; \
+    cp ~/rpmbuild/SRPMS/liburing* /root/build/liburing-el8/
+
+RUN rpm -i `ls /root/build/liburing-el7/liburing-*.x86_64.rpm | grep -v debug`
+
+ADD . /root/vitastor
+
+RUN set -e; \
+    cd /root/vitastor/rpm; \
+    sh build-tarball.sh; \
+    cp /root/vitastor-0.5.1.el8.tar.gz ~/rpmbuild/SOURCES; \
+    cp vitastor-el8.spec ~/rpmbuild/SPECS/vitastor.spec; \
+    cd ~/rpmbuild/SPECS/; \
+    rpmbuild -ba vitastor.spec; \
+    mkdir -p /root/build/vitastor-el8; \
+    rm -rf /root/build/vitastor-el8/*; \
+    cp ~/rpmbuild/RPMS/*/vitastor* /root/build/vitastor-el8/; \
+    cp ~/rpmbuild/SRPMS/vitastor* /root/build/vitastor-el8/
--- a/rpm/vitastor-el8.spec
+++ b/rpm/vitastor-el8.spec
@@ -0,0 +1,59 @@
+Name:           vitastor
+Version:        0.5.1
+Release:        2%{?dist}
+Summary:        Vitastor, a fast software-defined clustered block storage
+
+License:        Vitastor Network Public License 1.0
+URL:            https://vitastor.io/
+Source0:        vitastor-0.5.1.el8.tar.gz
+
+BuildRequires:  liburing-devel >= 0.6
+BuildRequires:  gperftools-devel
+BuildRequires:  gcc-toolset-9-gcc-c++
+BuildRequires:  nodejs >= 10
+BuildRequires:  jerasure-devel
+BuildRequires:  gf-complete-devel
+Requires:       fio = 3.7-3.el8
+Requires:       qemu-kvm = 4.2.0-29.el8.6
+Requires:       nodejs >= 10
+Requires:       liburing >= 0.6
+Requires:       libJerasure2
+
+%description
+Vitastor is a small, simple and fast clustered block storage (storage for VM drives),
+architecturally similar to Ceph which means strong consistency, primary-replication,
+symmetric clustering and automatic data distribution over any number of drives of any
+size with configurable redundancy (replication or erasure codes/XOR).
+
+
+%prep
+%setup -q
+
+
+%build
+. /opt/rh/gcc-toolset-9/enable
+make %{?_smp_mflags} BINDIR=%_bindir LIBDIR=%_libdir QEMU_PLUGINDIR=%_libdir/qemu-kvm
+
+
+%install
+rm -rf $RPM_BUILD_ROOT
+%make_install BINDIR=%_bindir LIBDIR=%_libdir QEMU_PLUGINDIR=%_libdir/qemu-kvm
+cd mon
+npm install
+cd ..
+mkdir -p %buildroot/usr/lib/vitastor
+cp -r mon %buildroot/usr/lib/vitastor
+
+
+%files
+%doc
+%_bindir/vitastor-dump-journal
+%_bindir/vitastor-nbd
+%_bindir/vitastor-osd
+%_bindir/vitastor-rm
+%_libdir/qemu-kvm/block-vitastor.so
+%_libdir/vitastor
+/usr/lib/vitastor
+
+
+%changelog
--- a/test-build-el7.sh
+++ b/test-build-el7.sh
@@ -0,0 +1,34 @@
+#!/bin/bash
+
+# Cheatsheet for CentOS 7 packaging (not a build script)
+
+set -e
+rm -f /etc/yum.repos.d/CentOS-Media.repo
+yum -y --enablerepo=extras install centos-release-scl epel-release
+yum -y --enablerepo='*' install devtoolset-9-gcc-c++ devtoolset-9-libatomic-devel gperftools-devel
+yumdownloader --source qemu
+yumdownloader --source fio
+yum-builddep -y --enablerepo='*' qemu
+yum -y install rpm-build
+. /opt/rh/devtoolset-9/enable
+rpm --nomd5 -i qemu*.src.rpm
+rpm --nomd5 -i fio*.src.rpm
+cd ~/rpmbuild/SPECS
+rpmbuild -bp fio.spec
+perl -i -pe 's/^make V=1/exit 1; make V=1/' qemu.spec
+rpmbuild -bc qemu.spec
+perl -i -pe 's/^exit 1; make V=1/make V=1/' qemu.spec
+cd ~/rpmbuild/BUILD/qemu*/
+make qapi-types.h
+mkdir -p ~/vitastor/qemu/b/qemu
+cp config-host.h ~/vitastor/qemu/b/qemu 
+cp qapi-types.h ~/vitastor/qemu/b/qemu
+cp -r include ~/vitastor/qemu
+cd ~/vitastor
+sh copy-qemu-includes.sh
+mv qemu qemu-old
+mv qemu-copy qemu
+ln -s ~/rpmbuild/BUILD/fio*/ fio
+sh copy-fio-includes.sh
+rm fio
+mv fio-copy fio
--- a/test_pattern.h
+++ b/test_pattern.h
@@ -12,4 +12,4 @@
 #define PATTERN3 0x426bd7854eb08509

 #define set_pattern(buf, len, pattern) for (uint64_t i = 0; i < len; i += 8) { *(uint64_t*)((void*)buf + i) = pattern; }
-#define check_pattern(buf, len, pattern) for (uint64_t i = 0; i < len; i += 8) { assert(*(uint64_t*)(buf + i) == pattern); }
+#define check_pattern(buf, len, pattern) { uint64_t bad = UINT64_MAX; for (uint64_t i = 0; i < len; i += 8) { if ((*(uint64_t*)(buf + i)) != (pattern)) { bad = i; break; } } if (bad != UINT64_MAX) { printf("mismatch at %lx\n", bad); } assert(bad == UINT64_MAX); }
--- a/test_shit.cpp
+++ b/test_shit.cpp
@@ -30,7 +30,7 @@

 #include "blockstore.h"
 #include "blockstore_impl.h"
-#include "osd_peering_pg.h"
+#include "osd_peering_pg.cpp"
 //#include "cpp-btree/btree_map.h"

 static int setup_context(unsigned entries, struct io_uring *ring)
@@ -168,7 +168,7 @@ int main0(int argc, char *argv[])
            },
            .version = 1,
        }] = (dirty_entry){
-            .state = ST_D_SYNCED,
+            .state = BS_ST_SYNCED | BS_ST_BIG_WRITE,
            .flags = 0,
            .location = (uint64_t)i << 17,
            .offset = 0,
--- a/timerfd_manager.cpp
+++ b/timerfd_manager.cpp
@@ -7,6 +7,8 @@
 #include <unistd.h>
 #include <errno.h>
 #include <string.h>
+#include <string>
+#include <stdexcept>
 #include "timerfd_manager.h"

 timerfd_manager_t::timerfd_manager_t(std::function<void(int, bool, std::function<void(int, int)>)> set_fd_handler)
Author	SHA1	Message	Date
Vitaliy Filippov	44a53d8352	Huh. Fix rpath for packages	2020-12-05 20:16:39 +03:00
Vitaliy Filippov	9d80bd2d98	Build with jerasure, split some build scripts	2020-12-05 19:02:23 +03:00
Vitaliy Filippov	322a38a144	Fix non-preserved real_pg_count leading to inability to change pools online	2020-12-04 23:46:48 +03:00
Vitaliy Filippov	1018764c91	Fix write->delete->write bugs, add & fix some debugging output	2020-12-04 23:21:58 +03:00
Vitaliy Filippov	a45e0e5e67	Use custom decoding instead of just jerasure_matrix_decode() - Cache the decoding matrix - Don't do unnecessary erasures->erased conversion during decoding - Avoid extra memory allocations during decoding - Don't always reconstruct coding chunks - Reconstruct chunks one-by-one, without overlapping ranges	2020-12-04 17:43:48 +03:00
Vitaliy Filippov	44656fbf67	Allow writes with low version numbers after a delete	2020-12-04 11:54:41 +03:00
Vitaliy Filippov	089f138e0c	Allow situations where the journal contains a big_write(v1) after delete(v2) and v1 < v2 Fixes a crash in the following scenario: - client issues a delete request (object version is at least 2) - OSD has time to flush it to the metadata, but doesn't have time to move the journal start pointer on disk - client overwrites the same object and it gets the version number 1 again - OSD is restarted and sees delete(v=2), big_write(v=1) in the journal - dirty_db sequence gets broken and OSD crashes with assert("Writes and deletes shouldn't happen at the same time")	2020-12-04 11:47:27 +03:00
Vitaliy Filippov	bcc8e697f9	Delete PGs when deleting pools (All OSD crash with "Online PG count change not allowed" if you try to delete an active pool though)	2020-12-04 11:47:27 +03:00
Vitaliy Filippov	a4c46ba745	Add jerasure EC support (reed_sol_van, others are slower) (not tested yet)	2020-12-04 11:47:27 +03:00
Vitaliy Filippov	5596ad8997	Use custom QEMU build for CentOS 7	2020-12-04 11:47:05 +03:00
Vitaliy Filippov	59c29b0cee	Fix RPATH for CentOS builds, add additional repos into the CentOS installation instructions	2020-12-04 11:47:04 +03:00
Vitaliy Filippov	959089b919	Enable progress_notify=true for etcd watches	2020-11-17 16:29:42 +03:00
Vitaliy Filippov	d3e7749616	Final fixes for packaging	2020-11-10 23:33:07 +03:00
Vitaliy Filippov	b56f8820ec	Container packaging for Debian 11 Bullseye, CentOS 7 and CentOS 8	2020-11-10 00:02:53 +03:00
Vitaliy Filippov	4bd2bd48eb	Build Vitastor packages, too	2020-11-09 14:41:39 +03:00
Vitaliy Filippov	a3fc9f8d7d	Add a Dockerfile to build patched QEMU for Debian (Buster)	2020-11-09 02:30:41 +03:00
Vitaliy Filippov	530975aed7	Make it also build with GCC 8 and on Debian Buster	2020-11-09 00:07:07 +03:00
Vitaliy Filippov	1446aad107	Simple patch for qemu-kvm .spec	2020-11-08 02:14:53 +03:00
Vitaliy Filippov	46479e2456	Add RPM build scripts for CentOS 8	2020-11-08 01:55:17 +03:00
Vitaliy Filippov	e41bee72a5	Lower node.js requirement to 10.x	2020-11-08 01:54:12 +03:00
Vitaliy Filippov	2e0f223ddb	Add RPM build scripts for CentOS 7	2020-11-07 01:52:10 +03:00
Vitaliy Filippov	3be7bc29d8	Make it build with QEMU 2.0, too Also begin to work on rpms	2020-11-06 20:05:00 +03:00
Vitaliy Filippov	0c43ff9daf	Add scripts to copy fio and qemu includes to the source package	2020-11-06 18:40:42 +03:00
Vitaliy Filippov	64d471cf53	Add simple Debian packaging	2020-11-06 18:40:42 +03:00
Vitaliy Filippov	809b2ad8cd	Add install target	2020-11-06 01:12:22 +03:00
Vitaliy Filippov	550d4af151	Rename test.cpp to test_shit.cpp (random shit)	2020-11-06 01:12:22 +03:00
Vitaliy Filippov	cf0f23ab8e	Add patches for QEMU QAPI IDL	2020-11-04 23:30:51 +03:00
Vitaliy Filippov	a516fefa8c	Add qemu_module_dummy and qemu_stamp_xxx to qemu_driver.c	2020-11-04 23:10:29 +03:00
Vitaliy Filippov	3b7279b376	Add Ceph EC 2+1 test results	2020-11-01 14:13:35 +03:00
Vitaliy Filippov	824ea507d0	Do not try to push more segments than IOV_MAX at once as it leads to EMSGSIZE	2020-10-30 01:25:43 +03:00
Vitaliy Filippov	23ea409081	Fix "can't get SQE, will fall out of sync with EPOLLET" when overflowing the ring OSDs shouldn't crash or hang with long iodepths anymore	2020-10-30 01:06:36 +03:00
Vitaliy Filippov	2ccb75974b	Fix a rare crash caused by a stopped client still being in write_ready_clients	2020-10-30 01:04:58 +03:00
Vitaliy Filippov	6561d4e040	Validate pool ID before executing the operation Reply -EPIPE for non-existing pools because we assume that it means that pool config isn't loaded yet. Previously OSD crashed on such operations	2020-10-30 01:02:46 +03:00
Vitaliy Filippov	1eda7f529d	Note about Linux 5.8+	2020-10-28 19:17:22 +03:00
Vitaliy Filippov	0a174bb313	Return success for already finished rollback operations There was a FIXME and I actually hit it during tests :)	2020-10-24 18:46:19 +03:00
Vitaliy Filippov	720985e4c7	Fix NULL rmw buffer after the latest changes and add a testcase for it	2020-10-24 18:29:19 +03:00
Vitaliy Filippov	4872f617a4	Clear connect timeout in stop_client() to stop races during disconnections	2020-10-24 10:37:16 +03:00
Vitaliy Filippov	e8ac08be14	Allow to overwrite incomplete objects or parts of objects to recover them	2020-10-24 02:14:41 +03:00
Vitaliy Filippov	660c2412fb	Improve debugging output for incomplete/degraded	2020-10-24 01:28:47 +03:00
Vitaliy Filippov	faa5e1436f	Attempt journal trim even without new flushes This is the new guaranteed unblocking method which replaces old trims in init and rollback, and also fixes a possible stall when just several writes in the beginning of the journal are flushed without triggering a subsequent trim.	2020-10-24 01:28:47 +03:00
Vitaliy Filippov	5fbe36198a	Fix journal trimming 1) Update journal's used_start in memory only after updating journal superblock. Doing the opposite is incorrect because part of the journal will be lost if writers overwrite its old beginning. 2) Sync journal device after updating the superblock. 3) Do not trim in rollback and init because trimming there would also require updating the superblock. And the only reason to trim in both those places was to unblock writers. And a guaranteed unblocking method will follow in the next commit :)	2020-10-24 01:08:33 +03:00
Vitaliy Filippov	99c45bb5ed	Fix debugging output during journal loading	2020-10-24 01:08:33 +03:00
Vitaliy Filippov	701eb79422	Stabilize writes before deleting extra chunks to not stall peer journals	2020-10-23 22:45:05 +03:00
Vitaliy Filippov	220bda0667	Fix possible buffer over(under)flow when handling LIST	2020-10-23 02:17:44 +03:00
Vitaliy Filippov	1e8f0328e0	Cancel outbound operations after re-peering PGs	2020-10-22 22:54:38 +00:00
Vitaliy Filippov	f011e0c675	Do not block stabilize by list and list by write	2020-10-22 22:13:40 +00:00
Vitaliy Filippov	1a694c387e	Print slow ops in log	2020-10-20 23:41:23 +00:00
Vitaliy Filippov	738ad5af79	Fix infinite looping in continue_recovery_op() when pg_cancel_write_queue() is called	2020-10-20 22:23:15 +00:00
Vitaliy Filippov	9abf3c17c9	Correct fix for "Pool %u PG %u configuration is invalid" during startup Establish watcher connection after loading PGs	2020-10-20 21:09:14 +00:00
Vitaliy Filippov	d2b901aa09	Fix default auto-created failure domains	2020-10-20 21:07:40 +00:00
Vitaliy Filippov	befff09370	Fix possible crash due to uninitialized ring_data_t in ringloop	2020-10-20 10:44:38 +03:00
Vitaliy Filippov	d1645551d4	Implement write batching Also fix possible race condition which could in theory lead to "command out of sync" and a buffer overflow that could happen on incorrect server response.	2020-10-20 03:29:17 +03:00
Vitaliy Filippov	7cb561f95a	Add etcd to the example service generator	2020-10-20 01:50:56 +03:00
Vitaliy Filippov	ae480196e2	Add a note about etcd bug, fix simple-offsets.js cmdline	2020-10-19 17:05:45 +03:00
Vitaliy Filippov	398c86f943	Improve PG-related log messages	2020-10-18 12:17:22 +00:00
Vitaliy Filippov	bec5f921a6	Fix buffer overflows in the no_same_sector_overwrites mode	2020-10-17 23:30:16 +00:00
Vitaliy Filippov	5335c8de8e	Do not use unordered_map for list_ops/list_results	2020-10-17 23:30:16 +00:00
Vitaliy Filippov	c696a82083	Replace assert with if + error message (may happen on metadata corruption)	2020-10-17 23:30:16 +00:00
Vitaliy Filippov	900171586b	XOR 2+1 test results	2020-10-17 14:58:08 +03:00
Vitaliy Filippov	70612e5df0	Do not handle change events before loading config	2020-10-17 11:18:39 +00:00
Vitaliy Filippov	d952c24979	Use timeout in rw callback	2020-10-17 11:00:55 +00:00
Vitaliy Filippov	776fe954a5	Fix crashes on multiple OSD reconnects Identify clients by pointers instead of peer_fd as peer may be dropped and reconnected between callbacks Yeah maybe I need some Rust, but ... maybe in the future :)	2020-10-17 10:53:04 +00:00
Vitaliy Filippov	9350656af6	Fix osd tags	2020-10-16 23:28:48 +00:00
Vitaliy Filippov	ece14a7d65	Hide "Connected with..." client messages by default	2020-10-11 02:22:46 +03:00
Vitaliy Filippov	be5f314c32	Change notes about gcc requirement to 9+, fio to 3.16+	2020-10-11 02:00:39 +03:00
Vitaliy Filippov	15dba96375	Implement inode removal tool. Removes multiple objects from multiple OSDs in parallel	2020-10-10 01:08:19 +03:00
Vitaliy Filippov	3d05aa9362	Make it build with GCC 10, fio 3.20+ (atomics...) and QEMU 5.1	2020-10-06 02:35:11 +03:00
Vitaliy Filippov	94efb54feb	Implement OSD tags (device classes), fix pool failure_domain configuration	2020-10-04 17:31:50 +03:00
Vitaliy Filippov	aa2a0ee00f	Do not group adjacent stripes by default as it's pointless on SSDs	2020-10-02 10:17:54 +03:00