Support RDMA devices without Implicit ODP using mlockall()

UPD: Seems it won't work because ibv_reg_mr() takes a permissions argument and doesn't allow more permissions than allowed by the kernel for memory mappings. So the only way to register all memory is probably to iterate over /proc/PID/maps... :) Mellanox docs mention that older MLNX_OFED emulated ODP, so maybe it's still possible to use it, but it's not confirmed.
2022-02-02 01:40:29 +03:00
96 changed files with 1485 additions and 4468 deletions
--- a/.gitmodules
+++ b/.gitmodules
@@ -4,6 +4,3 @@
 [submodule "json11"]
 	path = json11
 	url = ../json11.git
-[submodule "libnfs"]
-	path = libnfs
-	url = ../libnfs.git
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -2,6 +2,6 @@ cmake_minimum_required(VERSION 2.8)

 project(vitastor)

-set(VERSION "0.6.16")
+set(VERSION "0.6.12")

 add_subdirectory(src)
--- a/README-ru.md
+++ b/README-ru.md
@@ -407,7 +407,6 @@ Vitastor с однопоточной NBD прокси на том же стен
 - На хостах мониторов:
  - Пропишите нужные вам значения в файле `/usr/lib/vitastor/mon/make-units.sh`
  - Создайте юниты systemd для etcd и мониторов: `/usr/lib/vitastor/mon/make-units.sh`
- Запустите etcd и мониторы: `systemctl start etcd vitastor-mon`
 - Пропишите etcd_address и osd_network в `/etc/vitastor/vitastor.conf`. Например:
  ```
  {
@@ -415,14 +414,7 @@ Vitastor с однопоточной NBD прокси на том же стен
    "osd_network": "10.200.1.0/24"
  }
  ```
- Инициализуйте OSD:
-  - SSD: `/usr/lib/vitastor/make-osd.sh /dev/disk/by-partuuid/XXX [/dev/disk/by-partuuid/YYY ...]`
-  - Гибридные, HDD+SSD: `/usr/lib/vitastor/mon/make-osd-hybrid.js /dev/sda /dev/sdb ...` - передайте
-    все ваши SSD и HDD скрипту в командной строке подряд, скрипт автоматически выделит разделы под
-    журналы на SSD и данные на HDD. Скрипт пропускает HDD, на которых уже есть разделы
-    или вообще какие-то данные, поэтому если диски непустые, сначала очистите их с помощью
-    `wipefs -a`. SSD с таблицей разделов не пропускаются, но так как скрипт создаёт новые разделы
-    для журналов, на SSD должно быть доступно свободное нераспределённое место.
+- Создайте юниты systemd для OSD: `/usr/lib/vitastor/make-osd.sh /dev/disk/by-partuuid/XXX [/dev/disk/by-partuuid/YYY ...]`
 - Вы можете менять параметры OSD в юнитах systemd или в `vitastor.conf`. Смысл некоторых параметров:
  - `disable_data_fsync 1` - отключает fsync, используется с SSD с конденсаторами.
  - `immediate_commit all` - используется с SSD с конденсаторами.
@@ -438,6 +430,7 @@ Vitastor с однопоточной NBD прокси на том же стен
    диски, используемые на одном из тестовых стендов - Intel D3-S4510 - очень сильно не любят такую
    перезапись, и для них была добавлена эта опция. Когда данный режим включён, также нужно поднимать
    значение `journal_sector_buffer_count`, так как иначе Vitastor не хватит буферов для записи в журнал.
+- Запустите все etcd: `systemctl start etcd`
 - Создайте глобальную конфигурацию в etcd: `etcdctl --endpoints=... put /vitastor/config/global '{"immediate_commit":"all"}'`
  (если все ваши диски - серверные с конденсаторами).
 - Создайте пулы: `etcdctl --endpoints=... put /vitastor/config/pools '{"1":{"name":"testpool","scheme":"replicated","pg_size":2,"pg_minsize":1,"pg_count":256,"failure_domain":"host"}}'`.
--- a/README.md
+++ b/README.md
@@ -360,7 +360,6 @@ and calculate disk offsets almost by hand. This will be fixed in near future.
 - On the monitor hosts:
  - Edit variables at the top of `/usr/lib/vitastor/mon/make-units.sh` to desired values.
  - Create systemd units for the monitor and etcd: `/usr/lib/vitastor/mon/make-units.sh`
- Start etcd and monitors: `systemctl start etcd vitastor-mon`
 - Put etcd_address and osd_network into `/etc/vitastor/vitastor.conf`. Example:
  ```
  {
@@ -368,13 +367,7 @@ and calculate disk offsets almost by hand. This will be fixed in near future.
    "osd_network": "10.200.1.0/24"
  }
  ```
- Initialize OSDs:
-  - Simplest, SSD-only: `/usr/lib/vitastor/mon/make-osd.sh /dev/disk/by-partuuid/XXX [/dev/disk/by-partuuid/YYY ...]`
-  - Hybrid, HDD+SSD: `/usr/lib/vitastor/mon/make-osd-hybrid.js /dev/sda /dev/sdb ...` - pass all your
-    devices (HDD and SSD) to this script - it will partition disks and initialize journals on its own.
-    This script skips HDDs which are already partitioned so if you want to use non-empty disks for
-    Vitastor you should first wipe them with `wipefs -a`. SSDs with GPT partition table are not skipped,
-    but some free unpartitioned space must be available because the script creates new partitions for journals.
+- Create systemd units for your OSDs: `/usr/lib/vitastor/mon/make-osd.sh /dev/disk/by-partuuid/XXX [/dev/disk/by-partuuid/YYY ...]`
 - You can change OSD configuration in units or in `vitastor.conf`. Notable configuration variables:
  - `disable_data_fsync 1` - only safe with server-grade drives with capacitors.
  - `immediate_commit all` - use this if all your drives are server-grade.
--- a/csi/Makefile
+++ b/csi/Makefile
@@ -1,4 +1,4 @@
-VERSION ?= v0.6.16
+VERSION ?= v0.6.12

 all: build push

--- a/csi/deploy/004-csi-nodeplugin.yaml
+++ b/csi/deploy/004-csi-nodeplugin.yaml
@@ -49,7 +49,7 @@ spec:
            capabilities:
              add: ["SYS_ADMIN"]
            allowPrivilegeEscalation: true
-          image: vitalif/vitastor-csi:v0.6.16
+          image: vitalif/vitastor-csi:v0.6.12
          args:
            - "--node=$(NODE_ID)"
            - "--endpoint=$(CSI_ENDPOINT)"
--- a/csi/deploy/007-csi-provisioner.yaml
+++ b/csi/deploy/007-csi-provisioner.yaml
@@ -116,7 +116,7 @@ spec:
            privileged: true
            capabilities:
              add: ["SYS_ADMIN"]
-          image: vitalif/vitastor-csi:v0.6.16
+          image: vitalif/vitastor-csi:v0.6.12
          args:
            - "--node=$(NODE_ID)"
            - "--endpoint=$(CSI_ENDPOINT)"
--- a/csi/deploy/example-pvc-block.yaml
+++ b/csi/deploy/example-pvc-block.yaml
@@ -1,13 +0,0 @@
---
-apiVersion: v1
-kind: PersistentVolumeClaim
-metadata:
-  name: test-vitastor-pvc-block
-spec:
-  storageClassName: vitastor
-  volumeMode: Block
-  accessModes:
-    - ReadWriteMany
-  resources:
-    requests:
-      storage: 10Gi
--- a/csi/deploy/example-test-pod-block.yaml
+++ b/csi/deploy/example-test-pod-block.yaml
@@ -1,17 +0,0 @@
-apiVersion: v1
-kind: Pod
-metadata:
-  name: vitastor-test-block-pvc
-  namespace: default
-spec:
-  containers:
-  - name: vitastor-test-block-pvc
-    image: nginx
-    volumeDevices:
-      - name: data
-        devicePath: /dev/xvda
-  volumes:
-  - name: data
-    persistentVolumeClaim:
-      claimName: test-vitastor-pvc-block
-      readOnly: false
--- a/csi/deploy/example-test-pod.yaml
+++ b/csi/deploy/example-test-pod.yaml
@@ -1,17 +0,0 @@
-apiVersion: v1
-kind: Pod
-metadata:
-  name: vitastor-test-nginx
-  namespace: default
-spec:
-  containers:
-   - name: vitastor-test-nginx
-     image: nginx
-     volumeMounts:
-       - mountPath: /usr/share/nginx/html/s3
-         name: data
-  volumes:
-   - name: data
-     persistentVolumeClaim:
-       claimName: test-vitastor-pvc
-       readOnly: false
--- a/csi/src/config.go
+++ b/csi/src/config.go
@@ -5,7 +5,7 @@ package vitastor

 const (
    vitastorCSIDriverName    = "csi.vitastor.io"
-    vitastorCSIDriverVersion = "0.6.16"
+    vitastorCSIDriverVersion = "0.6.12"
 )

 // Config struct fills the parameters of request or user input
--- a/csi/src/nodeserver.go
+++ b/csi/src/nodeserver.go
@@ -67,44 +67,29 @@ func (ns *NodeServer) NodePublishVolume(ctx context.Context, req *csi.NodePublis
    klog.Infof("received node publish volume request %+v", protosanitizer.StripSecrets(req))

    targetPath := req.GetTargetPath()
-    isBlock := req.GetVolumeCapability().GetBlock() != nil

    // Check that it's not already mounted
-    _, error := mount.IsNotMountPoint(ns.mounter, targetPath)
+    free, error := mount.IsNotMountPoint(ns.mounter, targetPath)
    if (error != nil)
    {
        if (os.IsNotExist(error))
        {
-            if (isBlock)
+            error := os.MkdirAll(targetPath, 0777)
+            if (error != nil)
            {
-                pathFile, err := os.OpenFile(targetPath, os.O_CREATE|os.O_RDWR, 0o600)
-                if (err != nil)
-                {
-                    klog.Errorf("failed to create block device mount target %s with error: %v", targetPath, err)
-                    return nil, status.Error(codes.Internal, err.Error())
-                }
-                err = pathFile.Close()
-                if (err != nil)
-                {
-                    klog.Errorf("failed to close %s with error: %v", targetPath, err)
-                    return nil, status.Error(codes.Internal, err.Error())
-                }
-            }
-            else
-            {
-                err := os.MkdirAll(targetPath, 0777)
-                if (err != nil)
-                {
-                    klog.Errorf("failed to create fs mount target %s with error: %v", targetPath, err)
-                    return nil, status.Error(codes.Internal, err.Error())
-                }
+                return nil, status.Error(codes.Internal, error.Error())
            }
+            free = true
        }
        else
        {
            return nil, status.Error(codes.Internal, error.Error())
        }
    }
+    if (!free)
+    {
+        return &csi.NodePublishVolumeResponse{}, nil
+    }

    ctxVars := make(map[string]string)
    err := json.Unmarshal([]byte(req.VolumeId), &ctxVars)
@@ -164,6 +149,7 @@ func (ns *NodeServer) NodePublishVolume(ctx context.Context, req *csi.NodePublis

    // Format the device (ext4 or xfs)
    fsType := req.GetVolumeCapability().GetMount().GetFsType()
+    isBlock := req.GetVolumeCapability().GetBlock() != nil
    opt := req.GetVolumeCapability().GetMount().GetMountFlags()
    opt = append(opt, "_netdev")
    if ((req.VolumeCapability.AccessMode.Mode == csi.VolumeCapability_AccessMode_MULTI_NODE_READER_ONLY ||
--- a/debian/changelog
+++ b/debian/changelog
@@ -1,4 +1,4 @@
-vitastor (0.6.16-1) unstable; urgency=medium
+vitastor (0.6.12-1) unstable; urgency=medium

  * RDMA support
  * Bugfixes
--- a/debian/vitastor.Dockerfile
+++ b/debian/vitastor.Dockerfile
@@ -33,8 +33,8 @@ RUN set -e -x; \
    mkdir -p /root/packages/vitastor-$REL; \
    rm -rf /root/packages/vitastor-$REL/*; \
    cd /root/packages/vitastor-$REL; \
-    cp -r /root/vitastor vitastor-0.6.16; \
-    cd vitastor-0.6.16; \
+    cp -r /root/vitastor vitastor-0.6.12; \
+    cd vitastor-0.6.12; \
    ln -s /root/fio-build/fio-*/ ./fio; \
    FIO=$(head -n1 fio/debian/changelog | perl -pe 's/^.*\((.*?)\).*$/$1/'); \
    ls /usr/include/linux/raw.h || cp ./debian/raw.h /usr/include/linux/raw.h; \
@@ -47,8 +47,8 @@ RUN set -e -x; \
    rm -rf a b; \
    echo "dep:fio=$FIO" > debian/fio_version; \
    cd /root/packages/vitastor-$REL; \
-    tar --sort=name --mtime='2020-01-01' --owner=0 --group=0 --exclude=debian -cJf vitastor_0.6.16.orig.tar.xz vitastor-0.6.16; \
-    cd vitastor-0.6.16; \
+    tar --sort=name --mtime='2020-01-01' --owner=0 --group=0 --exclude=debian -cJf vitastor_0.6.12.orig.tar.xz vitastor-0.6.12; \
+    cd vitastor-0.6.12; \
    V=$(head -n1 debian/changelog | perl -pe 's/^.*\((.*?)\).*$/$1/'); \
    DEBFULLNAME="Vitaliy Filippov <vitalif@yourcmc.ru>" dch -D $REL -v "$V""$REL" "Rebuild for $REL"; \
    DEB_BUILD_OPTIONS=nocheck dpkg-buildpackage --jobs=auto -sa; \
--- a/docs/params/network.yml
+++ b/docs/params/network.yml
@@ -48,19 +48,28 @@
  type: string
  info: |
    RDMA device name to use for Vitastor OSD communications (for example,
-    "rocep5s0f0"). Please note that Vitastor RDMA requires Implicit On-Demand
-    Paging (Implicit ODP) and Scatter/Gather (SG) support from the RDMA device
-    to work. For example, Mellanox ConnectX-3 and older adapters don't have
-    Implicit ODP, so they're unsupported by Vitastor. Run `ibv_devinfo -v` as
-    root to list available RDMA devices and their features.
+    "rocep5s0f0"). Please note that if your RDMA device doesn't support
+    Implicit ODP (Implicit On-Demand Paging) then all Vitastor OSDs and clients
+    will have to use mlockall() to lock all application memory to use RDMA.
+    In case of the native Vitastor QEMU driver with RDMA, all virtual machine
+    memory will be locked if your RDMA device doesn't support Implicit ODP.
+
+    Notably, Mellanox ConnectX-3 and older adapters don't support Implicit ODP,
+    while ConnectX-4 and newer do. Run `ibv_devinfo -v` as root to list
+    available RDMA devices and their features.
  info_ru: |
    Название RDMA-устройства для связи с Vitastor OSD (например, "rocep5s0f0").
-    Имейте в виду, что поддержка RDMA в Vitastor требует функций устройства
-    Implicit On-Demand Paging (Implicit ODP) и Scatter/Gather (SG). Например,
-    адаптеры Mellanox ConnectX-3 и более старые не поддерживают Implicit ODP и
-    потому не поддерживаются в Vitastor. Запустите `ibv_devinfo -v` от имени
-    суперпользователя, чтобы посмотреть список доступных RDMA-устройств, их
-    параметры и возможности.
+    Имейте в виду, что если ваше устройство не поддерживает Implicit ODP
+    (Implicit On-Demand Paging), то все OSD и клиенты Vitastor будут вынуждены
+    блокировать всю память приложения с помощью mlockall(), чтобы задействовать
+    RDMA. В случае нативного QEMU-драйвера это будет означать, что при
+    использовании RDMA на устройстве без поддержки Implicit ODP блокироваться
+    от выгрузки будет вся память виртуальных машин.
+
+    В случае с адаптерами Mellanox Implicit ODP поддерживается начиная с
+    ConnectX-4. ConnectX-3 и более старые адаптеры не поддерживают Implicit ODP.
+    Чтобы посмотреть список своих RDMA-устройств и их возможностей, запустите
+    `ibv_devinfo -v` от имени суперпользователя.
 - name: rdma_port_num
  type: int
  default: 1
--- a/1
+++ b/1
--- a/mon/make-osd-hybrid.js
+++ b/mon/make-osd-hybrid.js
@@ -1,414 +0,0 @@
-#!/usr/bin/nodejs
-// systemd unit generator for hybrid (HDD+SSD) vitastor OSDs
-// Copyright (c) Vitaliy Filippov, 2019+
-// License: VNPL-1.1
-
-// USAGE: nodejs make-osd-hybrid.js [--disable_ssd_cache 0] [--disable_hdd_cache 0] /dev/sda /dev/sdb /dev/sdc /dev/sdd ...
-// I.e. - just pass all HDDs and SSDs mixed, the script will decide where
-// to put journals on its own
-
-const fs = require('fs');
-const fsp = fs.promises;
-const child_process = require('child_process');
-
-const options = {
-    debug: 1,
-    journal_size: 1024*1024*1024,
-    min_meta_size: 1024*1024*1024,
-    object_size: 1024*1024,
-    bitmap_granularity: 4096,
-    device_block_size: 4096,
-    disable_ssd_cache: 1,
-    disable_hdd_cache: 1,
-};
-
-run().catch(console.fatal);
-
-async function run()
-{
-    const device_list = parse_options();
-    await system_or_die("mkdir -p /var/log/vitastor; chown vitastor /var/log/vitastor");
-    // Collect devices
-    const all_devices = await collect_devices(device_list);
-    const ssds = all_devices.filter(d => d.ssd);
-    const hdds = all_devices.filter(d => !d.ssd);
-    // Collect existing OSD units
-    const osd_units = await collect_osd_units();
-    // Count assigned HDD journals and unallocated space for each SSD
-    await check_journal_count(ssds, osd_units);
-    // Create new OSDs
-    await create_new_hybrid_osds(hdds, ssds, osd_units);
-    process.exit(0);
-}
-
-function parse_options()
-{
-    const devices = [];
-    const opt = {};
-    for (let i = 2; i < process.argv.length; i++)
-    {
-        const arg = process.argv[i];
-        if (arg == '--help' || arg == '-h')
-        {
-            opt.help = true;
-            break;
-        }
-        else if (arg.substr(0, 2) == '--')
-            opt[arg.substr(2)] = process.argv[++i];
-        else
-            devices.push(arg);
-    }
-    if (opt.help || !devices.length)
-    {
-        console.log(
-            'Prepare hybrid (HDD+SSD) Vitastor OSDs\n'+
-            '(c) Vitaliy Filippov, 2019+, license: VNPL-1.1\n\n'+
-            'USAGE: nodejs make-osd-hybrid.js [OPTIONS] /dev/sda /dev/sdb /dev/sdc ...\n'+
-            'Just pass all your SSDs and HDDs in any order, the script will distribute OSDs for you.\n\n'+
-            'OPTIONS (with defaults):\n'+
-            Object.keys(options).map(k => `  --${k} ${options[k]}`).join('\n')
-        );
-        process.exit(0);
-    }
-    for (const k in opt)
-        options[k] = opt[k];
-    return devices;
-}
-
-// Collect devices
-async function collect_devices(devices_to_check)
-{
-    const devices = [];
-    for (const dev of devices_to_check)
-    {
-        if (dev.substr(0, 5) != '/dev/')
-        {
-            console.log(`${dev} does not start with /dev/, skipping`);
-            continue;
-        }
-        if (!await file_exists('/sys/block/'+dev.substr(5)))
-        {
-            console.log(`${dev} is a partition, skipping`);
-            continue;
-        }
-        // Check if the device is an SSD
-        const rot = '/sys/block/'+dev.substr(5)+'/queue/rotational';
-        if (!await file_exists(rot))
-        {
-            console.log(`${dev} does not have ${rot} to check whether it's an SSD, skipping`);
-            continue;
-        }
-        const ssd = !parseInt(await fsp.readFile(rot, { encoding: 'utf-8' }));
-        // Check if the device has partition table
-        let [ has_partition_table, parts ] = await system(`sfdisk --dump ${dev} --json`);
-        if (has_partition_table != 0)
-        {
-            // Check if the device has any data
-            const [ has_data, out ] = await system(`blkid ${dev}`);
-            if (has_data == 0)
-            {
-                console.log(`${dev} contains data, skipping:\n  ${out.trim().replace(/\n/g, '\n  ')}`);
-                continue;
-            }
-        }
-        parts = parts ? JSON.parse(parts).partitiontable : null;
-        if (parts && parts.label != 'gpt')
-        {
-            console.log(`${dev} contains "${parts.label}" partition table, only GPT is supported, skipping`);
-            continue;
-        }
-        devices.push({
-            path: dev,
-            ssd,
-            parts,
-        });
-    }
-    return devices;
-}
-
-// Collect existing OSD units
-async function collect_osd_units()
-{
-    const units = [];
-    for (const unit of (await system("ls /etc/systemd/system/vitastor-osd*.service"))[1].trim().split('\n'))
-    {
-        if (!unit)
-        {
-            continue;
-        }
-        let cmd = /^ExecStart\s*=\s*(([^\n]*\\\n)*[^\n]*)/.exec(await fsp.readFile(unit, { encoding: 'utf-8' }));
-        if (!cmd)
-        {
-            console.log('ExecStart= not found in '+unit+', skipping')
-            continue;
-        }
-        let kv = {}, key;
-        cmd = cmd[1].replace(/^bash\s+-c\s+'/, '')
-            .replace(/>>\s*\S+2>\s*&1\s*'$/, '')
-            .replace(/\s*\\\n\s*/g, ' ')
-            .replace(/([^\s']+)|'([^']+)'/g, (m, m1, m2) =>
-            {
-                m1 = m1||m2;
-                if (key == null)
-                {
-                    if (m1.substr(0, 2) != '--')
-                    {
-                        console.log('Strange command line in '+unit+', stopping');
-                        process.exit(1);
-                    }
-                    key = m1.substr(2);
-                }
-                else
-                {
-                    kv[key] = m1;
-                    key = null;
-                }
-            });
-        units.push(kv);
-    }
-    return units;
-}
-
-// Count assigned HDD journals and unallocated space for each SSD
-async function check_journal_count(ssds, osd_units)
-{
-    const units_by_journal = osd_units.reduce((a, c) =>
-    {
-        if (c.journal_device)
-            a[c.journal_device] = c;
-        return a;
-    }, {});
-    for (const dev of ssds)
-    {
-        dev.journals = 0;
-        if (dev.parts)
-        {
-            for (const part of dev.parts.partitions)
-            {
-                if (part.uuid && units_by_journal['/dev/disk/by-partuuid/'+part.uuid.toLowerCase()])
-                {
-                    dev.journals++;
-                }
-            }
-            dev.free = free_from_parttable(dev.parts);
-        }
-        else
-        {
-            dev.free = parseInt(await system_or_die("blockdev --getsize64 "+dev.path));
-        }
-    }
-}
-
-async function create_new_hybrid_osds(hdds, ssds, osd_units)
-{
-    const units_by_disk = osd_units.reduce((a, c) => { a[c.data_device] = c; return a; }, {});
-    for (const dev of hdds)
-    {
-        if (!dev.parts)
-        {
-            // HDD is not partitioned yet, create a single partition
-            // + is the "default value" for sfdisk
-            await system_or_die('sfdisk '+dev.path, 'label: gpt\n\n+ +\n');
-            dev.parts = JSON.parse(await system_or_die('sfdisk --dump '+dev.path+' --json')).partitiontable;
-        }
-        if (dev.parts.partitions.length != 1)
-        {
-            console.log(dev.path+' has more than 1 partition, skipping');
-        }
-        else if ((dev.parts.partitions[0].start + dev.parts.partitions[0].size) != (1 + dev.parts.lastlba))
-        {
-            console.log(dev.path+'1 is not a whole-disk partition, skipping');
-        }
-        else if (!dev.parts.partitions[0].uuid)
-        {
-            console.log(dev.parts.partitions[0].node+' does not have UUID. Please repartition '+dev.path+' with GPT');
-        }
-        else if (!units_by_disk['/dev/disk/by-partuuid/'+dev.parts.partitions[0].uuid.toLowerCase()])
-        {
-            await create_hybrid_osd(dev, ssds);
-        }
-    }
-}
-
-async function create_hybrid_osd(dev, ssds)
-{
-    // Create a new OSD
-    // Calculate metadata size
-    const data_device = '/dev/disk/by-partuuid/'+dev.parts.partitions[0].uuid.toLowerCase();
-    const data_size = dev.parts.partitions[0].size * dev.parts.sectorsize;
-    const meta_entry_size = 24 + 2*options.object_size/options.bitmap_granularity/8;
-    const entries_per_block = Math.floor(options.device_block_size / meta_entry_size);
-    const object_count = Math.floor(data_size / options.object_size);
-    let meta_size = Math.ceil(1 + object_count / entries_per_block) * options.device_block_size;
-    // Leave some extra space for future metadata formats and round metadata area size to multiples of 1 MB
-    meta_size = 2*meta_size;
-    meta_size = Math.ceil(meta_size/1024/1024) * 1024*1024;
-    if (meta_size < options.min_meta_size)
-        meta_size = options.min_meta_size;
-    let journal_size = Math.ceil(options.journal_size/1024/1024) * 1024*1024;
-    // Pick an SSD for journal, balancing the number of journals across SSDs
-    let selected_ssd;
-    for (const ssd of ssds)
-        if (ssd.free >= (meta_size+journal_size) && (!selected_ssd || selected_ssd.journals > ssd.journals))
-            selected_ssd = ssd;
-    if (!selected_ssd)
-    {
-        console.error('Could not find free space for SSD journal and metadata for '+dev.path);
-        process.exit(1);
-    }
-    // Allocate an OSD number
-    const osd_num = (await system_or_die("vitastor-cli alloc-osd")).trim();
-    if (!osd_num)
-    {
-        console.error('Failed to run vitastor-cli alloc-osd');
-        process.exit(1);
-    }
-    console.log('Creating OSD '+osd_num+' on '+dev.path+' (HDD) with journal and metadata on '+selected_ssd.path+' (SSD)');
-    // Add two partitions: journal and metadata
-    const new_parts = await add_partitions(selected_ssd, [ journal_size, meta_size ]);
-    selected_ssd.journals++;
-    const journal_device = '/dev/disk/by-partuuid/'+new_parts[0].uuid.toLowerCase();
-    const meta_device = '/dev/disk/by-partuuid/'+new_parts[1].uuid.toLowerCase();
-    // Wait until the device symlinks appear
-    while (!await file_exists(journal_device))
-    {
-        await new Promise(ok => setTimeout(ok, 100));
-    }
-    while (!await file_exists(meta_device))
-    {
-        await new Promise(ok => setTimeout(ok, 100));
-    }
-    // Zero out metadata and journal
-    await system_or_die("dd if=/dev/zero of="+journal_device+" bs=1M count="+(journal_size/1024/1024)+" oflag=direct");
-    await system_or_die("dd if=/dev/zero of="+meta_device+" bs=1M count="+(meta_size/1024/1024)+" oflag=direct");
-    // Create unit file for the OSD
-    const has_scsi_cache_type = options.disable_ssd_cache &&
-        (await system("ls /sys/block/"+selected_ssd.path.substr(5)+"/device/scsi_disk/*/cache_type"))[0] == 0;
-    const write_through = options.disable_ssd_cache && (
-        has_scsi_cache_type || selected_ssd.path.substr(5, 4) == 'nvme'
-        && (await system_or_die("/sys/block/"+selected_ssd.path.substr(5)+"/queue/write_cache")).trim() == "write through");
-    await fsp.writeFile('/etc/systemd/system/vitastor-osd'+osd_num+'.service',
-`[Unit]
-Description=Vitastor object storage daemon osd.${osd_num}
-After=network-online.target local-fs.target time-sync.target
-Wants=network-online.target local-fs.target time-sync.target
-PartOf=vitastor.target
-
-[Service]
-LimitNOFILE=1048576
-LimitNPROC=1048576
-LimitMEMLOCK=infinity
-ExecStart=bash -c '/usr/bin/vitastor-osd \\
-    --osd_num ${osd_num} ${write_through
-        ? "--disable_meta_fsync 1 --disable_journal_fsync 1 --immediate_commit "+(options.disable_hdd_cache ? "all" : "small")
-        : ""} \\
-    --throttle_small_writes 1 \\
-    --disk_alignment ${options.device_block_size} \\
-    --journal_block_size ${options.device_block_size} \\
-    --meta_block_size ${options.device_block_size} \\
-    --journal_no_same_sector_overwrites true \\
-    --journal_sector_buffer_count 1024 \\
-    --block_size ${options.object_size} \\
-    --data_device ${data_device} \\
-    --journal_device ${journal_device} \\
-    --meta_device ${meta_device} >>/var/log/vitastor/osd${osd_num}.log 2>&1'
-WorkingDirectory=/
-ExecStartPre=+chown vitastor:vitastor ${data_device}
-ExecStartPre=+chown vitastor:vitastor ${journal_device}
-ExecStartPre=+chown vitastor:vitastor ${meta_device}${
-    has_scsi_cache_type
-    ? "\nExecStartPre=+bash -c 'D=$$$(readlink "+journal_device+"); echo write through > $$$(dirname /sys/block/*/$$\${D##*/})/device/scsi_disk/*/cache_type'"
-    : ""}${
-    options.disable_hdd_cache
-    ? "\nExecStartPre=+bash -c 'D=$$$(readlink "+data_device+"); echo write through > $$$(dirname /sys/block/*/$$\${D##*/})/device/scsi_disk/*/cache_type'"
-    : ""}
-User=vitastor
-PrivateTmp=false
-TasksMax=infinity
-Restart=always
-StartLimitInterval=0
-RestartSec=10
-
-[Install]
-WantedBy=vitastor.target
-`);
-    await system_or_die("systemctl enable vitastor-osd"+osd_num);
-}
-
-async function add_partitions(dev, sizes)
-{
-    let script = 'label: gpt\n\n';
-    if (dev.parts)
-    {
-        // Old partitions
-        for (const part of dev.parts.partitions)
-        {
-            script += part.node+': '+Object.keys(part).map(k => k == 'node' ? '' : k+'='+part[k]).filter(k => k).join(', ')+'\n';
-        }
-    }
-    // New partitions
-    for (const size of sizes)
-    {
-        script += '+ '+Math.ceil(size/1024)+'KiB\n';
-    }
-    await system_or_die('sfdisk '+dev.path, script);
-    // Get new partition table and find the new partition
-    const newpt = JSON.parse(await system_or_die('sfdisk --dump '+dev.path+' --json')).partitiontable;
-    const old_nodes = dev.parts ? dev.parts.partitions.reduce((a, c) => { a[c.uuid] = true; return a; }, {}) : {};
-    const new_nodes = newpt.partitions.filter(part => !old_nodes[part.uuid]);
-    if (new_nodes.length != sizes.length)
-    {
-        console.error('Failed to partition '+dev.path+': new partitions not found in table');
-        process.exit(1);
-    }
-    dev.parts = newpt;
-    dev.free = free_from_parttable(newpt);
-    return new_nodes;
-}
-
-function free_from_parttable(pt)
-{
-    let free = pt.lastlba + 1 - pt.firstlba;
-    for (const part of pt.partitions)
-    {
-        free -= part.size;
-    }
-    free *= pt.sectorsize;
-    return free;
-}
-
-async function system_or_die(cmd, input = '')
-{
-    let [ exitcode, stdout, stderr ] = await system(cmd, input);
-    if (exitcode != 0)
-    {
-        console.error(cmd+' failed: '+stderr);
-        process.exit(1);
-    }
-    return stdout;
-}
-
-async function system(cmd, input = '')
-{
-    if (options.debug)
-    {
-        process.stderr.write('+ '+cmd+(input ? " <<EOF\n"+input.replace(/\s*$/, '\n')+"EOF" : '')+'\n');
-    }
-    const cp = child_process.spawn(cmd, { shell: true });
-    let stdout = '', stderr = '', finish_cb;
-    cp.stdout.on('data', buf => stdout += buf.toString());
-    cp.stderr.on('data', buf => stderr += buf.toString());
-    cp.on('exit', () => finish_cb && finish_cb());
-    cp.stdin.write(input);
-    cp.stdin.end();
-    if (cp.exitCode == null)
-    {
-        await new Promise(ok => finish_cb = ok);
-    }
-    return [ cp.exitCode, stdout, stderr ];
-}
-
-async function file_exists(filename)
-{
-    return new Promise((ok, no) => fs.access(filename, fs.constants.R_OK, err => ok(!err)));
-}
--- a/mon/make-osd.sh
+++ b/mon/make-osd.sh
@@ -25,10 +25,6 @@ OPT=$(vitastor-cli simple-offsets --format options $DEV | tr '\n' ' ')
 META=$(vitastor-cli simple-offsets --format json $DEV | jq .data_offset)
 dd if=/dev/zero of=$DEV bs=1048576 count=$(((META+1048575)/1048576)) oflag=direct

-mkdir -p /var/log/vitastor
-id vitastor &>/dev/null || useradd vitastor
-chown vitastor /var/log/vitastor
-
 cat >/etc/systemd/system/vitastor-osd$OSD_NUM.service <<EOF
 [Unit]
 Description=Vitastor object storage daemon osd.$OSD_NUM
@@ -40,14 +36,14 @@ PartOf=vitastor.target
 LimitNOFILE=1048576
 LimitNPROC=1048576
 LimitMEMLOCK=infinity
-ExecStart=bash -c '/usr/bin/vitastor-osd \\
+ExecStart=/usr/bin/vitastor-osd \\
    --osd_num $OSD_NUM \\
    --disable_data_fsync 1 \\
    --immediate_commit all \\
    --disk_alignment 4096 --journal_block_size 4096 --meta_block_size 4096 \\
    --journal_no_same_sector_overwrites true \\
    --journal_sector_buffer_count 1024 \\
-    $OPT >>/var/log/vitastor/osd$OSD_NUM.log 2>&1'
+    $OPT
 WorkingDirectory=/
 ExecStartPre=+chown vitastor:vitastor $DEV
 User=vitastor
--- a/mon/mon.js
+++ b/mon/mon.js
@@ -31,7 +31,6 @@ const etcd_allow = new RegExp('^'+[
    'osd/inodestats/[1-9]\\d*',
    'osd/space/[1-9]\\d*',
    'mon/master',
-    'mon/member/[a-f0-9]+',
    'pg/state/[1-9]\\d*/[1-9]\\d*',
    'pg/stats/[1-9]\\d*/[1-9]\\d*',
    'pg/history/[1-9]\\d*/[1-9]\\d*',
@@ -238,10 +237,7 @@ const etcd_tree = {
    },
    mon: {
        master: {
-            /* ip: [ string ], id: uint64_t */
-        },
-        standby: {
-            /* <uint64_t>: { ip: [ string ] }, */
+            /* ip: [ string ], */
        },
    },
    pg: {
@@ -272,7 +268,7 @@ const etcd_tree = {
                <pg_id>: {
                    osd_sets: osd_num_t[][],
                    all_peers: osd_num_t[],
-                    epoch: uint64_t,
+                    epoch: uint32_t,
                },
            }, */
        },
@@ -677,25 +673,11 @@ class Mon
        }, this.etcd_start_timeout, 0);
    }

-    get_mon_state()
-    {
-        return { ip: this.local_ips(), hostname: os.hostname() };
-    }
-
    async get_lease()
    {
        const max_ttl = this.config.etcd_mon_ttl + this.config.etcd_mon_timeout/1000*this.config.etcd_mon_retries;
-        // Get lease
-        let res = await this.etcd_call('/lease/grant', { TTL: max_ttl }, this.config.etcd_mon_timeout, -1);
+        const res = await this.etcd_call('/lease/grant', { TTL: max_ttl }, this.config.etcd_mon_timeout, -1);
        this.etcd_lease_id = res.ID;
-        // Register in /mon/member, just for the information
-        const state = this.get_mon_state();
-        res = await this.etcd_call('/kv/put', {
-            key: b64(this.etcd_prefix+'/mon/member/'+this.etcd_lease_id),
-            value: b64(JSON.stringify(state)),
-            lease: ''+this.etcd_lease_id
-        }, this.etcd_start_timeout, 0);
-        // Set refresh timer
        this.lease_timer = setInterval(async () =>
        {
            const res = await this.etcd_call('/lease/keepalive', { ID: this.etcd_lease_id }, this.config.etcd_mon_timeout, this.config.etcd_mon_retries);
@@ -721,7 +703,7 @@ class Mon

    async become_master()
    {
-        const state = { ...this.get_mon_state(), id: ''+this.etcd_lease_id };
+        const state = { ip: this.local_ips() };
        while (1)
        {
            const res = await this.etcd_call('/kv/txn', {
@@ -1363,30 +1345,21 @@ class Mon
        const tm = prev_stats ? BigInt(timestamp - prev_stats.timestamp) : 0;
        for (const op in op_stats)
        {
-            if (prev_stats && prev_stats.op_stats && prev_stats.op_stats[op])
-            {
-                op_stats[op].bps = (op_stats[op].bytes - prev_stats.op_stats[op].bytes) * 1000n / tm;
-                op_stats[op].iops = (op_stats[op].count - prev_stats.op_stats[op].count) * 1000n / tm;
-                op_stats[op].lat = (op_stats[op].usec - prev_stats.op_stats[op].usec)
-                    / ((op_stats[op].count - prev_stats.op_stats[op].count) || 1n);
-            }
+            op_stats[op].bps = prev_stats ? (op_stats[op].bytes - prev_stats.op_stats[op].bytes) * 1000n / tm : 0;
+            op_stats[op].iops = prev_stats ? (op_stats[op].count - prev_stats.op_stats[op].count) * 1000n / tm : 0;
+            op_stats[op].lat = prev_stats ? (op_stats[op].usec - prev_stats.op_stats[op].usec)
+                / ((op_stats[op].count - prev_stats.op_stats[op].count) || 1n) : 0;
        }
        for (const op in subop_stats)
        {
-            if (prev_stats && prev_stats.subop_stats && prev_stats.subop_stats[op])
-            {
-                subop_stats[op].iops = (subop_stats[op].count - prev_stats.subop_stats[op].count) * 1000n / tm;
-                subop_stats[op].lat = (subop_stats[op].usec - prev_stats.subop_stats[op].usec)
-                    / ((subop_stats[op].count - prev_stats.subop_stats[op].count) || 1n);
-            }
+            subop_stats[op].iops = prev_stats ? (subop_stats[op].count - prev_stats.subop_stats[op].count) * 1000n / tm : 0;
+            subop_stats[op].lat = prev_stats ? (subop_stats[op].usec - prev_stats.subop_stats[op].usec)
+                / ((subop_stats[op].count - prev_stats.subop_stats[op].count) || 1n) : 0;
        }
        for (const op in recovery_stats)
        {
-            if (prev_stats && prev_stats.recovery_stats && prev_stats.recovery_stats[op])
-            {
-                recovery_stats[op].bps = (recovery_stats[op].bytes - prev_stats.recovery_stats[op].bytes) * 1000n / tm;
-                recovery_stats[op].iops = (recovery_stats[op].count - prev_stats.recovery_stats[op].count) * 1000n / tm;
-            }
+            recovery_stats[op].bps = prev_stats ? (recovery_stats[op].bytes - prev_stats.recovery_stats[op].bytes) * 1000n / tm : 0;
+            recovery_stats[op].iops = prev_stats ? (recovery_stats[op].count - prev_stats.recovery_stats[op].count) * 1000n / tm : 0;
        }
        return { op_stats, subop_stats, recovery_stats };
    }
--- a/mon/simple-offsets.js
+++ b/mon/simple-offsets.js
@@ -49,8 +49,7 @@ async function run()
    }
    options.journal_offset = Math.ceil(options.journal_offset/options.device_block_size)*options.device_block_size;
    const meta_offset = options.journal_offset + Math.ceil(options.journal_size/options.device_block_size)*options.device_block_size;
-    const meta_entry_size = 24 + 2*options.object_size/options.bitmap_granularity/8;
-    const entries_per_block = Math.floor(options.device_block_size / meta_entry_size);
+    const entries_per_block = Math.floor(options.device_block_size / (24 + 2*options.object_size/options.bitmap_granularity/8));
    const object_count = Math.floor((device_size-meta_offset)/options.object_size);
    const meta_size = Math.ceil(1 + object_count / entries_per_block) * options.device_block_size;
    const data_offset = meta_offset + meta_size;
--- a/patches/cinder-vitastor.py
+++ b/patches/cinder-vitastor.py
@@ -50,7 +50,7 @@ from cinder.volume import configuration
 from cinder.volume import driver
 from cinder.volume import volume_utils

-VERSION = '0.6.16'
+VERSION = '0.6.12'

 LOG = logging.getLogger(__name__)

@@ -355,25 +355,7 @@ class VitastorDriver(driver.CloneableImageVD,
    def revert_to_snapshot(self, context, volume, snapshot):
        """Revert a volume to a given snapshot."""

-        vol_name = utils.convert_str(snapshot.volume_name)
-        snap_name = utils.convert_str(snapshot.name)
-
-        # Delete the image and recreate it from the snapshot
-        args = [ 'vitastor-cli', 'rm', vol_name, *(self._vitastor_args()) ]
-        try:
-            self._execute(*args)
-        except processutils.ProcessExecutionError as exc:
-            LOG.error("Failed to delete image "+vol_name+": "+exc)
-            raise exception.VolumeBackendAPIException(data = exc.stderr)
-        args = [
-            'vitastor-cli', 'create', '--parent', vol_name+'@'+snap_name,
-            vol_name, *(self._vitastor_args())
-        ]
-        try:
-            self._execute(*args)
-        except processutils.ProcessExecutionError as exc:
-            LOG.error("Failed to recreate image "+vol_name+" from "+vol_name+"@"+snap_name+": "+exc)
-            raise exception.VolumeBackendAPIException(data = exc.stderr)
+        # FIXME Delete the image, then recreate it from the snapshot

    def delete_snapshot(self, snapshot):
        """Deletes a snapshot."""
@@ -381,15 +363,24 @@ class VitastorDriver(driver.CloneableImageVD,
        vol_name = utils.convert_str(snapshot.volume_name)
        snap_name = utils.convert_str(snapshot.name)

-        args = [
-            'vitastor-cli', 'rm', vol_name+'@'+snap_name,
-            *(self._vitastor_args())
-        ]
-        try:
-            self._execute(*args)
-        except processutils.ProcessExecutionError as exc:
-            LOG.error("Failed to remove snapshot "+vol_name+'@'+snap_name+": "+exc)
-            raise exception.VolumeBackendAPIException(data = exc.stderr)
+        # Find the snapshot
+        resp = self._etcd_txn({ 'success': [
+            { 'request_range': { 'key': 'index/image/'+vol_name+'@'+snap_name } },
+        ] })
+        if len(resp['responses'][0]['kvs']) == 0:
+            raise exception.SnapshotNotFound(snapshot_id = snap_name)
+        inode_id = int(resp['responses'][0]['kvs'][0]['value']['id'])
+        pool_id = int(resp['responses'][0]['kvs'][0]['value']['pool_id'])
+        parents = {}
+        parents[(pool_id << 48) | (inode_id & 0xffffffffffff)] = True
+
+        # Check if there are child volumes
+        children = self._child_count(parents)
+        if children > 0:
+            raise exception.SnapshotIsBusy(snapshot_name = snap_name)
+
+        # FIXME: We can't delete snapshots because we can't merge layers yet
+        raise exception.VolumeBackendAPIException(data = 'Snapshot delete (layer merge) is not implemented yet')

    def _child_count(self, parents):
        children = 0
--- a/rpm/build-tarball.sh
+++ b/rpm/build-tarball.sh
@@ -25,4 +25,4 @@ rm fio
 mv fio-copy fio
 FIO=`rpm -qi fio | perl -e 'while(<>) { /^Epoch[\s:]+(\S+)/ && print "$1:"; /^Version[\s:]+(\S+)/ && print $1; /^Release[\s:]+(\S+)/ && print "-$1"; }'`
 perl -i -pe 's/(Requires:\s*fio)([^\n]+)?/$1 = '$FIO'/' $VITASTOR/rpm/vitastor-el$EL.spec
-tar --transform 's#^#vitastor-0.6.16/#' --exclude 'rpm/*.rpm' -czf $VITASTOR/../vitastor-0.6.16$(rpm --eval '%dist').tar.gz *
+tar --transform 's#^#vitastor-0.6.12/#' --exclude 'rpm/*.rpm' -czf $VITASTOR/../vitastor-0.6.12$(rpm --eval '%dist').tar.gz *
--- a/rpm/vitastor-el7.Dockerfile
+++ b/rpm/vitastor-el7.Dockerfile
@@ -34,7 +34,7 @@ ADD . /root/vitastor
 RUN set -e; \
    cd /root/vitastor/rpm; \
    sh build-tarball.sh; \
-    cp /root/vitastor-0.6.16.el7.tar.gz ~/rpmbuild/SOURCES; \
+    cp /root/vitastor-0.6.12.el7.tar.gz ~/rpmbuild/SOURCES; \
    cp vitastor-el7.spec ~/rpmbuild/SPECS/vitastor.spec; \
    cd ~/rpmbuild/SPECS/; \
    rpmbuild -ba vitastor.spec; \
--- a/rpm/vitastor-el7.spec
+++ b/rpm/vitastor-el7.spec
@@ -1,11 +1,11 @@
 Name:           vitastor
-Version:        0.6.16
+Version:        0.6.12
 Release:        1%{?dist}
 Summary:        Vitastor, a fast software-defined clustered block storage

 License:        Vitastor Network Public License 1.1
 URL:            https://vitastor.io/
-Source0:        vitastor-0.6.16.el7.tar.gz
+Source0:        vitastor-0.6.12.el7.tar.gz

 BuildRequires:  liburing-devel >= 0.6
 BuildRequires:  gperftools-devel
--- a/rpm/vitastor-el8.Dockerfile
+++ b/rpm/vitastor-el8.Dockerfile
@@ -33,7 +33,7 @@ ADD . /root/vitastor
 RUN set -e; \
    cd /root/vitastor/rpm; \
    sh build-tarball.sh; \
-    cp /root/vitastor-0.6.16.el8.tar.gz ~/rpmbuild/SOURCES; \
+    cp /root/vitastor-0.6.12.el8.tar.gz ~/rpmbuild/SOURCES; \
    cp vitastor-el8.spec ~/rpmbuild/SPECS/vitastor.spec; \
    cd ~/rpmbuild/SPECS/; \
    rpmbuild -ba vitastor.spec; \
--- a/rpm/vitastor-el8.spec
+++ b/rpm/vitastor-el8.spec
@@ -1,11 +1,11 @@
 Name:           vitastor
-Version:        0.6.16
+Version:        0.6.12
 Release:        1%{?dist}
 Summary:        Vitastor, a fast software-defined clustered block storage

 License:        Vitastor Network Public License 1.1
 URL:            https://vitastor.io/
-Source0:        vitastor-0.6.16.el8.tar.gz
+Source0:        vitastor-0.6.12.el8.tar.gz

 BuildRequires:  liburing-devel >= 0.6
 BuildRequires:  gperftools-devel
--- a/src/CMakeLists.txt
+++ b/src/CMakeLists.txt
@@ -15,7 +15,7 @@ if("${CMAKE_INSTALL_PREFIX}" MATCHES "^/usr/local/?$")
 	set(CMAKE_INSTALL_RPATH "${CMAKE_INSTALL_PREFIX}/${CMAKE_INSTALL_LIBDIR}")
 endif()

-add_definitions(-DVERSION="0.6.16")
+add_definitions(-DVERSION="0.6.12")
 add_definitions(-Wall -Wno-sign-compare -Wno-comment -Wno-parentheses -Wno-pointer-arith -fdiagnostics-color=always -I ${CMAKE_SOURCE_DIR}/src)
 if (${WITH_ASAN})
 	add_definitions(-fsanitize=address -fno-omit-frame-pointer)
@@ -152,71 +152,10 @@ target_link_libraries(vitastor-nbd
 	vitastor_client
 )

-# vitastor-nfs
-add_executable(vitastor-nfs
-	nfs_proxy.cpp
-	nfs_conn.cpp
-	nfs_portmap.cpp
-	sha256.c
-	../libnfs/lib/init.c
-	../libnfs/lib/pdu.c
-	../libnfs/lib/libnfs-zdr.c
-	../libnfs/lib/socket.c
-	../libnfs/portmap/libnfs-raw-portmap.c
-	../libnfs/nfs/libnfs-raw-nfs.c
-	../libnfs/mount/libnfs-raw-mount.c
-)
-set_source_files_properties(
-	../libnfs/nfs/libnfs-raw-nfs.c
-	PROPERTIES
-	COMPILE_FLAGS "-Wno-unused-but-set-variable"
-)
-# Simplified static configuration
-# The other option is to build patched libnfs packages until all distros get my fixes
-target_compile_options(vitastor-nfs
-	PRIVATE
-	-DHAVE_ARPA_INET_H
-	-DHAVE_INTTYPES_H
-	-DHAVE_MEMORY_H
-	-DHAVE_NETDB_H
-	-DHAVE_NETINET_IN_H
-	-DHAVE_NETINET_TCP_H
-	-DHAVE_NET_IF_H
-	-DHAVE_POLL_H
-	-DHAVE_STDINT_H
-	-DHAVE_STDLIB_H
-	-DHAVE_STRINGS_H
-	-DHAVE_STRING_H
-	-DHAVE_SYS_IOCTL_H
-	-DHAVE_SYS_SOCKET_H
-	-DHAVE_SYS_STATVFS_H
-	-DHAVE_SYS_STAT_H
-	-DHAVE_SYS_SYSMACROS_H
-	-DHAVE_SYS_TIME_H
-	-DHAVE_SYS_TYPES_H
-	-DHAVE_SYS_VFS_H
-	-DHAVE_UNISTD_H
-	-DHAVE_UTIME_H
-	-DHAVE_SOCKADDR_STORAGE
-	-DHAVE_STRUCT_STAT_ST_MTIM_TV_NSEC
-	-D_U_=
-)
-target_include_directories(vitastor-nfs
-	PRIVATE
-	../libnfs/include
-	../libnfs/include/nfsc
-	../libnfs/portmap
-	../libnfs/nfs
-	../libnfs/mount
-)
-target_link_libraries(vitastor-nfs
-	vitastor_client
-)
-
 # vitastor-cli
 add_executable(vitastor-cli
-	cli.cpp cli_common.cpp cli_alloc_osd.cpp cli_simple_offsets.cpp cli_status.cpp cli_df.cpp
-	cli_ls.cpp cli_create.cpp cli_modify.cpp cli_flatten.cpp cli_merge.cpp cli_rm_data.cpp cli_rm.cpp
+	cli.cpp cli_alloc_osd.cpp cli_simple_offsets.cpp cli_df.cpp
+	cli_ls.cpp cli_create.cpp cli_modify.cpp cli_flatten.cpp cli_merge.cpp cli_rm.cpp cli_snap_rm.cpp
 )
 target_link_libraries(vitastor-cli
 	vitastor_client
--- a/src/addr_util.cpp
+++ b/src/addr_util.cpp
@@ -1,5 +1,3 @@
-#include <sys/socket.h>
-#include <unistd.h>
 #include <arpa/inet.h>
 #include <net/if.h>
 #include <sys/types.h>
@@ -11,7 +9,7 @@

 #include "addr_util.h"

-bool string_to_addr(std::string str, bool parse_port, int default_port, struct sockaddr_storage *addr)
+bool string_to_addr(std::string str, bool parse_port, int default_port, struct sockaddr *addr)
 {
    if (parse_port)
    {
@@ -27,7 +25,7 @@ bool string_to_addr(std::string str, bool parse_port, int default_port, struct s
    }
    if (inet_pton(AF_INET, str.c_str(), &((struct sockaddr_in*)addr)->sin_addr) == 1)
    {
-        addr->ss_family = AF_INET;
+        addr->sa_family = AF_INET;
        ((struct sockaddr_in*)addr)->sin_port = htons(default_port);
        return true;
    }
@@ -35,30 +33,30 @@ bool string_to_addr(std::string str, bool parse_port, int default_port, struct s
        str = str.substr(1, str.length()-2);
    if (inet_pton(AF_INET6, str.c_str(), &((struct sockaddr_in6*)addr)->sin6_addr) == 1)
    {
-        addr->ss_family = AF_INET6;
+        addr->sa_family = AF_INET6;
        ((struct sockaddr_in6*)addr)->sin6_port = htons(default_port);
        return true;
    }
    return false;
 }

-std::string addr_to_string(const sockaddr_storage &addr)
+std::string addr_to_string(const sockaddr &addr)
 {
    char peer_str[256];
    bool ok = false;
    int port;
-    if (addr.ss_family == AF_INET)
+    if (addr.sa_family == AF_INET)
    {
        ok = !!inet_ntop(AF_INET, &((sockaddr_in*)&addr)->sin_addr, peer_str, 256);
        port = ntohs(((sockaddr_in*)&addr)->sin_port);
    }
-    else if (addr.ss_family == AF_INET6)
+    else if (addr.sa_family == AF_INET6)
    {
        ok = !!inet_ntop(AF_INET6, &((sockaddr_in6*)&addr)->sin6_addr, peer_str, 256);
        port = ntohs(((sockaddr_in6*)&addr)->sin6_port);
    }
    else
-        throw std::runtime_error("Unknown address family "+std::to_string(addr.ss_family));
+        throw std::runtime_error("Unknown address family "+std::to_string(addr.sa_family));
    if (!ok)
        throw std::runtime_error(std::string("inet_ntop: ") + strerror(errno));
    return std::string(peer_str)+":"+std::to_string(port);
@@ -188,51 +186,3 @@ std::vector<std::string> getifaddr_list(std::vector<std::string> mask_cfg, bool
    freeifaddrs(list);
    return addresses;
 }
-
-int create_and_bind_socket(std::string bind_address, int bind_port, int listen_backlog, int *listening_port)
-{
-    sockaddr_storage addr;
-    if (!string_to_addr(bind_address, 0, bind_port, &addr))
-    {
-        throw std::runtime_error("bind address "+bind_address+" is not valid");
-    }
-
-    int listen_fd = socket(addr.ss_family, SOCK_STREAM, 0);
-    if (listen_fd < 0)
-    {
-        throw std::runtime_error(std::string("socket: ") + strerror(errno));
-    }
-    int enable = 1;
-    setsockopt(listen_fd, SOL_SOCKET, SO_REUSEADDR, &enable, sizeof(enable));
-
-    if (bind(listen_fd, (sockaddr*)&addr, sizeof(addr)) < 0)
-    {
-        close(listen_fd);
-        throw std::runtime_error(std::string("bind: ") + strerror(errno));
-    }
-    if (listening_port)
-    {
-        if (bind_port == 0)
-        {
-            socklen_t len = sizeof(addr);
-            if (getsockname(listen_fd, (sockaddr *)&addr, &len) == -1)
-            {
-                close(listen_fd);
-                throw std::runtime_error(std::string("getsockname: ") + strerror(errno));
-            }
-            *listening_port = ntohs(((sockaddr_in*)&addr)->sin_port);
-        }
-        else
-        {
-            *listening_port = bind_port;
-        }
-    }
-
-    if (listen(listen_fd, listen_backlog ? listen_backlog : 128) < 0)
-    {
-        close(listen_fd);
-        throw std::runtime_error(std::string("listen: ") + strerror(errno));
-    }
-
-    return listen_fd;
-}
--- a/src/addr_util.h
+++ b/src/addr_util.h
@@ -4,7 +4,6 @@
 #include <string>
 #include <vector>

-bool string_to_addr(std::string str, bool parse_port, int default_port, struct sockaddr_storage *addr);
-std::string addr_to_string(const sockaddr_storage &addr);
+bool string_to_addr(std::string str, bool parse_port, int default_port, struct sockaddr *addr);
+std::string addr_to_string(const sockaddr &addr);
 std::vector<std::string> getifaddr_list(std::vector<std::string> mask_cfg = std::vector<std::string>(), bool include_v6 = false);
-int create_and_bind_socket(std::string bind_address, int bind_port, int listen_backlog, int *listening_port);
--- a/src/allocator.cpp
+++ b/src/allocator.cpp
@@ -25,7 +25,7 @@ allocator::allocator(uint64_t blocks)
    size = free = blocks;
    last_one_mask = (blocks % 64) == 0
        ? UINT64_MAX
-        : (((uint64_t)1 << (blocks % 64)) - 1);
+        : ((1l << (blocks % 64)) - 1);
    for (uint64_t i = 0; i < total; i++)
    {
        mask[i] = 0;
@@ -79,7 +79,7 @@ void allocator::set(uint64_t addr, bool value)
            }
            if (value)
            {
-                mask[last] = mask[last] | ((uint64_t)1 << bit);
+                mask[last] = mask[last] | (1l << bit);
                if (mask[last] != (!is_last || cur_addr/64 < size/64
                    ? UINT64_MAX : last_one_mask))
                {
@@ -88,7 +88,7 @@ void allocator::set(uint64_t addr, bool value)
            }
            else
            {
-                mask[last] = mask[last] & ~((uint64_t)1 << bit);
+                mask[last] = mask[last] & ~(1l << bit);
            }
            is_last = false;
            if (p2 > 1)
--- a/src/blockstore.h
+++ b/src/blockstore.h
@@ -21,7 +21,7 @@
 // Memory alignment for direct I/O (usually 512 bytes)
 // All other alignments must be a multiple of this one
 #ifndef MEM_ALIGNMENT
-#define MEM_ALIGNMENT 4096
+#define MEM_ALIGNMENT 512
 #endif

 // Default block size is 128 KB, current allowed range is 4K - 128M
--- a/src/blockstore_flush.cpp
+++ b/src/blockstore_flush.cpp
@@ -415,11 +415,8 @@ stop_flusher:
        flusher->active_flushers++;
 resume_1:
        // Find it in clean_db
-        {
-            auto & clean_db = bs->clean_db_shard(cur.oid);
-            auto clean_it = clean_db.find(cur.oid);
-            old_clean_loc = (clean_it != clean_db.end() ? clean_it->second.location : UINT64_MAX);
-        }
+        clean_it = bs->clean_db.find(cur.oid);
+        old_clean_loc = (clean_it != bs->clean_db.end() ? clean_it->second.location : UINT64_MAX);
        // Scan dirty versions of the object
        if (!scan_dirty(1))
        {
@@ -873,11 +870,10 @@ void journal_flusher_co::update_clean_db()
 #endif
        bs->data_alloc->set(old_clean_loc >> bs->block_order, false);
    }
-    auto & clean_db = bs->clean_db_shard(cur.oid);
    if (has_delete)
    {
-        auto clean_it = clean_db.find(cur.oid);
-        clean_db.erase(clean_it);
+        auto clean_it = bs->clean_db.find(cur.oid);
+        bs->clean_db.erase(clean_it);
 #ifdef BLOCKSTORE_DEBUG
        printf("Free block %lu from %lx:%lx v%lu (delete)\n",
            clean_loc >> bs->block_order,
@@ -888,7 +884,7 @@ void journal_flusher_co::update_clean_db()
    }
    else
    {
-        clean_db[cur.oid] = {
+        bs->clean_db[cur.oid] = {
            .version = cur.version,
            .location = clean_loc,
        };
--- a/src/blockstore_flush.h
+++ b/src/blockstore_flush.h
@@ -49,6 +49,7 @@ class journal_flusher_co
    std::function<void(ring_data_t*)> simple_callback_r, simple_callback_w;

    bool skip_copy, has_delete, has_writes;
+    blockstore_clean_db_t::iterator clean_it;
    std::vector<copy_buffer_t> v;
    std::vector<copy_buffer_t>::iterator it;
    int copy_count;
--- a/src/blockstore_impl.cpp
+++ b/src/blockstore_impl.cpp
@@ -118,7 +118,7 @@ void blockstore_impl_t::loop()
        // has_writes == 0 - no writes before the current queue item
        // has_writes == 1 - some writes in progress
        // has_writes == 2 - tried to submit some writes, but failed
-        int has_writes = 0, op_idx = 0, new_idx = 0, done_lists = 0;
+        int has_writes = 0, op_idx = 0, new_idx = 0;
        for (; op_idx < submit_queue.size(); op_idx++, new_idx++)
        {
            auto op = submit_queue[op_idx];
@@ -198,14 +198,9 @@ void blockstore_impl_t::loop()
            }
            else if (op->opcode == BS_OP_LIST)
            {
-                // LIST doesn't have to be blocked by previous modifications
-                // But don't do a lot of LISTs at once, because they're blocking and potentially slow
-                if (single_tick_list_limit <= 0 || done_lists < single_tick_list_limit)
-                {
-                    process_list(op);
-                    done_lists++;
-                    wr_st = 2;
-                }
+                // LIST doesn't need to be blocked by previous modifications
+                process_list(op);
+                wr_st = 2;
            }
            if (wr_st == 2)
            {
@@ -428,104 +423,22 @@ static bool replace_stable(object_id oid, uint64_t version, int search_start, in
    return false;
 }

-blockstore_clean_db_t& blockstore_impl_t::clean_db_shard(object_id oid)
-{
-    uint64_t pg_num = 0;
-    uint64_t pool_id = (oid.inode >> (64-POOL_ID_BITS));
-    auto sh_it = clean_db_settings.find(pool_id);
-    if (sh_it != clean_db_settings.end())
-    {
-        // like map_to_pg()
-        pg_num = (oid.stripe / sh_it->second.pg_stripe_size) % sh_it->second.pg_count + 1;
-    }
-    return clean_db_shards[(pool_id << (64-POOL_ID_BITS)) | pg_num];
-}
-
-void blockstore_impl_t::reshard_clean_db(pool_id_t pool, uint32_t pg_count, uint32_t pg_stripe_size)
-{
-    uint64_t pool_id = (uint64_t)pool;
-    std::map<pool_pg_id_t, blockstore_clean_db_t> new_shards;
-    auto sh_it = clean_db_shards.lower_bound((pool_id << (64-POOL_ID_BITS)));
-    while (sh_it != clean_db_shards.end() &&
-        (sh_it->first >> (64-POOL_ID_BITS)) == pool_id)
-    {
-        for (auto & pair: sh_it->second)
-        {
-            // like map_to_pg()
-            uint64_t pg_num = (pair.first.stripe / pg_stripe_size) % pg_count + 1;
-            uint64_t shard_id = (pool_id << (64-POOL_ID_BITS)) | pg_num;
-            new_shards[shard_id][pair.first] = pair.second;
-        }
-        clean_db_shards.erase(sh_it++);
-    }
-    for (sh_it = new_shards.begin(); sh_it != new_shards.end(); sh_it++)
-    {
-        auto & to = clean_db_shards[sh_it->first];
-        to.swap(sh_it->second);
-    }
-    clean_db_settings[pool_id] = (pool_shard_settings_t){
-        .pg_count = pg_count,
-        .pg_stripe_size = pg_stripe_size,
-    };
-}
-
 void blockstore_impl_t::process_list(blockstore_op_t *op)
 {
-    uint32_t list_pg = op->offset+1;
+    uint32_t list_pg = op->offset;
    uint32_t pg_count = op->len;
    uint64_t pg_stripe_size = op->oid.stripe;
    uint64_t min_inode = op->oid.inode;
    uint64_t max_inode = op->version;
    // Check PG
-    if (pg_count != 0 && (pg_stripe_size < MIN_BLOCK_SIZE || list_pg > pg_count))
+    if (pg_count != 0 && (pg_stripe_size < MIN_BLOCK_SIZE || list_pg >= pg_count))
    {
        op->retval = -EINVAL;
        FINISH_OP(op);
        return;
    }
-    // Check if the DB needs resharding
-    // (we don't know about PGs from the beginning, we only create "shards" here)
-    uint64_t first_shard = 0, last_shard = UINT64_MAX;
-    if (min_inode != 0 &&
-        // Check if min_inode == max_inode == pool_id<<N, i.e. this is a pool listing
-        (min_inode >> (64-POOL_ID_BITS)) == (max_inode >> (64-POOL_ID_BITS)))
-    {
-        pool_id_t pool_id = (min_inode >> (64-POOL_ID_BITS));
-        if (pg_count > 1)
-        {
-            // Per-pg listing
-            auto sh_it = clean_db_settings.find(pool_id);
-            if (sh_it == clean_db_settings.end() ||
-                sh_it->second.pg_count != pg_count ||
-                sh_it->second.pg_stripe_size != pg_stripe_size)
-            {
-                reshard_clean_db(pool_id, pg_count, pg_stripe_size);
-            }
-            first_shard = last_shard = ((uint64_t)pool_id << (64-POOL_ID_BITS)) | list_pg;
-        }
-        else
-        {
-            // Per-pool listing
-            first_shard = ((uint64_t)pool_id << (64-POOL_ID_BITS));
-            last_shard = ((uint64_t)(pool_id+1) << (64-POOL_ID_BITS)) - 1;
-        }
-    }
-    // Copy clean_db entries
-    int stable_count = 0, stable_alloc = 0;
-    if (min_inode != max_inode)
-    {
-        for (auto shard_it = clean_db_shards.lower_bound(first_shard);
-            shard_it != clean_db_shards.end() && shard_it->first <= last_shard;
-            shard_it++)
-        {
-            auto & clean_db = shard_it->second;
-            stable_alloc += clean_db.size();
-        }
-    }
-    else
-    {
-        stable_alloc = 32768;
-    }
+    // Copy clean_db entries (sorted)
+    int stable_count = 0, stable_alloc = clean_db.size() / (pg_count ? pg_count : 1);
    obj_ver_id *stable = (obj_ver_id*)malloc(sizeof(obj_ver_id) * stable_alloc);
    if (!stable)
    {
@@ -533,11 +446,7 @@ void blockstore_impl_t::process_list(blockstore_op_t *op)
        FINISH_OP(op);
        return;
    }
-    for (auto shard_it = clean_db_shards.lower_bound(first_shard);
-        shard_it != clean_db_shards.end() && shard_it->first <= last_shard;
-        shard_it++)
    {
-        auto & clean_db = shard_it->second;
        auto clean_it = clean_db.begin(), clean_end = clean_db.end();
        if ((min_inode != 0 || max_inode != 0) && min_inode <= max_inode)
        {
@@ -552,28 +461,26 @@ void blockstore_impl_t::process_list(blockstore_op_t *op)
        }
        for (; clean_it != clean_end; clean_it++)
        {
-            if (stable_count >= stable_alloc)
+            if (!pg_count || ((clean_it->first.stripe / pg_stripe_size) % pg_count) == list_pg) // like map_to_pg()
            {
-                stable_alloc *= 2;
-                stable = (obj_ver_id*)realloc(stable, sizeof(obj_ver_id) * stable_alloc);
-                if (!stable)
+                if (stable_count >= stable_alloc)
                {
-                    op->retval = -ENOMEM;
-                    FINISH_OP(op);
-                    return;
+                    stable_alloc += 32768;
+                    stable = (obj_ver_id*)realloc(stable, sizeof(obj_ver_id) * stable_alloc);
+                    if (!stable)
+                    {
+                        op->retval = -ENOMEM;
+                        FINISH_OP(op);
+                        return;
+                    }
                }
+                stable[stable_count++] = {
+                    .oid = clean_it->first,
+                    .version = clean_it->second.version,
+                };
            }
-            stable[stable_count++] = {
-                .oid = clean_it->first,
-                .version = clean_it->second.version,
-            };
        }
    }
-    if (first_shard != last_shard)
-    {
-        // If that's not a per-PG listing, sort clean entries
-        std::sort(stable, stable+stable_count);
-    }
    int clean_stable_count = stable_count;
    // Copy dirty_db entries (sorted, too)
    int unstable_count = 0, unstable_alloc = 0;
@@ -599,7 +506,7 @@ void blockstore_impl_t::process_list(blockstore_op_t *op)
        }
        for (; dirty_it != dirty_end; dirty_it++)
        {
-            if (!pg_count || ((dirty_it->first.oid.stripe / pg_stripe_size) % pg_count + 1) == list_pg) // like map_to_pg()
+            if (!pg_count || ((dirty_it->first.oid.stripe / pg_stripe_size) % pg_count) == list_pg) // like map_to_pg()
            {
                if (IS_DELETE(dirty_it->second.state))
                {
--- a/src/blockstore_impl.h
+++ b/src/blockstore_impl.h
@@ -204,17 +204,6 @@ typedef std::map<obj_ver_id, dirty_entry> blockstore_dirty_db_t;

 #include "blockstore_flush.h"

-typedef uint32_t pool_id_t;
-typedef uint64_t pool_pg_id_t;
-
-#define POOL_ID_BITS 16
-
-struct pool_shard_settings_t
-{
-    uint32_t pg_count;
-    uint32_t pg_stripe_size;
-};
-
 class blockstore_impl_t
 {
    /******* OPTIONS *******/
@@ -252,14 +241,11 @@ class blockstore_impl_t
    int throttle_target_parallelism = 1;
    // Minimum difference in microseconds between target and real execution times to throttle the response
    int throttle_threshold_us = 50;
-    // Maximum number of LIST operations to be processed between
-    int single_tick_list_limit = 1;
    /******* END OF OPTIONS *******/

    struct ring_consumer_t ring_consumer;

-    std::map<pool_id_t, pool_shard_settings_t> clean_db_settings;
-    std::map<pool_pg_id_t, blockstore_clean_db_t> clean_db_shards;
+    blockstore_clean_db_t clean_db;
    uint8_t *clean_bitmap = NULL;
    blockstore_dirty_db_t dirty_db;
    std::vector<blockstore_op_t*> submit_queue;
@@ -308,9 +294,6 @@ class blockstore_impl_t
    void open_journal();
    uint8_t* get_clean_entry_bitmap(uint64_t block_loc, int offset);

-    blockstore_clean_db_t& clean_db_shard(object_id oid);
-    void reshard_clean_db(pool_id_t pool_id, uint32_t pg_count, uint32_t pg_stripe_size);
-
    // Journaling
    void prepare_journal_sector_write(int sector, blockstore_op_t *op);
    void handle_journal_write(ring_data_t *data, uint64_t flush_id);
--- a/src/blockstore_init.cpp
+++ b/src/blockstore_init.cpp
@@ -222,11 +222,10 @@ void blockstore_init_meta::handle_entries(void* entries, unsigned count, int blo
        }
        if (entry->oid.inode > 0)
        {
-            auto & clean_db = bs->clean_db_shard(entry->oid);
-            auto clean_it = clean_db.find(entry->oid);
-            if (clean_it == clean_db.end() || clean_it->second.version < entry->version)
+            auto clean_it = bs->clean_db.find(entry->oid);
+            if (clean_it == bs->clean_db.end() || clean_it->second.version < entry->version)
            {
-                if (clean_it != clean_db.end())
+                if (clean_it != bs->clean_db.end())
                {
                    // free the previous block
 #ifdef BLOCKSTORE_DEBUG
@@ -246,7 +245,7 @@ void blockstore_init_meta::handle_entries(void* entries, unsigned count, int blo
                printf("Allocate block (clean entry) %lu: %lx:%lx v%lu\n", done_cnt+i, entry->oid.inode, entry->oid.stripe, entry->version);
 #endif
                bs->data_alloc->set(done_cnt+i, true);
-                clean_db[entry->oid] = (struct clean_entry){
+                bs->clean_db[entry->oid] = (struct clean_entry){
                    .version = entry->version,
                    .location = (done_cnt+i) << block_order,
                };
@@ -657,9 +656,8 @@ int blockstore_init_journal::handle_journal_part(void *buf, uint64_t done_pos, u
                    init_write_sector = proc_pos;
                    return 0;
                }
-                auto & clean_db = bs->clean_db_shard(je->small_write.oid);
-                auto clean_it = clean_db.find(je->small_write.oid);
-                if (clean_it == clean_db.end() ||
+                auto clean_it = bs->clean_db.find(je->small_write.oid);
+                if (clean_it == bs->clean_db.end() ||
                    clean_it->second.version < je->small_write.version)
                {
                    obj_ver_id ov = {
@@ -737,9 +735,8 @@ int blockstore_init_journal::handle_journal_part(void *buf, uint64_t done_pos, u
                        erase_dirty_object(dirty_it);
                    }
                }
-                auto & clean_db = bs->clean_db_shard(je->big_write.oid);
-                auto clean_it = clean_db.find(je->big_write.oid);
-                if (clean_it == clean_db.end() ||
+                auto clean_it = bs->clean_db.find(je->big_write.oid);
+                if (clean_it == bs->clean_db.end() ||
                    clean_it->second.version < je->big_write.version)
                {
                    // oid, version, block
@@ -844,9 +841,8 @@ int blockstore_init_journal::handle_journal_part(void *buf, uint64_t done_pos, u
                    dirty_it--;
                    dirty_exists = dirty_it->first.oid == je->del.oid;
                }
-                auto & clean_db = bs->clean_db_shard(je->del.oid);
-                auto clean_it = clean_db.find(je->del.oid);
-                bool clean_exists = (clean_it != clean_db.end() &&
+                auto clean_it = bs->clean_db.find(je->del.oid);
+                bool clean_exists = (clean_it != bs->clean_db.end() &&
                    clean_it->second.version < je->del.version);
                if (!clean_exists && dirty_exists)
                {
@@ -905,9 +901,8 @@ void blockstore_init_journal::erase_dirty_object(blockstore_dirty_db_t::iterator
            break;
        }
    }
-    auto & clean_db = bs->clean_db_shard(oid);
-    auto clean_it = clean_db.find(oid);
-    uint64_t clean_loc = clean_it != clean_db.end()
+    auto clean_it = bs->clean_db.find(oid);
+    uint64_t clean_loc = clean_it != bs->clean_db.end()
        ? clean_it->second.location : UINT64_MAX;
    if (exists && clean_loc == UINT64_MAX)
    {
--- a/src/blockstore_read.cpp
+++ b/src/blockstore_read.cpp
@@ -111,7 +111,6 @@ uint8_t* blockstore_impl_t::get_clean_entry_bitmap(uint64_t block_loc, int offse

 int blockstore_impl_t::dequeue_read(blockstore_op_t *read_op)
 {
-    auto & clean_db = clean_db_shard(read_op->oid);
    auto clean_it = clean_db.find(read_op->oid);
    auto dirty_it = dirty_db.upper_bound((obj_ver_id){
        .oid = read_op->oid,
@@ -298,7 +297,6 @@ int blockstore_impl_t::read_bitmap(object_id oid, uint64_t target_version, void
            dirty_it--;
        }
    }
-    auto & clean_db = clean_db_shard(oid);
    auto clean_it = clean_db.find(oid);
    if (clean_it != clean_db.end())
    {
--- a/src/blockstore_stable.cpp
+++ b/src/blockstore_stable.cpp
@@ -54,7 +54,6 @@ int blockstore_impl_t::dequeue_stable(blockstore_op_t *op)
        auto dirty_it = dirty_db.find(*v);
        if (dirty_it == dirty_db.end())
        {
-            auto & clean_db = clean_db_shard(v->oid);
            auto clean_it = clean_db.find(v->oid);
            if (clean_it == clean_db.end() || clean_it->second.version < v->version)
            {
@@ -189,7 +188,6 @@ void blockstore_impl_t::mark_stable(const obj_ver_id & v, bool forget_dirty)
                    }
                    if (exists == -1)
                    {
-                        auto & clean_db = clean_db_shard(v.oid);
                        auto clean_it = clean_db.find(v.oid);
                        exists = clean_it != clean_db.end() ? 1 : 0;
                    }
@@ -217,7 +215,6 @@ void blockstore_impl_t::mark_stable(const obj_ver_id & v, bool forget_dirty)
                        break;
                    }
                }
-                auto & clean_db = clean_db_shard(v.oid);
                auto clean_it = clean_db.find(v.oid);
                uint64_t clean_loc = clean_it != clean_db.end()
                    ? clean_it->second.location : UINT64_MAX;
--- a/src/blockstore_write.cpp
+++ b/src/blockstore_write.cpp
@@ -41,7 +41,6 @@ bool blockstore_impl_t::enqueue_write(blockstore_op_t *op)
    }
    if (!found)
    {
-        auto & clean_db = clean_db_shard(op->oid);
        auto clean_it = clean_db.find(op->oid);
        if (clean_it != clean_db.end())
        {
@@ -544,13 +543,12 @@ resume_4:
            if (ref_us > exec_us + throttle_threshold_us)
            {
                // Pause reply
-                PRIV(op)->op_state = 5;
-                // Remember that the timer can in theory be called right here
                tfd->set_timer_us(ref_us-exec_us, false, [this, op](int timer_id)
                {
                    PRIV(op)->op_state++;
                    ringloop->wakeup();
                });
+                PRIV(op)->op_state = 5;
                return 1;
            }
        }
--- a/src/cli.cpp
+++ b/src/cli.cpp
@@ -2,7 +2,8 @@
 // License: VNPL-1.1 (see README.md for details)

 /**
- * CLI tool and also a library for administrative tasks
+ * CLI tool
+ * Currently can (a) remove inodes and (b) merge snapshot/clone layers
 */

 #include <vector>
@@ -16,9 +17,7 @@

 static const char *exe_name = NULL;

-static void help();
-
-static json11::Json::object parse_args(int narg, const char *args[])
+json11::Json::object cli_tool_t::parse_args(int narg, const char *args[])
 {
    json11::Json::object cfg;
    json11::Json::array cmd;
@@ -80,16 +79,13 @@ static json11::Json::object parse_args(int narg, const char *args[])
    return cfg;
 }

-static void help()
+void cli_tool_t::help()
 {
    printf(
        "Vitastor command-line tool\n"
        "(c) Vitaliy Filippov, 2019+ (VNPL-1.1)\n"
        "\n"
        "USAGE:\n"
-        "%s status\n"
-        "  Show cluster status\n"
-        "\n"
        "%s df\n"
        "  Show pool space statistics\n"
        "\n"
@@ -159,132 +155,223 @@ static void help()
        "  --no-color          Disable colored output\n"
        "  --json              JSON output\n"
        ,
-        exe_name, exe_name, exe_name, exe_name, exe_name, exe_name, exe_name,
+        exe_name, exe_name, exe_name, exe_name, exe_name, exe_name,
        exe_name, exe_name, exe_name, exe_name, exe_name, exe_name
    );
    exit(0);
 }

-static int run(cli_tool_t *p, json11::Json cfg)
+void cli_tool_t::change_parent(inode_t cur, inode_t new_parent)
+{
+    auto cur_cfg_it = cli->st_cli.inode_config.find(cur);
+    if (cur_cfg_it == cli->st_cli.inode_config.end())
+    {
+        fprintf(stderr, "Inode 0x%lx disappeared\n", cur);
+        exit(1);
+    }
+    inode_config_t new_cfg = cur_cfg_it->second;
+    std::string cur_name = new_cfg.name;
+    std::string cur_cfg_key = base64_encode(cli->st_cli.etcd_prefix+
+        "/config/inode/"+std::to_string(INODE_POOL(cur))+
+        "/"+std::to_string(INODE_NO_POOL(cur)));
+    new_cfg.parent_id = new_parent;
+    json11::Json::object cur_cfg_json = cli->st_cli.serialize_inode_cfg(&new_cfg);
+    waiting++;
+    cli->st_cli.etcd_txn_slow(json11::Json::object {
+        { "compare", json11::Json::array {
+            json11::Json::object {
+                { "target", "MOD" },
+                { "key", cur_cfg_key },
+                { "result", "LESS" },
+                { "mod_revision", new_cfg.mod_revision+1 },
+            },
+        } },
+        { "success", json11::Json::array {
+            json11::Json::object {
+                { "request_put", json11::Json::object {
+                    { "key", cur_cfg_key },
+                    { "value", base64_encode(json11::Json(cur_cfg_json).dump()) },
+                } }
+            },
+        } },
+    }, [this, new_parent, cur, cur_name](std::string err, json11::Json res)
+    {
+        if (err != "")
+        {
+            fprintf(stderr, "Error changing parent of %s: %s\n", cur_name.c_str(), err.c_str());
+            exit(1);
+        }
+        if (!res["succeeded"].bool_value())
+        {
+            fprintf(stderr, "Inode %s was modified during snapshot deletion\n", cur_name.c_str());
+            exit(1);
+        }
+        if (new_parent)
+        {
+            auto new_parent_it = cli->st_cli.inode_config.find(new_parent);
+            std::string new_parent_name = new_parent_it != cli->st_cli.inode_config.end()
+                ? new_parent_it->second.name : "<unknown>";
+            printf(
+                "Parent of layer %s (inode %lu in pool %u) changed to %s (inode %lu in pool %u)\n",
+                cur_name.c_str(), INODE_NO_POOL(cur), INODE_POOL(cur),
+                new_parent_name.c_str(), INODE_NO_POOL(new_parent), INODE_POOL(new_parent)
+            );
+        }
+        else
+        {
+            printf(
+                "Parent of layer %s (inode %lu in pool %u) detached\n",
+                cur_name.c_str(), INODE_NO_POOL(cur), INODE_POOL(cur)
+            );
+        }
+        waiting--;
+        ringloop->wakeup();
+    });
+}
+
+void cli_tool_t::etcd_txn(json11::Json txn)
+{
+    waiting++;
+    cli->st_cli.etcd_txn_slow(txn, [this](std::string err, json11::Json res)
+    {
+        waiting--;
+        if (err != "")
+        {
+            fprintf(stderr, "Error reading from etcd: %s\n", err.c_str());
+            exit(1);
+        }
+        etcd_result = res;
+        ringloop->wakeup();
+    });
+}
+
+inode_config_t* cli_tool_t::get_inode_cfg(const std::string & name)
+{
+    for (auto & ic: cli->st_cli.inode_config)
+    {
+        if (ic.second.name == name)
+        {
+            return &ic.second;
+        }
+    }
+    fprintf(stderr, "Layer %s not found\n", name.c_str());
+    exit(1);
+}
+
+void cli_tool_t::run(json11::Json cfg)
 {
-    p->parse_config(cfg);
    json11::Json::array cmd = cfg["command"].array_items();
-    std::function<bool(cli_result_t &)> action_cb;
    if (!cmd.size())
    {
        fprintf(stderr, "command is missing\n");
-        return EINVAL;
-    }
-    else if (cmd[0] == "status")
-    {
-        // Show cluster status
-        action_cb = p->start_status(cfg);
+        exit(1);
    }
    else if (cmd[0] == "df")
    {
        // Show pool space stats
-        action_cb = p->start_df(cfg);
+        action_cb = start_df(cfg);
    }
    else if (cmd[0] == "ls")
    {
        // List images
-        action_cb = p->start_ls(cfg);
+        action_cb = start_ls(cfg);
    }
    else if (cmd[0] == "create" || cmd[0] == "snap-create")
    {
        // Create image/snapshot
-        action_cb = p->start_create(cfg);
+        action_cb = start_create(cfg);
    }
    else if (cmd[0] == "modify")
    {
        // Modify image
-        action_cb = p->start_modify(cfg);
+        action_cb = start_modify(cfg);
    }
    else if (cmd[0] == "rm-data")
    {
        // Delete inode data
-        action_cb = p->start_rm(cfg);
+        action_cb = start_rm(cfg);
    }
    else if (cmd[0] == "merge-data")
    {
        // Merge layer data without affecting metadata
-        action_cb = p->start_merge(cfg);
+        action_cb = start_merge(cfg);
    }
    else if (cmd[0] == "flatten")
    {
        // Merge layer data without affecting metadata
-        action_cb = p->start_flatten(cfg);
+        action_cb = start_flatten(cfg);
    }
    else if (cmd[0] == "rm")
    {
        // Remove multiple snapshots and rebase their children
-        action_cb = p->start_snap_rm(cfg);
+        action_cb = start_snap_rm(cfg);
    }
    else if (cmd[0] == "alloc-osd")
    {
        // Allocate a new OSD number
-        action_cb = p->start_alloc_osd(cfg);
+        action_cb = start_alloc_osd(cfg);
    }
    else if (cmd[0] == "simple-offsets")
    {
        // Calculate offsets for simple & stupid OSD deployment without superblock
-        action_cb = p->simple_offsets(cfg);
+        action_cb = simple_offsets(cfg);
    }
    else
    {
        fprintf(stderr, "unknown command: %s\n", cmd[0].string_value().c_str());
-        return EINVAL;
+        exit(1);
    }
    if (action_cb == NULL)
    {
-        return 0;
+        return;
    }
+    color = !cfg["no-color"].bool_value();
+    json_output = cfg["json"].bool_value();
+    iodepth = cfg["iodepth"].uint64_value();
+    if (!iodepth)
+        iodepth = 32;
+    parallel_osds = cfg["parallel_osds"].uint64_value();
+    if (!parallel_osds)
+        parallel_osds = 4;
+    log_level = cfg["log_level"].int64_value();
+    progress = cfg["progress"].uint64_value() ? true : false;
+    list_first = cfg["wait-list"].uint64_value() ? true : false;
    // Create client
-    p->ringloop = new ring_loop_t(512);
-    p->epmgr = new epoll_manager_t(p->ringloop);
-    p->cli = new cluster_client_t(p->ringloop, p->epmgr->tfd, cfg);
-    // Smaller timeout by default for more interactiveness
-    p->cli->st_cli.etcd_slow_timeout = p->cli->st_cli.etcd_quick_timeout;
-    ring_consumer_t consumer;
-    cli_result_t result;
-    p->cli->on_ready([&]()
+    ringloop = new ring_loop_t(512);
+    epmgr = new epoll_manager_t(ringloop);
+    cli = new cluster_client_t(ringloop, epmgr->tfd, cfg);
+    cli->on_ready([this]()
    {
        // Initialize job
-        consumer.loop = [&]()
+        consumer.loop = [this]()
        {
            if (action_cb != NULL)
            {
-                bool done = action_cb(result);
+                bool done = action_cb();
                if (done)
                {
                    action_cb = NULL;
                }
            }
-            p->ringloop->submit();
+            ringloop->submit();
        };
-        p->ringloop->register_consumer(&consumer);
+        ringloop->register_consumer(&consumer);
        consumer.loop();
    });
    // Loop until it completes
    while (action_cb != NULL)
    {
-        p->ringloop->loop();
+        ringloop->loop();
        if (action_cb != NULL)
-            p->ringloop->wait();
-    }
-    // Print result
-    if (result.text != "")
-    {
-        fprintf(stderr, "%s\n", result.text.c_str());
+            ringloop->wait();
    }
    // Destroy the client
-    delete p->cli;
-    delete p->epmgr;
-    delete p->ringloop;
-    p->cli = NULL;
-    p->epmgr = NULL;
-    p->ringloop = NULL;
-    return result.err;
+    delete cli;
+    delete epmgr;
+    delete ringloop;
+    cli = NULL;
+    epmgr = NULL;
+    ringloop = NULL;
 }

 int main(int narg, const char *args[])
@@ -293,7 +380,7 @@ int main(int narg, const char *args[])
    setvbuf(stderr, NULL, _IONBF, 0);
    exe_name = args[0];
    cli_tool_t *p = new cli_tool_t();
-    int r = run(p, parse_args(narg, args));
+    p->run(cli_tool_t::parse_args(narg, args));
    delete p;
-    return r;
+    return 0;
 }
--- a/src/cli.h
+++ b/src/cli.h
@@ -19,13 +19,6 @@ class epoll_manager_t;
 class cluster_client_t;
 struct inode_config_t;

-struct cli_result_t
-{
-    int err;
-    std::string text;
-    json11::Json data;
-};
-
 class cli_tool_t
 {
 public:
@@ -42,28 +35,32 @@ public:

    int waiting = 0;
    json11::Json etcd_result;
+    ring_consumer_t consumer;
+    std::function<bool(void)> action_cb;

-    void parse_config(json11::Json cfg);
+    void run(json11::Json cfg);

    void change_parent(inode_t cur, inode_t new_parent);
    inode_config_t* get_inode_cfg(const std::string & name);

+    static json11::Json::object parse_args(int narg, const char *args[]);
+    static void help();
+
    friend struct rm_inode_t;
    friend struct snap_merger_t;
    friend struct snap_flattener_t;
    friend struct snap_remover_t;

-    std::function<bool(cli_result_t &)> start_status(json11::Json);
-    std::function<bool(cli_result_t &)> start_df(json11::Json);
-    std::function<bool(cli_result_t &)> start_ls(json11::Json);
-    std::function<bool(cli_result_t &)> start_create(json11::Json);
-    std::function<bool(cli_result_t &)> start_modify(json11::Json);
-    std::function<bool(cli_result_t &)> start_rm(json11::Json);
-    std::function<bool(cli_result_t &)> start_merge(json11::Json);
-    std::function<bool(cli_result_t &)> start_flatten(json11::Json);
-    std::function<bool(cli_result_t &)> start_snap_rm(json11::Json);
-    std::function<bool(cli_result_t &)> start_alloc_osd(json11::Json cfg);
-    std::function<bool(cli_result_t &)> simple_offsets(json11::Json cfg);
+    std::function<bool(void)> start_df(json11::Json);
+    std::function<bool(void)> start_ls(json11::Json);
+    std::function<bool(void)> start_create(json11::Json);
+    std::function<bool(void)> start_modify(json11::Json);
+    std::function<bool(void)> start_rm(json11::Json);
+    std::function<bool(void)> start_merge(json11::Json);
+    std::function<bool(void)> start_flatten(json11::Json);
+    std::function<bool(void)> start_snap_rm(json11::Json);
+    std::function<bool(void)> start_alloc_osd(json11::Json cfg, uint64_t *out = NULL);
+    std::function<bool(void)> simple_offsets(json11::Json cfg);

    void etcd_txn(json11::Json txn);
 };
@@ -72,7 +69,7 @@ uint64_t parse_size(std::string size_str);

 std::string print_table(json11::Json items, json11::Json header, bool use_esc);

-std::string format_size(uint64_t size, bool nobytes = false);
+std::string format_size(uint64_t size);

 std::string format_lat(uint64_t lat);

--- a/src/cli_alloc_osd.cpp
+++ b/src/cli_alloc_osd.cpp
@@ -102,20 +102,20 @@ struct alloc_osd_t
    }
 };

-std::function<bool(cli_result_t &)> cli_tool_t::start_alloc_osd(json11::Json cfg)
+std::function<bool(void)> cli_tool_t::start_alloc_osd(json11::Json cfg, uint64_t *out)
 {
    json11::Json::array cmd = cfg["command"].array_items();
    auto alloc_osd = new alloc_osd_t();
    alloc_osd->parent = this;
-    return [alloc_osd](cli_result_t & result)
+    return [alloc_osd, out]()
    {
        alloc_osd->loop();
        if (alloc_osd->is_done())
        {
-            result = (cli_result_t){
-                .text = std::to_string(alloc_osd->new_id),
-                .data = json11::Json(alloc_osd->new_id),
-            };
+            if (out)
+                *out = alloc_osd->new_id;
+            else if (alloc_osd->new_id)
+                printf("%lu\n", alloc_osd->new_id);
            delete alloc_osd;
            return true;
        }
--- a/src/cli_common.cpp
+++ b/src/cli_common.cpp
@@ -1,118 +0,0 @@
-// Copyright (c) Vitaliy Filippov, 2019+
-// License: VNPL-1.1 (see README.md for details)
-
-#include "base64.h"
-#include "cluster_client.h"
-#include "cli.h"
-
-void cli_tool_t::change_parent(inode_t cur, inode_t new_parent)
-{
-    auto cur_cfg_it = cli->st_cli.inode_config.find(cur);
-    if (cur_cfg_it == cli->st_cli.inode_config.end())
-    {
-        fprintf(stderr, "Inode 0x%lx disappeared\n", cur);
-        exit(1);
-    }
-    inode_config_t new_cfg = cur_cfg_it->second;
-    std::string cur_name = new_cfg.name;
-    std::string cur_cfg_key = base64_encode(cli->st_cli.etcd_prefix+
-        "/config/inode/"+std::to_string(INODE_POOL(cur))+
-        "/"+std::to_string(INODE_NO_POOL(cur)));
-    new_cfg.parent_id = new_parent;
-    json11::Json::object cur_cfg_json = cli->st_cli.serialize_inode_cfg(&new_cfg);
-    waiting++;
-    cli->st_cli.etcd_txn_slow(json11::Json::object {
-        { "compare", json11::Json::array {
-            json11::Json::object {
-                { "target", "MOD" },
-                { "key", cur_cfg_key },
-                { "result", "LESS" },
-                { "mod_revision", new_cfg.mod_revision+1 },
-            },
-        } },
-        { "success", json11::Json::array {
-            json11::Json::object {
-                { "request_put", json11::Json::object {
-                    { "key", cur_cfg_key },
-                    { "value", base64_encode(json11::Json(cur_cfg_json).dump()) },
-                } }
-            },
-        } },
-    }, [this, new_parent, cur, cur_name](std::string err, json11::Json res)
-    {
-        if (err != "")
-        {
-            fprintf(stderr, "Error changing parent of %s: %s\n", cur_name.c_str(), err.c_str());
-            exit(1);
-        }
-        if (!res["succeeded"].bool_value())
-        {
-            fprintf(stderr, "Inode %s was modified during snapshot deletion\n", cur_name.c_str());
-            exit(1);
-        }
-        if (new_parent)
-        {
-            auto new_parent_it = cli->st_cli.inode_config.find(new_parent);
-            std::string new_parent_name = new_parent_it != cli->st_cli.inode_config.end()
-                ? new_parent_it->second.name : "<unknown>";
-            printf(
-                "Parent of layer %s (inode %lu in pool %u) changed to %s (inode %lu in pool %u)\n",
-                cur_name.c_str(), INODE_NO_POOL(cur), INODE_POOL(cur),
-                new_parent_name.c_str(), INODE_NO_POOL(new_parent), INODE_POOL(new_parent)
-            );
-        }
-        else
-        {
-            printf(
-                "Parent of layer %s (inode %lu in pool %u) detached\n",
-                cur_name.c_str(), INODE_NO_POOL(cur), INODE_POOL(cur)
-            );
-        }
-        waiting--;
-        ringloop->wakeup();
-    });
-}
-
-void cli_tool_t::etcd_txn(json11::Json txn)
-{
-    waiting++;
-    cli->st_cli.etcd_txn_slow(txn, [this](std::string err, json11::Json res)
-    {
-        waiting--;
-        if (err != "")
-        {
-            fprintf(stderr, "Error reading from etcd: %s\n", err.c_str());
-            exit(1);
-        }
-        etcd_result = res;
-        ringloop->wakeup();
-    });
-}
-
-inode_config_t* cli_tool_t::get_inode_cfg(const std::string & name)
-{
-    for (auto & ic: cli->st_cli.inode_config)
-    {
-        if (ic.second.name == name)
-        {
-            return &ic.second;
-        }
-    }
-    fprintf(stderr, "Layer %s not found\n", name.c_str());
-    exit(1);
-}
-
-void cli_tool_t::parse_config(json11::Json cfg)
-{
-    color = !cfg["no-color"].bool_value();
-    json_output = cfg["json"].bool_value();
-    iodepth = cfg["iodepth"].uint64_value();
-    if (!iodepth)
-        iodepth = 32;
-    parallel_osds = cfg["parallel_osds"].uint64_value();
-    if (!parallel_osds)
-        parallel_osds = 4;
-    log_level = cfg["log_level"].int64_value();
-    progress = cfg["progress"].uint64_value() ? true : false;
-    list_first = cfg["wait-list"].uint64_value() ? true : false;
-}
--- a/src/cli_create.cpp
+++ b/src/cli_create.cpp
@@ -33,7 +33,6 @@ struct image_creator_t
    uint64_t max_id_mod_rev = 0, cfg_mod_rev = 0, idx_mod_rev = 0;

    int state = 0;
-    cli_result_t result;

    bool is_done()
    {
@@ -44,27 +43,13 @@ struct image_creator_t
    {
        if (state >= 1)
            goto resume_1;
-        if (image_name == "")
-        {
-            // FIXME: EINVAL -> specific codes for every error
-            result = (cli_result_t){ .err = EINVAL, .text = "Image name is missing" };
-            state = 100;
-            return;
-        }
-        if (image_name.find('@') != std::string::npos)
-        {
-            result = (cli_result_t){ .err = EINVAL, .text = "Image name can't contain @ character" };
-            state = 100;
-            return;
-        }
        if (new_pool_id)
        {
            auto & pools = parent->cli->st_cli.pool_config;
            if (pools.find(new_pool_id) == pools.end())
            {
-                result = (cli_result_t){ .err = ENOENT, .text = "Pool "+std::to_string(new_pool_id)+" does not exist" };
-                state = 100;
-                return;
+                fprintf(stderr, "Pool %u does not exist\n", new_pool_id);
+                exit(1);
            }
        }
        else if (new_pool_name != "")
@@ -79,9 +64,8 @@ struct image_creator_t
            }
            if (!new_pool_id)
            {
-                result = (cli_result_t){ .err = ENOENT, .text = "Pool "+new_pool_name+" does not exist" };
-                state = 100;
-                return;
+                fprintf(stderr, "Pool %s does not exist\n", new_pool_name.c_str());
+                exit(1);
            }
        }
        else if (parent->cli->st_cli.pool_config.size() == 1)
@@ -107,9 +91,8 @@ struct image_creator_t
        {
            if (ic.second.name == image_name)
            {
-                result = (cli_result_t){ .err = EEXIST, .text = "Image "+image_name+" already exists" };
-                state = 100;
-                return;
+                fprintf(stderr, "Image %s already exists\n", image_name.c_str());
+                exit(1);
            }
            if (ic.second.name == new_parent)
            {
@@ -126,21 +109,18 @@ struct image_creator_t
        }
        if (new_parent != "" && !new_parent_id)
        {
-            result = (cli_result_t){ .err = ENOENT, .text = "Parent image "+new_parent+" not found" };
-            state = 100;
-            return;
+            fprintf(stderr, "Parent image not found\n");
+            exit(1);
        }
        if (!new_pool_id)
        {
-            result = (cli_result_t){ .err = EINVAL, .text = "Pool name or ID is missing" };
-            state = 100;
-            return;
+            fprintf(stderr, "Pool name or ID is missing\n");
+            exit(1);
        }
        if (!size)
        {
-            result = (cli_result_t){ .err = EINVAL, .text = "Image size is missing" };
-            state = 100;
-            return;
+            fprintf(stderr, "Image size is missing\n");
+            exit(1);
        }
        do
        {
@@ -160,12 +140,14 @@ resume_3:
            if (!parent->etcd_result["succeeded"].bool_value() &&
                parent->etcd_result["responses"][0]["response_range"]["kvs"].array_items().size() > 0)
            {
-                result = (cli_result_t){ .err = EEXIST, .text = "Image "+image_name+" already exists" };
-                state = 100;
-                return;
+                fprintf(stderr, "Image %s already exists\n", image_name.c_str());
+                exit(1);
            }
        } while (!parent->etcd_result["succeeded"].bool_value());
-        result = (cli_result_t){ .err = 0, .text = "Image "+image_name+" created" };
+        if (parent->progress)
+        {
+            printf("Image %s created\n", image_name.c_str());
+        }
        state = 100;
    }

@@ -181,16 +163,14 @@ resume_3:
        {
            if (ic.second.name == image_name+"@"+new_snap)
            {
-                result = (cli_result_t){ .err = EEXIST, .text = "Snapshot "+image_name+"@"+new_snap+" already exists" };
-                state = 100;
-                return;
+                fprintf(stderr, "Snapshot %s@%s already exists\n", image_name.c_str(), new_snap.c_str());
+                exit(1);
            }
        }
        if (new_parent != "")
        {
-            result = (cli_result_t){ .err = EINVAL, .text = "Parent can't be specified for snapshots" };
-            state = 100;
-            return;
+            fprintf(stderr, "--parent can't be used with snapshots\n");
+            exit(1);
        }
        do
        {
@@ -202,9 +182,8 @@ resume_3:
                return;
            if (!old_id)
            {
-                result = (cli_result_t){ .err = ENOENT, .text = "Image "+image_name+" does not exist" };
-                state = 100;
-                return;
+                fprintf(stderr, "Image %s does not exist\n", image_name.c_str());
+                exit(1);
            }
            if (!new_pool_id)
            {
@@ -219,12 +198,14 @@ resume_4:
            if (!parent->etcd_result["succeeded"].bool_value() &&
                parent->etcd_result["responses"][0]["response_range"]["kvs"].array_items().size() > 0)
            {
-                result = (cli_result_t){ .err = EEXIST, .text = "Snapshot "+image_name+"@"+new_snap+" already exists" };
-                state = 100;
-                return;
+                fprintf(stderr, "Snapshot %s@%s already exists\n", image_name.c_str(), new_snap.c_str());
+                exit(1);
            }
        } while (!parent->etcd_result["succeeded"].bool_value());
-        result = (cli_result_t){ .err = 0, .text = "Snapshot "+image_name+"@"+new_snap+" created" };
+        if (parent->progress)
+        {
+            printf("Snapshot %s@%s created\n", image_name.c_str(), new_snap.c_str());
+        }
        state = 100;
    }

@@ -307,9 +288,8 @@ resume_2:
                idx_mod_rev = kv.mod_revision;
                if (!old_id || !old_pool_id || old_pool_id >= POOL_ID_MAX)
                {
-                    result = (cli_result_t){ .err = ENOENT, .text = "Invalid pool or inode ID in etcd key "+kv.key };
-                    state = 100;
-                    return;
+                    fprintf(stderr, "Invalid pool or inode ID in etcd key %s\n", kv.key.c_str());
+                    exit(1);
                }
            }
            parent->etcd_txn(json11::Json::object {
@@ -477,24 +457,25 @@ uint64_t parse_size(std::string size_str)
    if (type_char == 'k' || type_char == 'm' || type_char == 'g' || type_char == 't')
    {
        if (type_char == 'k')
-            mul = (uint64_t)1<<10;
+            mul = 1l<<10;
        else if (type_char == 'm')
-            mul = (uint64_t)1<<20;
+            mul = 1l<<20;
        else if (type_char == 'g')
-            mul = (uint64_t)1<<30;
+            mul = 1l<<30;
        else /*if (type_char == 't')*/
-            mul = (uint64_t)1<<40;
+            mul = 1l<<40;
        size_str = size_str.substr(0, size_str.length()-1);
    }
    uint64_t size = json11::Json(size_str).uint64_value() * mul;
    if (size == 0 && size_str != "0" && (size_str != "" || mul != 1))
    {
-        return UINT64_MAX;
+        fprintf(stderr, "Invalid syntax for size: %s\n", size_str.c_str());
+        exit(1);
    }
    return size;
 }

-std::function<bool(cli_result_t &)> cli_tool_t::start_create(json11::Json cfg)
+std::function<bool(void)> cli_tool_t::start_create(json11::Json cfg)
 {
    json11::Json::array cmd = cfg["command"].array_items();
    auto image_creator = new image_creator_t();
@@ -511,12 +492,8 @@ std::function<bool(cli_result_t &)> cli_tool_t::start_create(json11::Json cfg)
        int p = image_creator->image_name.find('@');
        if (p == std::string::npos || p == image_creator->image_name.length()-1)
        {
-            delete image_creator;
-            return [](cli_result_t & result)
-            {
-                result = (cli_result_t){ .err = EINVAL, .text = "Please specify new snapshot name after @" };
-                return true;
-            };
+            fprintf(stderr, "Please specify new snapshot name after @\n");
+            exit(1);
        }
        image_creator->new_snap = image_creator->image_name.substr(p + 1);
        image_creator->image_name = image_creator->image_name.substr(0, p);
@@ -525,39 +502,32 @@ std::function<bool(cli_result_t &)> cli_tool_t::start_create(json11::Json cfg)
    if (cfg["size"].string_value() != "")
    {
        image_creator->size = parse_size(cfg["size"].string_value());
-        if (image_creator->size == UINT64_MAX)
-        {
-            return [size = cfg["size"].string_value()](cli_result_t & result)
-            {
-                result = (cli_result_t){ .err = EINVAL, .text = "Invalid syntax for size: "+size };
-                return true;
-            };
-        }
        if (image_creator->size % 4096)
        {
-            delete image_creator;
-            return [](cli_result_t & result)
-            {
-                result = (cli_result_t){ .err = EINVAL, .text = "Size should be a multiple of 4096" };
-                return true;
-            };
+            fprintf(stderr, "Size should be a multiple of 4096\n");
+            exit(1);
        }
        if (image_creator->new_snap != "")
        {
-            delete image_creator;
-            return [](cli_result_t & result)
-            {
-                result = (cli_result_t){ .err = EINVAL, .text = "Size can't be specified for snapshots" };
-                return true;
-            };
+            fprintf(stderr, "--size can't be specified for snapshots\n");
+            exit(1);
        }
    }
-    return [image_creator](cli_result_t & result)
+    if (image_creator->image_name == "")
+    {
+        fprintf(stderr, "Image name is missing\n");
+        exit(1);
+    }
+    if (image_creator->image_name.find('@') != std::string::npos)
+    {
+        fprintf(stderr, "Image name can't contain @ character\n");
+        exit(1);
+    }
+    return [image_creator]()
    {
        image_creator->loop();
        if (image_creator->is_done())
        {
-            result = image_creator->result;
            delete image_creator;
            return true;
        }
--- a/src/cli_df.cpp
+++ b/src/cli_df.cpp
@@ -12,7 +12,6 @@ struct pool_lister_t

    int state = 0;
    json11::Json space_info;
-    cli_result_t result;
    std::map<pool_id_t, json11::Json::object> pool_stats;

    bool is_done()
@@ -125,8 +124,8 @@ resume_1:
                { "scheme_name", pool_cfg.scheme == POOL_SCHEME_REPLICATED
                    ? std::to_string(pool_cfg.pg_size)+"/"+std::to_string(pool_cfg.pg_minsize)
                    : "EC "+std::to_string(pool_cfg.pg_size-pool_cfg.parity_chunks)+"+"+std::to_string(pool_cfg.parity_chunks) },
-                { "used_raw", (uint64_t)(pool_stats[pool_cfg.id]["used_raw_tb"].number_value() * ((uint64_t)1<<40)) },
-                { "total_raw", (uint64_t)(pool_stats[pool_cfg.id]["total_raw_tb"].number_value() * ((uint64_t)1<<40)) },
+                { "used_raw", (uint64_t)(pool_stats[pool_cfg.id]["used_raw_tb"].number_value() * (1l<<40)) },
+                { "total_raw", (uint64_t)(pool_stats[pool_cfg.id]["total_raw_tb"].number_value() * (1l<<40)) },
                { "max_available", pool_avail },
                { "raw_to_usable", pool_stats[pool_cfg.id]["raw_to_usable"].number_value() },
                { "space_efficiency", pool_stats[pool_cfg.id]["space_efficiency"].number_value() },
@@ -151,10 +150,10 @@ resume_1:
        get_stats();
        if (parent->waiting > 0)
            return;
-        result.data = to_list();
        if (parent->json_output)
        {
            // JSON output
+            printf("%s\n", json11::Json(to_list()).dump().c_str());
            state = 100;
            return;
        }
@@ -207,22 +206,21 @@ resume_1:
                : 100)+"%";
            kv.second["eff_fmt"] = format_q(kv.second["space_efficiency"].number_value()*100)+"%";
        }
-        result.text = print_table(result.data, cols, parent->color);
+        printf("%s", print_table(to_list(), cols, parent->color).c_str());
        state = 100;
    }
 };

-std::function<bool(cli_result_t &)> cli_tool_t::start_df(json11::Json cfg)
+std::function<bool(void)> cli_tool_t::start_df(json11::Json cfg)
 {
    json11::Json::array cmd = cfg["command"].array_items();
    auto lister = new pool_lister_t();
    lister->parent = this;
-    return [lister](cli_result_t & result)
+    return [lister]()
    {
        lister->loop();
        if (lister->is_done())
        {
-            result = lister->result;
            delete lister;
            return true;
        }
--- a/src/cli_flatten.cpp
+++ b/src/cli_flatten.cpp
@@ -22,8 +22,7 @@ struct snap_flattener_t
    std::string top_parent_name;
    inode_t target_id = 0;
    int state = 0;
-    std::function<bool(cli_result_t &)> merger_cb;
-    cli_result_t result;
+    std::function<bool(void)> merger_cb;

    void get_merge_parents()
    {
@@ -38,34 +37,23 @@ struct snap_flattener_t
            auto it = parent->cli->st_cli.inode_config.find(cur->parent_id);
            if (it == parent->cli->st_cli.inode_config.end())
            {
-                result = (cli_result_t){
-                    .err = ENOENT,
-                    .text = "Parent inode of layer "+cur->name+" (id "+std::to_string(cur->parent_id)+") does not exist",
-                    .data = json11::Json::object {
-                        { "error", "parent-not-found" },
-                        { "inode_id", cur->num },
-                        { "inode_name", cur->name },
-                        { "parent_id", cur->parent_id },
-                    },
-                };
-                state = 100;
-                return;
+                fprintf(stderr, "Parent inode of layer %s (id %ld) not found\n", cur->name.c_str(), cur->parent_id);
+                exit(1);
            }
            cur = &it->second;
            chain_list.push_back(cur->num);
        }
        if (cur->parent_id != 0)
        {
-            result = (cli_result_t){ .err = EBADF, .text = "Layer "+target_name+" has a loop in parents" };
-            state = 100;
-            return;
+            fprintf(stderr, "Layer %s has a loop in parents\n", target_name.c_str());
+            exit(1);
        }
        top_parent_name = cur->name;
    }

    bool is_done()
    {
-        return state == 100;
+        return state == 5;
    }

    void loop()
@@ -76,16 +64,8 @@ struct snap_flattener_t
            goto resume_2;
        else if (state == 3)
            goto resume_3;
-        if (target_name == "")
-        {
-            result = (cli_result_t){ .err = EINVAL, .text = "Layer to flatten not specified" };
-            state = 100;
-            return;
-        }
        // Get parent layers
        get_merge_parents();
-        if (state == 100)
-            return;
        // Start merger
        merger_cb = parent->start_merge(json11::Json::object {
            { "command", json11::Json::array{ "merge-data", top_parent_name, target_name } },
@@ -96,17 +76,12 @@ struct snap_flattener_t
        });
        // Wait for it
 resume_1:
-        while (!merger_cb(result))
+        while (!merger_cb())
        {
            state = 1;
            return;
        }
        merger_cb = NULL;
-        if (result.err)
-        {
-            state = 100;
-            return;
-        }
        // Change parent
        parent->change_parent(target_id, 0);
        // Wait for it to complete
@@ -117,27 +92,31 @@ resume_2:
        state = 3;
 resume_3:
        // Done
-        state = 100;
+        return;
    }
 };

-std::function<bool(cli_result_t &)> cli_tool_t::start_flatten(json11::Json cfg)
+std::function<bool(void)> cli_tool_t::start_flatten(json11::Json cfg)
 {
    json11::Json::array cmd = cfg["command"].array_items();
    auto flattener = new snap_flattener_t();
    flattener->parent = this;
    flattener->target_name = cmd.size() > 1 ? cmd[1].string_value() : "";
+    if (flattener->target_name == "")
+    {
+        fprintf(stderr, "Layer to flatten argument is missing\n");
+        exit(1);
+    }
    flattener->fsync_interval = cfg["fsync-interval"].uint64_value();
    if (!flattener->fsync_interval)
        flattener->fsync_interval = 128;
    if (!cfg["cas"].is_null())
        flattener->use_cas = cfg["cas"].uint64_value() ? 2 : 0;
-    return [flattener](cli_result_t & result)
+    return [flattener]()
    {
        flattener->loop();
        if (flattener->is_done())
        {
-            result = flattener->result;
            delete flattener;
            return true;
        }
--- a/src/cli_ls.cpp
+++ b/src/cli_ls.cpp
@@ -24,7 +24,6 @@ struct image_lister_t
    int state = 0;
    std::map<inode_t, json11::Json::object> stats;
    json11::Json space_info;
-    cli_result_t result;

    bool is_done()
    {
@@ -45,9 +44,8 @@ struct image_lister_t
            }
            if (!list_pool_id)
            {
-                result = (cli_result_t){ .err = ENOENT, .text = "Pool "+list_pool_name+" does not exist" };
-                state = 100;
-                return;
+                fprintf(stderr, "Pool %s does not exist\n", list_pool_name.c_str());
+                exit(1);
            }
        }
        for (auto & ic: parent->cli->st_cli.inode_config)
@@ -156,7 +154,7 @@ resume_1:
            if (pool_it != parent->cli->st_cli.pool_config.end())
            {
                auto & pool_cfg = pool_it->second;
-                used_size = used_size / (pool_pg_real_size[pool_id] ? pool_pg_real_size[pool_id] : 1)
+                used_size = used_size / pool_pg_real_size[pool_id]
                    * (pool_cfg.scheme == POOL_SCHEME_REPLICATED ? 1 : pool_cfg.pg_size-pool_cfg.parity_chunks);
            }
            auto stat_it = stats.find(inode_num);
@@ -248,10 +246,10 @@ resume_1:
            if (parent->waiting > 0)
                return;
        }
-        result.data = to_list();
        if (parent->json_output)
        {
            // JSON output
+            printf("%s\n", json11::Json(to_list()).dump().c_str());
            state = 100;
            return;
        }
@@ -361,7 +359,7 @@ resume_1:
            kv.second["size_fmt"] = format_size(kv.second["size"].uint64_value());
            kv.second["ro"] = kv.second["readonly"].bool_value() ? "RO" : "-";
        }
-        result.text = print_table(to_list(), cols, parent->color);
+        printf("%s", print_table(to_list(), cols, parent->color).c_str());
        state = 100;
    }
 };
@@ -438,26 +436,23 @@ std::string print_table(json11::Json items, json11::Json header, bool use_esc)
    return str;
 }

-static uint64_t size_thresh[] = { (uint64_t)1024*1024*1024*1024, (uint64_t)1024*1024*1024, (uint64_t)1024*1024, 1024, 0 };
-static uint64_t size_thresh_d[] = { (uint64_t)1000000000000, (uint64_t)1000000000, (uint64_t)1000000, (uint64_t)1000, 0 };
-static const int size_thresh_n = sizeof(size_thresh)/sizeof(size_thresh[0]);
+static uint64_t size_thresh[] = { 1024l*1024*1024*1024, 1024l*1024*1024, 1024l*1024, 1024, 0 };
 static const char *size_unit = "TGMKB";

-std::string format_size(uint64_t size, bool nobytes)
+std::string format_size(uint64_t size)
 {
-    uint64_t *thr = nobytes ? size_thresh_d : size_thresh;
    char buf[256];
-    for (int i = 0; i < size_thresh_n; i++)
+    for (int i = 0; i < sizeof(size_thresh)/sizeof(size_thresh[0]); i++)
    {
-        if (size >= thr[i] || i >= size_thresh_n-1)
+        if (size >= size_thresh[i] || i >= sizeof(size_thresh)/sizeof(size_thresh[0])-1)
        {
-            double value = thr[i] ? (double)size/thr[i] : size;
+            double value = size_thresh[i] ? (double)size/size_thresh[i] : size;
            int l = snprintf(buf, sizeof(buf), "%.1f", value);
            assert(l < sizeof(buf)-2);
            if (buf[l-1] == '0')
                l -= 2;
-            buf[l] = i == size_thresh_n-1 && nobytes ? 0 : ' ';
-            buf[l+1] = i == size_thresh_n-1 && nobytes ? 0 : size_unit[i];
+            buf[l] = ' ';
+            buf[l+1] = size_unit[i];
            buf[l+2] = 0;
            break;
        }
@@ -548,7 +543,7 @@ back:
    return true;
 }

-std::function<bool(cli_result_t &)> cli_tool_t::start_ls(json11::Json cfg)
+std::function<bool(void)> cli_tool_t::start_ls(json11::Json cfg)
 {
    json11::Json::array cmd = cfg["command"].array_items();
    auto lister = new image_lister_t();
@@ -564,12 +559,11 @@ std::function<bool(cli_result_t &)> cli_tool_t::start_ls(json11::Json cfg)
    {
        lister->only_names.insert(cmd[i].string_value());
    }
-    return [lister](cli_result_t & result)
+    return [lister]()
    {
        lister->loop();
        if (lister->is_done())
        {
-            result = lister->result;
            delete lister;
            return true;
        }
--- a/src/cli_merge.cpp
+++ b/src/cli_merge.cpp
@@ -12,9 +12,6 @@ struct snap_rw_op_t
    cluster_op_t op;
    int todo = 0;
    uint32_t start = 0, end = 0;
-    int error_code = 0;
-    uint64_t error_offset = 0;
-    bool error_read = false;
 };

 // Layer merge is the base for multiple operations:
@@ -57,27 +54,17 @@ struct snap_merger_t
    uint64_t last_written_offset = 0;
    int deleted_unsynced = 0;
    uint64_t processed = 0, to_process = 0;
-    std::string rwo_error;
-
-    cli_result_t result;

    void start_merge()
    {
-        if (from_name == "" || to_name == "")
-        {
-            result = (cli_result_t){ .err = EINVAL, .text = "Beginning or end of the merge sequence is missing" };
-            state = 100;
-            return;
-        }
        check_delete_source = delete_source || check_delete_source;
        inode_config_t *from_cfg = parent->get_inode_cfg(from_name);
        inode_config_t *to_cfg = parent->get_inode_cfg(to_name);
        inode_config_t *target_cfg = target_name == "" ? from_cfg : parent->get_inode_cfg(target_name);
        if (to_cfg->num == from_cfg->num)
        {
-            result = (cli_result_t){ .err = EINVAL, .text = "Only one layer specified, nothing to merge" };
-            state = 100;
-            return;
+            fprintf(stderr, "Only one layer specified, nothing to merge\n");
+            exit(1);
        }
        // Check that to_cfg is actually a child of from_cfg and target_cfg is somewhere between them
        std::vector<inode_t> chain_list;
@@ -91,18 +78,8 @@ struct snap_merger_t
            auto it = parent->cli->st_cli.inode_config.find(cur->parent_id);
            if (it == parent->cli->st_cli.inode_config.end())
            {
-                result = (cli_result_t){
-                    .err = ENOENT,
-                    .text = "Parent inode of layer "+cur->name+" (id "+std::to_string(cur->parent_id)+") does not exist",
-                    .data = json11::Json::object {
-                        { "error", "parent-not-found" },
-                        { "inode_id", cur->num },
-                        { "inode_name", cur->name },
-                        { "parent_id", cur->parent_id },
-                    },
-                };
-                state = 100;
-                return;
+                fprintf(stderr, "Parent inode of layer %s (id %ld) not found\n", cur->name.c_str(), cur->parent_id);
+                exit(1);
            }
            cur = &it->second;
            chain_list.push_back(cur->num);
@@ -110,9 +87,8 @@ struct snap_merger_t
        }
        if (cur->parent_id != from_cfg->num)
        {
-            result = (cli_result_t){ .err = EINVAL, .text = "Layer "+to_name+" is not a child of "+from_name };
-            state = 100;
-            return;
+            fprintf(stderr, "Layer %s is not a child of %s\n", to_name.c_str(), from_name.c_str());
+            exit(1);
        }
        chain_list.push_back(from_cfg->num);
        layer_block_size[from_cfg->num] = get_block_size(from_cfg->num);
@@ -123,9 +99,8 @@ struct snap_merger_t
        }
        if (sources.find(target_cfg->num) == sources.end())
        {
-            result = (cli_result_t){ .err = EINVAL, .text = "Layer "+target_name+" is not between "+to_name+" and "+from_name };
-            state = 100;
-            return;
+            fprintf(stderr, "Layer %s is not between %s and %s\n", target_name.c_str(), to_name.c_str(), from_name.c_str());
+            exit(1);
        }
        target = target_cfg->num;
        target_rank = sources.at(target);
@@ -155,15 +130,14 @@ struct snap_merger_t
                    int parent_rank = it->second;
                    if (parent_rank < to_rank && (parent_rank >= target_rank || check_delete_source))
                    {
-                        result = (cli_result_t){
-                            .err = EINVAL,
-                            .text = "Layers at or above "+(check_delete_source ? from_name : target_name)+
-                                ", but below "+to_name+" are not allowed to have other children, but "+
-                                ic.second.name+" is a child of "+
-                                parent->cli->st_cli.inode_config.at(ic.second.parent_id).name,
-                        };
-                        state = 100;
-                        return;
+                        fprintf(
+                            stderr, "Layers at or above %s, but below %s are not allowed"
+                                " to have other children, but %s is a child of %s\n",
+                            (check_delete_source ? from_name.c_str() : target_name.c_str()),
+                            to_name.c_str(), ic.second.name.c_str(),
+                            parent->cli->st_cli.inode_config.at(ic.second.parent_id).name.c_str()
+                        );
+                        exit(1);
                    }
                    if (parent_rank >= to_rank)
                    {
@@ -178,14 +152,11 @@ struct snap_merger_t
            use_cas = 0;
        }
        sources.erase(target);
-        if (parent->progress)
-        {
-            printf(
-                "Merging %ld layer(s) into target %s%s (inode %lu in pool %u)\n",
-                sources.size(), target_cfg->name.c_str(),
-                use_cas ? " online (with CAS)" : "", INODE_NO_POOL(target), INODE_POOL(target)
-            );
-        }
+        printf(
+            "Merging %ld layer(s) into target %s%s (inode %lu in pool %u)\n",
+            sources.size(), target_cfg->name.c_str(),
+            use_cas ? " online (with CAS)" : "", INODE_NO_POOL(target), INODE_POOL(target)
+        );
        target_block_size = get_block_size(target);
    }

@@ -282,8 +253,7 @@ struct snap_merger_t
        oit = merge_offsets.begin();
    resume_5:
        // Now read, overwrite and optionally delete offsets one by one
-        while (in_flight < parent->iodepth*parent->parallel_osds &&
-            oit != merge_offsets.end() && !rwo_error.size())
+        while (in_flight < parent->iodepth*parent->parallel_osds && oit != merge_offsets.end())
        {
            in_flight++;
            read_and_write(*oit);
@@ -294,15 +264,6 @@ struct snap_merger_t
                printf("\rOverwriting blocks: %lu/%lu", processed, to_process);
            }
        }
-        if (in_flight == 0 && rwo_error.size())
-        {
-            result = (cli_result_t){
-                .err = EIO,
-                .text = rwo_error,
-            };
-            state = 100;
-            return;
-        }
        if (in_flight > 0 || oit != merge_offsets.end())
        {
            // Wait until overwrites finish
@@ -435,9 +396,8 @@ struct snap_merger_t
        {
            if (op->retval != op->len)
            {
-                rwo->error_code = -op->retval;
-                rwo->error_offset = op->offset;
-                rwo->error_read = true;
+                fprintf(stderr, "error reading target at offset %lx: %s\n", op->offset, strerror(-op->retval));
+                exit(1);
            }
            next_write(rwo);
        };
@@ -450,7 +410,7 @@ struct snap_merger_t
        // FIXME: Allow to use single write with "holes" (OSDs don't allow it yet)
        uint32_t gran = parent->cli->get_bs_bitmap_granularity();
        uint64_t bitmap_size = target_block_size / gran;
-        while (rwo->end < bitmap_size && !rwo->error_code)
+        while (rwo->end < bitmap_size)
        {
            auto bit = ((*((uint8_t*)rwo->op.bitmap_buf + (rwo->end >> 3))) & (1 << (rwo->end & 0x7)));
            if (!bit)
@@ -474,7 +434,7 @@ struct snap_merger_t
                rwo->end++;
            }
        }
-        if (rwo->end > rwo->start && !rwo->error_code)
+        if (rwo->end > rwo->start)
        {
            // write start->end
            rwo->todo++;
@@ -513,9 +473,8 @@ struct snap_merger_t
                    delete subop;
                    return;
                }
-                rwo->error_code = -subop->retval;
-                rwo->error_offset = subop->offset;
-                rwo->error_read = false;
+                fprintf(stderr, "error writing target at offset %lx: %s\n", subop->offset, strerror(-subop->retval));
+                exit(1);
            }
            // Increment CAS version
            rwo->op.version++;
@@ -551,12 +510,11 @@ struct snap_merger_t
    {
        if (!rwo->todo)
        {
-            if (!rwo->error_code &&
-                last_written_offset < rwo->op.offset+target_block_size)
+            if (last_written_offset < rwo->op.offset+target_block_size)
            {
                last_written_offset = rwo->op.offset+target_block_size;
            }
-            if (!rwo->error_code && delete_source)
+            if (delete_source)
            {
                deleted_unsynced++;
                if (deleted_unsynced >= fsync_interval)
@@ -587,20 +545,13 @@ struct snap_merger_t
            }
            free(rwo->buf);
            delete rwo;
-            if (rwo->error_code)
-            {
-                char buf[1024];
-                snprintf(buf, 1024, "Error %s target at offset %lx: %s",
-                    rwo->error_read ? "reading" : "writing", rwo->error_offset, strerror(rwo->error_code));
-                rwo_error = std::string(buf);
-            }
            in_flight--;
            continue_merge_reent();
        }
    }
 };

-std::function<bool(cli_result_t &)> cli_tool_t::start_merge(json11::Json cfg)
+std::function<bool(void)> cli_tool_t::start_merge(json11::Json cfg)
 {
    json11::Json::array cmd = cfg["command"].array_items();
    auto merger = new snap_merger_t();
@@ -608,18 +559,22 @@ std::function<bool(cli_result_t &)> cli_tool_t::start_merge(json11::Json cfg)
    merger->from_name = cmd.size() > 1 ? cmd[1].string_value() : "";
    merger->to_name = cmd.size() > 2 ? cmd[2].string_value() : "";
    merger->target_name = cfg["target"].string_value();
+    if (merger->from_name == "" || merger->to_name == "")
+    {
+        fprintf(stderr, "Beginning or end of the merge sequence is missing\n");
+        exit(1);
+    }
    merger->delete_source = cfg["delete-source"].string_value() != "";
    merger->fsync_interval = cfg["fsync-interval"].uint64_value();
    if (!merger->fsync_interval)
        merger->fsync_interval = 128;
    if (!cfg["cas"].is_null())
        merger->use_cas = cfg["cas"].uint64_value() ? 2 : 0;
-    return [merger](cli_result_t & result)
+    return [merger]()
    {
        merger->continue_merge_reent();
        if (merger->is_done())
        {
-            result = merger->result;
            delete merger;
            return true;
        }
--- a/src/cli_modify.cpp
+++ b/src/cli_modify.cpp
@@ -23,8 +23,7 @@ struct image_changer_t
    bool has_children = false;

    int state = 0;
-    std::function<bool(cli_result_t &)> cb;
-    cli_result_t result;
+    std::function<bool(void)> cb;

    bool is_done()
    {
@@ -37,18 +36,6 @@ struct image_changer_t
            goto resume_1;
        else if (state == 2)
            goto resume_2;
-        if (image_name == "")
-        {
-            result = (cli_result_t){ .err = EINVAL, .text = "Image name is missing" };
-            state = 100;
-            return;
-        }
-        if (new_size != 0 && (new_size % 4096))
-        {
-            result = (cli_result_t){ .err = EINVAL, .text = "Image size should be a multiple of 4096" };
-            state = 100;
-            return;
-        }
        for (auto & ic: parent->cli->st_cli.inode_config)
        {
            if (ic.second.name == image_name)
@@ -59,16 +46,14 @@ struct image_changer_t
            }
            if (new_name != "" && ic.second.name == new_name)
            {
-                result = (cli_result_t){ .err = EEXIST, .text = "Image "+new_name+" already exists" };
-                state = 100;
-                return;
+                fprintf(stderr, "Image %s already exists\n", new_name.c_str());
+                exit(1);
            }
        }
        if (!inode_num)
        {
-            result = (cli_result_t){ .err = ENOENT, .text = "Image "+image_name+" does not exist" };
-            state = 100;
-            return;
+            fprintf(stderr, "Image %s does not exist\n", image_name.c_str());
+            exit(1);
        }
        for (auto & ic: parent->cli->st_cli.inode_config)
        {
@@ -83,7 +68,7 @@ struct image_changer_t
            (!new_size || cfg.size == new_size) &&
            (new_name == "" || new_name == image_name))
        {
-            result = (cli_result_t){ .text = "No change" };
+            printf("No change\n");
            state = 100;
            return;
        }
@@ -94,9 +79,8 @@ struct image_changer_t
                // Check confirmation when trimming an image with children
                if (has_children && !force)
                {
-                    result = (cli_result_t){ .err = EINVAL, .text = "Image "+image_name+" has children. Refusing to shrink it without --force" };
-                    state = 100;
-                    return;
+                    fprintf(stderr, "Image %s has children. Refusing to shrink it without --force\n", image_name.c_str());
+                    exit(1);
                }
                // Shrink the image first
                cb = parent->start_rm(json11::Json::object {
@@ -106,17 +90,12 @@ struct image_changer_t
                    { "min-offset", new_size },
                });
 resume_1:
-                while (!cb(result))
+                while (!cb())
                {
                    state = 1;
                    return;
                }
                cb = NULL;
-                if (result.err)
-                {
-                    state = 100;
-                    return;
-                }
            }
            cfg.size = new_size;
        }
@@ -130,9 +109,8 @@ resume_1:
            // Check confirmation when making an image with children read-write
            if (has_children && !force)
            {
-                result = (cli_result_t){ .err = EINVAL, .text = "Image "+image_name+" has children. Refusing to make it read-write without --force" };
-                state = 100;
-                return;
+                fprintf(stderr, "Image %s has children. Refusing to make it read-write without --force\n", image_name.c_str());
+                exit(1);
            }
        }
        if (new_name != "")
@@ -202,23 +180,32 @@ resume_2:
            return;
        if (!parent->etcd_result["succeeded"].bool_value())
        {
-            result = (cli_result_t){ .err = EAGAIN, .text = "Image "+image_name+" was modified by someone else, please repeat your request" };
-            state = 100;
-            return;
+            fprintf(stderr, "Image %s was modified by someone else, please repeat your request\n", image_name.c_str());
+            exit(1);
        }
-        result = (cli_result_t){ .err = 0, .text = "Image "+image_name+" modified" };
+        printf("Image %s modified\n", image_name.c_str());
        state = 100;
    }
 };

-std::function<bool(cli_result_t &)> cli_tool_t::start_modify(json11::Json cfg)
+std::function<bool(void)> cli_tool_t::start_modify(json11::Json cfg)
 {
    json11::Json::array cmd = cfg["command"].array_items();
    auto changer = new image_changer_t();
    changer->parent = this;
    changer->image_name = cmd.size() > 1 ? cmd[1].string_value() : "";
+    if (changer->image_name == "")
+    {
+        fprintf(stderr, "Image name is missing\n");
+        exit(1);
+    }
    changer->new_name = cfg["rename"].string_value();
    changer->new_size = parse_size(cfg["resize"].string_value());
+    if (changer->new_size != 0 && (changer->new_size % 4096))
+    {
+        fprintf(stderr, "Image size should be a multiple of 4096\n");
+        exit(1);
+    }
    changer->force = cfg["force"].bool_value();
    changer->set_readonly = cfg["readonly"].bool_value();
    changer->set_readwrite = cfg["readwrite"].bool_value();
@@ -226,12 +213,11 @@ std::function<bool(cli_result_t &)> cli_tool_t::start_modify(json11::Json cfg)
    if (!changer->fsync_interval)
        changer->fsync_interval = 128;
    // FIXME Check that the image doesn't have children when shrinking
-    return [changer](cli_result_t & result)
+    return [changer]()
    {
        changer->loop();
        if (changer->is_done())
        {
-            result = changer->result;
            delete changer;
            return true;
        }
--- a/src/cli_rm.cpp
+++ b/src/cli_rm.cpp
@@ -1,633 +1,211 @@
 // Copyright (c) Vitaliy Filippov, 2019+
 // License: VNPL-1.1 (see README.md for details)

-#include <fcntl.h>
 #include "cli.h"
 #include "cluster_client.h"
-#include "base64.h"

-// Remove layer(s): similar to merge, but alters metadata and processes multiple merge targets
-//
-// Exactly one child of the requested layers may be merged using the "inverted" workflow,
-// where we merge it "down" into one of the "to-be-removed" layers and then rename the
-// "to-be-removed" layer to the child. It may be done either if all writers are stopped
-// before trying to delete layers (which is signaled by --writers-stopped) or if that child
-// is a read-only layer (snapshot) itself.
-//
-// This "inverted" workflow trades copying data of one of the deleted layers for copying
-// data of one child of the chain which is also a child of the "traded" layer. So we
-// choose the (parent,child) pair which has the largest difference between "parent" and
-// "child" inode sizes.
-//
-// All other children of the chain are processed by iterating though them, merging removed
-// parents into them and rebasing them to the last layer which isn't a member of the removed
-// chain.
-//
-// Example:
-//
-// <parent> - <from> - <layer 2> - <to> - <child 1>
-//                 \           \       \- <child 2>
-//                  \           \- <child 3>
-//                   \-<child 4>
-//
-// 1) Find optimal pair for the "reverse" scenario
-//    Imagine that it's (<layer 2>, <child 1>) in this example
-// 2) Process all children except <child 1>:
-//    - Merge <from>..<to> to <child 2>
-//    - Set <child 2> parent to <parent>
-//    - Repeat for others
-// 3) Process <child 1>:
-//    - Merge <from>..<child 1> to <layer 2>
-//    - Set <layer 2> parent to <parent>
-//    - Rename <layer 2> to <child 1>
-// 4) Delete other layers of the chain (<from>, <to>)
-struct snap_remover_t
+#define RM_LISTING 1
+#define RM_REMOVING 2
+#define RM_END 3
+
+struct rm_pg_t
 {
-    cli_tool_t *parent;
-
-    // remove from..to
-    std::string from_name, to_name;
-    // writers are stopped, we can safely change writable layers
-    bool writers_stopped = false;
-    // use CAS writes (0 = never, 1 = auto, 2 = always)
-    int use_cas = 1;
-    // interval between fsyncs
-    int fsync_interval = 128;
-
-    std::map<inode_t,int> sources;
-    std::map<inode_t,uint64_t> inode_used;
-    std::vector<inode_t> merge_children;
-    std::vector<inode_t> chain_list;
-    std::map<inode_t,int> inverse_candidates;
-    inode_t inverse_parent = 0, inverse_child = 0;
-    inode_t new_parent = 0;
+    pg_num_t pg_num;
+    osd_num_t rm_osd_num;
+    std::set<object_id> objects;
+    std::set<object_id>::iterator obj_pos;
+    uint64_t obj_count = 0, obj_done = 0;
    int state = 0;
-    int current_child = 0;
-    std::function<bool(cli_result_t &)> cb;
+    int in_flight = 0;
+};

-    cli_result_t result;
+struct rm_inode_t
+{
+    uint64_t inode = 0;
+    pool_id_t pool_id = 0;
+    uint64_t min_offset = 0;

-    bool is_done()
+    cli_tool_t *parent = NULL;
+    inode_list_t *lister = NULL;
+    std::vector<rm_pg_t*> lists;
+    uint64_t total_count = 0, total_done = 0, total_prev_pct = 0;
+    uint64_t pgs_to_list = 0;
+    bool lists_done = false;
+    int state = 0;
+
+    void start_delete()
    {
-        return state == 9;
-    }
-
-    void loop()
-    {
-        if (state == 1)
-            goto resume_1;
-        else if (state == 2)
-            goto resume_2;
-        else if (state == 3)
-            goto resume_3;
-        else if (state == 4)
-            goto resume_4;
-        else if (state == 5)
-            goto resume_5;
-        else if (state == 6)
-            goto resume_6;
-        else if (state == 7)
-            goto resume_7;
-        else if (state == 8)
-            goto resume_8;
-        else if (state == 9)
-            goto resume_9;
-        if (from_name == "")
+        lister = parent->cli->list_inode_start(inode, [this](inode_list_t *lst,
+            std::set<object_id>&& objects, pg_num_t pg_num, osd_num_t primary_osd, int status)
        {
-            result = (cli_result_t){ .err = EINVAL, .text = "Layer to remove argument is missing" };
-            state = 100;
-            return;
-        }
-        if (to_name == "")
-        {
-            to_name = from_name;
-        }
-        // Get children to merge
-        get_merge_children();
-        if (state == 100)
-            return;
-        // Try to select an inode for the "inverse" optimized scenario
-        // Read statistics from etcd to do it
-        read_stats();
-        if (state == 100)
-            return;
-        state = 1;
-resume_1:
-        if (parent->waiting > 0)
-            return;
-        choose_inverse_candidate();
-        // Merge children one by one, except our "inverse" child
-        for (current_child = 0; current_child < merge_children.size(); current_child++)
-        {
-            if (merge_children[current_child] == inverse_child)
-                continue;
-            start_merge_child(merge_children[current_child], merge_children[current_child]);
-            if (state == 100)
-                return;
-resume_2:
-            while (!cb(result))
-            {
-                state = 2;
-                return;
-            }
-            cb = NULL;
-            if (result.err)
-            {
-                state = 100;
-                return;
-            }
-            parent->change_parent(merge_children[current_child], new_parent);
-            state = 3;
-resume_3:
-            if (parent->waiting > 0)
-                return;
-        }
-        // Merge our "inverse" child into our "inverse" parent
-        if (inverse_child != 0)
-        {
-            start_merge_child(inverse_child, inverse_parent);
-            if (state == 100)
-                return;
-resume_4:
-            while (!cb(result))
-            {
-                state = 4;
-                return;
-            }
-            cb = NULL;
-            if (result.err)
-            {
-                state = 100;
-                return;
-            }
-            // Delete "inverse" child data
-            start_delete_source(inverse_child);
-            if (state == 100)
-                return;
-resume_5:
-            while (!cb(result))
-            {
-                state = 5;
-                return;
-            }
-            cb = NULL;
-            if (result.err)
-            {
-                state = 100;
-                return;
-            }
-            // Delete "inverse" child metadata, rename parent over it,
-            // and also change parent links of the previous "inverse" child
-            rename_inverse_parent();
-            if (state == 100)
-                return;
-            state = 6;
-resume_6:
-            if (parent->waiting > 0)
-                return;
-        }
-        // Delete parents, except the "inverse" one
-        for (current_child = 0; current_child < chain_list.size(); current_child++)
-        {
-            if (chain_list[current_child] == inverse_parent)
-                continue;
-            start_delete_source(chain_list[current_child]);
-resume_7:
-            while (!cb(result))
-            {
-                state = 7;
-                return;
-            }
-            cb = NULL;
-            if (result.err)
-            {
-                state = 100;
-                return;
-            }
-            delete_inode_config(chain_list[current_child]);
-            if (state == 100)
-                return;
-            state = 8;
-resume_8:
-            if (parent->waiting > 0)
-                return;
-        }
-        state = 9;
-resume_9:
-        // Done
-        return;
-    }
-
-    void get_merge_children()
-    {
-        // Get all children of from..to
-        inode_config_t *from_cfg = parent->get_inode_cfg(from_name);
-        inode_config_t *to_cfg = parent->get_inode_cfg(to_name);
-        // Check that to_cfg is actually a child of from_cfg
-        // FIXME de-copypaste the following piece of code with snap_merger_t
-        inode_config_t *cur = to_cfg;
-        chain_list.push_back(cur->num);
-        while (cur->num != from_cfg->num && cur->parent_id != 0)
-        {
-            auto it = parent->cli->st_cli.inode_config.find(cur->parent_id);
-            if (it == parent->cli->st_cli.inode_config.end())
-            {
-                char buf[1024];
-                snprintf(buf, 1024, "Parent inode of layer %s (id 0x%lx) not found", cur->name.c_str(), cur->parent_id);
-                state = 100;
-                return;
-            }
-            cur = &it->second;
-            chain_list.push_back(cur->num);
-        }
-        if (cur->num != from_cfg->num)
-        {
-            result = (cli_result_t){ .err = EINVAL, .text = "Layer "+to_name+" is not a child of "+from_name };
-            state = 100;
-            return;
-        }
-        new_parent = from_cfg->parent_id;
-        // Calculate ranks
-        int i = chain_list.size()-1;
-        for (inode_t item: chain_list)
-        {
-            sources[item] = i--;
-        }
-        for (auto & ic: parent->cli->st_cli.inode_config)
-        {
-            if (!ic.second.parent_id)
-            {
-                continue;
-            }
-            auto it = sources.find(ic.second.parent_id);
-            if (it != sources.end() && sources.find(ic.second.num) == sources.end())
-            {
-                merge_children.push_back(ic.second.num);
-                if (ic.second.readonly || writers_stopped)
-                {
-                    inverse_candidates[ic.second.num] = it->second;
-                }
-            }
-        }
-    }
-
-    void read_stats()
-    {
-        if (inverse_candidates.size() == 0)
-        {
-            return;
-        }
-        json11::Json::array reads;
-        for (auto cp: inverse_candidates)
-        {
-            inode_t inode = cp.first;
-            reads.push_back(json11::Json::object {
-                { "request_range", json11::Json::object {
-                    { "key", base64_encode(
-                        parent->cli->st_cli.etcd_prefix+
-                        "/inode/stats/"+std::to_string(INODE_POOL(inode))+
-                        "/"+std::to_string(INODE_NO_POOL(inode))
-                    ) },
-                } }
+            rm_pg_t *rm = new rm_pg_t((rm_pg_t){
+                .pg_num = pg_num,
+                .rm_osd_num = primary_osd,
+                .objects = objects,
+                .obj_count = objects.size(),
+                .obj_done = 0,
            });
-        }
-        for (auto cp: sources)
-        {
-            inode_t inode = cp.first;
-            reads.push_back(json11::Json::object {
-                { "request_range", json11::Json::object {
-                    { "key", base64_encode(
-                        parent->cli->st_cli.etcd_prefix+
-                        "/inode/stats/"+std::to_string(INODE_POOL(inode))+
-                        "/"+std::to_string(INODE_NO_POOL(inode))
-                    ) },
-                } }
-            });
-        }
-        parent->waiting++;
-        parent->cli->st_cli.etcd_txn_slow(json11::Json::object {
-            { "success", reads },
-        }, [this](std::string err, json11::Json data)
-        {
-            parent->waiting--;
-            if (err != "")
+            if (min_offset == 0)
            {
-                result = (cli_result_t){ .err = EIO, .text = "Error reading layer statistics from etcd: "+err };
-                state = 100;
-                return;
+                total_count += objects.size();
            }
-            for (auto inode_result: data["responses"].array_items())
+            else
            {
-                auto kv = parent->cli->st_cli.parse_etcd_kv(inode_result["kvs"][0]);
-                pool_id_t pool_id = 0;
-                inode_t inode = 0;
-                char null_byte = 0;
-                sscanf(kv.key.c_str() + parent->cli->st_cli.etcd_prefix.length()+13, "%u/%lu%c", &pool_id, &inode, &null_byte);
-                if (!inode || null_byte != 0)
+                for (object_id oid: objects)
                {
-                    result = (cli_result_t){ .err = EIO, .text = "Bad key returned from etcd: "+kv.key };
-                    state = 100;
-                    return;
+                    if (oid.stripe >= min_offset)
+                    {
+                        total_count++;
+                    }
                }
-                auto pool_cfg_it = parent->cli->st_cli.pool_config.find(pool_id);
-                if (pool_cfg_it == parent->cli->st_cli.pool_config.end())
-                {
-                    result = (cli_result_t){ .err = ENOENT, .text = "Pool "+std::to_string(pool_id)+" does not exist" };
-                    state = 100;
-                    return;
-                }
-                inode = INODE_WITH_POOL(pool_id, inode);
-                auto & pool_cfg = pool_cfg_it->second;
-                uint64_t used_bytes = kv.value["raw_used"].uint64_value() / pool_cfg.pg_size;
-                if (pool_cfg.scheme != POOL_SCHEME_REPLICATED)
-                {
-                    used_bytes *= (pool_cfg.pg_size - pool_cfg.parity_chunks);
-                }
-                inode_used[inode] = used_bytes;
            }
-            parent->ringloop->wakeup();
+            rm->obj_pos = rm->objects.begin();
+            lists.push_back(rm);
+            if (parent->list_first)
+            {
+                parent->cli->list_inode_next(lister, 1);
+            }
+            if (status & INODE_LIST_DONE)
+            {
+                lists_done = true;
+            }
+            pgs_to_list--;
+            continue_delete();
        });
+        if (!lister)
+        {
+            fprintf(stderr, "Failed to list inode %lu from pool %u objects\n", INODE_NO_POOL(inode), INODE_POOL(inode));
+            exit(1);
+        }
+        pgs_to_list = parent->cli->list_pg_count(lister);
+        parent->cli->list_inode_next(lister, parent->parallel_osds);
    }

-    void choose_inverse_candidate()
+    void send_ops(rm_pg_t *cur_list)
    {
-        uint64_t max_diff = 0;
-        for (auto cp: inverse_candidates)
+        if (parent->cli->msgr.osd_peer_fds.find(cur_list->rm_osd_num) ==
+            parent->cli->msgr.osd_peer_fds.end())
        {
-            inode_t child = cp.first;
-            uint64_t child_used = inode_used[child];
-            int rank = cp.second;
-            for (int i = chain_list.size()-rank; i < chain_list.size(); i++)
-            {
-                inode_t parent = chain_list[i];
-                uint64_t parent_used = inode_used[parent];
-                if (parent_used > child_used && (!max_diff || max_diff < (parent_used-child_used)))
-                {
-                    max_diff = (parent_used-child_used);
-                    inverse_parent = parent;
-                    inverse_child = child;
-                }
-            }
-        }
-    }
-
-    void rename_inverse_parent()
-    {
-        auto child_it = parent->cli->st_cli.inode_config.find(inverse_child);
-        if (child_it == parent->cli->st_cli.inode_config.end())
-        {
-            char buf[1024];
-            snprintf(buf, 1024, "Inode 0x%lx disappeared", inverse_child);
-            result = (cli_result_t){ .err = EIO, .text = std::string(buf) };
-            state = 100;
+            // Initiate connection
+            parent->cli->msgr.connect_peer(cur_list->rm_osd_num, parent->cli->st_cli.peer_states[cur_list->rm_osd_num]);
            return;
        }
-        auto target_it = parent->cli->st_cli.inode_config.find(inverse_parent);
-        if (target_it == parent->cli->st_cli.inode_config.end())
+        while (cur_list->in_flight < parent->iodepth && cur_list->obj_pos != cur_list->objects.end())
        {
-            char buf[1024];
-            snprintf(buf, 1024, "Inode 0x%lx disappeared", inverse_parent);
-            result = (cli_result_t){ .err = EIO, .text = std::string(buf) };
-            state = 100;
-            return;
-        }
-        inode_config_t *child_cfg = &child_it->second;
-        inode_config_t *target_cfg = &target_it->second;
-        std::string child_name = child_cfg->name;
-        std::string target_name = target_cfg->name;
-        std::string child_cfg_key = base64_encode(
-            parent->cli->st_cli.etcd_prefix+
-            "/config/inode/"+std::to_string(INODE_POOL(inverse_child))+
-            "/"+std::to_string(INODE_NO_POOL(inverse_child))
-        );
-        std::string target_cfg_key = base64_encode(
-            parent->cli->st_cli.etcd_prefix+
-            "/config/inode/"+std::to_string(INODE_POOL(inverse_parent))+
-            "/"+std::to_string(INODE_NO_POOL(inverse_parent))
-        );
-        // Fill new configuration
-        inode_config_t new_cfg = *child_cfg;
-        new_cfg.num = target_cfg->num;
-        new_cfg.parent_id = new_parent;
-        json11::Json::array cmp = json11::Json::array {
-            json11::Json::object {
-                { "target", "MOD" },
-                { "key", child_cfg_key },
-                { "result", "LESS" },
-                { "mod_revision", child_cfg->mod_revision+1 },
-            },
-            json11::Json::object {
-                { "target", "MOD" },
-                { "key", target_cfg_key },
-                { "result", "LESS" },
-                { "mod_revision", target_cfg->mod_revision+1 },
-            },
-        };
-        json11::Json::array txn = json11::Json::array {
-            json11::Json::object {
-                { "request_delete_range", json11::Json::object {
-                    { "key", child_cfg_key },
-                } },
-            },
-            json11::Json::object {
-                { "request_put", json11::Json::object {
-                    { "key", target_cfg_key },
-                    { "value", base64_encode(json11::Json(parent->cli->st_cli.serialize_inode_cfg(&new_cfg)).dump()) },
-                } },
-            },
-            json11::Json::object {
-                { "request_put", json11::Json::object {
-                    { "key", base64_encode(parent->cli->st_cli.etcd_prefix+"/index/image/"+child_cfg->name) },
-                    { "value", base64_encode(json11::Json({
-                        { "id", INODE_NO_POOL(inverse_parent) },
-                        { "pool_id", (uint64_t)INODE_POOL(inverse_parent) },
-                    }).dump()) },
-                } },
-            },
-        };
-        // Reparent children of inverse_child
-        for (auto & cp: parent->cli->st_cli.inode_config)
-        {
-            if (cp.second.parent_id == child_cfg->num)
+            if (cur_list->obj_pos->stripe >= min_offset)
            {
-                auto cp_cfg = cp.second;
-                cp_cfg.parent_id = inverse_parent;
-                auto cp_key = base64_encode(
-                    parent->cli->st_cli.etcd_prefix+
-                    "/config/inode/"+std::to_string(INODE_POOL(cp.second.num))+
-                    "/"+std::to_string(INODE_NO_POOL(cp.second.num))
-                );
-                cmp.push_back(json11::Json::object {
-                    { "target", "MOD" },
-                    { "key", cp_key },
-                    { "result", "LESS" },
-                    { "mod_revision", cp.second.mod_revision+1 },
-                });
-                txn.push_back(json11::Json::object {
-                    { "request_put", json11::Json::object {
-                        { "key", cp_key },
-                        { "value", base64_encode(json11::Json(parent->cli->st_cli.serialize_inode_cfg(&cp_cfg)).dump()) },
-                    } },
-                });
-            }
-        }
-        parent->waiting++;
-        parent->cli->st_cli.etcd_txn_slow(json11::Json::object {
-            { "compare", cmp },
-            { "success", txn },
-        }, [this, target_name, child_name](std::string err, json11::Json res)
-        {
-            parent->waiting--;
-            if (err != "")
-            {
-                result = (cli_result_t){ .err = EIO, .text = "Error renaming "+target_name+" to "+child_name+": "+err };
-                state = 100;
-                return;
-            }
-            if (!res["succeeded"].bool_value())
-            {
-                result = (cli_result_t){
-                    .err = EIO,
-                    .text = "Parent ("+target_name+"), child ("+child_name+"), or one of its children"
-                        " configuration was modified during rename",
+                osd_op_t *op = new osd_op_t();
+                op->op_type = OSD_OP_OUT;
+                op->peer_fd = parent->cli->msgr.osd_peer_fds[cur_list->rm_osd_num];
+                op->req = (osd_any_op_t){
+                    .rw = {
+                        .header = {
+                            .magic = SECONDARY_OSD_OP_MAGIC,
+                            .id = parent->cli->next_op_id(),
+                            .opcode = OSD_OP_DELETE,
+                        },
+                        .inode = cur_list->obj_pos->inode,
+                        .offset = cur_list->obj_pos->stripe,
+                        .len = 0,
+                    },
                };
-                state = 100;
-                return;
+                op->callback = [this, cur_list](osd_op_t *op)
+                {
+                    cur_list->in_flight--;
+                    if (op->reply.hdr.retval < 0)
+                    {
+                        fprintf(stderr, "Failed to remove object %lx:%lx from PG %u (OSD %lu) (retval=%ld)\n",
+                            op->req.rw.inode, op->req.rw.offset,
+                            cur_list->pg_num, cur_list->rm_osd_num, op->reply.hdr.retval);
+                    }
+                    delete op;
+                    cur_list->obj_done++;
+                    total_done++;
+                    continue_delete();
+                };
+                cur_list->in_flight++;
+                parent->cli->msgr.outbox_push(op);
            }
-            printf("Layer %s renamed to %s\n", target_name.c_str(), child_name.c_str());
-            parent->ringloop->wakeup();
-        });
+            cur_list->obj_pos++;
+        }
    }

-    void delete_inode_config(inode_t cur)
+    void continue_delete()
    {
-        auto cur_cfg_it = parent->cli->st_cli.inode_config.find(cur);
-        if (cur_cfg_it == parent->cli->st_cli.inode_config.end())
+        if (parent->list_first && !lists_done)
        {
-            char buf[1024];
-            snprintf(buf, 1024, "Inode 0x%lx disappeared", cur);
-            result = (cli_result_t){ .err = EIO, .text = std::string(buf) };
-            state = 100;
            return;
        }
-        inode_config_t *cur_cfg = &cur_cfg_it->second;
-        std::string cur_name = cur_cfg->name;
-        std::string cur_cfg_key = base64_encode(
-            parent->cli->st_cli.etcd_prefix+
-            "/config/inode/"+std::to_string(INODE_POOL(cur))+
-            "/"+std::to_string(INODE_NO_POOL(cur))
-        );
-        parent->waiting++;
-        parent->cli->st_cli.etcd_txn_slow(json11::Json::object {
-            { "compare", json11::Json::array {
-                json11::Json::object {
-                    { "target", "MOD" },
-                    { "key", cur_cfg_key },
-                    { "result", "LESS" },
-                    { "mod_revision", cur_cfg->mod_revision+1 },
-                },
-            } },
-            { "success", json11::Json::array {
-                json11::Json::object {
-                    { "request_delete_range", json11::Json::object {
-                        { "key", cur_cfg_key },
-                    } },
-                },
-                json11::Json::object {
-                    { "request_delete_range", json11::Json::object {
-                        { "key", base64_encode(parent->cli->st_cli.etcd_prefix+"/index/image/"+cur_name) },
-                    } },
-                },
-            } },
-        }, [this, cur_name](std::string err, json11::Json res)
+        for (int i = 0; i < lists.size(); i++)
        {
-            parent->waiting--;
-            if (err != "")
+            if (!lists[i]->in_flight && lists[i]->obj_pos == lists[i]->objects.end())
            {
-                result = (cli_result_t){ .err = EIO, .text = "Error deleting "+cur_name+": "+err };
-                state = 100;
-                return;
+                delete lists[i];
+                lists.erase(lists.begin()+i, lists.begin()+i+1);
+                i--;
+                if (!lists_done)
+                {
+                    parent->cli->list_inode_next(lister, 1);
+                }
            }
-            if (!res["succeeded"].bool_value())
+            else
            {
-                result = (cli_result_t){ .err = EIO, .text = "Layer "+cur_name+" was modified during deletion" };
-                state = 100;
-                return;
+                send_ops(lists[i]);
            }
-            printf("Layer %s deleted\n", cur_name.c_str());
-            parent->ringloop->wakeup();
-        });
+        }
+        if (parent->progress && total_count > 0 && total_done*1000/total_count != total_prev_pct)
+        {
+            printf("\rRemoved %lu/%lu objects, %lu more PGs to list...", total_done, total_count, pgs_to_list);
+            total_prev_pct = total_done*1000/total_count;
+        }
+        if (lists_done && !lists.size())
+        {
+            printf("Done, inode %lu in pool %u data removed\n", INODE_NO_POOL(inode), pool_id);
+            state = 2;
+        }
    }

-    void start_merge_child(inode_t child_inode, inode_t target_inode)
+    bool loop()
    {
-        auto child_it = parent->cli->st_cli.inode_config.find(child_inode);
-        if (child_it == parent->cli->st_cli.inode_config.end())
+        if (state == 0)
        {
-            char buf[1024];
-            snprintf(buf, 1024, "Inode 0x%lx disappeared", child_inode);
-            result = (cli_result_t){ .err = EIO, .text = std::string(buf) };
-            state = 100;
-            return;
+            start_delete();
+            state = 1;
        }
-        auto target_it = parent->cli->st_cli.inode_config.find(target_inode);
-        if (target_it == parent->cli->st_cli.inode_config.end())
+        else if (state == 1)
        {
-            char buf[1024];
-            snprintf(buf, 1024, "Inode 0x%lx disappeared", target_inode);
-            result = (cli_result_t){ .err = EIO, .text = std::string(buf) };
-            state = 100;
-            return;
+            continue_delete();
        }
-        cb = parent->start_merge(json11::Json::object {
-            { "command", json11::Json::array{ "merge-data", from_name, child_it->second.name } },
-            { "target", target_it->second.name },
-            { "delete-source", false },
-            { "cas", use_cas },
-            { "fsync-interval", fsync_interval },
-        });
-    }
-
-    void start_delete_source(inode_t inode)
-    {
-        auto source = parent->cli->st_cli.inode_config.find(inode);
-        if (source == parent->cli->st_cli.inode_config.end())
+        else if (state == 2)
        {
-            char buf[1024];
-            snprintf(buf, 1024, "Inode 0x%lx disappeared", inode);
-            result = (cli_result_t){ .err = EIO, .text = std::string(buf) };
-            state = 100;
-            return;
+            return true;
        }
-        cb = parent->start_rm(json11::Json::object {
-            { "inode", inode },
-            { "pool", (uint64_t)INODE_POOL(inode) },
-            { "fsync-interval", fsync_interval },
-        });
+        return false;
    }
 };

-std::function<bool(cli_result_t &)> cli_tool_t::start_snap_rm(json11::Json cfg)
+std::function<bool(void)> cli_tool_t::start_rm(json11::Json cfg)
 {
-    json11::Json::array cmd = cfg["command"].array_items();
-    auto snap_remover = new snap_remover_t();
-    snap_remover->parent = this;
-    snap_remover->from_name = cmd.size() > 1 ? cmd[1].string_value() : "";
-    snap_remover->to_name = cmd.size() > 2 ? cmd[2].string_value() : "";
-    snap_remover->fsync_interval = cfg["fsync-interval"].uint64_value();
-    if (!snap_remover->fsync_interval)
-        snap_remover->fsync_interval = 128;
-    if (!cfg["cas"].is_null())
-        snap_remover->use_cas = cfg["cas"].uint64_value() ? 2 : 0;
-    if (!cfg["writers_stopped"].is_null())
-        snap_remover->writers_stopped = true;
-    return [snap_remover](cli_result_t & result)
+    auto remover = new rm_inode_t();
+    remover->parent = this;
+    remover->inode = cfg["inode"].uint64_value();
+    remover->pool_id = cfg["pool"].uint64_value();
+    if (remover->pool_id)
    {
-        snap_remover->loop();
-        if (snap_remover->is_done())
+        remover->inode = (remover->inode & ((1l << (64-POOL_ID_BITS)) - 1)) | (((uint64_t)remover->pool_id) << (64-POOL_ID_BITS));
+    }
+    remover->pool_id = INODE_POOL(remover->inode);
+    if (!remover->pool_id)
+    {
+        fprintf(stderr, "pool is missing\n");
+        exit(1);
+    }
+    remover->min_offset = cfg["min-offset"].uint64_value();
+    return [remover]()
+    {
+        if (remover->loop())
        {
-            result = snap_remover->result;
-            delete snap_remover;
+            delete remover;
            return true;
        }
        return false;
--- a/src/cli_rm_data.cpp
+++ b/src/cli_rm_data.cpp
@@ -1,230 +0,0 @@
-// Copyright (c) Vitaliy Filippov, 2019+
-// License: VNPL-1.1 (see README.md for details)
-
-#include "cli.h"
-#include "cluster_client.h"
-
-#define RM_LISTING 1
-#define RM_REMOVING 2
-#define RM_END 3
-
-struct rm_pg_t
-{
-    pg_num_t pg_num;
-    osd_num_t rm_osd_num;
-    std::set<object_id> objects;
-    std::set<object_id>::iterator obj_pos;
-    uint64_t obj_count = 0, obj_done = 0;
-    int state = 0;
-    int in_flight = 0;
-};
-
-struct rm_inode_t
-{
-    uint64_t inode = 0;
-    pool_id_t pool_id = 0;
-    uint64_t min_offset = 0;
-
-    cli_tool_t *parent = NULL;
-    inode_list_t *lister = NULL;
-    std::vector<rm_pg_t*> lists;
-    uint64_t total_count = 0, total_done = 0, total_prev_pct = 0;
-    uint64_t pgs_to_list = 0;
-    bool lists_done = false;
-    int state = 0;
-    int error_count = 0;
-
-    cli_result_t result;
-
-    void start_delete()
-    {
-        lister = parent->cli->list_inode_start(inode, [this](inode_list_t *lst,
-            std::set<object_id>&& objects, pg_num_t pg_num, osd_num_t primary_osd, int status)
-        {
-            rm_pg_t *rm = new rm_pg_t((rm_pg_t){
-                .pg_num = pg_num,
-                .rm_osd_num = primary_osd,
-                .objects = objects,
-                .obj_count = objects.size(),
-                .obj_done = 0,
-            });
-            if (min_offset == 0)
-            {
-                total_count += objects.size();
-            }
-            else
-            {
-                for (object_id oid: objects)
-                {
-                    if (oid.stripe >= min_offset)
-                    {
-                        total_count++;
-                    }
-                }
-            }
-            rm->obj_pos = rm->objects.begin();
-            lists.push_back(rm);
-            if (parent->list_first)
-            {
-                parent->cli->list_inode_next(lister, 1);
-            }
-            if (status & INODE_LIST_DONE)
-            {
-                lists_done = true;
-            }
-            pgs_to_list--;
-            continue_delete();
-        });
-        if (!lister)
-        {
-            result = (cli_result_t){
-                .err = EIO,
-                .text = "Failed to list objects of inode "+std::to_string(INODE_NO_POOL(inode))+
-                    " from pool "+std::to_string(INODE_POOL(inode)),
-            };
-            state = 100;
-            return;
-        }
-        pgs_to_list = parent->cli->list_pg_count(lister);
-        parent->cli->list_inode_next(lister, parent->parallel_osds);
-    }
-
-    void send_ops(rm_pg_t *cur_list)
-    {
-        if (parent->cli->msgr.osd_peer_fds.find(cur_list->rm_osd_num) ==
-            parent->cli->msgr.osd_peer_fds.end())
-        {
-            // Initiate connection
-            parent->cli->msgr.connect_peer(cur_list->rm_osd_num, parent->cli->st_cli.peer_states[cur_list->rm_osd_num]);
-            return;
-        }
-        while (cur_list->in_flight < parent->iodepth && cur_list->obj_pos != cur_list->objects.end())
-        {
-            if (cur_list->obj_pos->stripe >= min_offset)
-            {
-                osd_op_t *op = new osd_op_t();
-                op->op_type = OSD_OP_OUT;
-                // Already checked that it exists above, but anyway
-                op->peer_fd = parent->cli->msgr.osd_peer_fds.at(cur_list->rm_osd_num);
-                op->req = (osd_any_op_t){
-                    .rw = {
-                        .header = {
-                            .magic = SECONDARY_OSD_OP_MAGIC,
-                            .id = parent->cli->next_op_id(),
-                            .opcode = OSD_OP_DELETE,
-                        },
-                        .inode = cur_list->obj_pos->inode,
-                        .offset = cur_list->obj_pos->stripe,
-                        .len = 0,
-                    },
-                };
-                op->callback = [this, cur_list](osd_op_t *op)
-                {
-                    cur_list->in_flight--;
-                    if (op->reply.hdr.retval < 0)
-                    {
-                        fprintf(stderr, "Failed to remove object %lx:%lx from PG %u (OSD %lu) (retval=%ld)\n",
-                            op->req.rw.inode, op->req.rw.offset,
-                            cur_list->pg_num, cur_list->rm_osd_num, op->reply.hdr.retval);
-                        error_count++;
-                    }
-                    delete op;
-                    cur_list->obj_done++;
-                    total_done++;
-                    continue_delete();
-                };
-                cur_list->in_flight++;
-                parent->cli->msgr.outbox_push(op);
-            }
-            cur_list->obj_pos++;
-        }
-    }
-
-    void continue_delete()
-    {
-        if (parent->list_first && !lists_done)
-        {
-            return;
-        }
-        for (int i = 0; i < lists.size(); i++)
-        {
-            if (!lists[i]->in_flight && lists[i]->obj_pos == lists[i]->objects.end())
-            {
-                delete lists[i];
-                lists.erase(lists.begin()+i, lists.begin()+i+1);
-                i--;
-                if (!lists_done)
-                {
-                    parent->cli->list_inode_next(lister, 1);
-                }
-            }
-            else
-            {
-                send_ops(lists[i]);
-            }
-        }
-        if (parent->progress && total_count > 0 && total_done*1000/total_count != total_prev_pct)
-        {
-            printf("\rRemoved %lu/%lu objects, %lu more PGs to list...", total_done, total_count, pgs_to_list);
-            total_prev_pct = total_done*1000/total_count;
-        }
-        if (lists_done && !lists.size())
-        {
-            result = (cli_result_t){
-                .err = error_count > 0 ? EIO : 0,
-                .text = error_count > 0 ? "Some blocks were not removed" : (
-                    "Done, inode "+std::to_string(INODE_NO_POOL(inode))+" from pool "+
-                    std::to_string(pool_id)+" removed"),
-            };
-            state = 100;
-        }
-    }
-
-    bool is_done()
-    {
-        return state == 100;
-    }
-
-    void loop()
-    {
-        if (state == 1)
-            goto resume_1;
-        if (!pool_id)
-        {
-            result = (cli_result_t){ .err = EINVAL, .text = "Pool is not specified" };
-            state = 100;
-            return;
-        }
-        start_delete();
-        if (state == 100)
-            return;
-        state = 1;
-    resume_1:
-        continue_delete();
-    }
-};
-
-std::function<bool(cli_result_t &)> cli_tool_t::start_rm(json11::Json cfg)
-{
-    auto remover = new rm_inode_t();
-    remover->parent = this;
-    remover->inode = cfg["inode"].uint64_value();
-    remover->pool_id = cfg["pool"].uint64_value();
-    if (remover->pool_id)
-    {
-        remover->inode = (remover->inode & (((uint64_t)1 << (64-POOL_ID_BITS)) - 1)) | (((uint64_t)remover->pool_id) << (64-POOL_ID_BITS));
-    }
-    remover->pool_id = INODE_POOL(remover->inode);
-    remover->min_offset = cfg["min-offset"].uint64_value();
-    return [remover](cli_result_t & result)
-    {
-        remover->loop();
-        if (remover->is_done())
-        {
-            result = remover->result;
-            delete remover;
-            return true;
-        }
-        return false;
-    };
-}
--- a/src/cli_simple_offsets.cpp
+++ b/src/cli_simple_offsets.cpp
@@ -11,7 +11,7 @@
 #include <sys/stat.h>

 // Calculate offsets for a block device and print OSD command line parameters
-std::function<bool(cli_result_t &)> cli_tool_t::simple_offsets(json11::Json cfg)
+std::function<bool(void)> cli_tool_t::simple_offsets(json11::Json cfg)
 {
    std::string device = cfg["command"][1].string_value();
    uint64_t object_size = parse_size(cfg["object_size"].string_value());
--- a/src/cli_snap_rm.cpp
+++ b/src/cli_snap_rm.cpp
@@ -0,0 +1,568 @@
+// Copyright (c) Vitaliy Filippov, 2019+
+// License: VNPL-1.1 (see README.md for details)
+
+#include <fcntl.h>
+#include "cli.h"
+#include "cluster_client.h"
+#include "base64.h"
+
+// Remove layer(s): similar to merge, but alters metadata and processes multiple merge targets
+//
+// Exactly one child of the requested layers may be merged using the "inverted" workflow,
+// where we merge it "down" into one of the "to-be-removed" layers and then rename the
+// "to-be-removed" layer to the child. It may be done either if all writers are stopped
+// before trying to delete layers (which is signaled by --writers-stopped) or if that child
+// is a read-only layer (snapshot) itself.
+//
+// This "inverted" workflow trades copying data of one of the deleted layers for copying
+// data of one child of the chain which is also a child of the "traded" layer. So we
+// choose the (parent,child) pair which has the largest difference between "parent" and
+// "child" inode sizes.
+//
+// All other children of the chain are processed by iterating though them, merging removed
+// parents into them and rebasing them to the last layer which isn't a member of the removed
+// chain.
+//
+// Example:
+//
+// <parent> - <from> - <layer 2> - <to> - <child 1>
+//                 \           \       \- <child 2>
+//                  \           \- <child 3>
+//                   \-<child 4>
+//
+// 1) Find optimal pair for the "reverse" scenario
+//    Imagine that it's (<layer 2>, <child 1>) in this example
+// 2) Process all children except <child 1>:
+//    - Merge <from>..<to> to <child 2>
+//    - Set <child 2> parent to <parent>
+//    - Repeat for others
+// 3) Process <child 1>:
+//    - Merge <from>..<child 1> to <layer 2>
+//    - Set <layer 2> parent to <parent>
+//    - Rename <layer 2> to <child 1>
+// 4) Delete other layers of the chain (<from>, <to>)
+struct snap_remover_t
+{
+    cli_tool_t *parent;
+
+    // remove from..to
+    std::string from_name, to_name;
+    // writers are stopped, we can safely change writable layers
+    bool writers_stopped = false;
+    // use CAS writes (0 = never, 1 = auto, 2 = always)
+    int use_cas = 1;
+    // interval between fsyncs
+    int fsync_interval = 128;
+
+    std::map<inode_t,int> sources;
+    std::map<inode_t,uint64_t> inode_used;
+    std::vector<inode_t> merge_children;
+    std::vector<inode_t> chain_list;
+    std::map<inode_t,int> inverse_candidates;
+    inode_t inverse_parent = 0, inverse_child = 0;
+    inode_t new_parent = 0;
+    int state = 0;
+    int current_child = 0;
+    std::function<bool(void)> cb;
+
+    bool is_done()
+    {
+        return state == 9;
+    }
+
+    void loop()
+    {
+        if (state == 1)
+            goto resume_1;
+        else if (state == 2)
+            goto resume_2;
+        else if (state == 3)
+            goto resume_3;
+        else if (state == 4)
+            goto resume_4;
+        else if (state == 5)
+            goto resume_5;
+        else if (state == 6)
+            goto resume_6;
+        else if (state == 7)
+            goto resume_7;
+        else if (state == 8)
+            goto resume_8;
+        else if (state == 9)
+            goto resume_9;
+        // Get children to merge
+        get_merge_children();
+        // Try to select an inode for the "inverse" optimized scenario
+        // Read statistics from etcd to do it
+        read_stats();
+        state = 1;
+resume_1:
+        if (parent->waiting > 0)
+            return;
+        choose_inverse_candidate();
+        // Merge children one by one, except our "inverse" child
+        for (current_child = 0; current_child < merge_children.size(); current_child++)
+        {
+            if (merge_children[current_child] == inverse_child)
+                continue;
+            start_merge_child(merge_children[current_child], merge_children[current_child]);
+resume_2:
+            while (!cb())
+            {
+                state = 2;
+                return;
+            }
+            cb = NULL;
+            parent->change_parent(merge_children[current_child], new_parent);
+            state = 3;
+resume_3:
+            if (parent->waiting > 0)
+                return;
+        }
+        // Merge our "inverse" child into our "inverse" parent
+        if (inverse_child != 0)
+        {
+            start_merge_child(inverse_child, inverse_parent);
+resume_4:
+            while (!cb())
+            {
+                state = 4;
+                return;
+            }
+            cb = NULL;
+            // Delete "inverse" child data
+            start_delete_source(inverse_child);
+resume_5:
+            while (!cb())
+            {
+                state = 5;
+                return;
+            }
+            cb = NULL;
+            // Delete "inverse" child metadata, rename parent over it,
+            // and also change parent links of the previous "inverse" child
+            rename_inverse_parent();
+            state = 6;
+resume_6:
+            if (parent->waiting > 0)
+                return;
+        }
+        // Delete parents, except the "inverse" one
+        for (current_child = 0; current_child < chain_list.size(); current_child++)
+        {
+            if (chain_list[current_child] == inverse_parent)
+                continue;
+            start_delete_source(chain_list[current_child]);
+resume_7:
+            while (!cb())
+            {
+                state = 7;
+                return;
+            }
+            cb = NULL;
+            delete_inode_config(chain_list[current_child]);
+            state = 8;
+resume_8:
+            if (parent->waiting > 0)
+                return;
+        }
+        state = 9;
+resume_9:
+        // Done
+        return;
+    }
+
+    void get_merge_children()
+    {
+        // Get all children of from..to
+        inode_config_t *from_cfg = parent->get_inode_cfg(from_name);
+        inode_config_t *to_cfg = parent->get_inode_cfg(to_name);
+        // Check that to_cfg is actually a child of from_cfg
+        // FIXME de-copypaste the following piece of code with snap_merger_t
+        inode_config_t *cur = to_cfg;
+        chain_list.push_back(cur->num);
+        while (cur->num != from_cfg->num && cur->parent_id != 0)
+        {
+            auto it = parent->cli->st_cli.inode_config.find(cur->parent_id);
+            if (it == parent->cli->st_cli.inode_config.end())
+            {
+                fprintf(stderr, "Parent inode of layer %s (id %ld) not found\n", cur->name.c_str(), cur->parent_id);
+                exit(1);
+            }
+            cur = &it->second;
+            chain_list.push_back(cur->num);
+        }
+        if (cur->num != from_cfg->num)
+        {
+            fprintf(stderr, "Layer %s is not a child of %s\n", to_name.c_str(), from_name.c_str());
+            exit(1);
+        }
+        new_parent = from_cfg->parent_id;
+        // Calculate ranks
+        int i = chain_list.size()-1;
+        for (inode_t item: chain_list)
+        {
+            sources[item] = i--;
+        }
+        for (auto & ic: parent->cli->st_cli.inode_config)
+        {
+            if (!ic.second.parent_id)
+            {
+                continue;
+            }
+            auto it = sources.find(ic.second.parent_id);
+            if (it != sources.end() && sources.find(ic.second.num) == sources.end())
+            {
+                merge_children.push_back(ic.second.num);
+                if (ic.second.readonly || writers_stopped)
+                {
+                    inverse_candidates[ic.second.num] = it->second;
+                }
+            }
+        }
+    }
+
+    void read_stats()
+    {
+        if (inverse_candidates.size() == 0)
+        {
+            return;
+        }
+        json11::Json::array reads;
+        for (auto cp: inverse_candidates)
+        {
+            inode_t inode = cp.first;
+            reads.push_back(json11::Json::object {
+                { "request_range", json11::Json::object {
+                    { "key", base64_encode(
+                        parent->cli->st_cli.etcd_prefix+
+                        "/inode/stats/"+std::to_string(INODE_POOL(inode))+
+                        "/"+std::to_string(INODE_NO_POOL(inode))
+                    ) },
+                } }
+            });
+        }
+        for (auto cp: sources)
+        {
+            inode_t inode = cp.first;
+            reads.push_back(json11::Json::object {
+                { "request_range", json11::Json::object {
+                    { "key", base64_encode(
+                        parent->cli->st_cli.etcd_prefix+
+                        "/inode/stats/"+std::to_string(INODE_POOL(inode))+
+                        "/"+std::to_string(INODE_NO_POOL(inode))
+                    ) },
+                } }
+            });
+        }
+        parent->waiting++;
+        parent->cli->st_cli.etcd_txn_slow(json11::Json::object {
+            { "success", reads },
+        }, [this](std::string err, json11::Json data)
+        {
+            parent->waiting--;
+            if (err != "")
+            {
+                fprintf(stderr, "Error reading layer statistics from etcd: %s\n", err.c_str());
+                exit(1);
+            }
+            for (auto inode_result: data["responses"].array_items())
+            {
+                auto kv = parent->cli->st_cli.parse_etcd_kv(inode_result["kvs"][0]);
+                pool_id_t pool_id = 0;
+                inode_t inode = 0;
+                char null_byte = 0;
+                sscanf(kv.key.c_str() + parent->cli->st_cli.etcd_prefix.length()+13, "%u/%lu%c", &pool_id, &inode, &null_byte);
+                if (!inode || null_byte != 0)
+                {
+                    fprintf(stderr, "Bad key returned from etcd: %s\n", kv.key.c_str());
+                    exit(1);
+                }
+                auto pool_cfg_it = parent->cli->st_cli.pool_config.find(pool_id);
+                if (pool_cfg_it == parent->cli->st_cli.pool_config.end())
+                {
+                    fprintf(stderr, "Pool %u does not exist\n", pool_id);
+                    exit(1);
+                }
+                inode = INODE_WITH_POOL(pool_id, inode);
+                auto & pool_cfg = pool_cfg_it->second;
+                uint64_t used_bytes = kv.value["raw_used"].uint64_value() / pool_cfg.pg_size;
+                if (pool_cfg.scheme != POOL_SCHEME_REPLICATED)
+                {
+                    used_bytes *= (pool_cfg.pg_size - pool_cfg.parity_chunks);
+                }
+                inode_used[inode] = used_bytes;
+            }
+            parent->ringloop->wakeup();
+        });
+    }
+
+    void choose_inverse_candidate()
+    {
+        uint64_t max_diff = 0;
+        for (auto cp: inverse_candidates)
+        {
+            inode_t child = cp.first;
+            uint64_t child_used = inode_used[child];
+            int rank = cp.second;
+            for (int i = chain_list.size()-rank; i < chain_list.size(); i++)
+            {
+                inode_t parent = chain_list[i];
+                uint64_t parent_used = inode_used[parent];
+                if (parent_used > child_used && (!max_diff || max_diff < (parent_used-child_used)))
+                {
+                    max_diff = (parent_used-child_used);
+                    inverse_parent = parent;
+                    inverse_child = child;
+                }
+            }
+        }
+    }
+
+    void rename_inverse_parent()
+    {
+        auto child_it = parent->cli->st_cli.inode_config.find(inverse_child);
+        if (child_it == parent->cli->st_cli.inode_config.end())
+        {
+            fprintf(stderr, "Inode %ld disappeared\n", inverse_child);
+            exit(1);
+        }
+        auto target_it = parent->cli->st_cli.inode_config.find(inverse_parent);
+        if (target_it == parent->cli->st_cli.inode_config.end())
+        {
+            fprintf(stderr, "Inode %ld disappeared\n", inverse_parent);
+            exit(1);
+        }
+        inode_config_t *child_cfg = &child_it->second;
+        inode_config_t *target_cfg = &target_it->second;
+        std::string child_name = child_cfg->name;
+        std::string target_name = target_cfg->name;
+        std::string child_cfg_key = base64_encode(
+            parent->cli->st_cli.etcd_prefix+
+            "/config/inode/"+std::to_string(INODE_POOL(inverse_child))+
+            "/"+std::to_string(INODE_NO_POOL(inverse_child))
+        );
+        std::string target_cfg_key = base64_encode(
+            parent->cli->st_cli.etcd_prefix+
+            "/config/inode/"+std::to_string(INODE_POOL(inverse_parent))+
+            "/"+std::to_string(INODE_NO_POOL(inverse_parent))
+        );
+        // Fill new configuration
+        inode_config_t new_cfg = *child_cfg;
+        new_cfg.num = target_cfg->num;
+        new_cfg.parent_id = new_parent;
+        json11::Json::array cmp = json11::Json::array {
+            json11::Json::object {
+                { "target", "MOD" },
+                { "key", child_cfg_key },
+                { "result", "LESS" },
+                { "mod_revision", child_cfg->mod_revision+1 },
+            },
+            json11::Json::object {
+                { "target", "MOD" },
+                { "key", target_cfg_key },
+                { "result", "LESS" },
+                { "mod_revision", target_cfg->mod_revision+1 },
+            },
+        };
+        json11::Json::array txn = json11::Json::array {
+            json11::Json::object {
+                { "request_delete_range", json11::Json::object {
+                    { "key", child_cfg_key },
+                } },
+            },
+            json11::Json::object {
+                { "request_put", json11::Json::object {
+                    { "key", target_cfg_key },
+                    { "value", base64_encode(json11::Json(parent->cli->st_cli.serialize_inode_cfg(&new_cfg)).dump()) },
+                } },
+            },
+            json11::Json::object {
+                { "request_put", json11::Json::object {
+                    { "key", base64_encode(parent->cli->st_cli.etcd_prefix+"/index/image/"+child_cfg->name) },
+                    { "value", base64_encode(json11::Json({
+                        { "id", INODE_NO_POOL(inverse_parent) },
+                        { "pool_id", (uint64_t)INODE_POOL(inverse_parent) },
+                    }).dump()) },
+                } },
+            },
+        };
+        // Reparent children of inverse_child
+        for (auto & cp: parent->cli->st_cli.inode_config)
+        {
+            if (cp.second.parent_id == child_cfg->num)
+            {
+                auto cp_cfg = cp.second;
+                cp_cfg.parent_id = inverse_parent;
+                auto cp_key = base64_encode(
+                    parent->cli->st_cli.etcd_prefix+
+                    "/config/inode/"+std::to_string(INODE_POOL(cp.second.num))+
+                    "/"+std::to_string(INODE_NO_POOL(cp.second.num))
+                );
+                cmp.push_back(json11::Json::object {
+                    { "target", "MOD" },
+                    { "key", cp_key },
+                    { "result", "LESS" },
+                    { "mod_revision", cp.second.mod_revision+1 },
+                });
+                txn.push_back(json11::Json::object {
+                    { "request_put", json11::Json::object {
+                        { "key", cp_key },
+                        { "value", base64_encode(json11::Json(parent->cli->st_cli.serialize_inode_cfg(&cp_cfg)).dump()) },
+                    } },
+                });
+            }
+        }
+        parent->waiting++;
+        parent->cli->st_cli.etcd_txn_slow(json11::Json::object {
+            { "compare", cmp },
+            { "success", txn },
+        }, [this, target_name, child_name](std::string err, json11::Json res)
+        {
+            parent->waiting--;
+            if (err != "")
+            {
+                fprintf(stderr, "Error renaming %s to %s: %s\n", target_name.c_str(), child_name.c_str(), err.c_str());
+                exit(1);
+            }
+            if (!res["succeeded"].bool_value())
+            {
+                fprintf(
+                    stderr, "Parent (%s), child (%s), or one of its children"
+                    " configuration was modified during rename\n", target_name.c_str(), child_name.c_str()
+                );
+                exit(1);
+            }
+            printf("Layer %s renamed to %s\n", target_name.c_str(), child_name.c_str());
+            parent->ringloop->wakeup();
+        });
+    }
+
+    void delete_inode_config(inode_t cur)
+    {
+        auto cur_cfg_it = parent->cli->st_cli.inode_config.find(cur);
+        if (cur_cfg_it == parent->cli->st_cli.inode_config.end())
+        {
+            fprintf(stderr, "Inode 0x%lx disappeared\n", cur);
+            exit(1);
+        }
+        inode_config_t *cur_cfg = &cur_cfg_it->second;
+        std::string cur_name = cur_cfg->name;
+        std::string cur_cfg_key = base64_encode(
+            parent->cli->st_cli.etcd_prefix+
+            "/config/inode/"+std::to_string(INODE_POOL(cur))+
+            "/"+std::to_string(INODE_NO_POOL(cur))
+        );
+        parent->waiting++;
+        parent->cli->st_cli.etcd_txn_slow(json11::Json::object {
+            { "compare", json11::Json::array {
+                json11::Json::object {
+                    { "target", "MOD" },
+                    { "key", cur_cfg_key },
+                    { "result", "LESS" },
+                    { "mod_revision", cur_cfg->mod_revision+1 },
+                },
+            } },
+            { "success", json11::Json::array {
+                json11::Json::object {
+                    { "request_delete_range", json11::Json::object {
+                        { "key", cur_cfg_key },
+                    } },
+                },
+                json11::Json::object {
+                    { "request_delete_range", json11::Json::object {
+                        { "key", base64_encode(parent->cli->st_cli.etcd_prefix+"/index/image/"+cur_name) },
+                    } },
+                },
+            } },
+        }, [this, cur_name](std::string err, json11::Json res)
+        {
+            parent->waiting--;
+            if (err != "")
+            {
+                fprintf(stderr, "Error deleting %s: %s\n", cur_name.c_str(), err.c_str());
+                exit(1);
+            }
+            if (!res["succeeded"].bool_value())
+            {
+                fprintf(stderr, "Layer %s configuration was modified during deletion\n", cur_name.c_str());
+                exit(1);
+            }
+            printf("Layer %s deleted\n", cur_name.c_str());
+            parent->ringloop->wakeup();
+        });
+    }
+
+    void start_merge_child(inode_t child_inode, inode_t target_inode)
+    {
+        auto child_it = parent->cli->st_cli.inode_config.find(child_inode);
+        if (child_it == parent->cli->st_cli.inode_config.end())
+        {
+            fprintf(stderr, "Inode %ld disappeared\n", child_inode);
+            exit(1);
+        }
+        auto target_it = parent->cli->st_cli.inode_config.find(target_inode);
+        if (target_it == parent->cli->st_cli.inode_config.end())
+        {
+            fprintf(stderr, "Inode %ld disappeared\n", target_inode);
+            exit(1);
+        }
+        cb = parent->start_merge(json11::Json::object {
+            { "command", json11::Json::array{ "merge-data", from_name, child_it->second.name } },
+            { "target", target_it->second.name },
+            { "delete-source", false },
+            { "cas", use_cas },
+            { "fsync-interval", fsync_interval },
+        });
+    }
+
+    void start_delete_source(inode_t inode)
+    {
+        auto source = parent->cli->st_cli.inode_config.find(inode);
+        if (source == parent->cli->st_cli.inode_config.end())
+        {
+            fprintf(stderr, "Inode %ld disappeared\n", inode);
+            exit(1);
+        }
+        cb = parent->start_rm(json11::Json::object {
+            { "inode", inode },
+            { "pool", (uint64_t)INODE_POOL(inode) },
+            { "fsync-interval", fsync_interval },
+        });
+    }
+};
+
+std::function<bool(void)> cli_tool_t::start_snap_rm(json11::Json cfg)
+{
+    json11::Json::array cmd = cfg["command"].array_items();
+    auto snap_remover = new snap_remover_t();
+    snap_remover->parent = this;
+    snap_remover->from_name = cmd.size() > 1 ? cmd[1].string_value() : "";
+    snap_remover->to_name = cmd.size() > 2 ? cmd[2].string_value() : "";
+    if (snap_remover->from_name == "")
+    {
+        fprintf(stderr, "Layer to remove argument is missing\n");
+        exit(1);
+    }
+    if (snap_remover->to_name == "")
+    {
+        snap_remover->to_name = snap_remover->from_name;
+    }
+    snap_remover->fsync_interval = cfg["fsync-interval"].uint64_value();
+    if (!snap_remover->fsync_interval)
+        snap_remover->fsync_interval = 128;
+    if (!cfg["cas"].is_null())
+        snap_remover->use_cas = cfg["cas"].uint64_value() ? 2 : 0;
+    if (!cfg["writers_stopped"].is_null())
+        snap_remover->writers_stopped = true;
+    return [snap_remover]()
+    {
+        snap_remover->loop();
+        if (snap_remover->is_done())
+        {
+            delete snap_remover;
+            return true;
+        }
+        return false;
+    };
+}
--- a/src/cli_status.cpp
+++ b/src/cli_status.cpp
@@ -1,296 +0,0 @@
-// Copyright (c) Vitaliy Filippov, 2019+
-// License: VNPL-1.1 (see README.md for details)
-
-#include "cli.h"
-#include "cluster_client.h"
-#include "base64.h"
-#include "pg_states.h"
-
-// Print cluster status:
-// etcd, mon, osd states
-// raw/used space, object states, pool states, pg states
-// client io, recovery io, rebalance io
-struct status_printer_t
-{
-    cli_tool_t *parent;
-
-    int state = 0;
-    json11::Json::array mon_members, osd_stats;
-    json11::Json agg_stats;
-    std::map<pool_id_t, json11::Json::object> pool_stats;
-    json11::Json::array etcd_states;
-
-    bool is_done()
-    {
-        return state == 100;
-    }
-
-    void loop()
-    {
-        if (state == 1)
-            goto resume_1;
-        else if (state == 2)
-            goto resume_2;
-        // etcd states
-        {
-            auto addrs = parent->cli->st_cli.get_addresses();
-            etcd_states.resize(addrs.size());
-            for (int i = 0; i < etcd_states.size(); i++)
-            {
-                parent->waiting++;
-                parent->cli->st_cli.etcd_call_oneshot(
-                    addrs[i], "/maintenance/status", json11::Json::object(),
-                    parent->cli->st_cli.etcd_quick_timeout, [this, i](std::string err, json11::Json res)
-                    {
-                        parent->waiting--;
-                        etcd_states[i] = err != "" ? json11::Json::object{ { "error", err } } : res;
-                        parent->ringloop->wakeup();
-                    }
-                );
-            }
-        }
-        state = 1;
-resume_1:
-        if (parent->waiting > 0)
-            return;
-        // Monitors, OSD states
-        parent->etcd_txn(json11::Json::object {
-            { "success", json11::Json::array {
-                json11::Json::object {
-                    { "request_range", json11::Json::object {
-                        { "key", base64_encode(parent->cli->st_cli.etcd_prefix+"/mon/") },
-                        { "range_end", base64_encode(parent->cli->st_cli.etcd_prefix+"/mon0") },
-                    } },
-                },
-                json11::Json::object {
-                    { "request_range", json11::Json::object {
-                        { "key", base64_encode(
-                            parent->cli->st_cli.etcd_prefix+"/osd/stats/"
-                        ) },
-                        { "range_end", base64_encode(
-                            parent->cli->st_cli.etcd_prefix+"/osd/stats0"
-                        ) },
-                    } },
-                },
-                json11::Json::object {
-                    { "request_range", json11::Json::object {
-                        { "key", base64_encode(parent->cli->st_cli.etcd_prefix+"/stats") },
-                    } },
-                },
-            } },
-        });
-        state = 2;
-resume_2:
-        if (parent->waiting > 0)
-            return;
-        mon_members = parent->etcd_result["responses"][0]["response_range"]["kvs"].array_items();
-        osd_stats = parent->etcd_result["responses"][1]["response_range"]["kvs"].array_items();
-        if (parent->etcd_result["responses"][2]["response_range"]["kvs"].array_items().size() > 0)
-        {
-            agg_stats = parent->cli->st_cli.parse_etcd_kv(parent->etcd_result["responses"][2]["response_range"]["kvs"][0]).value;
-        }
-        int etcd_alive = 0;
-        uint64_t etcd_db_size = 0;
-        std::string etcd_detail;
-        for (int i = 0; i < etcd_states.size(); i++)
-        {
-            if (etcd_states[i]["error"].is_null())
-            {
-                etcd_alive++;
-                etcd_db_size = etcd_states[i]["dbSizeInUse"].uint64_value();
-            }
-        }
-        int mon_count = 0;
-        std::string mon_master;
-        for (int i = 0; i < mon_members.size(); i++)
-        {
-            auto kv = parent->cli->st_cli.parse_etcd_kv(mon_members[i]);
-            kv.key = kv.key.substr(parent->cli->st_cli.etcd_prefix.size());
-            if (kv.key.substr(0, 12) == "/mon/member/")
-                mon_count++;
-            else if (kv.key == "/mon/master")
-            {
-                if (kv.value["hostname"].is_string())
-                    mon_master = kv.value["hostname"].string_value();
-                else
-                    mon_master = kv.value["ip"][0].string_value();
-            }
-        }
-        int osd_count = 0, osd_up = 0;
-        uint64_t total_raw = 0, free_raw = 0, free_down_raw = 0, down_raw = 0;
-        for (int i = 0; i < osd_stats.size(); i++)
-        {
-            auto kv = parent->cli->st_cli.parse_etcd_kv(osd_stats[i]);
-            osd_num_t stat_osd_num = 0;
-            char null_byte = 0;
-            sscanf(kv.key.c_str() + parent->cli->st_cli.etcd_prefix.size(), "/osd/stats/%lu%c", &stat_osd_num, &null_byte);
-            if (!stat_osd_num || null_byte != 0)
-            {
-                fprintf(stderr, "Invalid key in etcd: %s\n", kv.key.c_str());
-                continue;
-            }
-            osd_count++;
-            total_raw += kv.value["size"].uint64_value();
-            free_raw += kv.value["free"].uint64_value();
-            auto peer_it = parent->cli->st_cli.peer_states.find(stat_osd_num);
-            if (peer_it != parent->cli->st_cli.peer_states.end())
-            {
-                osd_up++;
-            }
-            else
-            {
-                down_raw += kv.value["size"].uint64_value();
-                free_down_raw += kv.value["size"].uint64_value();
-            }
-        }
-        int pool_count = 0, pools_active = 0;
-        std::map<std::string, int> pgs_by_state;
-        std::string pgs_by_state_str;
-        for (auto & pool_pair: parent->cli->st_cli.pool_config)
-        {
-            auto & pool_cfg = pool_pair.second;
-            bool active = true;
-            if (pool_cfg.pg_config.size() != pool_cfg.pg_count)
-            {
-                active = false;
-                pgs_by_state["offline"] += pool_cfg.pg_count-pool_cfg.pg_config.size();
-            }
-            pool_count++;
-            for (auto pg_it = pool_cfg.pg_config.begin(); pg_it != pool_cfg.pg_config.end(); pg_it++)
-            {
-                if (!(pg_it->second.cur_state & PG_ACTIVE))
-                {
-                    active = false;
-                }
-                std::string pg_state_str;
-                for (int i = 0; i < pg_state_bit_count; i++)
-                {
-                    if (pg_it->second.cur_state & pg_state_bits[i])
-                    {
-                        pg_state_str += "+";
-                        pg_state_str += pg_state_names[i];
-                    }
-                }
-                if (pg_state_str.size())
-                    pgs_by_state[pg_state_str.substr(1)]++;
-                else
-                    pgs_by_state["offline"]++;
-            }
-            if (active)
-            {
-                pools_active++;
-            }
-        }
-        for (auto & kv: pgs_by_state)
-        {
-            if (pgs_by_state_str.size())
-            {
-                pgs_by_state_str += "\n           ";
-            }
-            pgs_by_state_str += std::to_string(kv.second)+" "+kv.first;
-        }
-        uint64_t object_size = parent->cli->get_bs_block_size();
-        std::string more_states;
-        uint64_t obj_n;
-        obj_n = agg_stats["object_counts"]["misplaced"].uint64_value();
-        if (obj_n > 0)
-            more_states += ", "+format_size(obj_n*object_size)+" misplaced";
-        obj_n = agg_stats["object_counts"]["degraded"].uint64_value();
-        if (obj_n > 0)
-            more_states += ", "+format_size(obj_n*object_size)+" degraded";
-        obj_n = agg_stats["object_counts"]["incomplete"].uint64_value();
-        if (obj_n > 0)
-            more_states += ", "+format_size(obj_n*object_size)+" incomplete";
-        std::string recovery_io;
-        {
-            uint64_t deg_bps = agg_stats["recovery_stats"]["degraded"]["bps"].uint64_value();
-            uint64_t deg_iops = agg_stats["recovery_stats"]["degraded"]["iops"].uint64_value();
-            uint64_t misp_bps = agg_stats["recovery_stats"]["misplaced"]["bps"].uint64_value();
-            uint64_t misp_iops = agg_stats["recovery_stats"]["misplaced"]["iops"].uint64_value();
-            if (deg_iops > 0 || deg_bps > 0)
-                recovery_io += "    recovery:  "+format_size(deg_bps)+"/s, "+format_size(deg_iops, true)+" op/s\n";
-            if (misp_iops > 0 || misp_bps > 0)
-                recovery_io += "    rebalance: "+format_size(misp_bps)+"/s, "+format_size(misp_iops, true)+" op/s\n";
-        }
-        if (parent->json_output)
-        {
-            // JSON output
-            printf("%s\n", json11::Json(json11::Json::object {
-                { "etcd_alive", etcd_alive },
-                { "etcd_count", (uint64_t)etcd_states.size() },
-                { "etcd_db_size", etcd_db_size },
-                { "mon_count", mon_count },
-                { "mon_master", mon_master },
-                { "osd_up", osd_up },
-                { "osd_count", osd_count },
-                { "total_raw", total_raw },
-                { "free_raw", free_raw },
-                { "down_raw", down_raw },
-                { "free_down_raw", free_down_raw },
-                { "clean_data", agg_stats["object_counts"]["clean"].uint64_value() * object_size },
-                { "misplaced_data", agg_stats["object_counts"]["misplaced"].uint64_value() * object_size },
-                { "degraded_data", agg_stats["object_counts"]["degraded"].uint64_value() * object_size },
-                { "incomplete_data", agg_stats["object_counts"]["incomplete"].uint64_value() * object_size },
-                { "pool_count", pool_count },
-                { "active_pool_count", pools_active },
-                { "pg_states", pgs_by_state },
-                { "op_stats", agg_stats["op_stats"] },
-                { "recovery_stats", agg_stats["recovery_stats"] },
-                { "object_counts", agg_stats["object_counts"] },
-            }).dump().c_str());
-            state = 100;
-            return;
-        }
-        printf(
-            "  cluster:\n"
-            "    etcd: %d / %ld up, %s database size\n"
-            "    mon:  %d up%s\n"
-            "    osd:  %d / %d up\n"
-            "  \n"
-            "  data:\n"
-            "    raw:   %s used, %s / %s available%s\n"
-            "    state: %s clean%s\n"
-            "    pools: %d / %d active\n"
-            "    pgs:   %s\n"
-            "  \n"
-            "  io:\n"
-            "    client:%s %s/s rd, %s op/s rd, %s/s wr, %s op/s wr\n"
-            "%s",
-            etcd_alive, etcd_states.size(), format_size(etcd_db_size).c_str(),
-            mon_count, mon_master == "" ? "" : (", master "+mon_master).c_str(),
-            osd_up, osd_count,
-            format_size(total_raw-free_raw).c_str(),
-            format_size(free_raw-free_down_raw).c_str(),
-            format_size(total_raw-down_raw).c_str(),
-            (down_raw > 0 ? (", "+format_size(down_raw)+" down").c_str() : ""),
-            format_size(agg_stats["object_counts"]["clean"].uint64_value() * object_size).c_str(), more_states.c_str(),
-            pools_active, pool_count,
-            pgs_by_state_str.c_str(),
-            recovery_io.size() > 0 ? "   " : "",
-            format_size(agg_stats["op_stats"]["primary_read"]["bps"].uint64_value()).c_str(),
-            format_size(agg_stats["op_stats"]["primary_read"]["iops"].uint64_value(), true).c_str(),
-            format_size(agg_stats["op_stats"]["primary_write"]["bps"].uint64_value()).c_str(),
-            format_size(agg_stats["op_stats"]["primary_write"]["iops"].uint64_value(), true).c_str(),
-            recovery_io.c_str()
-        );
-        state = 100;
-    }
-};
-
-std::function<bool(cli_result_t &)> cli_tool_t::start_status(json11::Json cfg)
-{
-    json11::Json::array cmd = cfg["command"].array_items();
-    auto printer = new status_printer_t();
-    printer->parent = this;
-    return [printer](cli_result_t & result)
-    {
-        printer->loop();
-        if (printer->is_done())
-        {
-            result = { .err = 0 };
-            delete printer;
-            return true;
-        }
-        return false;
-    };
-}
--- a/src/cluster_client.cpp
+++ b/src/cluster_client.cpp
@@ -143,7 +143,7 @@ void cluster_client_t::calc_wait(cluster_op_t *op)
    }
    else /* if (op->opcode == OSD_OP_READ || op->opcode == OSD_OP_READ_BITMAP) */
    {
-        for (auto prev = op_queue_head; prev && prev != op; prev = prev->next)
+        for (auto prev = op->prev; prev; prev = prev->prev)
        {
            if (prev->opcode == OSD_OP_WRITE && prev->flags & OP_FLUSH_BUFFER)
            {
@@ -151,7 +151,7 @@ void cluster_client_t::calc_wait(cluster_op_t *op)
            }
            else if (prev->opcode == OSD_OP_WRITE || prev->opcode == OSD_OP_READ || prev->opcode == OSD_OP_READ_BITMAP)
            {
-                // Flushes are always in the beginning (we're scanning from the beginning of the queue)
+                // Flushes are always in the beginning
                break;
            }
        }
@@ -172,7 +172,6 @@ void cluster_client_t::inc_wait(uint64_t opcode, uint64_t flags, cluster_op_t *n
                (next->opcode == OSD_OP_READ || next->opcode == OSD_OP_READ_BITMAP) && (flags & OP_FLUSH_BUFFER))
            {
                next->prev_wait += inc;
-                assert(next->prev_wait >= 0);
                if (!next->prev_wait)
                {
                    if (next->opcode == OSD_OP_SYNC)
@@ -192,7 +191,6 @@ void cluster_client_t::inc_wait(uint64_t opcode, uint64_t flags, cluster_op_t *n
            if (next->opcode == OSD_OP_SYNC || next->opcode == OSD_OP_WRITE)
            {
                next->prev_wait += inc;
-                assert(next->prev_wait >= 0);
                if (!next->prev_wait)
                {
                    if (next->opcode == OSD_OP_SYNC)
--- a/src/cluster_client_list.cpp
+++ b/src/cluster_client_list.cpp
@@ -200,8 +200,7 @@ void cluster_client_t::send_list(inode_list_osd_t *cur_list)
    auto & pool_cfg = st_cli.pool_config[cur_list->pg->lst->pool_id];
    osd_op_t *op = new osd_op_t();
    op->op_type = OSD_OP_OUT;
-    // Already checked that it exists above, but anyway
-    op->peer_fd = msgr.osd_peer_fds.at(cur_list->osd_num);
+    op->peer_fd = msgr.osd_peer_fds[cur_list->osd_num];
    op->req = (osd_any_op_t){
        .sec_list = {
            .header = {
--- a/src/etcd_state_client.cpp
+++ b/src/etcd_state_client.cpp
@@ -64,42 +64,6 @@ void etcd_state_client_t::etcd_txn_slow(json11::Json txn, std::function<void(std
    etcd_call("/kv/txn", txn, etcd_slow_timeout, max_etcd_attempts, 0, callback);
 }

-std::vector<std::string> etcd_state_client_t::get_addresses()
-{
-    auto addrs = etcd_local;
-    addrs.insert(addrs.end(), etcd_addresses.begin(), etcd_addresses.end());
-    return addrs;
-}
-
-void etcd_state_client_t::etcd_call_oneshot(std::string etcd_address, std::string api, json11::Json payload,
-    int timeout, std::function<void(std::string, json11::Json)> callback)
-{
-    std::string etcd_api_path;
-    int pos = etcd_address.find('/');
-    if (pos >= 0)
-    {
-        etcd_api_path = etcd_address.substr(pos);
-        etcd_address = etcd_address.substr(0, pos);
-    }
-    std::string req = payload.dump();
-    req = "POST "+etcd_api_path+api+" HTTP/1.1\r\n"
-        "Host: "+etcd_address+"\r\n"
-        "Content-Type: application/json\r\n"
-        "Content-Length: "+std::to_string(req.size())+"\r\n"
-        "Connection: close\r\n"
-        "\r\n"+req;
-    auto http_cli = http_init(tfd);
-    auto cb = [this, http_cli, callback](const http_response_t *response)
-    {
-        std::string err;
-        json11::Json data;
-        response->parse_json_response(err, data);
-        callback(err, data);
-        http_close(http_cli);
-    };
-    http_request(http_cli, etcd_address, req, { .timeout = timeout }, cb);
-}
-
 void etcd_state_client_t::etcd_call(std::string api, json11::Json payload, int timeout,
    int retries, int interval, std::function<void(std::string, json11::Json)> callback)
 {
@@ -954,10 +918,6 @@ void etcd_state_client_t::parse_state(const etcd_kv_t & kv)
            }
            if (!value.is_object())
            {
-                if (on_inode_change_hook != NULL)
-                {
-                    on_inode_change_hook(inode_num, true);
-                }
                this->inode_config.erase(inode_num);
            }
            else
@@ -972,7 +932,7 @@ void etcd_state_client_t::parse_state(const etcd_kv_t & kv)
                    {
                        fprintf(
                            stderr, "Inode %lu/%lu parent_pool value is invalid, ignoring parent setting\n",
-                            inode_num >> (64-POOL_ID_BITS), inode_num & (((uint64_t)1 << (64-POOL_ID_BITS)) - 1)
+                            inode_num >> (64-POOL_ID_BITS), inode_num & ((1l << (64-POOL_ID_BITS)) - 1)
                        );
                        parent_inode_num = 0;
                    }
@@ -999,10 +959,6 @@ void etcd_state_client_t::parse_state(const etcd_kv_t & kv)
                        }
                    }
                }
-                if (on_inode_change_hook != NULL)
-                {
-                    on_inode_change_hook(inode_num, false);
-                }
            }
        }
    }
--- a/src/etcd_state_client.h
+++ b/src/etcd_state_client.h
@@ -109,12 +109,9 @@ public:
    std::function<void(pool_id_t, pg_num_t)> on_change_pg_history_hook;
    std::function<void(osd_num_t)> on_change_osd_state_hook;
    std::function<void()> on_reload_hook;
-    std::function<void(inode_t, bool)> on_inode_change_hook;

    json11::Json::object serialize_inode_cfg(inode_config_t *cfg);
    etcd_kv_t parse_etcd_kv(const json11::Json & kv_json);
-    std::vector<std::string> get_addresses();
-    void etcd_call_oneshot(std::string etcd_address, std::string api, json11::Json payload, int timeout, std::function<void(std::string, json11::Json)> callback);
    void etcd_call(std::string api, json11::Json payload, int timeout, int retries, int interval, std::function<void(std::string, json11::Json)> callback);
    void etcd_txn(json11::Json txn, int timeout, int retries, int interval, std::function<void(std::string, json11::Json)> callback);
    void etcd_txn_slow(json11::Json txn, std::function<void(std::string, json11::Json)> callback);
--- a/src/fio_cluster.cpp
+++ b/src/fio_cluster.cpp
@@ -214,14 +214,14 @@ static int sec_setup(struct thread_data *td)

    if (!o->image)
    {
-        if (!(o->inode & (((uint64_t)1 << (64-POOL_ID_BITS)) - 1)))
+        if (!(o->inode & ((1l << (64-POOL_ID_BITS)) - 1)))
        {
            td_verror(td, EINVAL, "inode number is missing");
            return 1;
        }
        if (o->pool)
        {
-            o->inode = (o->inode & (((uint64_t)1 << (64-POOL_ID_BITS)) - 1)) | (o->pool << (64-POOL_ID_BITS));
+            o->inode = (o->inode & ((1l << (64-POOL_ID_BITS)) - 1)) | (o->pool << (64-POOL_ID_BITS));
        }
        if (!(o->inode >> (64-POOL_ID_BITS)))
        {
@@ -351,9 +351,9 @@ static enum fio_q_status sec_queue(struct thread_data *td, struct io_u *io)
        }
        else
        {
-            printf("+++ %s 0x%lx 0x%llx+%lx\n",
+            printf("+++ %s 0x%lx 0x%llx+%llx\n",
                io->ddir == DDIR_READ ? "READ" : "WRITE",
-                (uint64_t)io, io->offset, (uint64_t)io->xfer_buflen);
+                (uint64_t)io, io->offset, io->xfer_buflen);
        }
    }

--- a/src/fio_sec_osd.cpp
+++ b/src/fio_sec_osd.cpp
@@ -170,14 +170,14 @@ static int sec_init(struct thread_data *td)
    bsd->block_order = o->block_order == 0 ? 17 : o->block_order;
    bsd->block_size = 1 << o->block_order;

-    sockaddr_storage addr;
+    sockaddr addr;
    if (!string_to_addr(std::string(o->host ? o->host : "127.0.0.1"), false, o->port > 0 ? o->port : 11203, &addr))
    {
        fprintf(stderr, "server address: %s is not valid\n", o->host ? o->host : "127.0.0.1");
        return 1;
    }

-    bsd->connect_fd = socket(addr.ss_family, SOCK_STREAM, 0);
+    bsd->connect_fd = socket(addr.sa_family, SOCK_STREAM, 0);
    if (bsd->connect_fd < 0)
    {
        perror("socket");
@@ -355,7 +355,7 @@ static int sec_getevents(struct thread_data *td, unsigned int min, unsigned int
        {
            if (reply.hdr.retval != io->xfer_buflen)
            {
-                fprintf(stderr, "Short read: retval = %ld instead of %lu\n", reply.hdr.retval, (uint64_t)io->xfer_buflen);
+                fprintf(stderr, "Short read: retval = %ld instead of %llu\n", reply.hdr.retval, io->xfer_buflen);
                exit(1);
            }
            // Support bitmap
@@ -380,7 +380,7 @@ static int sec_getevents(struct thread_data *td, unsigned int min, unsigned int
        {
            if (reply.hdr.retval != io->xfer_buflen)
            {
-                fprintf(stderr, "Short write: retval = %ld instead of %lu\n", reply.hdr.retval, (uint64_t)io->xfer_buflen);
+                fprintf(stderr, "Short write: retval = %ld instead of %llu\n", reply.hdr.retval, io->xfer_buflen);
                exit(1);
            }
        }
--- a/src/http_client.cpp
+++ b/src/http_client.cpp
@@ -62,10 +62,9 @@ struct http_co_t
    void run_cb_and_clear();
    void start_connection();
    void close_connection();
-    void next_request();
    void handle_events();
    void handle_connect_result();
-    void submit_read(bool check_timeout);
+    void submit_read();
    void submit_send();
    bool handle_read();
    void post_message(int type, const std::string & msg);
@@ -129,7 +128,6 @@ void http_co_t::run_cb_and_clear()
    // Call callback after clearing it because otherwise we may hit reenterability problems
    if (cb != NULL)
        cb(&parsed);
-    next_request();
 }

 void http_co_t::send_request(const std::string & host, const std::string & request,
@@ -163,6 +161,17 @@ void http_co_t::send_request(const std::string & host, const std::string & reque
    this->sent = 0;
    this->response_callback = response_callback;
    this->parsed = {};
+    if (request_timeout > 0)
+    {
+        timeout_id = tfd->set_timer(request_timeout, false, [this](int timer_id)
+        {
+            stackin();
+            close_connection();
+            parsed = { .error = "HTTP request timed out" };
+            run_cb_and_clear();
+            stackout();
+        });
+    }
    if (state == HTTP_CO_KEEPALIVE)
    {
        state = HTTP_CO_SENDING_REQUEST;
@@ -172,28 +181,6 @@ void http_co_t::send_request(const std::string & host, const std::string & reque
    {
        start_connection();
    }
-    // Do it _after_ state assignment because set_timer() can actually trigger
-    // other timers and requests (reenterability is our friend)
-    if (request_timeout > 0)
-    {
-        timeout_id = tfd->set_timer(request_timeout, false, [this](int timer_id)
-        {
-            stackin();
-            if (state == HTTP_CO_REQUEST_SENT)
-            {
-                // In case of high CPU load, we may not handle etcd responses in time
-                // For this case, first check the socket and only then terminate request with the timeout
-                submit_read(true);
-            }
-            else
-            {
-                close_connection();
-                parsed = { .error = "HTTP request timed out" };
-                run_cb_and_clear();
-            }
-            stackout();
-        });
-    }
    stackout();
 }

@@ -284,19 +271,17 @@ void http_co_t::close_connection()
 void http_co_t::start_connection()
 {
    stackin();
-    struct sockaddr_storage addr;
+    struct sockaddr addr;
    if (!string_to_addr(host.c_str(), 1, 80, &addr))
    {
-        close_connection();
        parsed = { .error = "Invalid address: "+host };
        run_cb_and_clear();
        stackout();
        return;
    }
-    peer_fd = socket(addr.ss_family, SOCK_STREAM, 0);
+    peer_fd = socket(addr.sa_family, SOCK_STREAM, 0);
    if (peer_fd < 0)
    {
-        close_connection();
        parsed = { .error = std::string("socket: ")+strerror(errno) };
        run_cb_and_clear();
        stackout();
@@ -338,12 +323,10 @@ void http_co_t::handle_events()
            epoll_events &= ~EPOLLOUT;
            if (epoll_events & EPOLLIN)
            {
-                submit_read(false);
+                submit_read();
            }
            else if (epoll_events & (EPOLLRDHUP|EPOLLERR))
            {
-                if (state == HTTP_CO_HEADERS_RECEIVED)
-                    std::swap(parsed.body, response);
                close_connection();
                run_cb_and_clear();
                break;
@@ -427,11 +410,10 @@ again:
    stackout();
 }

-void http_co_t::submit_read(bool check_timeout)
+void http_co_t::submit_read()
 {
    stackin();
    int res;
-again:
    if (rbuf.size() != READ_BUFFER_SIZE)
    {
        rbuf.resize(READ_BUFFER_SIZE);
@@ -446,29 +428,12 @@ again:
    }
    if (res == -EAGAIN || res == -EINTR)
    {
-        if (check_timeout)
-        {
-            if (res == -EINTR)
-                goto again;
-            else
-            {
-                // Timeout happened and there is no data to read
-                close_connection();
-                parsed = { .error = "HTTP request timed out" };
-                run_cb_and_clear();
-            }
-        }
-        else
-        {
-            epoll_events = epoll_events & ~EPOLLIN;
-        }
+        epoll_events = epoll_events & ~EPOLLIN;
    }
    else if (res <= 0)
    {
        // < 0 means error, 0 means EOF
        epoll_events = epoll_events & ~EPOLLIN;
-        if (state == HTTP_CO_HEADERS_RECEIVED)
-            std::swap(parsed.body, response);
        close_connection();
        if (res < 0)
            parsed = { .error = std::string("recvmsg: ")+strerror(-res) };
@@ -536,11 +501,8 @@ bool http_co_t::handle_read()
    if (state == HTTP_CO_HEADERS_RECEIVED && target_response_size > 0 && response.size() >= target_response_size)
    {
        std::swap(parsed.body, response);
-        if (!keepalive)
-            close_connection();
-        else
-            state = HTTP_CO_KEEPALIVE;
-        run_cb_and_clear();
+        response_callback(&parsed);
+        parsed.eof = true;
    }
    else if (state == HTTP_CO_CHUNKED && response.size() > 0)
    {
@@ -571,14 +533,10 @@ bool http_co_t::handle_read()
            response_callback(&parsed);
            parsed.body = "";
        }
-        else if (parsed.eof)
+        if (parsed.eof && !want_streaming)
        {
            // Normal response
-            if (!keepalive)
-                close_connection();
-            else
-                state = HTTP_CO_KEEPALIVE;
-            run_cb_and_clear();
+            response_callback(&parsed);
        }
    }
    else if (state == HTTP_CO_WEBSOCKET && response.size() > 0)
@@ -589,20 +547,29 @@ bool http_co_t::handle_read()
            parsed.body = "";
        }
    }
+    if (parsed.eof)
+    {
+        response_callback = NULL;
+        parsed = {};
+        if (!keepalive)
+        {
+            close_connection();
+        }
+        else
+        {
+            state = HTTP_CO_KEEPALIVE;
+            if (keepalive_queue.size() > 0)
+            {
+                auto next = keepalive_queue[0];
+                keepalive_queue.erase(keepalive_queue.begin(), keepalive_queue.begin()+1);
+                next();
+            }
+        }
+    }
    stackout();
    return true;
 }

-void http_co_t::next_request()
-{
-    if (keepalive_queue.size() > 0)
-    {
-        auto next = keepalive_queue[0];
-        keepalive_queue.erase(keepalive_queue.begin(), keepalive_queue.begin()+1);
-        next();
-    }
-}
-
 uint64_t stoull_full(const std::string & str, int base)
 {
    if (isspace(str[0]))
--- a/src/messenger.cpp
+++ b/src/messenger.cpp
@@ -222,13 +222,13 @@ void osd_messenger_t::try_connect_peer(uint64_t peer_osd)
 void osd_messenger_t::try_connect_peer_addr(osd_num_t peer_osd, const char *peer_host, int peer_port)
 {
    assert(peer_osd != this->osd_num);
-    struct sockaddr_storage addr;
+    struct sockaddr addr;
    if (!string_to_addr(peer_host, 0, peer_port, &addr))
    {
        on_connect_peer(peer_osd, -EINVAL);
        return;
    }
-    int peer_fd = socket(addr.ss_family, SOCK_STREAM, 0);
+    int peer_fd = socket(addr.sa_family, SOCK_STREAM, 0);
    if (peer_fd < 0)
    {
        on_connect_peer(peer_osd, -errno);
@@ -484,10 +484,10 @@ void osd_messenger_t::check_peer_config(osd_client_t *cl)
 void osd_messenger_t::accept_connections(int listen_fd)
 {
    // Accept new connections
-    sockaddr_storage addr;
+    sockaddr addr;
    socklen_t peer_addr_size = sizeof(addr);
    int peer_fd;
-    while ((peer_fd = accept(listen_fd, (sockaddr*)&addr, &peer_addr_size)) >= 0)
+    while ((peer_fd = accept(listen_fd, &addr, &peer_addr_size)) >= 0)
    {
        assert(peer_fd != 0);
        fprintf(stderr, "[OSD %lu] new client %d: connection from %s\n", this->osd_num, peer_fd,
--- a/src/messenger.h
+++ b/src/messenger.h
@@ -49,7 +49,7 @@ struct osd_client_t
 {
    int refs = 0;

-    sockaddr_storage peer_addr;
+    sockaddr peer_addr;
    int peer_port;
    int peer_fd;
    int peer_state;
--- a/src/msgr_rdma.cpp
+++ b/src/msgr_rdma.cpp
@@ -3,6 +3,7 @@

 #include <stdio.h>
 #include <stdlib.h>
+#include <sys/mman.h>
 #include "msgr_rdma.h"
 #include "messenger.h"

@@ -54,6 +55,7 @@ msgr_rdma_connection_t::~msgr_rdma_connection_t()
 msgr_rdma_context_t *msgr_rdma_context_t::create(const char *ib_devname, uint8_t ib_port, uint8_t gid_index, uint32_t mtu, int log_level)
 {
    int res;
+    bool odp = true;
    ibv_device **dev_list = NULL;
    msgr_rdma_context_t *ctx = new msgr_rdma_context_t();
    ctx->mtu = mtu;
@@ -117,9 +119,9 @@ msgr_rdma_context_t *msgr_rdma_context_t::create(const char *ib_devname, uint8_t
        fprintf(stderr, "RDMA device %s must have local LID because it's not Ethernet, but LID is zero\n", ibv_get_device_name(ctx->dev));
        goto cleanup;
    }
-    if (ibv_query_gid(ctx->context, ib_port, gid_index, &ctx->my_gid))
+    if ((res = ibv_query_gid(ctx->context, ib_port, gid_index, &ctx->my_gid)) != 0)
    {
-        fprintf(stderr, "Couldn't read RDMA device %s GID index %d\n", ibv_get_device_name(ctx->dev), gid_index);
+        fprintf(stderr, "Couldn't read RDMA device %s GID index %d: %s\n", ibv_get_device_name(ctx->dev), gid_index, strerror(res));
        goto cleanup;
    }

@@ -131,9 +133,9 @@ msgr_rdma_context_t *msgr_rdma_context_t::create(const char *ib_devname, uint8_t
    }

    {
-        if (ibv_query_device_ex(ctx->context, NULL, &ctx->attrx))
+        if ((res = ibv_query_device_ex(ctx->context, NULL, &ctx->attrx)) != 0)
        {
-            fprintf(stderr, "Couldn't query RDMA device for its features\n");
+            fprintf(stderr, "Couldn't query RDMA device %s for its features: %s\n", ibv_get_device_name(ctx->dev), strerror(res));
            goto cleanup;
        }
        if (!(ctx->attrx.odp_caps.general_caps & IBV_ODP_SUPPORT) ||
@@ -141,15 +143,20 @@ msgr_rdma_context_t *msgr_rdma_context_t::create(const char *ib_devname, uint8_t
            !(ctx->attrx.odp_caps.per_transport_caps.rc_odp_caps & IBV_ODP_SUPPORT_SEND) ||
            !(ctx->attrx.odp_caps.per_transport_caps.rc_odp_caps & IBV_ODP_SUPPORT_RECV))
        {
-            fprintf(stderr, "The RDMA device isn't implicit ODP (On-Demand Paging) capable or does not support RC send and receive with ODP\n");
-            goto cleanup;
+            fprintf(stderr, "Warning: RDMA device isn't implicit ODP (On-Demand Paging) capable, trying to lock all application memory\n");
+            if (mlockall(MCL_CURRENT|MCL_FUTURE|MCL_ONFAULT) != 0)
+            {
+                fprintf(stderr, "mlockall() failed: %s\n", strerror(errno));
+                goto cleanup;
+            }
+            odp = false;
        }
    }

-    ctx->mr = ibv_reg_mr(ctx->pd, NULL, SIZE_MAX, IBV_ACCESS_LOCAL_WRITE | IBV_ACCESS_ON_DEMAND);
+    ctx->mr = ibv_reg_mr(ctx->pd, NULL, SIZE_MAX, IBV_ACCESS_LOCAL_WRITE | (odp ? IBV_ACCESS_ON_DEMAND : 0));
    if (!ctx->mr)
    {
-        fprintf(stderr, "Couldn't register RDMA memory region\n");
+        fprintf(stderr, "Couldn't register RDMA memory region: %s\n", strerror(errno));
        goto cleanup;
    }

--- a/src/nbd_proxy.cpp
+++ b/src/nbd_proxy.cpp
@@ -54,8 +54,6 @@ protected:
    msghdr read_msg = { 0 }, send_msg = { 0 };
    iovec read_iov = { 0 };

-    std::string logfile = "/dev/null";
-
 public:
    ~nbd_proxy()
    {
@@ -189,7 +187,7 @@ public:
            uint64_t pool = cfg["pool"].uint64_value();
            if (pool)
            {
-                inode = (inode & (((uint64_t)1 << (64-POOL_ID_BITS)) - 1)) | (pool << (64-POOL_ID_BITS));
+                inode = (inode & ((1l << (64-POOL_ID_BITS)) - 1)) | (pool << (64-POOL_ID_BITS));
            }
            if (!(inode >> (64-POOL_ID_BITS)))
            {
@@ -280,10 +278,6 @@ public:
                }
            }
        }
-        if (cfg["logfile"].is_string())
-        {
-            logfile = cfg["logfile"].string_value();
-        }
        if (bg)
        {
            daemonize();
@@ -369,14 +363,13 @@ public:
        setsid();
        if (fork())
            exit(0);
+        chdir("/");
        close(0);
        close(1);
        close(2);
        open("/dev/null", O_RDONLY);
-        open(logfile.c_str(), O_WRONLY|O_APPEND|O_CREAT, 0666);
-        open(logfile.c_str(), O_WRONLY|O_APPEND|O_CREAT, 0666);
-        if (chdir("/") != 0)
-            fprintf(stderr, "Warning: Failed to chdir into /\n");
+        open("/dev/null", O_WRONLY);
+        open("/dev/null", O_WRONLY);
    }

    json11::Json::object list_mapped()
@@ -532,11 +525,7 @@ protected:
        {
            goto end_unmap;
        }
-        r = write(qd_fd, "32768", 5);
-        if (r != 5)
-        {
-            fprintf(stderr, "Warning: Failed to configure max_sectors_kb\n");
-        }
+        write(qd_fd, "32768", 5);
        close(qd_fd);
        if (!fork())
        {
@@ -690,7 +679,6 @@ protected:
            {
                assert(result <= cur_left);
                cur_left -= result;
-                cur_buf = (uint8_t*)cur_buf + result;
                result = 0;
            }
            if (cur_left <= 0)
@@ -705,12 +693,6 @@ protected:
        if (read_state == CL_READ_HDR)
        {
            int req_type = be32toh(cur_req.type);
-            if (be32toh(cur_req.magic) == NBD_REQUEST_MAGIC && req_type == NBD_CMD_DISC)
-            {
-                // Disconnect
-                close(nbd_fd);
-                exit(0);
-            }
            if (be32toh(cur_req.magic) != NBD_REQUEST_MAGIC ||
                req_type != NBD_CMD_READ && req_type != NBD_CMD_WRITE && req_type != NBD_CMD_FLUSH)
            {
--- a/src/nfs_conn.cpp
+++ b/src/nfs_conn.cpp
@@ -1,748 +0,0 @@
-// Copyright (c) Vitaliy Filippov, 2019+
-// License: VNPL-1.1 (see README.md for details)
-//
-// NFS connection handler for NFS proxy
-
-#include <sys/time.h>
-
-#include "libnfs-raw-mount.h"
-#include "libnfs-raw-nfs.h"
-
-#include "base64.h"
-
-#include "nfs_proxy.h"
-
-static unsigned len_pad4(unsigned len)
-{
-    return len + (len&3 ? 4-(len&3) : 0);
-}
-
-static int nfs3_null_proc(struct rpc_context *rpc, struct rpc_msg *call, void *opaque)
-{
-    rpc_send_reply(rpc, call, NULL, (zdrproc_t)zdr_void, 0);
-    return 0;
-}
-
-static fattr3 get_dir_attributes(nfs_client_t *self, std::string dir)
-{
-    return (fattr3){
-        .type = NF3DIR,
-        .mode = 0755,
-        .nlink = 1,
-        .uid = 0,
-        .gid = 0,
-        .size = 4096,
-        .used = 4096,
-        .rdev = (specdata3){ 0 },
-        .fsid = self->parent->fsid,
-        .fileid = dir == "" ? 1 : self->parent->dir_ids.at(dir),
-        //.atime = (nfstime3){ .seconds = now.tv_sec, .nseconds = now.tv_nsec },
-        //.mtime = (nfstime3){ .seconds = now.tv_sec, .nseconds = now.tv_nsec },
-        //.ctime = (nfstime3){ .seconds = now.tv_sec, .nseconds = now.tv_nsec },
-    };
-}
-
-static int nfs3_getattr_proc(struct rpc_context *rpc, struct rpc_msg *call, void *opaque)
-{
-    nfs_client_t *self = (nfs_client_t*)opaque;
-    GETATTR3args *args = (GETATTR3args*)call->body.cbody.args;
-    GETATTR3res reply;
-    std::string dirhash = std::string(args->object.data.data_val, args->object.data.data_len);
-    bool is_dir = false;
-    std::string dir;
-    if (dirhash == "roothandle")
-        is_dir = true;
-    else
-    {
-        auto dir_it = self->parent->dir_by_hash.find(dirhash);
-        if (dir_it != self->parent->dir_by_hash.end())
-        {
-            is_dir = true;
-            dir = dir_it->second;
-        }
-    }
-    if (is_dir)
-    {
-        // Directory info
-        reply.status = NFS3_OK;
-        reply.GETATTR3res_u.resok.obj_attributes = get_dir_attributes(self, dir);
-    }
-    else
-    {
-        uint64_t inode_num;
-        auto inode_num_it = self->parent->inode_by_hash.find(dirhash);
-        if (inode_num_it != self->parent->inode_by_hash.end())
-            inode_num = inode_num_it->second;
-        auto inode_it = self->parent->cli->st_cli.inode_config.find(inode_num);
-        if (inode_it != self->parent->cli->st_cli.inode_config.end())
-        {
-            // File info
-            auto & inode_cfg = inode_it->second;
-            reply.status = NFS3_OK;
-            reply.GETATTR3res_u.resok.obj_attributes = {
-                .type = NF3REG,
-                .mode = 0644,
-                .nlink = 1,
-                .uid = 0,
-                .gid = 0,
-                .size = inode_cfg.size,
-                .used = inode_cfg.size,
-                .rdev = (specdata3){ 0 },
-                .fsid = self->parent->fsid,
-                .fileid = inode_it->first,
-                //.atime = (nfstime3){ .seconds = now.tv_sec, .nseconds = now.tv_nsec },
-                //.mtime = (nfstime3){ .seconds = now.tv_sec, .nseconds = now.tv_nsec },
-                //.ctime = (nfstime3){ .seconds = now.tv_sec, .nseconds = now.tv_nsec },
-            };
-        }
-        else
-        {
-            // File not exists
-            reply.status = NFS3ERR_NOENT;
-        }
-    }
-    rpc_send_reply(rpc, call, &reply, (zdrproc_t)zdr_GETATTR3res, sizeof(GETATTR3res));
-    return 0;
-}
-
-static int nfs3_setattr_proc(struct rpc_context *rpc, struct rpc_msg *call, void *opaque)
-{
-    nfs_client_t *self = (nfs_client_t*)opaque;
-    SETATTR3args *args = (SETATTR3args*)call->body.cbody.args;
-    SETATTR3res reply;
-    // Not supported yet
-    reply.status = NFS3ERR_NOTSUPP;
-    rpc_send_reply(rpc, call, &reply, (zdrproc_t)zdr_SETATTR3res, sizeof(SETATTR3res));
-    return 0;
-}
-
-static int nfs3_lookup_proc(struct rpc_context *rpc, struct rpc_msg *call, void *opaque)
-{
-    nfs_client_t *self = (nfs_client_t*)opaque;
-    LOOKUP3args *args = (LOOKUP3args*)call->body.cbody.args;
-    LOOKUP3res reply;
-    std::string dirhash = std::string(args->what.dir.data.data_val, args->what.dir.data.data_len);
-    std::string dir;
-    if (dirhash != "roothandle")
-    {
-        auto dir_it = self->parent->dir_by_hash.find(dirhash);
-        if (dir_it != self->parent->dir_by_hash.end())
-            dir = dir_it->second;
-    }
-    std::string full_name = self->parent->name_prefix;
-    if (dir != "")
-    {
-        full_name += dir+"/";
-    }
-    full_name += std::string(args->what.name);
-    for (auto & ic: self->parent->cli->st_cli.inode_config)
-    {
-        if (ic.second.name == full_name)
-        {
-            std::string fh = "S"+base64_encode(sha256(full_name.substr(self->parent->name_prefix.size())));
-            reply.status = NFS3_OK;
-            reply.LOOKUP3res_u.resok.object.data.data_len = fh.size();
-            reply.LOOKUP3res_u.resok.object.data.data_val = (char*)fh.c_str();
-            rpc_send_reply(rpc, call, &reply, (zdrproc_t)zdr_LOOKUP3res, sizeof(LOOKUP3res));
-            return 0;
-        }
-    }
-    reply.status = NFS3ERR_NOENT;
-    rpc_send_reply(rpc, call, &reply, (zdrproc_t)zdr_LOOKUP3res, sizeof(LOOKUP3res));
-    return 0;
-}
-
-static int nfs3_access_proc(struct rpc_context *rpc, struct rpc_msg *call, void *opaque)
-{
-    nfs_client_t *self = (nfs_client_t*)opaque;
-    ACCESS3args *args = (ACCESS3args*)call->body.cbody.args;
-    ACCESS3res reply = {
-        .status = NFS3_OK,
-        .ACCESS3res_u = { .resok = {
-            .access = args->access,
-        } },
-    };
-    rpc_send_reply(rpc, call, &reply, (zdrproc_t)zdr_ACCESS3res, sizeof(ACCESS3res));
-    return 0;
-}
-
-static int nfs3_readlink_proc(struct rpc_context *rpc, struct rpc_msg *call, void *opaque)
-{
-    nfs_client_t *self = (nfs_client_t*)opaque;
-    READLINK3args *args = (READLINK3args*)call->body.cbody.args;
-    READLINK3res reply = {};
-    // Not supported yet
-    reply.status = NFS3ERR_NOTSUPP;
-    rpc_send_reply(rpc, call, &reply, (zdrproc_t)zdr_READLINK3res, sizeof(READLINK3res));
-    return 0;
-}
-
-#define MAX_REQUEST_SIZE 128*1024*1024
-
-static int nfs3_read_proc(struct rpc_context *rpc, struct rpc_msg *call, void *opaque)
-{
-    nfs_client_t *self = (nfs_client_t*)opaque;
-    READ3args *args = (READ3args*)call->body.cbody.args;
-    std::string handle = std::string(args->file.data.data_val, args->file.data.data_len);
-    auto ino_it = self->parent->inode_by_hash.find(handle);
-    if (ino_it == self->parent->inode_by_hash.end())
-    {
-        READ3res reply = { .status = NFS3ERR_NOENT };
-        rpc_send_reply(rpc, call, &reply, (zdrproc_t)zdr_READ3res, sizeof(READ3res));
-        return 0;
-    }
-    if (args->count > MAX_REQUEST_SIZE)
-    {
-        READ3res reply = { .status = NFS3ERR_INVAL };
-        rpc_send_reply(rpc, call, &reply, (zdrproc_t)zdr_READ3res, sizeof(READ3res));
-        return 0;
-    }
-    void *buf = malloc_or_die(args->count);
-    cluster_op_t *op = new cluster_op_t;
-    op->opcode = OSD_OP_READ;
-    op->inode = ino_it->second;
-    op->offset = args->offset;
-    op->len = args->count;
-    op->iov.push_back(buf, args->count);
-    op->callback = [rpc, call](cluster_op_t *op)
-    {
-        void *buf = op->iov.buf[0].iov_base;
-        READ3res reply = {};
-        if (op->retval != op->len)
-        {
-            if (op->retval == -EINVAL)
-                reply.status = NFS3ERR_INVAL;
-            else if (op->retval == -ENOSPC)
-                reply.status = NFS3ERR_NOSPC;
-            else
-                reply.status = NFS3ERR_IO;
-        }
-        else
-        {
-            reply.status = NFS3_OK;
-            auto & reply_ok = reply.READ3res_u.resok;
-            reply_ok.count = op->retval;
-            reply_ok.eof = FALSE;
-            reply_ok.data.data_len = reply_ok.count;
-            reply_ok.data.data_val = (char*)buf;
-        }
-        rpc_send_reply(rpc, call, &reply, (zdrproc_t)zdr_READ3res, sizeof(READ3res));
-        delete op;
-        free(buf);
-    };
-    self->parent->cli->execute(op);
-    return 0;
-}
-
-static int nfs3_write_proc(struct rpc_context *rpc, struct rpc_msg *call, void *opaque)
-{
-    nfs_client_t *self = (nfs_client_t*)opaque;
-    WRITE3args *args = (WRITE3args*)call->body.cbody.args;
-    WRITE3res reply;
-    // Not supported yet
-    reply.status = NFS3ERR_NOTSUPP;
-    rpc_send_reply(rpc, call, &reply, (zdrproc_t)zdr_WRITE3res, sizeof(WRITE3res));
-    return 0;
-}
-
-static int nfs3_create_proc(struct rpc_context *rpc, struct rpc_msg *call, void *opaque)
-{
-    nfs_client_t *self = (nfs_client_t*)opaque;
-    CREATE3args *args = (CREATE3args*)call->body.cbody.args;
-    CREATE3res reply;
-    // Not supported yet
-    reply.status = NFS3ERR_NOTSUPP;
-    rpc_send_reply(rpc, call, &reply, (zdrproc_t)zdr_CREATE3res, sizeof(CREATE3res));
-    return 0;
-}
-
-static int nfs3_mkdir_proc(struct rpc_context *rpc, struct rpc_msg *call, void *opaque)
-{
-    nfs_client_t *self = (nfs_client_t*)opaque;
-    MKDIR3args *args = (MKDIR3args*)call->body.cbody.args;
-    MKDIR3res reply;
-    // Not supported yet
-    reply.status = NFS3ERR_NOTSUPP;
-    rpc_send_reply(rpc, call, &reply, (zdrproc_t)zdr_MKDIR3res, sizeof(MKDIR3res));
-    return 0;
-}
-
-static int nfs3_symlink_proc(struct rpc_context *rpc, struct rpc_msg *call, void *opaque)
-{
-    nfs_client_t *self = (nfs_client_t*)opaque;
-    SYMLINK3args *args = (SYMLINK3args*)call->body.cbody.args;
-    SYMLINK3res reply;
-    // Not supported yet
-    reply.status = NFS3ERR_NOTSUPP;
-    rpc_send_reply(rpc, call, &reply, (zdrproc_t)zdr_SYMLINK3res, sizeof(SYMLINK3res));
-    return 0;
-}
-
-static int nfs3_mknod_proc(struct rpc_context *rpc, struct rpc_msg *call, void *opaque)
-{
-    nfs_client_t *self = (nfs_client_t*)opaque;
-    MKNOD3args *args = (MKNOD3args*)call->body.cbody.args;
-    MKNOD3res reply;
-    // Not supported yet
-    reply.status = NFS3ERR_NOTSUPP;
-    rpc_send_reply(rpc, call, &reply, (zdrproc_t)zdr_MKNOD3res, sizeof(MKNOD3res));
-    return 0;
-}
-
-static int nfs3_remove_proc(struct rpc_context *rpc, struct rpc_msg *call, void *opaque)
-{
-    REMOVE3args *args = (REMOVE3args*)call->body.cbody.args;
-    REMOVE3res reply;
-    // Not supported yet
-    reply.status = NFS3ERR_NOTSUPP;
-    rpc_send_reply(rpc, call, &reply, (zdrproc_t)zdr_REMOVE3res, sizeof(REMOVE3res));
-    return 0;
-}
-
-static int nfs3_rmdir_proc(struct rpc_context *rpc, struct rpc_msg *call, void *opaque)
-{
-    nfs_client_t *self = (nfs_client_t*)opaque;
-    RMDIR3args *args = (RMDIR3args*)call->body.cbody.args;
-    RMDIR3res reply;
-    // Not supported yet
-    reply.status = NFS3ERR_NOTSUPP;
-    rpc_send_reply(rpc, call, &reply, (zdrproc_t)zdr_RMDIR3res, sizeof(RMDIR3res));
-    return 0;
-}
-
-static int nfs3_rename_proc(struct rpc_context *rpc, struct rpc_msg *call, void *opaque)
-{
-    nfs_client_t *self = (nfs_client_t*)opaque;
-    RENAME3args *args = (RENAME3args*)call->body.cbody.args;
-    RENAME3res reply;
-    // Not supported yet
-    reply.status = NFS3ERR_NOTSUPP;
-    rpc_send_reply(rpc, call, &reply, (zdrproc_t)zdr_RENAME3res, sizeof(RENAME3res));
-    return 0;
-}
-
-static int nfs3_link_proc(struct rpc_context *rpc, struct rpc_msg *call, void *opaque)
-{
-    nfs_client_t *self = (nfs_client_t*)opaque;
-    LINK3args *args = (LINK3args*)call->body.cbody.args;
-    // We don't support hard links
-    LINK3res reply = { NFS3ERR_NOTSUPP };
-    rpc_send_reply(rpc, call, &reply, (zdrproc_t)zdr_LINK3res, sizeof(LINK3res));
-    return 0;
-}
-
-static void nfs3_readdir_common(struct rpc_context *rpc, struct rpc_msg *call, void *opaque, bool is_plus)
-{
-    nfs_client_t *self = (nfs_client_t*)opaque;
-    READDIRPLUS3args plus_args;
-    READDIRPLUS3args *args = NULL;
-    if (is_plus)
-        args = ((READDIRPLUS3args*)call->body.cbody.args);
-    else
-    {
-        args = &plus_args;
-        READDIR3args *in_args = ((READDIR3args*)call->body.cbody.args);
-        args->dir = in_args->dir;
-        args->cookie = in_args->cookie;
-        *((uint64_t*)args->cookieverf) = *((uint64_t*)in_args->cookieverf);
-        args->dircount = 512;
-        args->maxcount = in_args->count;
-    }
-    std::string dirhash = std::string(args->dir.data.data_val, args->dir.data.data_len);
-    std::string dir;
-    if (dirhash != "roothandle")
-    {
-        auto dir_it = self->parent->dir_by_hash.find(dirhash);
-        if (dir_it != self->parent->dir_by_hash.end())
-            dir = dir_it->second;
-    }
-    std::string prefix = self->parent->name_prefix;
-    if (dir != "")
-    {
-        prefix += dir+"/";
-    }
-    //struct timespec now;
-    //clock_gettime(CLOCK_REALTIME, &now);
-    std::map<std::string, struct entryplus3> entries;
-    std::vector<std::string> handles;
-    for (auto & ic: self->parent->cli->st_cli.inode_config)
-    {
-        auto & inode_cfg = ic.second;
-        if (prefix != "" && inode_cfg.name.substr(0, prefix.size()) != prefix)
-            continue;
-        std::string subname = inode_cfg.name.substr(prefix.size());
-        int p = 0;
-        while (p < subname.size() && subname[p] == '/')
-            p++;
-        if (p > 0)
-            subname = subname.substr(p);
-        if (subname.size() == 0)
-            continue;
-        p = 0;
-        while (p < subname.size() && subname[p] != '/')
-            p++;
-        if (p >= subname.size())
-        {
-            entries[subname] = (struct entryplus3){
-                // fileid will change when the user creates snapshots
-                // however, we hope that clients tolerate it well
-                // Linux does, even though it complains about "fileid changed" in dmesg
-                .fileid = ic.first,
-            };
-            if (is_plus)
-            {
-                handles.push_back("S"+base64_encode(sha256(inode_cfg.name)));
-                entries[subname].name_attributes = {
-                    .attributes_follow = TRUE,
-                    .post_op_attr_u = { .attributes = {
-                        .type = NF3REG,
-                        .mode = 0644,
-                        .nlink = 1,
-                        .uid = 0,
-                        .gid = 0,
-                        .size = inode_cfg.size,
-                        .used = inode_cfg.size, // FIXME take from statistics
-                        .rdev = (specdata3){ 0 },
-                        .fsid = self->parent->fsid,
-                        .fileid = ic.first,
-                        //.atime = (nfstime3){ .seconds = now.tv_sec, .nseconds = now.tv_nsec },
-                        //.mtime = (nfstime3){ .seconds = now.tv_sec, .nseconds = now.tv_nsec },
-                        //.ctime = (nfstime3){ .seconds = now.tv_sec, .nseconds = now.tv_nsec },
-                    } },
-                };
-                entries[subname].name_handle = {
-                    .handle_follows = TRUE,
-                    .post_op_fh3_u = { .handle = {
-                        .data = {
-                            // FIXME: I really want ZDR with std::string
-                            .data_len = handles[handles.size()-1].size(),
-                            .data_val = (char*)handles[handles.size()-1].c_str(),
-                        },
-                    } },
-                };
-            }
-        }
-        else
-        {
-            auto subdir = dir == "" ? subname.substr(0, p) : dir+"/"+subname.substr(0, p);
-            entries[subdir] = (struct entryplus3){
-                // for directories, fileid will change when the user restarts proxy
-                .fileid = self->parent->dir_ids.at(subdir),
-            };
-            if (is_plus)
-            {
-                handles.push_back("S"+base64_encode(sha256(subdir)));
-                entries[subdir].name_attributes = {
-                    .attributes_follow = TRUE,
-                    .post_op_attr_u = { .attributes = get_dir_attributes(self, subdir) },
-                };
-                entries[subdir].name_handle = {
-                    .handle_follows = TRUE,
-                    .post_op_fh3_u = { .handle = {
-                        .data = {
-                            // FIXME: I really want ZDR with std::string
-                            .data_len = (unsigned)handles[handles.size()-1].size(),
-                            .data_val = (char*)handles[handles.size()-1].c_str(),
-                        },
-                    } },
-                };
-            }
-        }
-    }
-    // Offset results by the continuation cookie (equal to index in the listing)
-    uint64_t idx = 1;
-    void *prev = NULL;
-    for (auto it = entries.begin(); it != entries.end(); it++)
-    {
-        entryplus3 *entry = &it->second;
-        // First fields of entry3 and entryplus3 are the same: fileid, name, cookie
-        entry->name = (char*)it->first.c_str();
-        entry->cookie = idx++;
-        if (prev)
-        {
-            if (is_plus)
-                ((entryplus3*)prev)->nextentry = entry;
-            else
-                ((entry3*)prev)->nextentry = (entry3*)entry;
-        }
-        prev = entry;
-        if (args->cookie > 0 && entry->cookie == args->cookie+1)
-        {
-            entries.erase(entries.begin(), it);
-        }
-    }
-    // Now limit results based on maximum reply size
-    // Sadly we have to calculate reply size by hand
-    // reply without entries is 4+4+(dir_attributes ? sizeof(fattr3) : 0)+8+4 bytes
-    int reply_size = 20;
-    if (reply_size > args->maxcount)
-    {
-        // Error, too small max reply size
-        if (is_plus)
-        {
-            READDIRPLUS3res reply = { .status = NFS3ERR_TOOSMALL };
-            rpc_send_reply(rpc, call, &reply, (zdrproc_t)zdr_READDIRPLUS3res, sizeof(READDIRPLUS3res));
-        }
-        else
-        {
-            READDIR3res reply = { .status = NFS3ERR_TOOSMALL };
-            rpc_send_reply(rpc, call, &reply, (zdrproc_t)zdr_READDIR3res, sizeof(READDIR3res));
-        }
-        return;
-    }
-    // 1 entry3 is (8+4+(filename_len+3)/4*4+8) bytes
-    // 1 entryplus3 is (8+4+(filename_len+3)/4*4+8
-    //   + 4+(name_attributes ? (sizeof(fattr3) = 84) : 0)
-    //   + 4+(name_handle ? 4+(handle_len+3)/4*4 : 0)) bytes
-    bool eof = true;
-    for (auto it = entries.begin(); it != entries.end(); it++)
-    {
-        reply_size += 20+len_pad4(it->first.size())+(is_plus
-            ? 8+84+len_pad4(it->second.name_handle.post_op_fh3_u.handle.data.data_len) : 0);
-        if (reply_size > args->maxcount)
-        {
-            // Stop
-            entries.erase(it, entries.end());
-            eof = false;
-            break;
-        }
-    }
-    if (entries.end() != entries.begin())
-    {
-        auto last_it = entries.end();
-        last_it--;
-        if (is_plus)
-            ((entryplus3*)&last_it->second)->nextentry = NULL;
-        else
-            ((entry3*)&last_it->second)->nextentry = NULL;
-    }
-    // Send reply
-    if (is_plus)
-    {
-        READDIRPLUS3res reply = { .status = NFS3_OK };
-        *(uint64_t*)(reply.READDIRPLUS3res_u.resok.cookieverf) = self->parent->dir_mod_rev.at(dir);
-        reply.READDIRPLUS3res_u.resok.reply.entries = &entries.begin()->second;
-        reply.READDIRPLUS3res_u.resok.reply.eof = eof;
-        rpc_send_reply(rpc, call, &reply, (zdrproc_t)zdr_READDIRPLUS3res, sizeof(READDIRPLUS3res));
-    }
-    else
-    {
-        READDIR3res reply = { .status = NFS3_OK };
-        *(uint64_t*)(reply.READDIR3res_u.resok.cookieverf) = self->parent->dir_mod_rev.at(dir);
-        reply.READDIR3res_u.resok.reply.entries = (entry3*)&entries.begin()->second;
-        reply.READDIR3res_u.resok.reply.eof = eof;
-        rpc_send_reply(rpc, call, &reply, (zdrproc_t)zdr_READDIR3res, sizeof(READDIR3res));
-    }
-}
-
-static int nfs3_readdir_proc(struct rpc_context *rpc, struct rpc_msg *call, void *opaque)
-{
-    nfs3_readdir_common(rpc, call, opaque, false);
-    return 0;
-}
-
-static int nfs3_readdirplus_proc(struct rpc_context *rpc, struct rpc_msg *call, void *opaque)
-{
-    nfs3_readdir_common(rpc, call, opaque, true);
-    return 0;
-}
-
-// Get file system statistics
-static int nfs3_fsstat_proc(struct rpc_context *rpc, struct rpc_msg *call, void *opaque)
-{
-    nfs_client_t *self = (nfs_client_t*)opaque;
-    FSSTAT3args *args = (FSSTAT3args*)call->body.cbody.args;
-    FSSTAT3res reply;
-    reply.status = NFS3_OK;
-    reply.FSSTAT3res_u.resok.obj_attributes.attributes_follow = TRUE;
-    reply.FSSTAT3res_u.resok.obj_attributes.post_op_attr_u.attributes = get_dir_attributes(self, "");
-    reply.FSSTAT3res_u.resok.tbytes = 4096; // total bytes
-    reply.FSSTAT3res_u.resok.fbytes = 4096; // free bytes
-    reply.FSSTAT3res_u.resok.abytes = 4096; // available bytes
-    reply.FSSTAT3res_u.resok.tfiles = 1 << 31; // total files
-    reply.FSSTAT3res_u.resok.ffiles = 1 << 31; // free files
-    reply.FSSTAT3res_u.resok.afiles = 1 << 31; // available files
-    reply.FSSTAT3res_u.resok.invarsec = 0;
-    rpc_send_reply(rpc, call, &reply, (zdrproc_t)zdr_FSSTAT3res, sizeof(FSSTAT3res));
-    return 0;
-}
-
-static int nfs3_fsinfo_proc(struct rpc_context *rpc, struct rpc_msg *call, void *opaque)
-{
-    nfs_client_t *self = (nfs_client_t*)opaque;
-    FSINFO3args *args = (FSINFO3args*)call->body.cbody.args;
-    FSINFO3res reply;
-    if (args->fsroot.data.data_len != 10)
-    {
-        // Example error
-        reply.status = NFS3ERR_INVAL;
-    }
-    else
-    {
-        // Fill info
-        reply.status = NFS3_OK;
-        reply.FSINFO3res_u.resok.obj_attributes.attributes_follow = TRUE;
-        reply.FSINFO3res_u.resok.obj_attributes.post_op_attr_u.attributes = get_dir_attributes(self, "");
-        reply.FSINFO3res_u.resok.rtmax = 128*1024*1024;
-        reply.FSINFO3res_u.resok.rtpref = 128*1024*1024;
-        reply.FSINFO3res_u.resok.rtmult = 4096;
-        reply.FSINFO3res_u.resok.wtmax = 128*1024*1024;
-        reply.FSINFO3res_u.resok.wtpref = 128*1024*1024;
-        reply.FSINFO3res_u.resok.wtmult = 4096;
-        reply.FSINFO3res_u.resok.dtpref = 128;
-        reply.FSINFO3res_u.resok.maxfilesize = 0x7fffffffffffffff;
-        reply.FSINFO3res_u.resok.time_delta.seconds = 1;
-        reply.FSINFO3res_u.resok.time_delta.nseconds = 0;
-        reply.FSINFO3res_u.resok.properties = FSF3_SYMLINK | FSF3_HOMOGENEOUS;
-    }
-    rpc_send_reply(rpc, call, &reply, (zdrproc_t)zdr_FSINFO3res, sizeof(FSINFO3res));
-    return 0;
-}
-
-static int nfs3_pathconf_proc(struct rpc_context *rpc, struct rpc_msg *call, void *opaque)
-{
-    nfs_client_t *self = (nfs_client_t*)opaque;
-    PATHCONF3args *args = (PATHCONF3args*)call->body.cbody.args;
-    PATHCONF3res reply;
-    if (args->object.data.data_len != 10)
-    {
-        // Example error
-        reply.status = NFS3ERR_INVAL;
-    }
-    else
-    {
-        // Fill info
-        reply.status = NFS3_OK;
-        reply.PATHCONF3res_u.resok.obj_attributes.attributes_follow = FALSE;
-        reply.PATHCONF3res_u.resok.linkmax = 0;
-        reply.PATHCONF3res_u.resok.name_max = 255;
-        reply.PATHCONF3res_u.resok.no_trunc = TRUE;
-        reply.PATHCONF3res_u.resok.chown_restricted = FALSE;
-        reply.PATHCONF3res_u.resok.case_insensitive = FALSE;
-        reply.PATHCONF3res_u.resok.case_preserving = TRUE;
-    }
-    rpc_send_reply(rpc, call, &reply, (zdrproc_t)zdr_PATHCONF3res, sizeof(PATHCONF3res));
-    return 0;
-}
-
-static int nfs3_commit_proc(struct rpc_context *rpc, struct rpc_msg *call, void *opaque)
-{
-    nfs_client_t *self = (nfs_client_t*)opaque;
-    COMMIT3args *args = (COMMIT3args*)call->body.cbody.args;
-    COMMIT3res reply = {};
-    // Just pretend we did fsync :-)
-    rpc_send_reply(rpc, call, &reply, (zdrproc_t)zdr_COMMIT3res, sizeof(COMMIT3res));
-    return 0;
-}
-
-static int mount3_mnt_proc(struct rpc_context *rpc, struct rpc_msg *call, void *opaque)
-{
-    nfs_client_t *self = (nfs_client_t*)opaque;
-    dirpath *arg = (dirpath*)call->body.cbody.args;
-    int flavor = AUTH_NONE;
-    mountres3 reply;
-    reply.fhs_status = MNT3_OK;
-    reply.mountres3_u.mountinfo.fhandle.fhandle3_len = 10;
-    reply.mountres3_u.mountinfo.fhandle.fhandle3_val = "roothandle";
-    reply.mountres3_u.mountinfo.auth_flavors.auth_flavors_len = 1;
-    reply.mountres3_u.mountinfo.auth_flavors.auth_flavors_val = &flavor;
-    rpc_send_reply(rpc, call, &reply, (zdrproc_t)zdr_mountres3, sizeof(mountres3));
-    return 0;
-}
-
-static int mount3_dump_proc(struct rpc_context *rpc, struct rpc_msg *call, void *opaque)
-{
-    nfs_client_t *self = (nfs_client_t*)opaque;
-    mountlist reply;
-    reply = (struct mountbody*)malloc(sizeof(struct mountbody));
-    reply->ml_hostname = (dirpath)"127.0.0.1";
-    reply->ml_directory = (dirpath)"/test";
-    reply->ml_next = NULL;
-    rpc_send_reply(rpc, call, NULL, (zdrproc_t)zdr_mountlist, sizeof(mountlist));
-    free(reply);
-    return 0;
-}
-
-static int mount3_umnt_proc(struct rpc_context *rpc, struct rpc_msg *call, void *opaque)
-{
-    nfs_client_t *self = (nfs_client_t*)opaque;
-    dirpath *arg = (dirpath*)call->body.cbody.args;
-    // do nothing
-    rpc_send_reply(rpc, call, NULL, (zdrproc_t)zdr_void, 0);
-    return 0;
-}
-
-static int mount3_umntall_proc(struct rpc_context *rpc, struct rpc_msg *call, void *opaque)
-{
-    // do nothing
-    rpc_send_reply(rpc, call, NULL, (zdrproc_t)zdr_void, 0);
-    return 0;
-}
-
-static int mount3_export_proc(struct rpc_context *rpc, struct rpc_msg *call, void *opaque)
-{
-    nfs_client_t *self = (nfs_client_t*)opaque;
-    exports reply;
-    reply = (struct exportnode*)malloc(sizeof(struct exportnode) + sizeof(struct groupnode));
-    reply->ex_dir = (dirpath)"/test";
-    reply->ex_groups = (struct groupnode*)(reply+1);
-    reply->ex_groups->gr_name = (dirpath)"127.0.0.1";
-    reply->ex_groups->gr_next = NULL;
-    reply->ex_next = NULL;
-    rpc_send_reply(rpc, call, &reply, (zdrproc_t)zdr_exports, sizeof(exports));
-    free(reply);
-    return 0;
-}
-
-nfs_client_t::nfs_client_t()
-{
-    struct service_proc nfs3_pt_a[22] = {
-        {NFS3_NULL,        nfs3_null_proc,        (zdrproc_t)zdr_void,             0,                        this},
-        {NFS3_GETATTR,     nfs3_getattr_proc,     (zdrproc_t)zdr_GETATTR3args,     sizeof(GETATTR3args),     this},
-        {NFS3_SETATTR,     nfs3_setattr_proc,     (zdrproc_t)zdr_SETATTR3args,     sizeof(SETATTR3args),     this},
-        {NFS3_LOOKUP,      nfs3_lookup_proc,      (zdrproc_t)zdr_LOOKUP3args,      sizeof(LOOKUP3args),      this},
-        {NFS3_ACCESS,      nfs3_access_proc,      (zdrproc_t)zdr_ACCESS3args,      sizeof(ACCESS3args),      this},
-        {NFS3_READLINK,    nfs3_readlink_proc,    (zdrproc_t)zdr_READLINK3args,    sizeof(READLINK3args),    this},
-        {NFS3_READ,        nfs3_read_proc,        (zdrproc_t)zdr_READ3args,        sizeof(READ3args),        this},
-        {NFS3_WRITE,       nfs3_write_proc,       (zdrproc_t)zdr_WRITE3args,       sizeof(WRITE3args),       this},
-        {NFS3_CREATE,      nfs3_create_proc,      (zdrproc_t)zdr_CREATE3args,      sizeof(CREATE3args),      this},
-        {NFS3_MKDIR,       nfs3_mkdir_proc,       (zdrproc_t)zdr_MKDIR3args,       sizeof(MKDIR3args),       this},
-        {NFS3_SYMLINK,     nfs3_symlink_proc,     (zdrproc_t)zdr_SYMLINK3args,     sizeof(SYMLINK3args),     this},
-        {NFS3_MKNOD,       nfs3_mknod_proc,       (zdrproc_t)zdr_MKNOD3args,       sizeof(MKNOD3args),       this},
-        {NFS3_REMOVE,      nfs3_remove_proc,      (zdrproc_t)zdr_REMOVE3args,      sizeof(REMOVE3args),      this},
-        {NFS3_RMDIR,       nfs3_rmdir_proc,       (zdrproc_t)zdr_RMDIR3args,       sizeof(RMDIR3args),       this},
-        {NFS3_RENAME,      nfs3_rename_proc,      (zdrproc_t)zdr_RENAME3args,      sizeof(RENAME3args),      this},
-        {NFS3_LINK,        nfs3_link_proc,        (zdrproc_t)zdr_LINK3args,        sizeof(LINK3args),        this},
-        {NFS3_READDIR,     nfs3_readdir_proc,     (zdrproc_t)zdr_READDIR3args,     sizeof(READDIR3args),     this},
-        {NFS3_READDIRPLUS, nfs3_readdirplus_proc, (zdrproc_t)zdr_READDIRPLUS3args, sizeof(READDIRPLUS3args), this},
-        {NFS3_FSSTAT,      nfs3_fsstat_proc,      (zdrproc_t)zdr_FSSTAT3args,      sizeof(FSSTAT3args),      this},
-        {NFS3_FSINFO,      nfs3_fsinfo_proc,      (zdrproc_t)zdr_FSINFO3args,      sizeof(FSINFO3args),      this},
-        {NFS3_PATHCONF,    nfs3_pathconf_proc,    (zdrproc_t)zdr_PATHCONF3args,    sizeof(PATHCONF3args),    this},
-        {NFS3_COMMIT,      nfs3_commit_proc,      (zdrproc_t)zdr_COMMIT3args,      sizeof(COMMIT3args),      this},
-    };
-    for (int i = 0; i < sizeof(nfs3_pt_a)/sizeof(service_proc); i++)
-    {
-        nfs3_pt.push_back(nfs3_pt_a[i]);
-    }
-    struct service_proc nfs3_mount_pt_a[6] = {
-        {MOUNT3_NULL,    nfs3_null_proc,      (zdrproc_t)zdr_void,    0,               this},
-        {MOUNT3_MNT,     mount3_mnt_proc,     (zdrproc_t)zdr_dirpath, sizeof(dirpath), this},
-        {MOUNT3_DUMP,    mount3_dump_proc,    (zdrproc_t)zdr_void,    0,               this},
-        {MOUNT3_UMNT,    mount3_umnt_proc,    (zdrproc_t)zdr_dirpath, sizeof(dirpath), this},
-        {MOUNT3_UMNTALL, mount3_umntall_proc, (zdrproc_t)zdr_void,    0,               this},
-        {MOUNT3_EXPORT,  mount3_export_proc,  (zdrproc_t)zdr_void,    0,               this},
-    };
-    for (int i = 0; i < sizeof(nfs3_mount_pt_a)/sizeof(service_proc); i++)
-    {
-        nfs3_mount_pt.push_back(nfs3_mount_pt_a[i]);
-    }
-}
-
-nfs_client_t::~nfs_client_t()
-{
-    if (rpc)
-    {
-        rpc_disconnect(rpc, NULL);
-        rpc_destroy_context(rpc);
-    }
-}
--- a/src/nfs_portmap.cpp
+++ b/src/nfs_portmap.cpp
@@ -1,172 +0,0 @@
-// Copyright (c) Vitaliy Filippov, 2019+
-// License: VNPL-1.1 (see README.md for details)
-//
-// Portmap service for NFS proxy
-
-#include <netinet/in.h>
-#include <string.h>
-
-#include "nfs_portmap.h"
-
-#include "libnfs-raw-portmap.h"
-
-#include "sha256.h"
-#include "base64.h"
-
-/*
- * The NULL procedure. All protocols/versions must provide a NULL procedure
- * as index 0.
- * It is used by clients, and rpcinfo, to "ping" a service and verify that
- * the service is available and that it does support the indicated version.
- */
-static int pmap2_null_proc(struct rpc_context *rpc, struct rpc_msg *call, void *opaque)
-{
-    rpc_send_reply(rpc, call, NULL, (zdrproc_t)zdr_void, 0);
-    return 0;
-}
-
-/*
- * v2 GETPORT.
- * This is the lookup function for portmapper version 2.
- * A client provides program, version and protocol (tcp or udp)
- * and portmapper returns which port that service is available on,
- * (or 0 if no such program is registered.)
- */
-static int pmap2_getport_proc(struct rpc_context *rpc, struct rpc_msg *call, void *opaque)
-{
-    portmap_service_t *self = (portmap_service_t *)opaque;
-    PMAP2GETPORTargs *args = (PMAP2GETPORTargs *)call->body.cbody.args;
-    uint32_t port = 0;
-    auto it = self->reg_ports.lower_bound((portmap_id_t){
-        .prog = args->prog,
-        .vers = args->vers,
-        .udp = args->prot == IPPROTO_UDP,
-        .ipv6 = false,
-    });
-    if (it != self->reg_ports.end() &&
-        it->prog == args->prog && it->vers == args->vers &&
-        it->udp == (args->prot == IPPROTO_UDP))
-    {
-        port = it->port;
-    }
-    rpc_send_reply(rpc, call, &port, (zdrproc_t)zdr_uint32_t, sizeof(uint32_t));
-    return 0;
-}
-
-/*
- * v2 DUMP.
- * This RPC returns a list of all endpoints that are registered with
- * portmapper.
- */
-static int pmap2_dump_proc(struct rpc_context *rpc, struct rpc_msg *call, void *opaque)
-{
-    portmap_service_t *self = (portmap_service_t *)opaque;
-    pmap2_mapping_list *list = new pmap2_mapping_list[self->reg_ports.size()];
-    int i = 0;
-    for (auto it = self->reg_ports.begin(); it != self->reg_ports.end(); it++)
-    {
-        if (it->ipv6)
-            continue;
-        list[i] = {
-            .map = {
-                .prog = it->prog,
-                .vers = it->vers,
-                .prot = it->udp ? IPPROTO_UDP : IPPROTO_TCP,
-                .port = it->port,
-            },
-            .next = list+i+1,
-        };
-        i++;
-    }
-    list[i-1].next = NULL;
-    // Send reply
-    PMAP2DUMPres reply;
-    reply.list = list;
-    rpc_send_reply(rpc, call, &reply, (zdrproc_t)zdr_PMAP2DUMPres, sizeof(PMAP2DUMPres));
-    reply.list = NULL;
-    delete list;
-    return 0;
-}
-
-/*
- * v3 GETADDR.
- * This is the lookup function for portmapper version 3.
- */
-static int pmap3_getaddr_proc(struct rpc_context *rpc, struct rpc_msg *call, void *opaque)
-{
-    portmap_service_t *self = (portmap_service_t *)opaque;
-    PMAP3GETADDRargs *args = (PMAP3GETADDRargs *)call->body.cbody.args;
-    portmap_id_t ref = (portmap_id_t){
-        .prog = args->prog,
-        .vers = args->vers,
-        .udp = !strcmp(args->netid, "udp") || !strcmp(args->netid, "udp6"),
-        .ipv6 = !strcmp(args->netid, "tcp6") || !strcmp(args->netid, "udp6"),
-    };
-    auto it = self->reg_ports.lower_bound(ref);
-    PMAP3GETADDRres reply;
-    if (it != self->reg_ports.end() &&
-        it->prog == ref.prog && it->vers == ref.vers &&
-        it->udp == ref.udp && it->ipv6 == ref.ipv6)
-    {
-        reply.addr = (char*)it->addr.c_str();
-    }
-    rpc_send_reply(rpc, call, &reply, (zdrproc_t)zdr_PMAP3GETADDRres, sizeof(PMAP3GETADDRres));
-    return 0;
-}
-
-/*
- * v3 DUMP.
- * This RPC returns a list of all endpoints that are registered with
- * portmapper.
- */
-static int pmap3_dump_proc(struct rpc_context *rpc, struct rpc_msg *call, void *opaque)
-{
-    portmap_service_t *self = (portmap_service_t *)opaque;
-    pmap3_mapping_list *list = new pmap3_mapping_list[self->reg_ports.size()];
-    int i = 0;
-    for (auto it = self->reg_ports.begin(); it != self->reg_ports.end(); it++)
-    {
-        list[i] = (pmap3_mapping_list){
-            .map = {
-                .prog  = it->prog,
-                .vers  = it->vers,
-                .netid = (char*)(it->ipv6
-                    ? (it->udp ? "udp6" : "tcp6")
-                    : (it->udp ? "udp" : "tcp")),
-                .addr  = (char*)it->addr.c_str(), // 0.0.0.0.port
-                .owner = (char*)it->owner.c_str(),
-            },
-            .next = list+i+1,
-        };
-        i++;
-    }
-    list[i-1].next = NULL;
-    // Send reply
-    PMAP3DUMPres reply;
-    reply.list = list;
-    rpc_send_reply(rpc, call, &reply, (zdrproc_t)zdr_PMAP3DUMPres, sizeof(PMAP3DUMPres));
-    reply.list = NULL;
-    delete list;
-    return 0;
-}
-
-portmap_service_t::portmap_service_t()
-{
-    pmap2_pt.push_back((service_proc){PMAP2_NULL, pmap2_null_proc, (zdrproc_t)zdr_void, 0, this});
-    pmap2_pt.push_back((service_proc){PMAP2_GETPORT, pmap2_getport_proc, (zdrproc_t)zdr_PMAP2GETPORTargs, sizeof(PMAP2GETPORTargs), this});
-    pmap2_pt.push_back((service_proc){PMAP2_DUMP, pmap2_dump_proc, (zdrproc_t)zdr_void, 0, this});
-    pmap3_pt.push_back((service_proc){PMAP3_NULL, pmap2_null_proc, (zdrproc_t)zdr_void, 0, this});
-    pmap3_pt.push_back((service_proc){PMAP3_GETADDR, pmap3_getaddr_proc, (zdrproc_t)zdr_PMAP3GETADDRargs, sizeof(PMAP3GETADDRargs), this});
-    pmap3_pt.push_back((service_proc){PMAP3_DUMP, pmap3_dump_proc, (zdrproc_t)zdr_void, 0, this});
-}
-
-std::string sha256(const std::string & str)
-{
-    std::string hash;
-    hash.resize(32);
-    SHA256_CTX ctx;
-    sha256_init(&ctx);
-    sha256_update(&ctx, (uint8_t*)str.data(), str.size());
-    sha256_final(&ctx, (uint8_t*)hash.data());
-    return hash;
-}
--- a/src/nfs_portmap.h
+++ b/src/nfs_portmap.h
@@ -1,41 +0,0 @@
-// Copyright (c) Vitaliy Filippov, 2019+
-// License: VNPL-1.1 (see README.md for details)
-//
-// Portmap service for NFS proxy
-
-#pragma once
-
-#include <string>
-#include <set>
-#include <vector>
-
-#include "nfsc/libnfs.h"
-#include "nfsc/libnfs-raw.h"
-
-struct portmap_id_t
-{
-    unsigned prog, vers;
-    bool udp;
-    bool ipv6;
-    unsigned port;
-    std::string owner;
-    std::string addr;
-};
-
-class portmap_service_t
-{
-public:
-    std::set<portmap_id_t> reg_ports;
-    std::vector<service_proc> pmap2_pt;
-    std::vector<service_proc> pmap3_pt;
-    portmap_service_t();
-};
-
-inline bool operator < (const portmap_id_t &a, const portmap_id_t &b)
-{
-    return a.prog < b.prog || a.prog == b.prog && a.vers < b.vers ||
-        a.prog == b.prog && a.vers == b.vers && a.udp < b.udp ||
-        a.prog == b.prog && a.vers == b.vers && a.udp == b.udp && a.ipv6 < b.ipv6;
-}
-
-std::string sha256(const std::string & str);
--- a/src/nfs_proxy.cpp
+++ b/src/nfs_proxy.cpp
@@ -1,301 +0,0 @@
-// Copyright (c) Vitaliy Filippov, 2019+
-// License: VNPL-1.1 (see README.md for details)
-//
-// Simplified NFS proxy
-// Presents all images as files, stores small files directly in etcd
-// Keeps image list in memory and thus is unsuitable for a lot of files
-
-#include <netinet/tcp.h>
-#include <sys/epoll.h>
-#include <sys/poll.h>
-#include <unistd.h>
-#include <fcntl.h>
-//#include <signal.h>
-
-#include "libnfs-raw-mount.h"
-#include "libnfs-raw-nfs.h"
-#include "libnfs-raw-portmap.h"
-
-#include "addr_util.h"
-#include "base64.h"
-#include "nfs_proxy.h"
-
-const char *exe_name = NULL;
-
-nfs_proxy_t::~nfs_proxy_t()
-{
-    if (cli)
-        delete cli;
-    if (epmgr)
-        delete epmgr;
-    if (ringloop)
-        delete ringloop;
-}
-
-json11::Json::object nfs_proxy_t::parse_args(int narg, const char *args[])
-{
-    json11::Json::object cfg;
-    for (int i = 1; i < narg; i++)
-    {
-        if (!strcmp(args[i], "-h") || !strcmp(args[i], "--help"))
-        {
-            printf(
-                "Vitastor NFS 3.0 proxy\n"
-                "(c) Vitaliy Filippov, 2021-2022 (VNPL-1.1)\n\n"
-                "USAGE:\n"
-                "  %s [--etcd_address ADDR] [OTHER OPTIONS]\n",
-                exe_name
-            );
-            exit(0);
-        }
-        else if (args[i][0] == '-' && args[i][1] == '-')
-        {
-            const char *opt = args[i]+2;
-            cfg[opt] = !strcmp(opt, "json") || i == narg-1 ? "1" : args[++i];
-        }
-    }
-    return cfg;
-}
-
-void nfs_proxy_t::run(json11::Json cfg)
-{
-    bind_address = cfg["bind_address"].string_value();
-    if (bind_address == "")
-        bind_address = "0.0.0.0";
-    // Create client
-    ringloop = new ring_loop_t(512);
-    epmgr = new epoll_manager_t(ringloop);
-    cli = new cluster_client_t(ringloop, epmgr->tfd, cfg);
-    // We need inode name hashes for NFS handles to remain stateless and <= 64 bytes long
-    dir_mod_rev[""] = 0;
-    dir_ids[""] = 1;
-    assert(cli->st_cli.on_inode_change_hook == NULL);
-    cli->st_cli.on_inode_change_hook = [this](inode_t changed_inode, bool removed)
-    {
-        if (removed)
-        {
-            auto ino_it = hash_by_inode.find(changed_inode);
-            if (ino_it != hash_by_inode.end())
-            {
-                inode_by_hash.erase(ino_it->second);
-                hash_by_inode.erase(ino_it);
-            }
-            // FIXME also calculate dir_mod_rev
-        }
-        else
-        {
-            auto & inode_cfg = cli->st_cli.inode_config.at(changed_inode);
-            std::string name = inode_cfg.name;
-            if (name_prefix != "")
-            {
-                if (name.substr(0, name_prefix.size()) != name_prefix)
-                    return;
-                name = name.substr(name_prefix.size());
-            }
-            dir_mod_rev[""] = dir_mod_rev[""] < inode_cfg.mod_revision ? inode_cfg.mod_revision : dir_mod_rev[""];
-            std::string hash = "S"+base64_encode(sha256(name));
-            int pos = name.find('/');
-            while (pos >= 0)
-            {
-                std::string dir = name.substr(0, pos);
-                if (dir_ids.find(dir) == dir_ids.end())
-                    dir_ids[dir] = next_dir_id++;
-                dir_mod_rev[dir] = dir_mod_rev[dir] < inode_cfg.mod_revision ? inode_cfg.mod_revision : dir_mod_rev[dir];
-                dir_by_hash["S"+base64_encode(sha256(dir))] = dir;
-                int next = name.substr(pos+1).find('/');
-                pos = next < 0 ? -1 : pos+1+next;
-            }
-            auto hbi_it = hash_by_inode.find(changed_inode);
-            if (hbi_it != hash_by_inode.end() && hbi_it->second != hash)
-            {
-                // inode had a different name, remove old hash=>inode pointer
-                inode_by_hash.erase(hbi_it->second);
-            }
-            inode_by_hash[hash] = changed_inode;
-            hash_by_inode[changed_inode] = hash;
-        }
-    };
-    // Load image metadata
-    while (!cli->is_ready())
-    {
-        ringloop->loop();
-        if (cli->is_ready())
-            break;
-        ringloop->wait();
-    }
-    // Create portmap socket
-    int portmap_socket = create_and_bind_socket(bind_address, 111, 128, NULL);
-    fcntl(portmap_socket, F_SETFL, fcntl(portmap_socket, F_GETFL, 0) | O_NONBLOCK);
-    // Create NFS socket
-    int nfs_socket = create_and_bind_socket(bind_address, 2049, 128, NULL);
-    fcntl(nfs_socket, F_SETFL, fcntl(nfs_socket, F_GETFL, 0) | O_NONBLOCK);
-    // Self-register portmap and NFS
-    pmap.reg_ports.insert((portmap_id_t){
-        .prog = PMAP_PROGRAM,
-        .vers = PMAP_V2,
-        .port = 111,
-        .owner = "portmapper-service",
-        .addr = "0.0.0.0.0.111",
-    });
-    pmap.reg_ports.insert((portmap_id_t){
-        .prog = PMAP_PROGRAM,
-        .vers = PMAP_V3,
-        .port = 111,
-        .owner = "portmapper-service",
-        .addr = "0.0.0.0.0.111",
-    });
-    pmap.reg_ports.insert((portmap_id_t){
-        .prog = NFS_PROGRAM,
-        .vers = NFS_V3,
-        .port = 2049,
-        .owner = "nfs-server",
-        .addr = "0.0.0.0.0.2049",
-    });
-    pmap.reg_ports.insert((portmap_id_t){
-        .prog = MOUNT_PROGRAM,
-        .vers = MOUNT_V3,
-        .port = 2049,
-        .owner = "rpc.mountd",
-        .addr = "0.0.0.0.0.2049",
-    });
-    // Add FDs to epoll
-    epmgr->tfd->set_fd_handler(portmap_socket, false, [this](int portmap_socket, int epoll_events)
-    {
-        if (epoll_events & EPOLLRDHUP)
-        {
-            fprintf(stderr, "Listening portmap socket disconnected, exiting\n");
-            exit(1);
-        }
-        else
-        {
-            do_accept(portmap_socket);
-        }
-    });
-    epmgr->tfd->set_fd_handler(nfs_socket, false, [this](int nfs_socket, int epoll_events)
-    {
-        if (epoll_events & EPOLLRDHUP)
-        {
-            fprintf(stderr, "Listening portmap socket disconnected, exiting\n");
-            exit(1);
-        }
-        else
-        {
-            do_accept(nfs_socket);
-        }
-    });
-    if (cfg["foreground"].is_null())
-    {
-        daemonize();
-    }
-    while (true)
-    {
-        ringloop->loop();
-        ringloop->wait();
-    }
-    /*// Sync at the end
-    cluster_op_t *close_sync = new cluster_op_t;
-    close_sync->opcode = OSD_OP_SYNC;
-    close_sync->callback = [&stop](cluster_op_t *op)
-    {
-        stop = true;
-        delete op;
-    };
-    cli->execute(close_sync);*/
-    // Destroy the client
-    delete cli;
-    delete epmgr;
-    delete ringloop;
-    cli = NULL;
-    epmgr = NULL;
-    ringloop = NULL;
-}
-
-void nfs_proxy_t::do_accept(int listen_fd)
-{
-    struct sockaddr_storage addr;
-    socklen_t addr_size = sizeof(addr);
-    int nfs_fd = 0;
-    while ((nfs_fd = accept(listen_fd, (struct sockaddr *)&addr, &addr_size)) >= 0)
-    {
-        fprintf(stderr, "New client %d: connection from %s\n", nfs_fd, addr_to_string(addr).c_str());
-        fcntl(nfs_fd, F_SETFL, fcntl(nfs_fd, F_GETFL, 0) | O_NONBLOCK);
-        int one = 1;
-        setsockopt(nfs_fd, SOL_TCP, TCP_NODELAY, &one, sizeof(one));
-        auto cli = new nfs_client_t();
-        cli->parent = this;
-        cli->nfs_fd = nfs_fd;
-        cli->rpc = rpc_init_server_context(nfs_fd);
-        if (!cli->rpc)
-        {
-            delete cli;
-            close(nfs_fd);
-            fprintf(stderr, "Failed to init libnfs server context\n");
-            exit(1);
-        }
-        // Use both portmap and NFS everywhere
-        rpc_register_service(cli->rpc, PMAP_PROGRAM, PMAP_V2, pmap.pmap2_pt.data(), pmap.pmap2_pt.size());
-        rpc_register_service(cli->rpc, PMAP_PROGRAM, PMAP_V3, pmap.pmap3_pt.data(), pmap.pmap3_pt.size());
-        rpc_register_service(cli->rpc, NFS_PROGRAM, NFS_V3, cli->nfs3_pt.data(), cli->nfs3_pt.size());
-        rpc_register_service(cli->rpc, MOUNT_PROGRAM, MOUNT_V3, cli->nfs3_mount_pt.data(), cli->nfs3_mount_pt.size());
-        epmgr->tfd->set_fd_handler(nfs_fd, true, [this, cli](int nfs_fd, int epoll_events)
-        {
-            // Handle incoming event
-            if (epoll_events & EPOLLRDHUP)
-            {
-                fprintf(stderr, "Client %d disconnected\n", nfs_fd);
-                epmgr->tfd->set_fd_handler(cli->nfs_fd, true, NULL);
-                delete cli;
-                close(nfs_fd);
-                return;
-            }
-            int revents = 0;
-            if (epoll_events & EPOLLIN)
-                revents |= POLLIN;
-            if (epoll_events & EPOLLOUT)
-                revents |= POLLOUT;
-            // Let libnfs process the event
-            if (rpc_service(cli->rpc, revents) < 0)
-            {
-                fprintf(stderr, "libnfs error: %s, disconnecting client %d\n", rpc_get_error(cli->rpc), nfs_fd);
-                epmgr->tfd->set_fd_handler(cli->nfs_fd, true, NULL);
-                delete cli;
-                close(nfs_fd);
-                return;
-            }
-            // FIXME Add/remove events based on rpc_which_events(rpc) ?
-        });
-    }
-    if (nfs_fd < 0 && errno != EAGAIN)
-    {
-        fprintf(stderr, "Failed to accept connection: %s\n", strerror(errno));
-        exit(1);
-    }
-}
-
-void nfs_proxy_t::daemonize()
-{
-    if (fork())
-        exit(0);
-    setsid();
-    if (fork())
-        exit(0);
-    if (chdir("/") != 0)
-        fprintf(stderr, "Warning: Failed to chdir into /\n");
-    close(0);
-    close(1);
-    close(2);
-    open("/dev/null", O_RDONLY);
-    open("/dev/null", O_WRONLY);
-    open("/dev/null", O_WRONLY);
-}
-
-int main(int narg, const char *args[])
-{
-    setvbuf(stdout, NULL, _IONBF, 0);
-    setvbuf(stderr, NULL, _IONBF, 0);
-    exe_name = args[0];
-    nfs_proxy_t *p = new nfs_proxy_t();
-    p->run(nfs_proxy_t::parse_args(narg, args));
-    delete p;
-    return 0;
-}
--- a/src/nfs_proxy.h
+++ b/src/nfs_proxy.h
@@ -1,47 +0,0 @@
-#pragma once
-
-#include "cluster_client.h"
-#include "epoll_manager.h"
-#include "nfs_portmap.h"
-
-#include "nfsc/libnfs-raw.h"
-
-class nfs_proxy_t
-{
-public:
-    std::string bind_address;
-    std::string name_prefix;
-    int fsid = 1;
-
-    portmap_service_t pmap;
-    ring_loop_t *ringloop = NULL;
-    epoll_manager_t *epmgr = NULL;
-    cluster_client_t *cli = NULL;
-
-    uint64_t next_dir_id = 2;
-    std::map<std::string, std::string> dir_by_hash;
-    std::map<std::string, uint64_t> dir_ids;
-    std::map<std::string, uint64_t> dir_mod_rev;
-    std::map<inode_t, std::string> hash_by_inode;
-    std::map<std::string, inode_t> inode_by_hash;
-
-    ~nfs_proxy_t();
-
-    static json11::Json::object parse_args(int narg, const char *args[]);
-    void run(json11::Json cfg);
-    void do_accept(int listen_fd);
-    void daemonize();
-};
-
-class nfs_client_t
-{
-public:
-    nfs_proxy_t *parent = NULL;
-    int nfs_fd;
-    struct rpc_context *rpc = NULL;
-    std::vector<service_proc> nfs3_pt;
-    std::vector<service_proc> nfs3_mount_pt;
-
-    nfs_client_t();
-    ~nfs_client_t();
-};
--- a/src/osd.cpp
+++ b/src/osd.cpp
@@ -57,11 +57,7 @@ osd_t::osd_t(const json11::Json & config, ring_loop_t *ringloop)
    if (this->config["osd_memlock"] == "true" || this->config["osd_memlock"] == "1" || this->config["osd_memlock"] == "yes")
    {
        // Lock all OSD memory if requested
-        if (mlockall(MCL_CURRENT|MCL_FUTURE
-#ifdef MCL_ONFAULT
-            | MCL_ONFAULT
-#endif
-            ) != 0)
+        if (mlockall(MCL_CURRENT|MCL_FUTURE|MCL_ONFAULT) != 0)
        {
            fprintf(stderr, "osd_memlock is set to true, but mlockall() failed: %s\n", strerror(errno));
            exit(-1);
@@ -200,7 +196,46 @@ void osd_t::bind_socket()

    // FIXME Support multiple listening sockets

-    listen_fd = create_and_bind_socket(bind_address, bind_port, listen_backlog, &listening_port);
+    sockaddr addr;
+    if (!string_to_addr(bind_address, 0, bind_port, &addr))
+    {
+        throw std::runtime_error("bind address "+bind_address+" is not valid");
+    }
+
+    listen_fd = socket(addr.sa_family, SOCK_STREAM, 0);
+    if (listen_fd < 0)
+    {
+        throw std::runtime_error(std::string("socket: ") + strerror(errno));
+    }
+    int enable = 1;
+    setsockopt(listen_fd, SOL_SOCKET, SO_REUSEADDR, &enable, sizeof(enable));
+
+    if (bind(listen_fd, &addr, sizeof(addr)) < 0)
+    {
+        close(listen_fd);
+        throw std::runtime_error(std::string("bind: ") + strerror(errno));
+    }
+    if (bind_port == 0)
+    {
+        socklen_t len = sizeof(addr);
+        if (getsockname(listen_fd, (sockaddr *)&addr, &len) == -1)
+        {
+            close(listen_fd);
+            throw std::runtime_error(std::string("getsockname: ") + strerror(errno));
+        }
+        listening_port = ntohs(((sockaddr_in*)&addr)->sin_port);
+    }
+    else
+    {
+        listening_port = bind_port;
+    }
+
+    if (listen(listen_fd, listen_backlog) < 0)
+    {
+        close(listen_fd);
+        throw std::runtime_error(std::string("listen: ") + strerror(errno));
+    }
+
    fcntl(listen_fd, F_SETFL, fcntl(listen_fd, F_GETFL, 0) | O_NONBLOCK);

    epmgr->set_fd_handler(listen_fd, false, [this](int fd, int events)
--- a/src/osd.h
+++ b/src/osd.h
@@ -211,7 +211,7 @@ class osd_t
    // flushing, recovery and backfill
    void submit_pg_flush_ops(pg_t & pg);
    void handle_flush_op(bool rollback, pool_id_t pool_id, pg_num_t pg_num, pg_flush_batch_t *fb, osd_num_t peer_osd, int retval);
-    bool submit_flush_op(pool_id_t pool_id, pg_num_t pg_num, pg_flush_batch_t *fb, bool rollback, osd_num_t peer_osd, int count, obj_ver_id *data);
+    void submit_flush_op(pool_id_t pool_id, pg_num_t pg_num, pg_flush_batch_t *fb, bool rollback, osd_num_t peer_osd, int count, obj_ver_id *data);
    bool pick_next_recovery(osd_recovery_op_t &op);
    void submit_recovery_op(osd_recovery_op_t *op);
    bool continue_recovery();
--- a/src/osd_cluster.cpp
+++ b/src/osd_cluster.cpp
@@ -189,7 +189,7 @@ void osd_t::report_statistics()
    for (auto kv: bs->get_inode_space_stats())
    {
        pool_id_t pool_id = INODE_POOL(kv.first);
-        uint64_t only_inode_num = (kv.first & (((uint64_t)1 << (64-POOL_ID_BITS)) - 1));
+        uint64_t only_inode_num = (kv.first & ((1l << (64-POOL_ID_BITS)) - 1));
        if (!last_pool || pool_id != last_pool)
        {
            if (last_pool)
@@ -207,7 +207,7 @@ void osd_t::report_statistics()
    for (auto kv: inode_stats)
    {
        pool_id_t pool_id = INODE_POOL(kv.first);
-        uint64_t only_inode_num = (kv.first & (((uint64_t)1 << (64-POOL_ID_BITS)) - 1));
+        uint64_t only_inode_num = (kv.first & ((1l << (64-POOL_ID_BITS)) - 1));
        if (!last_pool || pool_id != last_pool)
        {
            if (last_pool)
@@ -457,8 +457,7 @@ void osd_t::renew_lease()
        if (err == "" && data["result"]["TTL"].string_value() == "")
        {
            // Die
-            fprintf(stderr, "Error refreshing etcd lease\n");
-            force_stop(1);
+            throw std::runtime_error("etcd lease has expired");
        }
        if (err != "")
        {
@@ -467,8 +466,7 @@ void osd_t::renew_lease()
            if (etcd_failed_attempts > st_cli.max_etcd_attempts)
            {
                // Die
-                fprintf(stderr, "Cluster connection failed\n");
-                force_stop(1);
+                throw std::runtime_error("Cluster connection failed");
            }
            // Retry
            tfd->set_timer(st_cli.etcd_quick_timeout, false, [this](int timer_id)
--- a/src/osd_flush.cpp
+++ b/src/osd_flush.cpp
@@ -47,8 +47,7 @@ void osd_t::submit_pg_flush_ops(pg_t & pg)
        if (l.second.size() > 0)
        {
            fb->flush_ops++;
-            if (!submit_flush_op(pg.pool_id, pg.pg_num, fb, true, l.first, l.second.size(), l.second.data()))
-                return;
+            submit_flush_op(pg.pool_id, pg.pg_num, fb, true, l.first, l.second.size(), l.second.data());
        }
    }
    for (auto & l: fb->stable_lists)
@@ -56,8 +55,7 @@ void osd_t::submit_pg_flush_ops(pg_t & pg)
        if (l.second.size() > 0)
        {
            fb->flush_ops++;
-            if (!submit_flush_op(pg.pool_id, pg.pg_num, fb, false, l.first, l.second.size(), l.second.data()))
-                return;
+            submit_flush_op(pg.pool_id, pg.pg_num, fb, false, l.first, l.second.size(), l.second.data());
        }
    }
 }
@@ -162,7 +160,7 @@ void osd_t::handle_flush_op(bool rollback, pool_id_t pool_id, pg_num_t pg_num, p
    }
 }

-bool osd_t::submit_flush_op(pool_id_t pool_id, pg_num_t pg_num, pg_flush_batch_t *fb, bool rollback, osd_num_t peer_osd, int count, obj_ver_id *data)
+void osd_t::submit_flush_op(pool_id_t pool_id, pg_num_t pg_num, pg_flush_batch_t *fb, bool rollback, osd_num_t peer_osd, int count, obj_ver_id *data)
 {
    osd_op_t *op = new osd_op_t();
    // Copy buffer so it gets freed along with the operation
@@ -190,8 +188,10 @@ bool osd_t::submit_flush_op(pool_id_t pool_id, pg_num_t pg_num, pg_flush_batch_t
    else
    {
        // Peer
+        int peer_fd = msgr.osd_peer_fds[peer_osd];
        op->op_type = OSD_OP_OUT;
        op->iov.push_back(op->buf, count * sizeof(obj_ver_id));
+        op->peer_fd = peer_fd;
        op->req = (osd_any_op_t){
            .sec_stab = {
                .header = {
@@ -207,21 +207,8 @@ bool osd_t::submit_flush_op(pool_id_t pool_id, pg_num_t pg_num, pg_flush_batch_t
            handle_flush_op(op->req.hdr.opcode == OSD_OP_SEC_ROLLBACK, pool_id, pg_num, fb, peer_osd, op->reply.hdr.retval);
            delete op;
        };
-        auto peer_fd_it = msgr.osd_peer_fds.find(peer_osd);
-        if (peer_fd_it != msgr.osd_peer_fds.end())
-        {
-            op->peer_fd = peer_fd_it->second;
-            msgr.outbox_push(op);
-        }
-        else
-        {
-            // Fail it immediately
-            op->reply.hdr.retval = -EPIPE;
-            op->callback(op);
-            return false;
-        }
+        msgr.outbox_push(op);
    }
-    return true;
 }

 bool osd_t::pick_next_recovery(osd_recovery_op_t &op)
--- a/src/osd_id.h
+++ b/src/osd_id.h
@@ -9,7 +9,7 @@
 #define POOL_ID_MAX 0x10000
 #define POOL_ID_BITS 16
 #define INODE_POOL(inode) (pool_id_t)((inode) >> (64 - POOL_ID_BITS))
-#define INODE_NO_POOL(inode) (inode_t)(inode & (((uint64_t)1 << (64-POOL_ID_BITS)) - 1))
+#define INODE_NO_POOL(inode) (inode_t)(inode & ((1l << (64-POOL_ID_BITS)) - 1))
 #define INODE_WITH_POOL(pool_id, inode) (((inode_t)(pool_id) << (64-POOL_ID_BITS)) | INODE_NO_POOL(inode))

 // Pool ID is 16 bits long
--- a/src/osd_peering.cpp
+++ b/src/osd_peering.cpp
@@ -29,10 +29,8 @@ void osd_t::handle_peers()
                    degraded_objects += p.second.degraded_objects.size();
                    if (p.second.state & PG_HAS_UNCLEAN)
                        peering_state = peering_state | OSD_FLUSHING_PGS;
-                    else if (p.second.state & (PG_HAS_DEGRADED | PG_HAS_MISPLACED))
+                    else if (p.second.state & PG_HAS_DEGRADED)
                        peering_state = peering_state | OSD_RECOVERING;
-                    ringloop->wakeup();
-                    return;
                }
                else
                {
@@ -342,7 +340,7 @@ void osd_t::submit_sync_and_list_subop(osd_num_t role_osd, pg_peering_state_t *p
    else
    {
        // Peer
-        auto & cl = msgr.clients.at(msgr.osd_peer_fds.at(role_osd));
+        auto & cl = msgr.clients.at(msgr.osd_peer_fds[role_osd]);
        osd_op_t *op = new osd_op_t();
        op->op_type = OSD_OP_OUT;
        op->peer_fd = cl->peer_fd;
@@ -396,9 +394,7 @@ void osd_t::submit_list_subop(osd_num_t role_osd, pg_peering_state_t *ps)
        {
            if (op->bs_op->retval < 0)
            {
-                printf("Local OP_LIST failed: retval=%d\n", op->bs_op->retval);
-                force_stop(1);
-                return;
+                throw std::runtime_error("local OP_LIST failed");
            }
            add_bs_subop_stats(op);
            printf(
@@ -423,7 +419,7 @@ void osd_t::submit_list_subop(osd_num_t role_osd, pg_peering_state_t *ps)
        // Peer
        osd_op_t *op = new osd_op_t();
        op->op_type = OSD_OP_OUT;
-        op->peer_fd = msgr.osd_peer_fds.at(role_osd);
+        op->peer_fd = msgr.osd_peer_fds[role_osd];
        op->req = (osd_any_op_t){
            .sec_list = {
                .header = {
--- a/src/osd_peering_pg.cpp
+++ b/src/osd_peering_pg.cpp
@@ -437,7 +437,7 @@ void pg_t::calc_object_states(int log_level)
    st.walk();
    if (this->state & (PG_DEGRADED|PG_LEFT_ON_DEAD))
    {
-        assert(epoch != (((uint64_t)1 << PG_EPOCH_BITS)-1));
+        assert(epoch != ((1ul << PG_EPOCH_BITS)-1));
        epoch++;
    }
 }
--- a/src/osd_primary.cpp
+++ b/src/osd_primary.cpp
@@ -194,22 +194,18 @@ void osd_t::continue_primary_read(osd_op_t *cur_op)
        // Determine version
        auto vo_it = pg.ver_override.find(op_data->oid);
        op_data->target_ver = vo_it != pg.ver_override.end() ? vo_it->second : UINT64_MAX;
-        op_data->prev_set = pg.cur_set.data();
-        if (pg.state != PG_ACTIVE)
-        {
-            // PG may be degraded or have misplaced objects
-            op_data->prev_set = get_object_osd_set(pg, op_data->oid, pg.cur_set.data(), &op_data->object_state);
-        }
        if (pg.state == PG_ACTIVE || op_data->scheme == POOL_SCHEME_REPLICATED)
        {
            // Fast happy-path
            cur_op->buf = alloc_read_buffer(op_data->stripes, op_data->pg_data_size, 0);
-            submit_primary_subops(SUBMIT_RMW_READ, op_data->target_ver, op_data->prev_set, cur_op);
+            submit_primary_subops(SUBMIT_RMW_READ, op_data->target_ver, pg.cur_set.data(), cur_op);
            op_data->st = 1;
        }
        else
        {
-            if (extend_missing_stripes(op_data->stripes, op_data->prev_set, op_data->pg_data_size, pg.pg_size) < 0)
+            // PG may be degraded or have misplaced objects
+            uint64_t* cur_set = get_object_osd_set(pg, op_data->oid, pg.cur_set.data(), &op_data->object_state);
+            if (extend_missing_stripes(op_data->stripes, cur_set, op_data->pg_data_size, pg.pg_size) < 0)
            {
                finish_op(cur_op, -EIO);
                return;
@@ -219,7 +215,7 @@ void osd_t::continue_primary_read(osd_op_t *cur_op)
            op_data->scheme = pg.scheme;
            op_data->degraded = 1;
            cur_op->buf = alloc_read_buffer(op_data->stripes, pg.pg_size, 0);
-            submit_primary_subops(SUBMIT_RMW_READ, op_data->target_ver, op_data->prev_set, cur_op);
+            submit_primary_subops(SUBMIT_RMW_READ, op_data->target_ver, cur_set, cur_op);
            op_data->st = 1;
        }
    }
--- a/src/osd_primary_chain.cpp
+++ b/src/osd_primary_chain.cpp
@@ -246,6 +246,7 @@ int osd_t::submit_bitmap_subops(osd_op_t *cur_op, pg_t & pg)
                // Send to a remote OSD
                osd_op_t *subop = op_data->subops+subop_idx;
                subop->op_type = OSD_OP_OUT;
+                subop->peer_fd = msgr.osd_peer_fds.at(subop_osd_num);
                // FIXME: Use the pre-allocated buffer
                subop->buf = malloc_or_die(sizeof(obj_ver_id)*(i+1-prev));
                subop->req = (osd_any_op_t){
@@ -286,18 +287,7 @@ int osd_t::submit_bitmap_subops(osd_op_t *cur_op, pg_t & pg)
                    }
                    handle_primary_subop(subop, cur_op);
                };
-                auto peer_fd_it = msgr.osd_peer_fds.find(subop_osd_num);
-                if (peer_fd_it != msgr.osd_peer_fds.end())
-                {
-                    subop->peer_fd = peer_fd_it->second;
-                    msgr.outbox_push(subop);
-                }
-                else
-                {
-                    // Fail it immediately
-                    subop->reply.hdr.retval = -EPIPE;
-                    subop->callback(subop);
-                }
+                msgr.outbox_push(subop);
                subop_idx++;
            }
            prev = i+1;
@@ -400,21 +390,18 @@ int osd_t::submit_chained_read_requests(pg_t & pg, osd_op_t *cur_op)
            stripes[role].read_end = stripes[role].req_end;
        }
        uint64_t *cur_set = pg.cur_set.data();
-        if (pg.state != PG_ACTIVE)
+        if (pg.state != PG_ACTIVE && op_data->scheme != POOL_SCHEME_REPLICATED)
        {
            pg_osd_set_state_t *object_state;
            cur_set = get_object_osd_set(pg, cur_oid, pg.cur_set.data(), &object_state);
-            if (op_data->scheme != POOL_SCHEME_REPLICATED)
+            if (extend_missing_stripes(stripes, cur_set, pg.pg_data_size, pg.pg_size) < 0)
            {
-                if (extend_missing_stripes(stripes, cur_set, pg.pg_data_size, pg.pg_size) < 0)
-                {
-                    free(op_data->chain_reads);
-                    op_data->chain_reads = NULL;
-                    finish_op(cur_op, -EIO);
-                    return -1;
-                }
-                op_data->degraded = 1;
+                free(op_data->chain_reads);
+                op_data->chain_reads = NULL;
+                finish_op(cur_op, -EIO);
+                return -1;
            }
+            op_data->degraded = 1;
        }
        if (op_data->scheme == POOL_SCHEME_REPLICATED)
        {
@@ -468,7 +455,7 @@ int osd_t::submit_chained_read_requests(pg_t & pg, osd_op_t *cur_op)
        auto vo_it = pg.ver_override.find(cur_oid);
        uint64_t target_ver = vo_it != pg.ver_override.end() ? vo_it->second : UINT64_MAX;
        uint64_t *cur_set = pg.cur_set.data();
-        if (pg.state != PG_ACTIVE)
+        if (pg.state != PG_ACTIVE && op_data->scheme != POOL_SCHEME_REPLICATED)
        {
            pg_osd_set_state_t *object_state;
            cur_set = get_object_osd_set(pg, cur_oid, pg.cur_set.data(), &object_state);
--- a/src/osd_primary_subops.cpp
+++ b/src/osd_primary_subops.cpp
@@ -182,6 +182,7 @@ int osd_t::submit_primary_subop_batch(int submit_type, inode_t inode, uint64_t o
            else
            {
                subop->op_type = OSD_OP_OUT;
+                subop->peer_fd = msgr.osd_peer_fds.at(role_osd_num);
                subop->bitmap = stripes[stripe_num].bmp_buf;
                subop->bitmap_len = clean_entry_bitmap_size;
                subop->req.sec_rw = {
@@ -224,18 +225,7 @@ int osd_t::submit_primary_subop_batch(int submit_type, inode_t inode, uint64_t o
                {
                    handle_primary_subop(subop, cur_op);
                };
-                auto peer_fd_it = msgr.osd_peer_fds.find(role_osd_num);
-                if (peer_fd_it != msgr.osd_peer_fds.end())
-                {
-                    subop->peer_fd = peer_fd_it->second;
-                    msgr.outbox_push(subop);
-                }
-                else
-                {
-                    // Fail it immediately
-                    subop->reply.hdr.retval = -EPIPE;
-                    subop->callback(subop);
-                }
+                msgr.outbox_push(subop);
            }
            i++;
        }
@@ -473,6 +463,7 @@ void osd_t::submit_primary_del_batch(osd_op_t *cur_op, obj_ver_osd_t *chunks_to_
        else
        {
            subops[i].op_type = OSD_OP_OUT;
+            subops[i].peer_fd = msgr.osd_peer_fds.at(chunk.osd_num);
            subops[i].req = (osd_any_op_t){ .sec_del = {
                .header = {
                    .magic = SECONDARY_OSD_OP_MAGIC,
@@ -486,18 +477,7 @@ void osd_t::submit_primary_del_batch(osd_op_t *cur_op, obj_ver_osd_t *chunks_to_
            {
                handle_primary_subop(subop, cur_op);
            };
-            auto peer_fd_it = msgr.osd_peer_fds.find(chunk.osd_num);
-            if (peer_fd_it != msgr.osd_peer_fds.end())
-            {
-                subops[i].peer_fd = peer_fd_it->second;
-                msgr.outbox_push(&subops[i]);
-            }
-            else
-            {
-                // Fail it immediately
-                subops[i].reply.hdr.retval = -EPIPE;
-                subops[i].callback(&subops[i]);
-            }
+            msgr.outbox_push(&subops[i]);
        }
    }
 }
@@ -587,6 +567,7 @@ void osd_t::submit_primary_stab_subops(osd_op_t *cur_op)
        else
        {
            subops[i].op_type = OSD_OP_OUT;
+            subops[i].peer_fd = msgr.osd_peer_fds.at(stab_osd.osd_num);
            subops[i].req = (osd_any_op_t){ .sec_stab = {
                .header = {
                    .magic = SECONDARY_OSD_OP_MAGIC,
@@ -600,18 +581,7 @@ void osd_t::submit_primary_stab_subops(osd_op_t *cur_op)
            {
                handle_primary_subop(subop, cur_op);
            };
-            auto peer_fd_it = msgr.osd_peer_fds.find(stab_osd.osd_num);
-            if (peer_fd_it != msgr.osd_peer_fds.end())
-            {
-                subops[i].peer_fd = peer_fd_it->second;
-                msgr.outbox_push(&subops[i]);
-            }
-            else
-            {
-                // Fail it immediately
-                subops[i].reply.hdr.retval = -EPIPE;
-                subops[i].callback(&subops[i]);
-            }
+            msgr.outbox_push(&subops[i]);
        }
    }
 }
--- a/src/osd_primary_write.cpp
+++ b/src/osd_primary_write.cpp
@@ -144,9 +144,9 @@ resume_3:
    }
    else
    {
-        if ((op_data->fact_ver & ((uint64_t)1 << (64-PG_EPOCH_BITS) - 1)) == ((uint64_t)1 << (64-PG_EPOCH_BITS) - 1))
+        if ((op_data->fact_ver & (1ul<<(64-PG_EPOCH_BITS) - 1)) == (1ul<<(64-PG_EPOCH_BITS) - 1))
        {
-            assert(pg.epoch != (((uint64_t)1 << PG_EPOCH_BITS)-1));
+            assert(pg.epoch != ((1ul << PG_EPOCH_BITS)-1));
            pg.epoch++;
        }
        op_data->target_ver = op_data->fact_ver + 1;
--- a/src/osd_rmw.h
+++ b/src/osd_rmw.h
@@ -8,7 +8,7 @@
 #include "osd_id.h"

 #ifndef MEM_ALIGNMENT
-#define MEM_ALIGNMENT 4096
+#define MEM_ALIGNMENT 512
 #endif

 struct buf_len_t
--- a/src/osd_test.cpp
+++ b/src/osd_test.cpp
@@ -134,14 +134,14 @@ int main(int narg, char *args[])

 int connect_osd(const char *osd_address, int osd_port)
 {
-    struct sockaddr_storage addr;
+    struct sockaddr addr;
    if (!string_to_addr(osd_address, 0, osd_port, &addr))
    {
        fprintf(stderr, "server address: %s is not valid\n", osd_address);
        return -1;
    }

-    int connect_fd = socket(addr.ss_family, SOCK_STREAM, 0);
+    int connect_fd = socket(addr.sa_family, SOCK_STREAM, 0);
    if (connect_fd < 0)
    {
        perror("socket");
--- a/src/qemu_driver.c
+++ b/src/qemu_driver.c
@@ -262,7 +262,7 @@ static int vitastor_file_open(BlockDriverState *bs, QDict *options, int flags, E
        client->pool = qdict_get_try_int(options, "pool", 0);
        if (client->pool)
        {
-            client->inode = (client->inode & (((uint64_t)1 << (64-POOL_ID_BITS)) - 1)) | (client->pool << (64-POOL_ID_BITS));
+            client->inode = (client->inode & ((1l << (64-POOL_ID_BITS)) - 1)) | (client->pool << (64-POOL_ID_BITS));
        }
        client->size = qdict_get_try_int(options, "size", 0);
    }
--- a/src/sha256.c
+++ b/src/sha256.c
@@ -1,158 +0,0 @@
-/*********************************************************************
-* Filename:   sha256.c
-* Author:     Brad Conte (brad AT bradconte.com)
-* Copyright:
-* Disclaimer: This code is presented "as is" without any guarantees.
-* Details:    Implementation of the SHA-256 hashing algorithm.
-              SHA-256 is one of the three algorithms in the SHA2
-              specification. The others, SHA-384 and SHA-512, are not
-              offered in this implementation.
-              Algorithm specification can be found here:
-               * http://csrc.nist.gov/publications/fips/fips180-2/fips180-2withchangenotice.pdf
-              This implementation uses little endian byte order.
-*********************************************************************/
-
-/*************************** HEADER FILES ***************************/
-#include <stdlib.h>
-#include <memory.h>
-#include "sha256.h"
-
-/****************************** MACROS ******************************/
-#define ROTLEFT(a,b) (((a) << (b)) | ((a) >> (32-(b))))
-#define ROTRIGHT(a,b) (((a) >> (b)) | ((a) << (32-(b))))
-
-#define CH(x,y,z) (((x) & (y)) ^ (~(x) & (z)))
-#define MAJ(x,y,z) (((x) & (y)) ^ ((x) & (z)) ^ ((y) & (z)))
-#define EP0(x) (ROTRIGHT(x,2) ^ ROTRIGHT(x,13) ^ ROTRIGHT(x,22))
-#define EP1(x) (ROTRIGHT(x,6) ^ ROTRIGHT(x,11) ^ ROTRIGHT(x,25))
-#define SIG0(x) (ROTRIGHT(x,7) ^ ROTRIGHT(x,18) ^ ((x) >> 3))
-#define SIG1(x) (ROTRIGHT(x,17) ^ ROTRIGHT(x,19) ^ ((x) >> 10))
-
-/**************************** VARIABLES *****************************/
-static const WORD k[64] = {
-	0x428a2f98,0x71374491,0xb5c0fbcf,0xe9b5dba5,0x3956c25b,0x59f111f1,0x923f82a4,0xab1c5ed5,
-	0xd807aa98,0x12835b01,0x243185be,0x550c7dc3,0x72be5d74,0x80deb1fe,0x9bdc06a7,0xc19bf174,
-	0xe49b69c1,0xefbe4786,0x0fc19dc6,0x240ca1cc,0x2de92c6f,0x4a7484aa,0x5cb0a9dc,0x76f988da,
-	0x983e5152,0xa831c66d,0xb00327c8,0xbf597fc7,0xc6e00bf3,0xd5a79147,0x06ca6351,0x14292967,
-	0x27b70a85,0x2e1b2138,0x4d2c6dfc,0x53380d13,0x650a7354,0x766a0abb,0x81c2c92e,0x92722c85,
-	0xa2bfe8a1,0xa81a664b,0xc24b8b70,0xc76c51a3,0xd192e819,0xd6990624,0xf40e3585,0x106aa070,
-	0x19a4c116,0x1e376c08,0x2748774c,0x34b0bcb5,0x391c0cb3,0x4ed8aa4a,0x5b9cca4f,0x682e6ff3,
-	0x748f82ee,0x78a5636f,0x84c87814,0x8cc70208,0x90befffa,0xa4506ceb,0xbef9a3f7,0xc67178f2
-};
-
-/*********************** FUNCTION DEFINITIONS ***********************/
-void sha256_transform(SHA256_CTX *ctx, const BYTE data[])
-{
-	WORD a, b, c, d, e, f, g, h, i, j, t1, t2, m[64];
-
-	for (i = 0, j = 0; i < 16; ++i, j += 4)
-		m[i] = (data[j] << 24) | (data[j + 1] << 16) | (data[j + 2] << 8) | (data[j + 3]);
-	for ( ; i < 64; ++i)
-		m[i] = SIG1(m[i - 2]) + m[i - 7] + SIG0(m[i - 15]) + m[i - 16];
-
-	a = ctx->state[0];
-	b = ctx->state[1];
-	c = ctx->state[2];
-	d = ctx->state[3];
-	e = ctx->state[4];
-	f = ctx->state[5];
-	g = ctx->state[6];
-	h = ctx->state[7];
-
-	for (i = 0; i < 64; ++i) {
-		t1 = h + EP1(e) + CH(e,f,g) + k[i] + m[i];
-		t2 = EP0(a) + MAJ(a,b,c);
-		h = g;
-		g = f;
-		f = e;
-		e = d + t1;
-		d = c;
-		c = b;
-		b = a;
-		a = t1 + t2;
-	}
-
-	ctx->state[0] += a;
-	ctx->state[1] += b;
-	ctx->state[2] += c;
-	ctx->state[3] += d;
-	ctx->state[4] += e;
-	ctx->state[5] += f;
-	ctx->state[6] += g;
-	ctx->state[7] += h;
-}
-
-void sha256_init(SHA256_CTX *ctx)
-{
-	ctx->datalen = 0;
-	ctx->bitlen = 0;
-	ctx->state[0] = 0x6a09e667;
-	ctx->state[1] = 0xbb67ae85;
-	ctx->state[2] = 0x3c6ef372;
-	ctx->state[3] = 0xa54ff53a;
-	ctx->state[4] = 0x510e527f;
-	ctx->state[5] = 0x9b05688c;
-	ctx->state[6] = 0x1f83d9ab;
-	ctx->state[7] = 0x5be0cd19;
-}
-
-void sha256_update(SHA256_CTX *ctx, const BYTE data[], size_t len)
-{
-	WORD i;
-
-	for (i = 0; i < len; ++i) {
-		ctx->data[ctx->datalen] = data[i];
-		ctx->datalen++;
-		if (ctx->datalen == 64) {
-			sha256_transform(ctx, ctx->data);
-			ctx->bitlen += 512;
-			ctx->datalen = 0;
-		}
-	}
-}
-
-void sha256_final(SHA256_CTX *ctx, BYTE hash[])
-{
-	WORD i;
-
-	i = ctx->datalen;
-
-	// Pad whatever data is left in the buffer.
-	if (ctx->datalen < 56) {
-		ctx->data[i++] = 0x80;
-		while (i < 56)
-			ctx->data[i++] = 0x00;
-	}
-	else {
-		ctx->data[i++] = 0x80;
-		while (i < 64)
-			ctx->data[i++] = 0x00;
-		sha256_transform(ctx, ctx->data);
-		memset(ctx->data, 0, 56);
-	}
-
-	// Append to the padding the total message's length in bits and transform.
-	ctx->bitlen += ctx->datalen * 8;
-	ctx->data[63] = ctx->bitlen;
-	ctx->data[62] = ctx->bitlen >> 8;
-	ctx->data[61] = ctx->bitlen >> 16;
-	ctx->data[60] = ctx->bitlen >> 24;
-	ctx->data[59] = ctx->bitlen >> 32;
-	ctx->data[58] = ctx->bitlen >> 40;
-	ctx->data[57] = ctx->bitlen >> 48;
-	ctx->data[56] = ctx->bitlen >> 56;
-	sha256_transform(ctx, ctx->data);
-
-	// Since this implementation uses little endian byte ordering and SHA uses big endian,
-	// reverse all the bytes when copying the final state to the output hash.
-	for (i = 0; i < 4; ++i) {
-		hash[i]      = (ctx->state[0] >> (24 - i * 8)) & 0x000000ff;
-		hash[i + 4]  = (ctx->state[1] >> (24 - i * 8)) & 0x000000ff;
-		hash[i + 8]  = (ctx->state[2] >> (24 - i * 8)) & 0x000000ff;
-		hash[i + 12] = (ctx->state[3] >> (24 - i * 8)) & 0x000000ff;
-		hash[i + 16] = (ctx->state[4] >> (24 - i * 8)) & 0x000000ff;
-		hash[i + 20] = (ctx->state[5] >> (24 - i * 8)) & 0x000000ff;
-		hash[i + 24] = (ctx->state[6] >> (24 - i * 8)) & 0x000000ff;
-		hash[i + 28] = (ctx->state[7] >> (24 - i * 8)) & 0x000000ff;
-	}
-}
--- a/src/sha256.h
+++ b/src/sha256.h
@@ -1,41 +0,0 @@
-/*********************************************************************
-* Filename:   sha256.h
-* Author:     Brad Conte (brad AT bradconte.com)
-* Copyright:
-* Disclaimer: This code is presented "as is" without any guarantees.
-* Details:    Defines the API for the corresponding SHA1 implementation.
-*********************************************************************/
-
-#ifndef SHA256_H
-#define SHA256_H
-
-/*************************** HEADER FILES ***************************/
-#include <stddef.h>
-
-/****************************** MACROS ******************************/
-#define SHA256_BLOCK_SIZE 32            // SHA256 outputs a 32 byte digest
-
-#ifdef __cplusplus
-extern "C" {
-#endif
-/**************************** DATA TYPES ****************************/
-typedef unsigned char BYTE;             // 8-bit byte
-typedef unsigned int  WORD;             // 32-bit word, change to "long" for 16-bit machines
-
-typedef struct {
-	BYTE data[64];
-	WORD datalen;
-	unsigned long long bitlen;
-	WORD state[8];
-} SHA256_CTX;
-
-/*********************** FUNCTION DECLARATIONS **********************/
-void sha256_init(SHA256_CTX *ctx);
-void sha256_update(SHA256_CTX *ctx, const BYTE data[], size_t len);
-void sha256_final(SHA256_CTX *ctx, BYTE hash[]);
-
-#ifdef __cplusplus
-};
-#endif
-
-#endif   // SHA256_H
--- a/src/stub_bench.cpp
+++ b/src/stub_bench.cpp
@@ -67,14 +67,14 @@ int main(int narg, char *args[])

 int connect_stub(const char *server_address, int server_port)
 {
-    struct sockaddr_storage addr;
+    struct sockaddr addr;
    if (!string_to_addr(server_address, 0, server_port, &addr))
    {
        fprintf(stderr, "server address: %s is not valid\n", server_address);
        return -1;
    }

-    int connect_fd = socket(addr.ss_family, SOCK_STREAM, 0);
+    int connect_fd = socket(addr.sa_family, SOCK_STREAM, 0);
    if (connect_fd < 0)
    {
        perror("socket");
--- a/src/stub_osd.cpp
+++ b/src/stub_osd.cpp
@@ -41,19 +41,21 @@
 #include "rw_blocking.h"
 #include "osd_ops.h"

+int bind_stub(std::string bind_address, int bind_port);
+
 void run_stub(int peer_fd);

 int main(int narg, char *args[])
 {
-    int listen_fd = create_and_bind_socket("0.0.0.0", 11203, 128, NULL);
+    int listen_fd = bind_stub("0.0.0.0", 11203);
    // Accept new connections
-    sockaddr_storage addr;
+    sockaddr addr;
    socklen_t peer_addr_size = sizeof(addr);
    int peer_fd;
    while (1)
    {
        printf("stub_osd: waiting for 1 client\n");
-        peer_fd = accept(listen_fd, (sockaddr*)&addr, &peer_addr_size);
+        peer_fd = accept(listen_fd, &addr, &peer_addr_size);
        if (peer_fd == -1)
        {
            if (errno == EAGAIN)
@@ -74,6 +76,39 @@ int main(int narg, char *args[])
    return 0;
 }

+int bind_stub(std::string bind_address, int bind_port)
+{
+    int listen_backlog = 128;
+
+    sockaddr addr;
+    if (!string_to_addr(bind_address, 0, bind_port, &addr))
+    {
+        throw std::runtime_error("bind address "+bind_address+" is not valid");
+    }
+
+    int listen_fd = socket(addr.sa_family, SOCK_STREAM, 0);
+    if (listen_fd < 0)
+    {
+        throw std::runtime_error(std::string("socket: ") + strerror(errno));
+    }
+    int enable = 1;
+    setsockopt(listen_fd, SOL_SOCKET, SO_REUSEADDR, &enable, sizeof(enable));
+
+    if (bind(listen_fd, &addr, sizeof(addr)) < 0)
+    {
+        close(listen_fd);
+        throw std::runtime_error(std::string("bind: ") + strerror(errno));
+    }
+
+    if (listen(listen_fd, listen_backlog) < 0)
+    {
+        close(listen_fd);
+        throw std::runtime_error(std::string("listen: ") + strerror(errno));
+    }
+
+    return listen_fd;
+}
+
 void run_stub(int peer_fd)
 {
    osd_any_op_t op;
--- a/src/stub_uring_osd.cpp
+++ b/src/stub_uring_osd.cpp
@@ -25,6 +25,8 @@
 #include "epoll_manager.h"
 #include "messenger.h"

+int bind_stub(std::string bind_address, int bind_port);
+
 void stub_exec_op(osd_messenger_t *msgr, osd_op_t *op);

 int main(int narg, char *args[])
@@ -41,8 +43,7 @@ int main(int narg, char *args[])
    json11::Json config = json11::Json::object { { "log_level", 1 } };
    msgr->parse_config(config);
    // Accept new connections
-    int listen_fd = create_and_bind_socket("0.0.0.0", 11203, 128, NULL);
-    fcntl(listen_fd, F_SETFL, fcntl(listen_fd, F_GETFL, 0) | O_NONBLOCK);
+    int listen_fd = bind_stub("0.0.0.0", 11203);
    epmgr->set_fd_handler(listen_fd, false, [listen_fd, msgr](int fd, int events)
    {
        msgr->accept_connections(listen_fd);
@@ -66,6 +67,41 @@ int main(int narg, char *args[])
    return 0;
 }

+int bind_stub(std::string bind_address, int bind_port)
+{
+    int listen_backlog = 128;
+
+    sockaddr addr;
+    if (!string_to_addr(bind_address, 0, bind_port, &addr))
+    {
+        throw std::runtime_error("bind address "+bind_address+" is not valid");
+    }
+
+    int listen_fd = socket(addr.sa_family, SOCK_STREAM, 0);
+    if (listen_fd < 0)
+    {
+        throw std::runtime_error(std::string("socket: ") + strerror(errno));
+    }
+    int enable = 1;
+    setsockopt(listen_fd, SOL_SOCKET, SO_REUSEADDR, &enable, sizeof(enable));
+
+    if (bind(listen_fd, &addr, sizeof(addr)) < 0)
+    {
+        close(listen_fd);
+        throw std::runtime_error(std::string("bind: ") + strerror(errno));
+    }
+
+    if (listen(listen_fd, listen_backlog) < 0)
+    {
+        close(listen_fd);
+        throw std::runtime_error(std::string("listen: ") + strerror(errno));
+    }
+
+    fcntl(listen_fd, F_SETFL, fcntl(listen_fd, F_GETFL, 0) | O_NONBLOCK);
+
+    return listen_fd;
+}
+
 void stub_exec_op(osd_messenger_t *msgr, osd_op_t *op)
 {
    op->reply.hdr.magic = SECONDARY_OSD_REPLY_MAGIC;
--- a/src/test_shit.cpp
+++ b/src/test_shit.cpp
@@ -406,7 +406,7 @@ uint64_t crush(uint64_t key, int count, uint64_t *weights)
        seed = (key + 0xc6a4a7935bd1e995 + (seed << 6) + (seed >> 2));
        seed ^= (j + 0xc6a4a7935bd1e995 + (seed << 6) + (seed >> 2));
        seed = 2862933555777941757ull*seed + 3037000493ull; // LCPRNG
-        seed = -log(((double)seed) / ((uint64_t)1 << 32) / ((uint64_t)1 << 32)) * weights[j];
+        seed = -log(((double)seed) / (1ul << 32) / (1ul << 32)) * weights[j];
        if (seed > max)
        {
            max = seed;
@@ -439,8 +439,8 @@ void crush3(uint64_t key, int count, uint64_t *weights, uint64_t *r, uint64_t to
                seed ^= (k2 + 0xc6a4a7935bd1e995 + (seed << 6) + (seed >> 2));
                seed ^= (k3 + 0xc6a4a7935bd1e995 + (seed << 6) + (seed >> 2));
                seed = 2862933555777941757ull*seed + 3037000493ull; // LCPRNG
-                //seed = ((double)seed) / ((uint64_t)1 << 32) / ((uint64_t)1 << 32) * (weights[k1] + weights[k2] + weights[k3]);
-                seed = ((double)seed) / ((uint64_t)1 << 32) / ((uint64_t)1 << 32) * (1 -
+                //seed = ((double)seed) / (1ul << 32) / (1ul << 32) * (weights[k1] + weights[k2] + weights[k3]);
+                seed = ((double)seed) / (1ul << 32) / (1ul << 32) * (1 -
                    (1 - 1.0*weights[k1]/total_weight)*
                    (1 - 1.0*weights[k2]/total_weight)*
                    (1 - 1.0*weights[k3]/total_weight)
--- a/src/vitastor.pc.in
+++ b/src/vitastor.pc.in
@@ -6,7 +6,7 @@ includedir=${prefix}/@CMAKE_INSTALL_INCLUDEDIR@

 Name: Vitastor
 Description: Vitastor client library
-Version: 0.6.16
+Version: 0.6.12
 Libs: -L${libdir} -lvitastor_client
 Cflags: -I${includedir}

--- a/tests/run_7osds.sh
+++ b/tests/run_7osds.sh
@@ -1,68 +0,0 @@
-#!/bin/bash
-
-. `dirname $0`/common.sh
-
-if [ "$IMMEDIATE_COMMIT" != "" ]; then
-    NO_SAME="--journal_no_same_sector_overwrites true --journal_sector_buffer_count 1024 --disable_data_fsync 1 --immediate_commit all --log_level 1"
-    $ETCDCTL put /vitastor/config/global '{"recovery_queue_depth":1,"osd_out_time":5,"immediate_commit":"all"}'
-else
-    NO_SAME="--journal_sector_buffer_count 1024 --log_level 1"
-    $ETCDCTL put /vitastor/config/global '{"recovery_queue_depth":1,"osd_out_time":5}'
-fi
-
-OSD_SIZE=1024
-OSD_COUNT=7
-OSD_ARGS=
-for i in $(seq 1 $OSD_COUNT); do
-    dd if=/dev/zero of=./testdata/test_osd$i.bin bs=1024 count=1 seek=$((OSD_SIZE*1024-1))
-    build/src/vitastor-osd --osd_num $i --bind_address 127.0.0.1 $NO_SAME $OSD_ARGS --etcd_address $ETCD_URL $(build/src/vitastor-cli simple-offsets --format options ./testdata/test_osd$i.bin 2>/dev/null) &>./testdata/osd$i.log &
-    eval OSD${i}_PID=$!
-done
-
-cd mon
-npm install
-cd ..
-node mon/mon-main.js --etcd_url $ETCD_URL --etcd_prefix "/vitastor" --verbose 1 &>./testdata/mon.log &
-MON_PID=$!
-
-if [ "$EC" != "" ]; then
-    POOLCFG='"scheme":"xor","pg_size":3,"pg_minsize":2,"parity_chunks":1'
-    PG_SIZE=3
-else
-    POOLCFG='"scheme":"replicated","pg_size":2,"pg_minsize":2'
-    PG_SIZE=2
-fi
-$ETCDCTL put /vitastor/config/pools '{"1":{"name":"testpool",'$POOLCFG',"pg_count":32,"failure_domain":"osd"}}'
-
-sleep 2
-
-if ! ($ETCDCTL get /vitastor/config/pgs --print-value-only | jq -s -e '(.[0].items["1"] | map((.osd_set | select(. > 0)) | length == '$PG_SIZE') | length) == 32'); then
-    format_error "FAILED: 32 PGS NOT CONFIGURED"
-fi
-
-if ! ($ETCDCTL get --prefix /vitastor/pg/state/ --print-value-only | jq -s -e '([ .[] | select(.state == ["active"]) ] | length) == 32'); then
-    format_error "FAILED: 32 PGS NOT UP"
-fi
-
-try_reweight()
-{
-    osd=$1
-    w=$2
-    $ETCDCTL put /vitastor/config/osd/$osd '{"reweight":'$w'}'
-    sleep 3
-}
-
-wait_finish_rebalance()
-{
-    sec=$1
-    i=0
-    while [[ $i -lt $sec ]]; do
-        ($ETCDCTL get --prefix /vitastor/pg/state/ --print-value-only | jq -s -e '([ .[] | select(.state == ["active"]) ] | length) == 32') && \
-            break
-        if [ $i -eq 60 ]; then
-            format_error "Rebalance couldn't finish in $sec seconds"
-        fi
-        sleep 1
-        i=$((i+1))
-    done
-}
--- a/tests/run_tests.sh
+++ b/tests/run_tests.sh
@@ -1,43 +0,0 @@
-#!/bin/bash -ex
-# Run all possible tests
-
-cd $(dirname $0)
-
-./test_add_osd.sh
-
-./test_cas.sh
-
-./test_change_pg_count.sh
-EC=1 ./test_change_pg_count.sh
-
-./test_change_pg_size.sh
-
-./test_etcd_fail.sh
-
-./test_failure_domain.sh
-
-./test_interrupted_rebalance.sh
-IMMEDIATE_COMMIT=1 ./test_interrupted_rebalance.sh
-EC=1 ./test_interrupted_rebalance.sh
-EC=1 IMMEDIATE_COMMIT=1 ./test_interrupted_rebalance.sh
-
-./test_minsize_1.sh
-
-./test_move_reappear.sh
-
-./test_rebalance_verify.sh
-IMMEDIATE_COMMIT=1 ./test_rebalance_verify.sh
-EC=1 ./test_rebalance_verify.sh
-EC=1 IMMEDIATE_COMMIT=1 ./test_rebalance_verify.sh
-
-./test_rm.sh
-
-./test_snapshot.sh
-SCHEME=replicated ./test_snapshot.sh
-
-./test_splitbrain.sh
-
-./test_write.sh
-SCHEME=replicated ./test_write.sh
-
-./test_write_no_same.sh
--- a/tests/test_change_pg_count.sh
+++ b/tests/test_change_pg_count.sh
@@ -78,16 +78,7 @@ try_change()
    fi

    # Check that no objects are lost !
-    # But note that reporting this information may take up to <etcd_report_interval+1> seconds
-    nobj=0
-    waittime=0
-    while [[ $nobj -ne $NOBJ && $waittime -lt 7 ]]; do
-        nobj=`$ETCDCTL get --prefix '/vitastor/pg/stats' --print-value-only | jq -s '[ .[].object_count ] | reduce .[] as $num (0; .+$num)'`
-        if [[ $nobj -ne $NOBJ ]]; then
-            waittime=$((waittime+1))
-            sleep 1
-        fi
-    done
+    nobj=`$ETCDCTL get --prefix '/vitastor/pg/stats' --print-value-only | jq -s '[ .[].object_count ] | reduce .[] as $num (0; .+$num)'`
    if [ "$nobj" -ne $NOBJ ]; then
        format_error "Data lost after changing PG count to $n: $NOBJ objects expected, but got $nobj"
    fi
--- a/tests/test_interrupted_rebalance.sh
+++ b/tests/test_interrupted_rebalance.sh
@@ -1,6 +1,41 @@
 #!/bin/bash -ex

-. `dirname $0`/run_7osds.sh
+. `dirname $0`/common.sh
+
+if [ "$IMMEDIATE_COMMIT" != "" ]; then
+    NO_SAME="--journal_no_same_sector_overwrites true --journal_sector_buffer_count 1024 --disable_data_fsync 1 --immediate_commit all --log_level 1"
+    $ETCDCTL put /vitastor/config/global '{"recovery_queue_depth":1,"osd_out_time":5,"immediate_commit":"all"}'
+else
+    NO_SAME="--journal_sector_buffer_count 1024 --log_level 1"
+    $ETCDCTL put /vitastor/config/global '{"recovery_queue_depth":1,"osd_out_time":5}'
+fi
+
+OSD_SIZE=1024
+OSD_COUNT=7
+OSD_ARGS=
+for i in $(seq 1 $OSD_COUNT); do
+    dd if=/dev/zero of=./testdata/test_osd$i.bin bs=1024 count=1 seek=$((OSD_SIZE*1024-1))
+    build/src/vitastor-osd --osd_num $i --bind_address 127.0.0.1 $OSD_ARGS --etcd_address $ETCD_URL $(build/src/vitastor-cli simple-offsets --format options ./testdata/test_osd$i.bin 2>/dev/null) &>./testdata/osd$i.log &
+    eval OSD${i}_PID=$!
+done
+
+cd mon
+npm install
+cd ..
+node mon/mon-main.js --etcd_url $ETCD_URL --etcd_prefix "/vitastor" --verbose 1 &>./testdata/mon.log &
+MON_PID=$!
+
+$ETCDCTL put /vitastor/config/pools '{"1":{"name":"testpool","scheme":"replicated","pg_size":2,"pg_minsize":1,"pg_count":32,"failure_domain":"osd"}}'
+
+sleep 2
+
+if ! ($ETCDCTL get /vitastor/config/pgs --print-value-only | jq -s -e '(.[0].items["1"] | map((.osd_set | select(. > 0)) | length == 2) | length) == 32'); then
+    format_error "FAILED: 32 PGS NOT CONFIGURED"
+fi
+
+if ! ($ETCDCTL get --prefix /vitastor/pg/state/ --print-value-only | jq -s -e '([ .[] | select(.state == ["active"]) ] | length) == 32'); then
+    format_error "FAILED: 32 PGS NOT UP"
+fi

 IMG_SIZE=960

@@ -8,6 +43,14 @@ LD_PRELOAD=libasan.so.5 \
 fio -thread -name=test -ioengine=build/src/libfio_vitastor.so -bs=4M -direct=1 -iodepth=16 -fsync=16 -rw=write \
    -etcd=$ETCD_URL -pool=1 -inode=2 -size=${IMG_SIZE}M -cluster_log_level=10

+try_reweight()
+{
+    osd=$1
+    w=$2
+    $ETCDCTL put /vitastor/config/osd/$osd '{"reweight":'$w'}'
+    sleep 3
+}
+
 try_reweight 1 0

 try_reweight 2 0
@@ -29,7 +72,14 @@ try_reweight 4 1
 try_reweight 5 1

 # Wait for the rebalance to finish
-wait_finish_rebalance 60
+for i in {1..60}; do
+    ($ETCDCTL get --prefix /vitastor/pg/state/ --print-value-only | jq -s -e '([ .[] | select(.state == ["active"]) ] | length) == 32') && \
+        break
+    if [ $i -eq 60 ]; then
+        format_error "Rebalance couldn't finish in 60 seconds"
+    fi
+    sleep 1
+done

 # Check that PGs never had degraded objects !
 if grep has_degraded ./testdata/mon.log; then
--- a/tests/test_rebalance_verify.sh
+++ b/tests/test_rebalance_verify.sh
@@ -1,57 +0,0 @@
-#!/bin/bash -ex
-
-. `dirname $0`/run_7osds.sh
-
-IMG_SIZE=256
-
-$ETCDCTL put /vitastor/config/inode/1/1 '{"name":"testimg","size":'$((IMG_SIZE*1024*1024))'}'
-
-NBD_DEV=$(sudo build/src/vitastor-nbd map --etcd_address $ETCD_URL --image testimg --logfile ./testdata/nbd.log &)
-
-trap "sudo build/src/vitastor-nbd unmap $NBD_DEV"'; kill -9 $(jobs -p)' EXIT
-
-sudo chown $(id -u) $NBD_DEV
-
-dd if=/dev/urandom of=./testdata/img1.bin bs=1M count=$IMG_SIZE
-
-dd if=./testdata/img1.bin of=$NBD_DEV bs=1M count=$IMG_SIZE oflag=direct
-
-verify() {
-    echo "Verifying before rebalance"
-    dd if=$NBD_DEV of=./testdata/img2.bin bs=1M count=$IMG_SIZE iflag=direct
-    diff ./testdata/img1.bin ./testdata/img2.bin
-
-    $ETCDCTL put /vitastor/config/osd/1 '{"reweight":'$1'}'
-    $ETCDCTL put /vitastor/config/osd/2 '{"reweight":'$1'}'
-    $ETCDCTL put /vitastor/config/osd/3 '{"reweight":'$1'}'
-    sleep 1
-
-    for i in {1..10000}; do
-        O=$(((RANDOM*RANDOM) % (IMG_SIZE*128)))
-        dd if=$NBD_DEV of=./testdata/img2.bin bs=4k seek=$O skip=$O count=1 iflag=direct conv=notrunc
-    done
-
-    echo "Verifying during rebalance"
-    diff ./testdata/img1.bin ./testdata/img2.bin
-
-    # Wait for the rebalance to finish
-    wait_finish_rebalance 60
-
-    echo "Verifying after rebalance"
-    dd if=$NBD_DEV of=./testdata/img2.bin bs=1M count=$IMG_SIZE iflag=direct
-    diff ./testdata/img1.bin ./testdata/img2.bin
-}
-
-# Verify with regular reads
-
-verify 0
-
-# Same with chained reads
-
-$ETCDCTL put /vitastor/config/inode/1/1 '{"name":"testimg0","size":'$((IMG_SIZE*1024*1024))'}'
-$ETCDCTL put /vitastor/config/inode/1/2 '{"name":"testimg","size":'$((IMG_SIZE*1024*1024))',"parent_id":1}'
-sleep 1
-
-verify 1
-
-format_green OK