Return block NFS implementation back as an option too

Move KV FS header into a separate file
Implement packing small files into shared inodes
2024-02-24 11:46:34 +03:00 · 2024-02-24 11:46:34 +03:00 · 2024-02-24 11:46:33 +03:00 · 2024-02-24 11:45:59 +03:00 · 2024-02-24 11:44:26 +03:00 · 2024-02-24 11:43:21 +03:00
126 changed files with 7584 additions and 936 deletions
--- a/.gitea/workflows/test.yml
+++ b/.gitea/workflows/test.yml
@ -395,7 +395,7 @@ jobs:
    steps:
    - name: Run test
      id: test
-      timeout-minutes: 3
+      timeout-minutes: 6
      run: SCHEME=ec /root/vitastor/tests/test_snapshot_chain.sh
    - name: Print logs
      if: always() && steps.test.outcome == 'failure'
--- a/.gitea/workflows/tests-to-yaml.pl
+++ b/.gitea/workflows/tests-to-yaml.pl
@ -39,6 +39,10 @@ for my $line (<>)
                $test_name .= '_'.lc($1).'_'.$2;
            }
        }
+        if ($test_name eq 'test_snapshot_chain_ec')
+        {
+            $timeout = 6;
+        }
        $line =~ s!\./test_!/root/vitastor/tests/test_!;
        # Gitea CI doesn't support artifacts yet, lol
        #- name: Upload results
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@ -2,6 +2,6 @@ cmake_minimum_required(VERSION 2.8.12)

 project(vitastor)

-set(VERSION "1.4.2")
+set(VERSION "1.4.7")

 add_subdirectory(src)
--- a/csi/Makefile
+++ b/csi/Makefile
@ -1,4 +1,4 @@
-VERSION ?= v1.4.2
+VERSION ?= v1.4.7

 all: build push

--- a/csi/deploy/004-csi-nodeplugin.yaml
+++ b/csi/deploy/004-csi-nodeplugin.yaml
@ -49,7 +49,7 @@ spec:
            capabilities:
              add: ["SYS_ADMIN"]
            allowPrivilegeEscalation: true
-          image: vitalif/vitastor-csi:v1.4.2
+          image: vitalif/vitastor-csi:v1.4.7
          args:
            - "--node=$(NODE_ID)"
            - "--endpoint=$(CSI_ENDPOINT)"
--- a/csi/deploy/007-csi-provisioner.yaml
+++ b/csi/deploy/007-csi-provisioner.yaml
@ -121,7 +121,7 @@ spec:
            privileged: true
            capabilities:
              add: ["SYS_ADMIN"]
-          image: vitalif/vitastor-csi:v1.4.2
+          image: vitalif/vitastor-csi:v1.4.7
          args:
            - "--node=$(NODE_ID)"
            - "--endpoint=$(CSI_ENDPOINT)"
--- a/csi/src/config.go
+++ b/csi/src/config.go
@ -5,7 +5,7 @@ package vitastor

 const (
    vitastorCSIDriverName    = "csi.vitastor.io"
-    vitastorCSIDriverVersion = "1.4.2"
+    vitastorCSIDriverVersion = "1.4.7"
 )

 // Config struct fills the parameters of request or user input
--- a/debian/changelog
+++ b/debian/changelog
@ -1,4 +1,4 @@
-vitastor (1.4.2-1) unstable; urgency=medium
+vitastor (1.4.7-1) unstable; urgency=medium

  * Bugfixes

--- a/debian/vitastor.Dockerfile
+++ b/debian/vitastor.Dockerfile
@ -35,8 +35,8 @@ RUN set -e -x; \
    mkdir -p /root/packages/vitastor-$REL; \
    rm -rf /root/packages/vitastor-$REL/*; \
    cd /root/packages/vitastor-$REL; \
-    cp -r /root/vitastor vitastor-1.4.2; \
-    cd vitastor-1.4.2; \
+    cp -r /root/vitastor vitastor-1.4.7; \
+    cd vitastor-1.4.7; \
    ln -s /root/fio-build/fio-*/ ./fio; \
    FIO=$(head -n1 fio/debian/changelog | perl -pe 's/^.*\((.*?)\).*$/$1/'); \
    ls /usr/include/linux/raw.h || cp ./debian/raw.h /usr/include/linux/raw.h; \
@ -49,8 +49,8 @@ RUN set -e -x; \
    rm -rf a b; \
    echo "dep:fio=$FIO" > debian/fio_version; \
    cd /root/packages/vitastor-$REL; \
-    tar --sort=name --mtime='2020-01-01' --owner=0 --group=0 --exclude=debian -cJf vitastor_1.4.2.orig.tar.xz vitastor-1.4.2; \
-    cd vitastor-1.4.2; \
+    tar --sort=name --mtime='2020-01-01' --owner=0 --group=0 --exclude=debian -cJf vitastor_1.4.7.orig.tar.xz vitastor-1.4.7; \
+    cd vitastor-1.4.7; \
    V=$(head -n1 debian/changelog | perl -pe 's/^.*\((.*?)\).*$/$1/'); \
    DEBFULLNAME="Vitaliy Filippov <vitalif@yourcmc.ru>" dch -D $REL -v "$V""$REL" "Rebuild for $REL"; \
    DEB_BUILD_OPTIONS=nocheck dpkg-buildpackage --jobs=auto -sa; \
--- a/docs/config/monitor.en.md
+++ b/docs/config/monitor.en.md
@ -19,8 +19,8 @@ These parameters only apply to Monitors.
 ## etcd_mon_ttl

 - Type: seconds
- Default: 30
- Minimum: 10
+- Default: 1
+- Minimum: 5

 Monitor etcd lease refresh interval in seconds

--- a/docs/config/monitor.ru.md
+++ b/docs/config/monitor.ru.md
@ -19,8 +19,8 @@
 ## etcd_mon_ttl

 - Тип: секунды
- Значение по умолчанию: 30
- Минимальное значение: 10
+- Значение по умолчанию: 1
+- Минимальное значение: 5

 Интервал обновления etcd резервации (lease) монитором

--- a/docs/config/network.en.md
+++ b/docs/config/network.en.md
@ -215,8 +215,8 @@ is scheduled.
 ## up_wait_retry_interval

 - Type: milliseconds
- Default: 500
- Minimum: 50
+- Default: 50
+- Minimum: 10
 - Can be changed online: yes

 OSDs respond to clients with a special error code when they receive I/O
--- a/docs/config/network.ru.md
+++ b/docs/config/network.ru.md
@ -224,8 +224,8 @@ OSD в любом случае согласовывают реальное зн
 ## up_wait_retry_interval

 - Тип: миллисекунды
- Значение по умолчанию: 500
- Минимальное значение: 50
+- Значение по умолчанию: 50
+- Минимальное значение: 10
 - Можно менять на лету: да

 Когда OSD получают от клиентов запросы ввода-вывода, относящиеся к не
--- a/docs/config/osd.en.md
+++ b/docs/config/osd.en.md
@ -59,6 +59,7 @@ them, even without restarting by updating configuration in etcd.
 - [recovery_tune_client_util_high](#recovery_tune_client_util_high)
 - [recovery_tune_agg_interval](#recovery_tune_agg_interval)
 - [recovery_tune_sleep_min_us](#recovery_tune_sleep_min_us)
+- [recovery_tune_sleep_cutoff_us](#recovery_tune_sleep_cutoff_us)

 ## etcd_report_interval

@ -604,5 +605,14 @@ is usually fine.
 - Default: 10
 - Can be changed online: yes

-Minimum possible value for auto-tuned recovery_sleep_us. Values lower
-than this value are changed to 0.
+Minimum possible value for auto-tuned recovery_sleep_us. Lower values
+are changed to 0.
+
+## recovery_tune_sleep_cutoff_us
+
+- Type: microseconds
+- Default: 10000000
+- Can be changed online: yes
+
+Maximum possible value for auto-tuned recovery_sleep_us. Higher values
+are treated as outliers and ignored in aggregation.
--- a/docs/config/osd.ru.md
+++ b/docs/config/osd.ru.md
@ -60,6 +60,7 @@
 - [recovery_tune_client_util_high](#recovery_tune_client_util_high)
 - [recovery_tune_agg_interval](#recovery_tune_agg_interval)
 - [recovery_tune_sleep_min_us](#recovery_tune_sleep_min_us)
+- [recovery_tune_sleep_cutoff_us](#recovery_tune_sleep_cutoff_us)

 ## etcd_report_interval

@ -634,4 +635,14 @@ EC (кодов коррекции ошибок) с более, чем 1 диск
 - Можно менять на лету: да

 Минимальное возможное значение авто-подстроенного recovery_sleep_us.
-Значения ниже данного заменяются на 0.
+Меньшие значения заменяются на 0.
+
+## recovery_tune_sleep_cutoff_us
+
+- Тип: микросекунды
+- Значение по умолчанию: 10000000
+- Можно менять на лету: да
+
+Максимальное возможное значение авто-подстроенного recovery_sleep_us.
+Большие значения считаются случайными выбросами и игнорируются в
+усреднении.
--- a/docs/config/pool.en.md
+++ b/docs/config/pool.en.md
@ -154,6 +154,9 @@ That is, if it becomes impossible to place PG data on at least (pg_minsize)
 OSDs, PG is deactivated for both read and write. So you know that a fresh
 write always goes to at least (pg_minsize) OSDs (disks).

+That is, pg_size minus pg_minsize sets the number of disk failures to tolerate
+without temporary downtime (for [osd_out_time](monitor.en.md#osd_out_time)).
+
 FIXME: pg_minsize behaviour may be changed in the future to only make PGs
 read-only instead of deactivating them.

--- a/docs/config/pool.ru.md
+++ b/docs/config/pool.ru.md
@ -157,6 +157,10 @@
 OSD, PG деактивируется на чтение и запись. Иными словами, всегда известно,
 что новые блоки данных всегда записываются как минимум на pg_minsize дисков.

+По сути, разница pg_size и pg_minsize задаёт число отказов дисков, которые пул
+может пережить без временной (на [osd_out_time](monitor.ru.md#osd_out_time))
+остановки обслуживания.
+
 FIXME: Поведение pg_minsize может быть изменено в будущем с полной деактивации
 PG на перевод их в режим только для чтения.

--- a/docs/config/src/osd.yml
+++ b/docs/config/src/osd.yml
@ -731,8 +731,19 @@
  default: 10
  online: true
  info: |
-    Minimum possible value for auto-tuned recovery_sleep_us. Values lower
-    than this value are changed to 0.
+    Minimum possible value for auto-tuned recovery_sleep_us. Lower values
+    are changed to 0.
  info_ru: |
    Минимальное возможное значение авто-подстроенного recovery_sleep_us.
-    Значения ниже данного заменяются на 0.
+    Меньшие значения заменяются на 0.
+- name: recovery_tune_sleep_cutoff_us
+  type: us
+  default: 10000000
+  online: true
+  info: |
+    Maximum possible value for auto-tuned recovery_sleep_us. Higher values
+    are treated as outliers and ignored in aggregation.
+  info_ru: |
+    Максимальное возможное значение авто-подстроенного recovery_sleep_us.
+    Большие значения считаются случайными выбросами и игнорируются в
+    усреднении.
--- a/docs/usage/disk.en.md
+++ b/docs/usage/disk.en.md
@ -261,7 +261,7 @@ Options (see also [Cluster-Wide Disk Layout Parameters](../config/layout-cluster
 ```
 --object_size 128k       Set blockstore block size
 --bitmap_granularity 4k  Set bitmap granularity
--journal_size 16M       Set journal size
+--journal_size 32M       Set journal size
 --data_csum_type none    Set data checksum type (crc32c or none)
 --csum_block_size 4k     Set data checksum block size
 --device_block_size 4k   Set device block size
--- a/docs/usage/disk.ru.md
+++ b/docs/usage/disk.ru.md
@ -267,7 +267,7 @@ OSD отключены fsync-и.
 ```
 --object_size 128k       Размер блока хранилища
 --bitmap_granularity 4k  Гранулярность битовых карт
--journal_size 16M       Размер журнала
+--journal_size 32M       Размер журнала
 --data_csum_type none    Задать тип контрольных сумм (crc32c или none)
 --csum_block_size 4k     Задать размер блока расчёта контрольных сумм
 --device_block_size 4k   Размер блока устройства
--- a/mon/mon.js
+++ b/mon/mon.js
@ -675,7 +675,12 @@ class Mon
                {
                    this.parse_kv(e.kv);
                    const key = e.kv.key.substr(this.etcd_prefix.length);
-                    if (key.substr(0, 11) == '/osd/stats/' || key.substr(0, 10) == '/pg/stats/' || key.substr(0, 16) == '/osd/inodestats/')
+                    if (key.substr(0, 11) == '/osd/state/')
+                    {
+                        stats_changed = true;
+                        changed = true;
+                    }
+                    else if (key.substr(0, 11) == '/osd/stats/' || key.substr(0, 10) == '/pg/stats/' || key.substr(0, 16) == '/osd/inodestats/')
                    {
                        stats_changed = true;
                    }
@ -1635,9 +1640,13 @@ class Mon
        }
        const sum_diff = { op_stats: {}, subop_stats: {}, recovery_stats: {} };
        // Sum derived values instead of deriving summed
-        for (const osd in this.state.osd.stats)
+        for (const osd in this.state.osd.state)
        {
            const derived = this.prev_stats.osd_diff[osd];
+            if (!this.state.osd.state[osd] || !derived)
+            {
+                continue;
+            }
            for (const type in sum_diff)
            {
                for (const op in derived[type]||{})
@ -1738,9 +1747,13 @@ class Mon
            const used = this.state.pool.stats[pool_id].used_raw_tb;
            this.state.pool.stats[pool_id].used_raw_tb = Number(used)/1024/1024/1024/1024;
        }
-        for (const osd_num in this.state.osd.inodestats)
+        for (const osd_num in this.state.osd.state)
        {
            const ist = this.state.osd.inodestats[osd_num];
+            if (!ist || !this.state.osd.state[osd_num])
+            {
+                continue;
+            }
            for (const pool_id in ist)
            {
                inode_stats[pool_id] = inode_stats[pool_id] || {};
@ -1756,9 +1769,14 @@ class Mon
                }
            }
        }
-        for (const osd in this.prev_stats.osd_diff)
+        for (const osd in this.state.osd.state)
        {
-            for (const pool_id in this.prev_stats.osd_diff[osd].inode_stats)
+            const osd_diff = this.prev_stats.osd_diff[osd];
+            if (!osd_diff || !this.state.osd.state[osd])
+            {
+                continue;
+            }
+            for (const pool_id in osd_diff.inode_stats)
            {
                for (const inode_num in this.prev_stats.osd_diff[osd].inode_stats[pool_id])
                {
--- a/mon/package.json
+++ b/mon/package.json
@ -1,6 +1,6 @@
 {
  "name": "vitastor-mon",
-  "version": "1.4.2",
+  "version": "1.4.7",
  "description": "Vitastor SDS monitor service",
  "main": "mon-main.js",
  "scripts": {
--- a/patches/cinder-vitastor.py
+++ b/patches/cinder-vitastor.py
@ -50,7 +50,7 @@ from cinder.volume import configuration
 from cinder.volume import driver
 from cinder.volume import volume_utils

-VERSION = '1.4.2'
+VERSION = '1.4.7'

 LOG = logging.getLogger(__name__)

--- a/rpm/build-tarball.sh
+++ b/rpm/build-tarball.sh
@ -24,4 +24,4 @@ rm fio
 mv fio-copy fio
 FIO=`rpm -qi fio | perl -e 'while(<>) { /^Epoch[\s:]+(\S+)/ && print "$1:"; /^Version[\s:]+(\S+)/ && print $1; /^Release[\s:]+(\S+)/ && print "-$1"; }'`
 perl -i -pe 's/(Requires:\s*fio)([^\n]+)?/$1 = '$FIO'/' $VITASTOR/rpm/vitastor-el$EL.spec
-tar --transform 's#^#vitastor-1.4.2/#' --exclude 'rpm/*.rpm' -czf $VITASTOR/../vitastor-1.4.2$(rpm --eval '%dist').tar.gz *
+tar --transform 's#^#vitastor-1.4.7/#' --exclude 'rpm/*.rpm' -czf $VITASTOR/../vitastor-1.4.7$(rpm --eval '%dist').tar.gz *
--- a/rpm/vitastor-el7.Dockerfile
+++ b/rpm/vitastor-el7.Dockerfile
@ -36,7 +36,7 @@ ADD . /root/vitastor
 RUN set -e; \
    cd /root/vitastor/rpm; \
    sh build-tarball.sh; \
-    cp /root/vitastor-1.4.2.el7.tar.gz ~/rpmbuild/SOURCES; \
+    cp /root/vitastor-1.4.7.el7.tar.gz ~/rpmbuild/SOURCES; \
    cp vitastor-el7.spec ~/rpmbuild/SPECS/vitastor.spec; \
    cd ~/rpmbuild/SPECS/; \
    rpmbuild -ba vitastor.spec; \
--- a/rpm/vitastor-el7.spec
+++ b/rpm/vitastor-el7.spec
@ -1,11 +1,11 @@
 Name:           vitastor
-Version:        1.4.2
+Version:        1.4.7
 Release:        1%{?dist}
 Summary:        Vitastor, a fast software-defined clustered block storage

 License:        Vitastor Network Public License 1.1
 URL:            https://vitastor.io/
-Source0:        vitastor-1.4.2.el7.tar.gz
+Source0:        vitastor-1.4.7.el7.tar.gz

 BuildRequires:  liburing-devel >= 0.6
 BuildRequires:  gperftools-devel
--- a/rpm/vitastor-el8.Dockerfile
+++ b/rpm/vitastor-el8.Dockerfile
@ -35,7 +35,7 @@ ADD . /root/vitastor
 RUN set -e; \
    cd /root/vitastor/rpm; \
    sh build-tarball.sh; \
-    cp /root/vitastor-1.4.2.el8.tar.gz ~/rpmbuild/SOURCES; \
+    cp /root/vitastor-1.4.7.el8.tar.gz ~/rpmbuild/SOURCES; \
    cp vitastor-el8.spec ~/rpmbuild/SPECS/vitastor.spec; \
    cd ~/rpmbuild/SPECS/; \
    rpmbuild -ba vitastor.spec; \
--- a/rpm/vitastor-el8.spec
+++ b/rpm/vitastor-el8.spec
@ -1,11 +1,11 @@
 Name:           vitastor
-Version:        1.4.2
+Version:        1.4.7
 Release:        1%{?dist}
 Summary:        Vitastor, a fast software-defined clustered block storage

 License:        Vitastor Network Public License 1.1
 URL:            https://vitastor.io/
-Source0:        vitastor-1.4.2.el8.tar.gz
+Source0:        vitastor-1.4.7.el8.tar.gz

 BuildRequires:  liburing-devel >= 0.6
 BuildRequires:  gperftools-devel
--- a/rpm/vitastor-el9.Dockerfile
+++ b/rpm/vitastor-el9.Dockerfile
@ -18,7 +18,7 @@ ADD . /root/vitastor
 RUN set -e; \
    cd /root/vitastor/rpm; \
    sh build-tarball.sh; \
-    cp /root/vitastor-1.4.2.el9.tar.gz ~/rpmbuild/SOURCES; \
+    cp /root/vitastor-1.4.7.el9.tar.gz ~/rpmbuild/SOURCES; \
    cp vitastor-el9.spec ~/rpmbuild/SPECS/vitastor.spec; \
    cd ~/rpmbuild/SPECS/; \
    rpmbuild -ba vitastor.spec; \
--- a/rpm/vitastor-el9.spec
+++ b/rpm/vitastor-el9.spec
@ -1,11 +1,11 @@
 Name:           vitastor
-Version:        1.4.2
+Version:        1.4.7
 Release:        1%{?dist}
 Summary:        Vitastor, a fast software-defined clustered block storage

 License:        Vitastor Network Public License 1.1
 URL:            https://vitastor.io/
-Source0:        vitastor-1.4.2.el9.tar.gz
+Source0:        vitastor-1.4.7.el9.tar.gz

 BuildRequires:  liburing-devel >= 0.6
 BuildRequires:  gperftools-devel
--- a/src/CMakeLists.txt
+++ b/src/CMakeLists.txt
@ -16,8 +16,8 @@ if("${CMAKE_INSTALL_PREFIX}" MATCHES "^/usr/local/?$")
 	set(CMAKE_INSTALL_RPATH "${CMAKE_INSTALL_PREFIX}/${CMAKE_INSTALL_LIBDIR}")
 endif()

-add_definitions(-DVERSION="1.4.2")
-add_definitions(-Wall -Wno-sign-compare -Wno-comment -Wno-parentheses -Wno-pointer-arith -fdiagnostics-color=always -fno-omit-frame-pointer -I ${CMAKE_SOURCE_DIR}/src)
+add_definitions(-DVERSION="1.4.7")
+add_definitions(-D_LARGEFILE64_SOURCE -D_FILE_OFFSET_BITS=64 -Wall -Wno-sign-compare -Wno-comment -Wno-parentheses -Wno-pointer-arith -fdiagnostics-color=always -fno-omit-frame-pointer -I ${CMAKE_SOURCE_DIR}/src)
 add_link_options(-fno-omit-frame-pointer)
 if (${WITH_ASAN})
 	add_definitions(-fsanitize=address)
@ -181,10 +181,48 @@ target_link_libraries(vitastor-nbd
 	vitastor_client
 )

+# libvitastor_kv.so
+add_library(vitastor_kv SHARED
+	kv_db.cpp
+	kv_db.h
+)
+target_link_libraries(vitastor_kv
+	vitastor_client
+)
+set_target_properties(vitastor_kv PROPERTIES VERSION ${VERSION} SOVERSION 0)
+
+# vitastor-kv
+add_executable(vitastor-kv
+	kv_cli.cpp
+)
+target_link_libraries(vitastor-kv
+	vitastor_kv
+)
+
+add_executable(vitastor-kv-stress
+	kv_stress.cpp
+)
+target_link_libraries(vitastor-kv-stress
+	vitastor_kv
+)
+
 # vitastor-nfs
 add_executable(vitastor-nfs
 	nfs_proxy.cpp
-	nfs_conn.cpp
+	nfs_block.cpp
+	nfs_kv.cpp
+	nfs_kv_create.cpp
+	nfs_kv_getattr.cpp
+	nfs_kv_link.cpp
+	nfs_kv_lookup.cpp
+	nfs_kv_read.cpp
+	nfs_kv_readdir.cpp
+	nfs_kv_remove.cpp
+	nfs_kv_rename.cpp
+	nfs_kv_setattr.cpp
+	nfs_kv_write.cpp
+	nfs_fsstat.cpp
+	nfs_mount.cpp
 	nfs_portmap.cpp
 	sha256.c
 	nfs/xdr_impl.cpp
@ -194,6 +232,7 @@ add_executable(vitastor-nfs
 )
 target_link_libraries(vitastor-nfs
 	vitastor_client
+	vitastor_kv
 )

 # vitastor-cli
--- a/src/blockstore_disk.cpp
+++ b/src/blockstore_disk.cpp
@ -108,6 +108,10 @@ void blockstore_disk_t::parse_config(std::map<std::string, std::string> & config
    {
        throw std::runtime_error("journal_block_size must be a multiple of "+std::to_string(DIRECT_IO_ALIGNMENT));
    }
+    else if (journal_block_size > MAX_DATA_BLOCK_SIZE)
+    {
+        throw std::runtime_error("journal_block_size must not exceed "+std::to_string(MAX_DATA_BLOCK_SIZE));
+    }
    if (!meta_block_size)
    {
        meta_block_size = 4096;
@ -116,6 +120,10 @@ void blockstore_disk_t::parse_config(std::map<std::string, std::string> & config
    {
        throw std::runtime_error("meta_block_size must be a multiple of "+std::to_string(DIRECT_IO_ALIGNMENT));
    }
+    else if (meta_block_size > MAX_DATA_BLOCK_SIZE)
+    {
+        throw std::runtime_error("meta_block_size must not exceed "+std::to_string(MAX_DATA_BLOCK_SIZE));
+    }
    if (data_offset % disk_alignment)
    {
        throw std::runtime_error("data_offset must be a multiple of disk_alignment = "+std::to_string(disk_alignment));
--- a/src/blockstore_flush.cpp
+++ b/src/blockstore_flush.cpp
@ -19,7 +19,6 @@ journal_flusher_t::journal_flusher_t(blockstore_impl_t *bs)
    syncing_flushers = 0;
    // FIXME: allow to configure flusher_start_threshold and journal_trim_interval
    flusher_start_threshold = bs->dsk.journal_block_size / sizeof(journal_entry_stable);
-    journal_trim_interval = 512;
    journal_trim_counter = bs->journal.flush_journal ? 1 : 0;
    trim_wanted = bs->journal.flush_journal ? 1 : 0;
    journal_superblock = bs->journal.inmemory ? bs->journal.buffer : memalign_or_die(MEM_ALIGNMENT, bs->dsk.journal_block_size);
@ -94,7 +93,7 @@ void journal_flusher_t::loop()
 void journal_flusher_t::enqueue_flush(obj_ver_id ov)
 {
 #ifdef BLOCKSTORE_DEBUG
-    printf("enqueue_flush %lx:%lx v%lu\n", ov.oid.inode, ov.oid.stripe, ov.version);
+    printf("enqueue_flush %jx:%jx v%ju\n", ov.oid.inode, ov.oid.stripe, ov.version);
 #endif
    auto it = flush_versions.find(ov.oid);
    if (it != flush_versions.end())
@ -117,7 +116,7 @@ void journal_flusher_t::enqueue_flush(obj_ver_id ov)
 void journal_flusher_t::unshift_flush(obj_ver_id ov, bool force)
 {
 #ifdef BLOCKSTORE_DEBUG
-    printf("unshift_flush %lx:%lx v%lu\n", ov.oid.inode, ov.oid.stripe, ov.version);
+    printf("unshift_flush %jx:%jx v%ju\n", ov.oid.inode, ov.oid.stripe, ov.version);
 #endif
    auto it = flush_versions.find(ov.oid);
    if (it != flush_versions.end())
@ -143,7 +142,7 @@ void journal_flusher_t::unshift_flush(obj_ver_id ov, bool force)
 void journal_flusher_t::remove_flush(object_id oid)
 {
 #ifdef BLOCKSTORE_DEBUG
-    printf("undo_flush %lx:%lx\n", oid.inode, oid.stripe);
+    printf("undo_flush %jx:%jx\n", oid.inode, oid.stripe);
 #endif
    auto v_it = flush_versions.find(oid);
    if (v_it != flush_versions.end())
@ -184,8 +183,7 @@ void journal_flusher_t::mark_trim_possible()
    if (trim_wanted > 0)
    {
        dequeuing = true;
-        if (!journal_trim_counter)
-            journal_trim_counter = journal_trim_interval;
+        journal_trim_counter = 0;
        bs->ringloop->wakeup();
    }
 }
@ -235,7 +233,7 @@ void journal_flusher_t::dump_diagnostics()
        break;
    }
    printf(
-        "Flusher: queued=%ld first=%s%lx:%lx trim_wanted=%d dequeuing=%d trimming=%d cur=%d target=%d active=%d syncing=%d\n",
+        "Flusher: queued=%zd first=%s%jx:%jx trim_wanted=%d dequeuing=%d trimming=%d cur=%d target=%d active=%d syncing=%d\n",
        flush_queue.size(), unflushable_type, unflushable.oid.inode, unflushable.oid.stripe,
        trim_wanted, dequeuing, trimming, cur_flusher_count, target_flusher_count,
        active_flushers, syncing_flushers
@ -268,7 +266,7 @@ bool journal_flusher_t::try_find_other(std::map<obj_ver_id, dirty_entry>::iterat
 {
    int search_left = flush_queue.size() - 1;
 #ifdef BLOCKSTORE_DEBUG
-    printf("Flusher overran writers (%lx:%lx v%lu, dirty_start=%08lx) - searching for older flushes (%d left)\n",
+    printf("Flusher overran writers (%jx:%jx v%ju, dirty_start=%08jx) - searching for older flushes (%d left)\n",
        cur.oid.inode, cur.oid.stripe, cur.version, bs->journal.dirty_start, search_left);
 #endif
    while (search_left > 0)
@ -285,7 +283,7 @@ bool journal_flusher_t::try_find_other(std::map<obj_ver_id, dirty_entry>::iterat
                dirty_end->second.journal_sector < bs->journal.used_start))
            {
 #ifdef BLOCKSTORE_DEBUG
-                printf("Write %lx:%lx v%lu is too new: offset=%08lx\n", cur.oid.inode, cur.oid.stripe, cur.version, dirty_end->second.journal_sector);
+                printf("Write %jx:%jx v%ju is too new: offset=%08jx\n", cur.oid.inode, cur.oid.stripe, cur.version, dirty_end->second.journal_sector);
 #endif
                enqueue_flush(cur);
            }
@ -366,9 +364,10 @@ resume_0:
        !flusher->flush_queue.size() || !flusher->dequeuing)
    {
 stop_flusher:
-        if (flusher->trim_wanted > 0 && flusher->journal_trim_counter > 0)
+        if (flusher->trim_wanted > 0 && cur.oid.inode != 0)
        {
            // Attempt forced trim
+            cur.oid = {};
            flusher->active_flushers++;
            goto trim_journal;
        }
@ -387,7 +386,7 @@ stop_flusher:
        if (repeat_it != flusher->sync_to_repeat.end())
        {
 #ifdef BLOCKSTORE_DEBUG
-            printf("Postpone %lx:%lx v%lu\n", cur.oid.inode, cur.oid.stripe, cur.version);
+            printf("Postpone %jx:%jx v%ju\n", cur.oid.inode, cur.oid.stripe, cur.version);
 #endif
            // We don't flush different parts of history of the same object in parallel
            // So we check if someone is already flushing this object
@ -416,12 +415,13 @@ stop_flusher:
                flusher->sync_to_repeat.erase(cur.oid);
                if (!flusher->try_find_other(dirty_end, cur))
                {
+                    cur.oid = {};
                    goto stop_flusher;
                }
            }
        }
 #ifdef BLOCKSTORE_DEBUG
-        printf("Flushing %lx:%lx v%lu\n", cur.oid.inode, cur.oid.stripe, cur.version);
+        printf("Flushing %jx:%jx v%ju\n", cur.oid.inode, cur.oid.stripe, cur.version);
 #endif
        flusher->active_flushers++;
        // Find it in clean_db
@ -448,7 +448,7 @@ stop_flusher:
                // Object not allocated. This is a bug.
                char err[1024];
                snprintf(
-                    err, 1024, "BUG: Object %lx:%lx v%lu that we are trying to flush is not allocated on the data device",
+                    err, 1024, "BUG: Object %jx:%jx v%ju that we are trying to flush is not allocated on the data device",
                    cur.oid.inode, cur.oid.stripe, cur.version
                );
                throw std::runtime_error(err);
@ -538,7 +538,7 @@ resume_2:
                clean_disk_entry *old_entry = (clean_disk_entry*)((uint8_t*)meta_old.buf + meta_old.pos*bs->dsk.clean_entry_size);
                if (old_entry->oid.inode != 0 && old_entry->oid != cur.oid)
                {
-                    printf("Fatal error (metadata corruption or bug): tried to wipe metadata entry %lu (%lx:%lx v%lu) as old location of %lx:%lx\n",
+                    printf("Fatal error (metadata corruption or bug): tried to wipe metadata entry %ju (%jx:%jx v%ju) as old location of %jx:%jx\n",
                        old_clean_loc >> bs->dsk.block_order, old_entry->oid.inode, old_entry->oid.stripe,
                        old_entry->version, cur.oid.inode, cur.oid.stripe);
                    exit(1);
@ -571,7 +571,7 @@ resume_2:
        // Erase dirty_db entries
        bs->erase_dirty(dirty_start, std::next(dirty_end), clean_loc);
 #ifdef BLOCKSTORE_DEBUG
-        printf("Flushed %lx:%lx v%lu (%d copies, wr:%d, del:%d), %ld left\n", cur.oid.inode, cur.oid.stripe, cur.version,
+        printf("Flushed %jx:%jx v%ju (%d copies, wr:%d, del:%d), %jd left\n", cur.oid.inode, cur.oid.stripe, cur.version,
            copy_count, has_writes, has_delete, flusher->flush_queue.size());
 #endif
    release_oid:
@ -584,7 +584,8 @@ resume_2:
        flusher->sync_to_repeat.erase(repeat_it);
    trim_journal:
        // Clear unused part of the journal every <journal_trim_interval> flushes
-        if (!((++flusher->journal_trim_counter) % flusher->journal_trim_interval) || flusher->trim_wanted > 0)
+        if (bs->journal_trim_interval && !((++flusher->journal_trim_counter) % bs->journal_trim_interval) ||
+            flusher->trim_wanted > 0)
        {
    resume_26:
    resume_27:
@ -609,8 +610,8 @@ void journal_flusher_co::update_metadata_entry()
    {
        printf(
            has_delete
-                ? "Fatal error (metadata corruption or bug): tried to delete metadata entry %lu (%lx:%lx v%lu) while deleting %lx:%lx v%lu\n"
-                : "Fatal error (metadata corruption or bug): tried to overwrite non-zero metadata entry %lu (%lx:%lx v%lu) with %lx:%lx v%lu\n",
+                ? "Fatal error (metadata corruption or bug): tried to delete metadata entry %ju (%jx:%jx v%ju) while deleting %jx:%jx v%ju\n"
+                : "Fatal error (metadata corruption or bug): tried to overwrite non-zero metadata entry %ju (%jx:%jx v%ju) with %jx:%jx v%ju\n",
            clean_loc >> bs->dsk.block_order, new_entry->oid.inode, new_entry->oid.stripe,
            new_entry->version, cur.oid.inode, cur.oid.stripe, cur.version
        );
@ -710,7 +711,7 @@ bool journal_flusher_co::write_meta_block(flusher_meta_write_t & meta_block, int
    if (wait_state == wait_base)
        goto resume_0;
    await_sqe(0);
-    data->iov = (struct iovec){ meta_block.buf, bs->dsk.meta_block_size };
+    data->iov = (struct iovec){ meta_block.buf, (size_t)bs->dsk.meta_block_size };
    data->callback = simple_callback_w;
    my_uring_prep_writev(
        sqe, bs->dsk.meta_fd, &data->iov, 1, bs->dsk.meta_offset + bs->dsk.meta_block_size + meta_block.sector
@ -760,7 +761,7 @@ bool journal_flusher_co::clear_incomplete_csum_block_bits(int wait_base)
        {
            // If we encounter bad checksums during flush, we still update the bad block,
            // but intentionally mangle checksums to avoid hiding the corruption.
-            iovec iov = { .iov_base = v[i].buf, .iov_len = v[i].len };
+            iovec iov = { .iov_base = v[i].buf, .iov_len = (size_t)v[i].len };
            if (!(v[i].copy_flags & COPY_BUF_JOURNAL))
            {
                assert(!(v[i].offset % bs->dsk.csum_block_size));
@ -768,7 +769,7 @@ bool journal_flusher_co::clear_incomplete_csum_block_bits(int wait_base)
                bs->verify_padded_checksums(new_clean_bitmap, new_clean_bitmap + 2*bs->dsk.clean_entry_bitmap_size,
                    v[i].offset, &iov, 1, [&](uint32_t bad_block, uint32_t calc_csum, uint32_t stored_csum)
                {
-                    printf("Checksum mismatch in object %lx:%lx v%lu in data area at offset 0x%lx+0x%x: got %08x, expected %08x\n",
+                    printf("Checksum mismatch in object %jx:%jx v%ju in data area at offset 0x%jx+0x%x: got %08x, expected %08x\n",
                        cur.oid.inode, cur.oid.stripe, old_clean_ver, old_clean_loc, bad_block, calc_csum, stored_csum);
                    for (uint32_t j = 0; j < bs->dsk.csum_block_size; j += bs->dsk.bitmap_granularity)
                    {
@ -781,7 +782,7 @@ bool journal_flusher_co::clear_incomplete_csum_block_bits(int wait_base)
            {
                bs->verify_journal_checksums(v[i].csum_buf, v[i].offset, &iov, 1, [&](uint32_t bad_block, uint32_t calc_csum, uint32_t stored_csum)
                {
-                    printf("Checksum mismatch in object %lx:%lx v%lu in journal at offset 0x%lx+0x%x (block offset 0x%lx): got %08x, expected %08x\n",
+                    printf("Checksum mismatch in object %jx:%jx v%ju in journal at offset 0x%jx+0x%x (block offset 0x%jx): got %08x, expected %08x\n",
                        cur.oid.inode, cur.oid.stripe, old_clean_ver,
                        v[i].disk_offset, bad_block, v[i].offset, calc_csum, stored_csum);
                    bad_block += (v[i].offset/bs->dsk.csum_block_size) * bs->dsk.csum_block_size;
@ -805,7 +806,7 @@ bool journal_flusher_co::clear_incomplete_csum_block_bits(int wait_base)
            if (new_entry->oid != cur.oid)
            {
                printf(
-                    "Fatal error (metadata corruption or bug): tried to make holes in %lu (%lx:%lx v%lu) with %lx:%lx v%lu\n",
+                    "Fatal error (metadata corruption or bug): tried to make holes in %ju (%jx:%jx v%ju) with %jx:%jx v%ju\n",
                    clean_loc >> bs->dsk.block_order, new_entry->oid.inode, new_entry->oid.stripe,
                    new_entry->version, cur.oid.inode, cur.oid.stripe, cur.version
                );
@ -925,7 +926,7 @@ void journal_flusher_co::scan_dirty()
        {
            char err[1024];
            snprintf(
-                err, 1024, "BUG: Unexpected dirty_entry %lx:%lx v%lu unstable state during flush: 0x%x",
+                err, 1024, "BUG: Unexpected dirty_entry %jx:%jx v%ju unstable state during flush: 0x%x",
                dirty_it->first.oid.inode, dirty_it->first.oid.stripe, dirty_it->first.version, dirty_it->second.state
            );
            throw std::runtime_error(err);
@ -1021,7 +1022,7 @@ void journal_flusher_co::scan_dirty()
            // May happen if the metadata entry is corrupt, but journal isn't
            // FIXME: Report corrupted object to the upper layer (OSD)
            printf(
-                "Warning: object %lx:%lx has overwrites, but doesn't have a clean version."
+                "Warning: object %jx:%jx has overwrites, but doesn't have a clean version."
                " Metadata is likely corrupted. Dropping object from the DB.\n",
                cur.oid.inode, cur.oid.stripe
            );
@ -1056,7 +1057,7 @@ void journal_flusher_co::scan_dirty()
        flusher->enqueue_flush(cur);
        cur.version = dirty_end->first.version;
 #ifdef BLOCKSTORE_DEBUG
-        printf("Partial checksum block overwrites found - rewinding flush back to %lx:%lx v%lu\n", cur.oid.inode, cur.oid.stripe, cur.version);
+        printf("Partial checksum block overwrites found - rewinding flush back to %jx:%jx v%ju\n", cur.oid.inode, cur.oid.stripe, cur.version);
 #endif
        v.clear();
        copy_count = 0;
@ -1084,7 +1085,7 @@ bool journal_flusher_co::read_dirty(int wait_base)
        auto & vi = v[v.size()-i];
        assert(vi.len != 0);
        vi.buf = memalign_or_die(MEM_ALIGNMENT, vi.len);
-        data->iov = (struct iovec){ vi.buf, vi.len };
+        data->iov = (struct iovec){ vi.buf, (size_t)vi.len };
        data->callback = simple_callback_r;
        my_uring_prep_readv(
            sqe, bs->dsk.data_fd, &data->iov, 1, bs->dsk.data_offset + old_clean_loc + vi.offset
@ -1208,7 +1209,7 @@ bool journal_flusher_co::modify_meta_read(uint64_t meta_loc, flusher_meta_write_
            .usage_count = 1,
        }).first;
        await_sqe(0);
-        data->iov = (struct iovec){ wr.it->second.buf, bs->dsk.meta_block_size };
+        data->iov = (struct iovec){ wr.it->second.buf, (size_t)bs->dsk.meta_block_size };
        data->callback = simple_callback_r;
        wr.submitted = true;
        my_uring_prep_readv(
@ -1247,7 +1248,7 @@ void journal_flusher_co::free_data_blocks()
        auto uo_it = bs->used_clean_objects.find(old_clean_loc);
        bool used = uo_it != bs->used_clean_objects.end();
 #ifdef BLOCKSTORE_DEBUG
-        printf("%s block %lu from %lx:%lx v%lu (new location is %lu)\n",
+        printf("%s block %ju from %jx:%jx v%ju (new location is %ju)\n",
            used ? "Postpone free" : "Free",
            old_clean_loc >> bs->dsk.block_order,
            cur.oid.inode, cur.oid.stripe, cur.version,
@ -1264,7 +1265,7 @@ void journal_flusher_co::free_data_blocks()
        auto uo_it = bs->used_clean_objects.find(old_clean_loc);
        bool used = uo_it != bs->used_clean_objects.end();
 #ifdef BLOCKSTORE_DEBUG
-        printf("%s block %lu from %lx:%lx v%lu (delete)\n",
+        printf("%s block %ju from %jx:%jx v%ju (delete)\n",
            used ? "Postpone free" : "Free",
            old_clean_loc >> bs->dsk.block_order,
            cur.oid.inode, cur.oid.stripe, cur.version);
@ -1346,7 +1347,6 @@ bool journal_flusher_co::trim_journal(int wait_base)
    else if (wait_state == wait_base+2) goto resume_2;
    else if (wait_state == wait_base+3) goto resume_3;
    else if (wait_state == wait_base+4) goto resume_4;
-    flusher->journal_trim_counter = 0;
    new_trim_pos = bs->journal.get_trim_pos();
    if (new_trim_pos != bs->journal.used_start)
    {
@ -1378,7 +1378,7 @@ bool journal_flusher_co::trim_journal(int wait_base)
                .csum_block_size = bs->dsk.csum_block_size,
            };
            ((journal_entry_start*)flusher->journal_superblock)->crc32 = je_crc32((journal_entry*)flusher->journal_superblock);
-            data->iov = (struct iovec){ flusher->journal_superblock, bs->dsk.journal_block_size };
+            data->iov = (struct iovec){ flusher->journal_superblock, (size_t)bs->dsk.journal_block_size };
            data->callback = simple_callback_w;
            my_uring_prep_writev(sqe, bs->dsk.journal_fd, &data->iov, 1, bs->journal.offset);
            wait_count++;
@ -1410,7 +1410,7 @@ bool journal_flusher_co::trim_journal(int wait_base)
            }
            bs->journal.used_start = new_trim_pos;
 #ifdef BLOCKSTORE_DEBUG
-            printf("Journal trimmed to %08lx (next_free=%08lx dirty_start=%08lx)\n", bs->journal.used_start, bs->journal.next_free, bs->journal.dirty_start);
+            printf("Journal trimmed to %08jx (next_free=%08jx dirty_start=%08jx)\n", bs->journal.used_start, bs->journal.next_free, bs->journal.dirty_start);
 #endif
            if (bs->journal.flush_journal && !flusher->flush_queue.size())
            {
@ -1419,6 +1419,7 @@ bool journal_flusher_co::trim_journal(int wait_base)
                exit(0);
            }
        }
+        flusher->journal_trim_counter = 0;
        flusher->trimming = false;
    }
    return true;
--- a/src/blockstore_flush.h
+++ b/src/blockstore_flush.h
@ -107,7 +107,7 @@ class journal_flusher_t
    blockstore_impl_t *bs;
    friend class journal_flusher_co;

-    int journal_trim_counter, journal_trim_interval;
+    int journal_trim_counter;
    bool trimming;
    void* journal_superblock;

--- a/src/blockstore_impl.cpp
+++ b/src/blockstore_impl.cpp
@ -195,6 +195,10 @@ void blockstore_impl_t::loop()
                    // ring is full, stop submission
                    break;
                }
+                else if (PRIV(op)->wait_for == WAIT_JOURNAL)
+                {
+                    PRIV(op)->wait_detail2 = (unstable_writes.size()+unstable_unsynced);
+                }
            }
        }
        if (op_idx != new_idx)
@ -265,7 +269,7 @@ void blockstore_impl_t::check_wait(blockstore_op_t *op)
        {
            // stop submission if there's still no free space
 #ifdef BLOCKSTORE_DEBUG
-            printf("Still waiting for %lu SQE(s)\n", PRIV(op)->wait_detail);
+            printf("Still waiting for %ju SQE(s)\n", PRIV(op)->wait_detail);
 #endif
            return;
        }
@ -273,15 +277,15 @@ void blockstore_impl_t::check_wait(blockstore_op_t *op)
    }
    else if (PRIV(op)->wait_for == WAIT_JOURNAL)
    {
-        if (journal.used_start == PRIV(op)->wait_detail && !unstable_count_changed)
+        if (journal.used_start == PRIV(op)->wait_detail &&
+            (unstable_writes.size()+unstable_unsynced) == PRIV(op)->wait_detail2)
        {
            // do not submit
 #ifdef BLOCKSTORE_DEBUG
-            printf("Still waiting to flush journal offset %08lx\n", PRIV(op)->wait_detail);
+            printf("Still waiting to flush journal offset %08jx\n", PRIV(op)->wait_detail);
 #endif
            return;
        }
-        unstable_count_changed = false;
        flusher->release_trim();
        PRIV(op)->wait_for = 0;
    }
@ -353,7 +357,6 @@ void blockstore_impl_t::enqueue_op(blockstore_op_t *op)
                    };
                }
                unstable_writes.clear();
-                unstable_count_changed = true;
                op->callback = [old_callback](blockstore_op_t *op)
                {
                    obj_ver_id *vers = (obj_ver_id*)op->buf;
--- a/src/blockstore_impl.h
+++ b/src/blockstore_impl.h
@ -202,7 +202,7 @@ struct blockstore_op_private_t
 {
    // Wait status
    int wait_for;
-    uint64_t wait_detail;
+    uint64_t wait_detail, wait_detail2;
    int pending_ops;
    int op_state;

@ -253,6 +253,7 @@ class blockstore_impl_t
    bool inmemory_meta = false;
    // Maximum and minimum flusher count
    unsigned max_flusher_count, min_flusher_count;
+    unsigned journal_trim_interval;
    // Maximum queue depth
    unsigned max_write_iodepth = 128;
    // Enable small (journaled) write throttling, useful for the SSD+HDD case
@ -276,7 +277,6 @@ class blockstore_impl_t
    std::vector<blockstore_op_t*> submit_queue;
    std::vector<obj_ver_id> unsynced_big_writes, unsynced_small_writes;
    int unsynced_big_write_count = 0, unstable_unsynced = 0;
-    bool unstable_count_changed = false;
    int unsynced_queued_ops = 0;
    allocator *data_alloc = NULL;
    uint64_t used_blocks = 0;
--- a/src/blockstore_init.cpp
+++ b/src/blockstore_init.cpp
@ -63,7 +63,7 @@ int blockstore_init_meta::loop()
        throw std::runtime_error("Failed to allocate metadata read buffer");
    // Read superblock
    GET_SQE();
-    data->iov = { metadata_buffer, bs->dsk.meta_block_size };
+    data->iov = { metadata_buffer, (size_t)bs->dsk.meta_block_size };
    data->callback = [this](ring_data_t *data) { handle_event(data, -1); };
    my_uring_prep_readv(sqe, bs->dsk.meta_fd, &data->iov, 1, bs->dsk.meta_offset);
    bs->ringloop->submit();
@ -100,7 +100,7 @@ resume_1:
        {
            printf("Initializing metadata area\n");
            GET_SQE();
-            data->iov = (struct iovec){ metadata_buffer, bs->dsk.meta_block_size };
+            data->iov = (struct iovec){ metadata_buffer, (size_t)bs->dsk.meta_block_size };
            data->callback = [this](ring_data_t *data) { handle_event(data, -1); };
            my_uring_prep_writev(sqe, bs->dsk.meta_fd, &data->iov, 1, bs->dsk.meta_offset);
            bs->ringloop->submit();
@ -153,7 +153,7 @@ resume_1:
        else if (hdr->version > BLOCKSTORE_META_FORMAT_V2)
        {
            printf(
-                "Metadata format is too new for me (stored version is %lu, max supported %u).\n",
+                "Metadata format is too new for me (stored version is %ju, max supported %u).\n",
                hdr->version, BLOCKSTORE_META_FORMAT_V2
            );
            exit(1);
@ -167,7 +167,7 @@ resume_1:
            printf(
                "Configuration stored in metadata superblock"
                " (meta_block_size=%u, data_block_size=%u, bitmap_granularity=%u, data_csum_type=%u, csum_block_size=%u)"
-                " differs from OSD configuration (%lu/%u/%lu, %u/%u).\n",
+                " differs from OSD configuration (%ju/%u/%ju, %u/%u).\n",
                hdr->meta_block_size, hdr->data_block_size, hdr->bitmap_granularity,
                hdr->data_csum_type, hdr->csum_block_size,
                bs->dsk.meta_block_size, bs->dsk.data_block_size, bs->dsk.bitmap_granularity,
@ -199,7 +199,8 @@ resume_2:
                submitted++;
                next_offset += bufs[i].size;
                GET_SQE();
-                data->iov = { bufs[i].buf, bufs[i].size };
+                assert(bufs[i].size <= 0x7fffffff);
+                data->iov = { bufs[i].buf, (size_t)bufs[i].size };
                data->callback = [this, i](ring_data_t *data) { handle_event(data, i); };
                if (!zero_on_init)
                    my_uring_prep_readv(sqe, bs->dsk.meta_fd, &data->iov, 1, bs->dsk.meta_offset + bufs[i].offset);
@ -231,7 +232,8 @@ resume_2:
            {
                // write the modified buffer back
                GET_SQE();
-                data->iov = { bufs[i].buf, bufs[i].size };
+                assert(bufs[i].size <= 0x7fffffff);
+                data->iov = { bufs[i].buf, (size_t)bufs[i].size };
                data->callback = [this, i](ring_data_t *data) { handle_event(data, i); };
                my_uring_prep_writev(sqe, bs->dsk.meta_fd, &data->iov, 1, bs->dsk.meta_offset + bufs[i].offset);
                bufs[i].state = INIT_META_WRITING;
@ -257,7 +259,7 @@ resume_2:
            next_offset = entries_to_zero[i]/entries_per_block;
            for (j = i; j < entries_to_zero.size() && entries_to_zero[j]/entries_per_block == next_offset; j++) {}
            GET_SQE();
-            data->iov = { metadata_buffer, bs->dsk.meta_block_size };
+            data->iov = { metadata_buffer, (size_t)bs->dsk.meta_block_size };
            data->callback = [this](ring_data_t *data) { handle_event(data, -1); };
            my_uring_prep_readv(sqe, bs->dsk.meta_fd, &data->iov, 1, bs->dsk.meta_offset + (1+next_offset)*bs->dsk.meta_block_size);
            submitted++;
@ -273,7 +275,7 @@ resume_5:
                memset((uint8_t*)metadata_buffer + pos*bs->dsk.clean_entry_size, 0, bs->dsk.clean_entry_size);
            }
            GET_SQE();
-            data->iov = { metadata_buffer, bs->dsk.meta_block_size };
+            data->iov = { metadata_buffer, (size_t)bs->dsk.meta_block_size };
            data->callback = [this](ring_data_t *data) { handle_event(data, -1); };
            my_uring_prep_writev(sqe, bs->dsk.meta_fd, &data->iov, 1, bs->dsk.meta_offset + (1+next_offset)*bs->dsk.meta_block_size);
            submitted++;
@ -287,7 +289,7 @@ resume_6:
        entries_to_zero.clear();
    }
    // metadata read finished
-    printf("Metadata entries loaded: %lu, free blocks: %lu / %lu\n", entries_loaded, bs->data_alloc->get_free_count(), bs->dsk.block_count);
+    printf("Metadata entries loaded: %ju, free blocks: %ju / %ju\n", entries_loaded, bs->data_alloc->get_free_count(), bs->dsk.block_count);
    if (!bs->inmemory_meta)
    {
        free(metadata_buffer);
@ -328,7 +330,7 @@ bool blockstore_init_meta::handle_meta_block(uint8_t *buf, uint64_t entries_per_
                uint32_t *entry_csum = (uint32_t*)((uint8_t*)entry + bs->dsk.clean_entry_size - 4);
                if (*entry_csum != crc32c(0, entry, bs->dsk.clean_entry_size - 4))
                {
-                    printf("Metadata entry %lu is corrupt (checksum mismatch), skipping\n", done_cnt+i);
+                    printf("Metadata entry %ju is corrupt (checksum mismatch), skipping\n", done_cnt+i);
                    continue;
                }
            }
@ -366,7 +368,7 @@ bool blockstore_init_meta::handle_meta_block(uint8_t *buf, uint64_t entries_per_
                        entries_to_zero.push_back(clean_it->second.location >> bs->dsk.block_order);
                    }
 #ifdef BLOCKSTORE_DEBUG
-                    printf("Free block %lu from %lx:%lx v%lu (new location is %lu)\n",
+                    printf("Free block %ju from %jx:%jx v%ju (new location is %ju)\n",
                        old_clean_loc,
                        clean_it->first.inode, clean_it->first.stripe, clean_it->second.version,
                        done_cnt+i);
@ -380,7 +382,7 @@ bool blockstore_init_meta::handle_meta_block(uint8_t *buf, uint64_t entries_per_
                }
                entries_loaded++;
 #ifdef BLOCKSTORE_DEBUG
-                printf("Allocate block (clean entry) %lu: %lx:%lx v%lu\n", done_cnt+i, entry->oid.inode, entry->oid.stripe, entry->version);
+                printf("Allocate block (clean entry) %ju: %jx:%jx v%ju\n", done_cnt+i, entry->oid.inode, entry->oid.stripe, entry->version);
 #endif
                bs->data_alloc->set(done_cnt+i, true);
                clean_db[entry->oid] = (struct clean_entry){
@ -394,7 +396,7 @@ bool blockstore_init_meta::handle_meta_block(uint8_t *buf, uint64_t entries_per_
                updated = true;
                memset(entry, 0, bs->dsk.clean_entry_size);
 #ifdef BLOCKSTORE_DEBUG
-                printf("Old clean entry %lu: %lx:%lx v%lu\n", done_cnt+i, entry->oid.inode, entry->oid.stripe, entry->version);
+                printf("Old clean entry %ju: %jx:%jx v%ju\n", done_cnt+i, entry->oid.inode, entry->oid.stripe, entry->version);
 #endif
            }
        }
@ -466,7 +468,7 @@ int blockstore_init_journal::loop()
    if (!sqe)
        throw std::runtime_error("io_uring is full while trying to read journal");
    data = ((ring_data_t*)sqe->user_data);
-    data->iov = { submitted_buf, bs->journal.block_size };
+    data->iov = { submitted_buf, (size_t)bs->journal.block_size };
    data->callback = simple_callback;
    my_uring_prep_readv(sqe, bs->dsk.journal_fd, &data->iov, 1, bs->journal.offset);
    bs->ringloop->submit();
@ -507,7 +509,7 @@ resume_1:
            // FIXME: Randomize initial crc32. Track crc32 when trimming.
            printf("Resetting journal\n");
            GET_SQE();
-            data->iov = (struct iovec){ submitted_buf, 2*bs->journal.block_size };
+            data->iov = (struct iovec){ submitted_buf, (size_t)(2*bs->journal.block_size) };
            data->callback = simple_callback;
            my_uring_prep_writev(sqe, bs->dsk.journal_fd, &data->iov, 1, bs->journal.offset);
            wait_count++;
@ -557,7 +559,7 @@ resume_1:
            (je_start->version != JOURNAL_VERSION_V2 || je_start->size != JE_START_V2_SIZE && je_start->size != JE_START_V1_SIZE))
        {
            fprintf(
-                stderr, "The code only supports journal versions 2 and 1, but it is %lu on disk."
+                stderr, "The code only supports journal versions 2 and 1, but it is %ju on disk."
                    " Please use vitastor-disk to rewrite the journal\n",
                je_start->size == JE_START_V0_SIZE ? 0 : je_start->version
            );
@ -606,7 +608,7 @@ resume_1:
                    submitted_buf = (uint8_t*)bs->journal.buffer + journal_pos;
                data->iov = {
                    submitted_buf,
-                    end - journal_pos < JOURNAL_BUFFER_SIZE ? end - journal_pos : JOURNAL_BUFFER_SIZE,
+                    (size_t)(end - journal_pos < JOURNAL_BUFFER_SIZE ? end - journal_pos : JOURNAL_BUFFER_SIZE),
                };
                data->callback = [this](ring_data_t *data1) { handle_event(data1); };
                my_uring_prep_readv(sqe, bs->dsk.journal_fd, &data->iov, 1, bs->journal.offset + journal_pos);
@ -622,7 +624,7 @@ resume_1:
                    if (init_write_buf && !bs->readonly)
                    {
                        GET_SQE();
-                        data->iov = { init_write_buf, bs->journal.block_size };
+                        data->iov = { init_write_buf, (size_t)bs->journal.block_size };
                        data->callback = simple_callback;
                        my_uring_prep_writev(sqe, bs->dsk.journal_fd, &data->iov, 1, bs->journal.offset + init_write_sector);
                        wait_count++;
@ -691,7 +693,7 @@ resume_1:
            IS_BIG_WRITE(dirty_it->second.state) &&
            dirty_it->second.location == UINT64_MAX)
        {
-            printf("Fatal error (bug): %lx:%lx v%lu big_write journal_entry was allocated over another object\n",
+            printf("Fatal error (bug): %jx:%jx v%ju big_write journal_entry was allocated over another object\n",
                dirty_it->first.oid.inode, dirty_it->first.oid.stripe, dirty_it->first.version);
            exit(1);
        }
@ -699,7 +701,7 @@ resume_1:
    bs->flusher->mark_trim_possible();
    bs->journal.dirty_start = bs->journal.next_free;
    printf(
-        "Journal entries loaded: %lu, free journal space: %lu bytes (%08lx..%08lx is used), free blocks: %lu / %lu\n",
+        "Journal entries loaded: %ju, free journal space: %ju bytes (%08jx..%08jx is used), free blocks: %ju / %ju\n",
        entries_loaded,
        (bs->journal.next_free >= bs->journal.used_start
            ? bs->journal.len-bs->journal.block_size - (bs->journal.next_free-bs->journal.used_start)
@ -754,7 +756,7 @@ int blockstore_init_journal::handle_journal_part(void *buf, uint64_t done_pos, u
            {
 #ifdef BLOCKSTORE_DEBUG
                printf(
-                    "je_small_write%s oid=%lx:%lx ver=%lu offset=%u len=%u\n",
+                    "je_small_write%s oid=%jx:%jx ver=%ju offset=%u len=%u\n",
                    je->type == JE_SMALL_WRITE_INSTANT ? "_instant" : "",
                    je->small_write.oid.inode, je->small_write.oid.stripe, je->small_write.version,
                    je->small_write.offset, je->small_write.len
@ -776,7 +778,7 @@ int blockstore_init_journal::handle_journal_part(void *buf, uint64_t done_pos, u
                if (location != je->small_write.data_offset)
                {
                    char err[1024];
-                    snprintf(err, 1024, "BUG: calculated journal data offset (%08lx) != stored journal data offset (%08lx)", location, je->small_write.data_offset);
+                    snprintf(err, 1024, "BUG: calculated journal data offset (%08jx) != stored journal data offset (%08jx)", location, je->small_write.data_offset);
                    throw std::runtime_error(err);
                }
                small_write_data.clear();
@ -803,7 +805,7 @@ int blockstore_init_journal::handle_journal_part(void *buf, uint64_t done_pos, u
                            covered += part_end - part_begin;
                            small_write_data.push_back((iovec){
                                .iov_base = (uint8_t*)done[i].buf + part_begin - done[i].pos,
-                                .iov_len = part_end - part_begin,
+                                .iov_len = (size_t)(part_end - part_begin),
                            });
                        }
                    }
@ -826,7 +828,7 @@ int blockstore_init_journal::handle_journal_part(void *buf, uint64_t done_pos, u
                    if (!data_csum_valid)
                    {
                        printf(
-                            "Journal entry data is corrupt for small_write%s oid=%lx:%lx ver=%lu offset=%u len=%u - data crc32 %x != %x\n",
+                            "Journal entry data is corrupt for small_write%s oid=%jx:%jx ver=%ju offset=%u len=%u - data crc32 %x != %x\n",
                            je->type == JE_SMALL_WRITE_INSTANT ? "_instant" : "",
                            je->small_write.oid.inode, je->small_write.oid.stripe, je->small_write.version,
                            je->small_write.offset, je->small_write.len,
@ -845,7 +847,7 @@ int blockstore_init_journal::handle_journal_part(void *buf, uint64_t done_pos, u
                    if (je->size != required_size)
                    {
                        printf(
-                            "Journal entry data has invalid size for small_write%s oid=%lx:%lx ver=%lu offset=%u len=%u - should be %u bytes but is %u bytes\n",
+                            "Journal entry data has invalid size for small_write%s oid=%jx:%jx ver=%ju offset=%u len=%u - should be %u bytes but is %u bytes\n",
                            je->type == JE_SMALL_WRITE_INSTANT ? "_instant" : "",
                            je->small_write.oid.inode, je->small_write.oid.stripe, je->small_write.version,
                            je->small_write.offset, je->small_write.len,
@ -893,7 +895,7 @@ int blockstore_init_journal::handle_journal_part(void *buf, uint64_t done_pos, u
                            if (block_crc32 != *block_csums)
                            {
                                printf(
-                                    "Journal entry data is corrupt for small_write%s oid=%lx:%lx ver=%lu offset=%u len=%u - block %u crc32 %x != %x\n",
+                                    "Journal entry data is corrupt for small_write%s oid=%jx:%jx ver=%ju offset=%u len=%u - block %u crc32 %x != %x\n",
                                    je->type == JE_SMALL_WRITE_INSTANT ? "_instant" : "",
                                    je->small_write.oid.inode, je->small_write.oid.stripe, je->small_write.version,
                                    je->small_write.offset, je->small_write.len,
@ -956,7 +958,7 @@ int blockstore_init_journal::handle_journal_part(void *buf, uint64_t done_pos, u
                    bs->journal.used_sectors[proc_pos]++;
 #ifdef BLOCKSTORE_DEBUG
                    printf(
-                        "journal offset %08lx is used by %lx:%lx v%lu (%lu refs)\n",
+                        "journal offset %08jx is used by %jx:%jx v%ju (%ju refs)\n",
                        proc_pos, ov.oid.inode, ov.oid.stripe, ov.version, bs->journal.used_sectors[proc_pos]
                    );
 #endif
@ -972,7 +974,7 @@ int blockstore_init_journal::handle_journal_part(void *buf, uint64_t done_pos, u
            {
 #ifdef BLOCKSTORE_DEBUG
                printf(
-                    "je_big_write%s oid=%lx:%lx ver=%lu loc=%lu\n",
+                    "je_big_write%s oid=%jx:%jx ver=%ju loc=%ju\n",
                    je->type == JE_BIG_WRITE_INSTANT ? "_instant" : "",
                    je->big_write.oid.inode, je->big_write.oid.stripe, je->big_write.version, je->big_write.location >> bs->dsk.block_order
                );
@ -1049,7 +1051,7 @@ int blockstore_init_journal::handle_journal_part(void *buf, uint64_t done_pos, u
                    {
 #ifdef BLOCKSTORE_DEBUG
                        printf(
-                            "Allocate block (journal) %lu: %lx:%lx v%lu\n",
+                            "Allocate block (journal) %ju: %jx:%jx v%ju\n",
                            je->big_write.location >> bs->dsk.block_order,
                            ov.oid.inode, ov.oid.stripe, ov.version
                        );
@ -1059,7 +1061,7 @@ int blockstore_init_journal::handle_journal_part(void *buf, uint64_t done_pos, u
                    bs->journal.used_sectors[proc_pos]++;
 #ifdef BLOCKSTORE_DEBUG
                    printf(
-                        "journal offset %08lx is used by %lx:%lx v%lu (%lu refs)\n",
+                        "journal offset %08jx is used by %jx:%jx v%ju (%ju refs)\n",
                        proc_pos, ov.oid.inode, ov.oid.stripe, ov.version, bs->journal.used_sectors[proc_pos]
                    );
 #endif
@ -1074,7 +1076,7 @@ int blockstore_init_journal::handle_journal_part(void *buf, uint64_t done_pos, u
            else if (je->type == JE_STABLE)
            {
 #ifdef BLOCKSTORE_DEBUG
-                printf("je_stable oid=%lx:%lx ver=%lu\n", je->stable.oid.inode, je->stable.oid.stripe, je->stable.version);
+                printf("je_stable oid=%jx:%jx ver=%ju\n", je->stable.oid.inode, je->stable.oid.stripe, je->stable.version);
 #endif
                // oid, version
                obj_ver_id ov = {
@ -1086,7 +1088,7 @@ int blockstore_init_journal::handle_journal_part(void *buf, uint64_t done_pos, u
            else if (je->type == JE_ROLLBACK)
            {
 #ifdef BLOCKSTORE_DEBUG
-                printf("je_rollback oid=%lx:%lx ver=%lu\n", je->rollback.oid.inode, je->rollback.oid.stripe, je->rollback.version);
+                printf("je_rollback oid=%jx:%jx ver=%ju\n", je->rollback.oid.inode, je->rollback.oid.stripe, je->rollback.version);
 #endif
                // rollback dirty writes of <oid> up to <version>
                obj_ver_id ov = {
@ -1098,7 +1100,7 @@ int blockstore_init_journal::handle_journal_part(void *buf, uint64_t done_pos, u
            else if (je->type == JE_DELETE)
            {
 #ifdef BLOCKSTORE_DEBUG
-                printf("je_delete oid=%lx:%lx ver=%lu\n", je->del.oid.inode, je->del.oid.stripe, je->del.version);
+                printf("je_delete oid=%jx:%jx ver=%ju\n", je->del.oid.inode, je->del.oid.stripe, je->del.version);
 #endif
                bool dirty_exists = false;
                auto dirty_it = bs->dirty_db.upper_bound((obj_ver_id){
--- a/src/blockstore_journal.cpp
+++ b/src/blockstore_journal.cpp
@ -90,8 +90,8 @@ int blockstore_journal_check_t::check_available(blockstore_op_t *op, int entries
            }
            // In fact, it's even more rare than "ran out of journal space", so print a warning
            printf(
-                "Ran out of journal sector buffers: %d/%lu buffers used (%d dirty), next buffer (%ld)"
-                " is %s and flushed %lu times. Consider increasing \'journal_sector_buffer_count\'\n",
+                "Ran out of journal sector buffers: %d/%ju buffers used (%d dirty), next buffer (%jd)"
+                " is %s and flushed %ju times. Consider increasing \'journal_sector_buffer_count\'\n",
                used, bs->journal.sector_count, dirty, next_sector,
                bs->journal.sector_info[next_sector].dirty ? "dirty" : "not dirty",
                bs->journal.sector_info[next_sector].flush_count
@ -103,7 +103,7 @@ int blockstore_journal_check_t::check_available(blockstore_op_t *op, int entries
    if (data_after > 0)
    {
        next_pos = next_pos + data_after;
-        if (next_pos > bs->journal.len)
+        if (next_pos >= bs->journal.len)
        {
            if (right_dir)
                next_pos = bs->journal.block_size + data_after;
@ -114,7 +114,7 @@ int blockstore_journal_check_t::check_available(blockstore_op_t *op, int entries
    {
        // No space in the journal. Wait until used_start changes.
        printf(
-            "Ran out of journal space (used_start=%08lx, next_free=%08lx, dirty_start=%08lx)\n",
+            "Ran out of journal space (used_start=%08jx, next_free=%08jx, dirty_start=%08jx)\n",
            bs->journal.used_start, bs->journal.next_free, bs->journal.dirty_start
        );
        PRIV(op)->wait_for = WAIT_JOURNAL;
@ -146,7 +146,7 @@ journal_entry* prefill_single_journal_entry(journal_t & journal, uint16_t type,
        journal.in_sector_pos = 0;
        auto next_next_free = (journal.next_free+journal.block_size) < journal.len ? journal.next_free + journal.block_size : journal.block_size;
        // double check that next_free doesn't cross used_start from the left
-        assert(journal.next_free >= journal.used_start || next_next_free < journal.used_start);
+        assert(journal.next_free >= journal.used_start && next_next_free >= journal.next_free || next_next_free < journal.used_start);
        journal.next_free = next_next_free;
        memset(journal.inmemory
            ? (uint8_t*)journal.buffer + journal.sector_info[journal.cur_sector].offset
@ -183,7 +183,7 @@ void blockstore_impl_t::prepare_journal_sector_write(int cur_sector, blockstore_
            (journal.inmemory
                ? (uint8_t*)journal.buffer + journal.sector_info[cur_sector].offset
                : (uint8_t*)journal.sector_buf + journal.block_size*cur_sector),
-            journal.block_size
+            (size_t)journal.block_size
        };
        data->callback = [this, flush_id = journal.submit_id](ring_data_t *data) { handle_journal_write(data, flush_id); };
        my_uring_prep_writev(
@ -263,7 +263,7 @@ uint64_t journal_t::get_trim_pos()
            // next_free does not need updating during trim
 #ifdef BLOCKSTORE_DEBUG
            printf(
-                "Trimming journal (used_start=%08lx, next_free=%08lx, dirty_start=%08lx, new_start=%08lx, new_refcount=%ld)\n",
+                "Trimming journal (used_start=%08jx, next_free=%08jx, dirty_start=%08jx, new_start=%08jx, new_refcount=%jd)\n",
                used_start, next_free, dirty_start,
                journal_used_it->first, journal_used_it->second
            );
@ -276,7 +276,7 @@ uint64_t journal_t::get_trim_pos()
        // Journal is cleared up to <journal_used_it>
 #ifdef BLOCKSTORE_DEBUG
        printf(
-            "Trimming journal (used_start=%08lx, next_free=%08lx, dirty_start=%08lx, new_start=%08lx, new_refcount=%ld)\n",
+            "Trimming journal (used_start=%08jx, next_free=%08jx, dirty_start=%08jx, new_start=%08jx, new_refcount=%jd)\n",
            used_start, next_free, dirty_start,
            journal_used_it->first, journal_used_it->second
        );
@ -296,7 +296,7 @@ void journal_t::dump_diagnostics()
        journal_used_it = used_sectors.begin();
    }
    printf(
-        "Journal: used_start=%08lx next_free=%08lx dirty_start=%08lx trim_to=%08lx trim_to_refs=%ld\n",
+        "Journal: used_start=%08jx next_free=%08jx dirty_start=%08jx trim_to=%08jx trim_to_refs=%jd\n",
        used_start, next_free, dirty_start,
        journal_used_it == used_sectors.end() ? 0 : journal_used_it->first,
        journal_used_it == used_sectors.end() ? 0 : journal_used_it->second
--- a/src/blockstore_open.cpp
+++ b/src/blockstore_open.cpp
@ -13,6 +13,7 @@ void blockstore_impl_t::parse_config(blockstore_config_t & config, bool init)
        max_flusher_count = strtoull(config["flusher_count"].c_str(), NULL, 10);
    }
    min_flusher_count = strtoull(config["min_flusher_count"].c_str(), NULL, 10);
+    journal_trim_interval = strtoull(config["journal_trim_interval"].c_str(), NULL, 10);
    max_write_iodepth = strtoull(config["max_write_iodepth"].c_str(), NULL, 10);
    throttle_small_writes = config["throttle_small_writes"] == "true" || config["throttle_small_writes"] == "1" || config["throttle_small_writes"] == "yes";
    throttle_target_iops = strtoull(config["throttle_target_iops"].c_str(), NULL, 10);
@ -31,6 +32,10 @@ void blockstore_impl_t::parse_config(blockstore_config_t & config, bool init)
    {
        min_flusher_count = 1;
    }
+    if (!journal_trim_interval)
+    {
+        journal_trim_interval = 512;
+    }
    if (!max_write_iodepth)
    {
        max_write_iodepth = 128;
--- a/src/blockstore_read.cpp
+++ b/src/blockstore_read.cpp
@ -25,7 +25,7 @@ int blockstore_impl_t::fulfill_read_push(blockstore_op_t *op, void *buf, uint64_
        return 1;
    }
    BS_SUBMIT_GET_SQE(sqe, data);
-    data->iov = (struct iovec){ buf, len };
+    data->iov = (struct iovec){ buf, (size_t)len };
    PRIV(op)->pending_ops++;
    my_uring_prep_readv(
        sqe,
@ -505,7 +505,7 @@ int blockstore_impl_t::dequeue_read(blockstore_op_t *read_op)
        for (auto & rv: PRIV(read_op)->read_vec)
        {
            if (rv.journal_sector)
-                journal.used_sectors[rv.journal_sector-1]++;
+                journal.used_sectors.at(rv.journal_sector-1)++;
        }
    }
    read_op->retval = 0;
@ -700,7 +700,7 @@ uint8_t* blockstore_impl_t::read_clean_meta_block(blockstore_op_t *op, uint64_t
        .buf = buf,
    });
    BS_SUBMIT_GET_SQE(sqe, data);
-    data->iov = (struct iovec){ buf, dsk.meta_block_size };
+    data->iov = (struct iovec){ buf, (size_t)dsk.meta_block_size };
    PRIV(op)->pending_ops++;
    my_uring_prep_readv(sqe, dsk.meta_fd, &data->iov, 1, dsk.meta_offset + dsk.meta_block_size + sector);
    data->callback = [this, op](ring_data_t *data) { handle_read_event(data, op); };
@ -855,7 +855,7 @@ void blockstore_impl_t::handle_read_event(ring_data_t *data, blockstore_op_t *op
                            {
                                ok = false;
                                printf(
-                                    "Checksum mismatch in object %lx:%lx v%lu in journal at 0x%lx, checksum block #%u: got %08x, expected %08x\n",
+                                    "Checksum mismatch in object %jx:%jx v%ju in journal at 0x%jx, checksum block #%u: got %08x, expected %08x\n",
                                    op->oid.inode, op->oid.stripe, op->version,
                                    rv[i].disk_offset, bad_block / dsk.csum_block_size, calc_csum, stored_csum
                                );
@ -875,7 +875,7 @@ void blockstore_impl_t::handle_read_event(ring_data_t *data, blockstore_op_t *op
                                {
                                    ok = false;
                                    printf(
-                                        "Checksum mismatch in object %lx:%lx v%lu in %s data at 0x%lx, checksum block #%u: got %08x, expected %08x\n",
+                                        "Checksum mismatch in object %jx:%jx v%ju in %s data at 0x%jx, checksum block #%u: got %08x, expected %08x\n",
                                        op->oid.inode, op->oid.stripe, op->version,
                                        (rv[i].copy_flags & COPY_BUF_JOURNALED_BIG ? "redirect-write" : "clean"),
                                        rv[i].disk_offset, bad_block / dsk.csum_block_size, calc_csum, stored_csum
@ -918,7 +918,7 @@ void blockstore_impl_t::handle_read_event(ring_data_t *data, blockstore_op_t *op
                            {
                                // checksum error
                                printf(
-                                    "Checksum mismatch in object %lx:%lx v%lu in %s area at offset 0x%lx+0x%lx: %08x vs %08x\n",
+                                    "Checksum mismatch in object %jx:%jx v%ju in %s area at offset 0x%jx+0x%zx: %08x vs %08x\n",
                                    op->oid.inode, op->oid.stripe, op->version,
                                    (vec.copy_flags & COPY_BUF_JOURNAL) ? "journal" : "data", vec.disk_offset, p,
                                    crc32c(0, (uint8_t*)op->buf + vec.offset - op->offset + p, dsk.csum_block_size), *csum
@ -966,7 +966,7 @@ void blockstore_impl_t::handle_read_event(ring_data_t *data, blockstore_op_t *op
            {
                if (rv.journal_sector)
                {
-                    auto used = --journal.used_sectors[rv.journal_sector-1];
+                    auto used = --journal.used_sectors.at(rv.journal_sector-1);
                    if (used == 0)
                    {
                        journal.used_sectors.erase(rv.journal_sector-1);
--- a/src/blockstore_rollback.cpp
+++ b/src/blockstore_rollback.cpp
@ -162,7 +162,6 @@ void blockstore_impl_t::mark_rolled_back(const obj_ver_id & ov)
                    unstable_writes.erase(unstab_it);
                else
                    unstab_it->second = max_unstable;
-                unstable_count_changed = true;
            }
        }
    }
@ -180,7 +179,7 @@ void blockstore_impl_t::erase_dirty(blockstore_dirty_db_t::iterator dirty_start,
    {
        object_id oid = dirty_it->first.oid;
 #ifdef BLOCKSTORE_DEBUG
-        printf("Unblock writes-after-delete %lx:%lx v%lu\n", oid.inode, oid.stripe, dirty_it->first.version);
+        printf("Unblock writes-after-delete %jx:%jx v%ju\n", oid.inode, oid.stripe, dirty_it->first.version);
 #endif
        dirty_it = dirty_end;
        // Unblock operations blocked by delete flushing
@ -211,21 +210,26 @@ void blockstore_impl_t::erase_dirty(blockstore_dirty_db_t::iterator dirty_start,
            dirty_it->second.location != UINT64_MAX)
        {
 #ifdef BLOCKSTORE_DEBUG
-            printf("Free block %lu from %lx:%lx v%lu\n", dirty_it->second.location >> dsk.block_order,
+            printf("Free block %ju from %jx:%jx v%ju\n", dirty_it->second.location >> dsk.block_order,
                dirty_it->first.oid.inode, dirty_it->first.oid.stripe, dirty_it->first.version);
 #endif
            data_alloc->set(dirty_it->second.location >> dsk.block_order, false);
        }
-        auto used = --journal.used_sectors[dirty_it->second.journal_sector];
+        auto used = --journal.used_sectors.at(dirty_it->second.journal_sector);
 #ifdef BLOCKSTORE_DEBUG
        printf(
-            "remove usage of journal offset %08lx by %lx:%lx v%lu (%lu refs)\n", dirty_it->second.journal_sector,
+            "remove usage of journal offset %08jx by %jx:%jx v%ju (%ju refs)\n", dirty_it->second.journal_sector,
            dirty_it->first.oid.inode, dirty_it->first.oid.stripe, dirty_it->first.version, used
        );
 #endif
        if (used == 0)
        {
            journal.used_sectors.erase(dirty_it->second.journal_sector);
+            if (dirty_it->second.journal_sector == journal.sector_info[journal.cur_sector].offset)
+            {
+                // Mark current sector as "full" to select the new one
+                journal.in_sector_pos = dsk.journal_block_size;
+            }
            flusher->mark_trim_possible();
        }
        free_dirty_dyn_data(dirty_it->second);
--- a/src/blockstore_stable.cpp
+++ b/src/blockstore_stable.cpp
@ -298,7 +298,7 @@ int blockstore_impl_t::dequeue_stable(blockstore_op_t *op)
            if (clean_it == clean_db.end() || clean_it->second.version < ov.version)
            {
                // No such object version
-                printf("Error: %lx:%lx v%lu not found while stabilizing\n", ov.oid.inode, ov.oid.stripe, ov.version);
+                printf("Error: %jx:%jx v%ju not found while stabilizing\n", ov.oid.inode, ov.oid.stripe, ov.version);
                return -ENOENT;
            }
            else
@ -307,7 +307,14 @@ int blockstore_impl_t::dequeue_stable(blockstore_op_t *op)
                return STAB_SPLIT_DONE;
            }
        }
-        else if (IS_IN_FLIGHT(dirty_it->second.state))
+        else if (IS_STABLE(dirty_it->second.state))
+        {
+            // Already stable
+            return STAB_SPLIT_DONE;
+        }
+        while (true)
+        {
+            if (IS_IN_FLIGHT(dirty_it->second.state))
            {
                // Object write is still in progress. Wait until the write request completes
                return STAB_SPLIT_WAIT;
@ -329,13 +336,20 @@ int blockstore_impl_t::dequeue_stable(blockstore_op_t *op)
            }
            else if (IS_STABLE(dirty_it->second.state))
            {
-            // Already stable
-            return STAB_SPLIT_DONE;
+                break;
            }
-        else
+            // Check previous versions too
+            if (dirty_it == dirty_db.begin())
            {
-            return STAB_SPLIT_TODO;
+                break;
            }
+            dirty_it--;
+            if (dirty_it->first.oid != ov.oid)
+            {
+                break;
+            }
+        }
+        return STAB_SPLIT_TODO;
    });
    if (r != 1)
    {
@ -402,7 +416,7 @@ resume_4:
    {
        // Mark all dirty_db entries up to op->version as stable
 #ifdef BLOCKSTORE_DEBUG
-        printf("Stabilize %lx:%lx v%lu\n", v->oid.inode, v->oid.stripe, v->version);
+        printf("Stabilize %jx:%jx v%ju\n", v->oid.inode, v->oid.stripe, v->version);
 #endif
        mark_stable(*v);
    }
@ -493,7 +507,7 @@ void blockstore_impl_t::mark_stable(obj_ver_id v, bool forget_dirty)
            {
                // mark_stable should never be called for in-flight or submitted writes
                printf(
-                    "BUG: Attempt to mark_stable object %lx:%lx v%lu state of which is %x\n",
+                    "BUG: Attempt to mark_stable object %jx:%jx v%ju state of which is %x\n",
                    dirty_it->first.oid.inode, dirty_it->first.oid.stripe, dirty_it->first.version,
                    dirty_it->second.state
                );
@ -537,6 +551,5 @@ void blockstore_impl_t::mark_stable(obj_ver_id v, bool forget_dirty)
        unstab_it->second <= v.version)
    {
        unstable_writes.erase(unstab_it);
-        unstable_count_changed = true;
    }
 }
--- a/src/blockstore_sync.cpp
+++ b/src/blockstore_sync.cpp
@ -92,8 +92,7 @@ int blockstore_impl_t::continue_sync(blockstore_op_t *op)
            }
        }
        else if (!space_check.check_available(op, PRIV(op)->sync_big_writes.size(),
-            sizeof(journal_entry_big_write) + dsk.clean_entry_bitmap_size,
-            (unstable_writes.size()+unstable_unsynced)*journal.block_size))
+            sizeof(journal_entry_big_write) + dsk.clean_entry_bitmap_size, 0))
        {
            return 0;
        }
@ -116,11 +115,14 @@ int blockstore_impl_t::continue_sync(blockstore_op_t *op)
                journal, (dirty_entry.state & BS_ST_INSTANT) ? JE_BIG_WRITE_INSTANT : JE_BIG_WRITE,
                sizeof(journal_entry_big_write) + dyn_size
            );
-            dirty_entry.journal_sector = journal.sector_info[journal.cur_sector].offset;
+            auto jsec = dirty_entry.journal_sector = journal.sector_info[journal.cur_sector].offset;
+            assert(journal.next_free >= journal.used_start
+                ? (jsec >= journal.used_start && jsec < journal.next_free)
+                : (jsec >= journal.used_start || jsec < journal.next_free));
            journal.used_sectors[journal.sector_info[journal.cur_sector].offset]++;
 #ifdef BLOCKSTORE_DEBUG
            printf(
-                "journal offset %08lx is used by %lx:%lx v%lu (%lu refs)\n",
+                "journal offset %08jx is used by %jx:%jx v%ju (%ju refs)\n",
                dirty_entry.journal_sector, it->oid.inode, it->oid.stripe, it->version,
                journal.used_sectors[journal.sector_info[journal.cur_sector].offset]
            );
@ -174,7 +176,7 @@ void blockstore_impl_t::ack_sync(blockstore_op_t *op)
    for (auto it = PRIV(op)->sync_big_writes.begin(); it != PRIV(op)->sync_big_writes.end(); it++)
    {
 #ifdef BLOCKSTORE_DEBUG
-        printf("Ack sync big %lx:%lx v%lu\n", it->oid.inode, it->oid.stripe, it->version);
+        printf("Ack sync big %jx:%jx v%ju\n", it->oid.inode, it->oid.stripe, it->version);
 #endif
        auto & unstab = unstable_writes[it->oid];
        unstab = unstab < it->version ? it->version : unstab;
@ -202,7 +204,7 @@ void blockstore_impl_t::ack_sync(blockstore_op_t *op)
    for (auto it = PRIV(op)->sync_small_writes.begin(); it != PRIV(op)->sync_small_writes.end(); it++)
    {
 #ifdef BLOCKSTORE_DEBUG
-        printf("Ack sync small %lx:%lx v%lu\n", it->oid.inode, it->oid.stripe, it->version);
+        printf("Ack sync small %jx:%jx v%ju\n", it->oid.inode, it->oid.stripe, it->version);
 #endif
        auto & unstab = unstable_writes[it->oid];
        unstab = unstab < it->version ? it->version : unstab;
--- a/src/blockstore_write.cpp
+++ b/src/blockstore_write.cpp
@ -85,7 +85,7 @@ bool blockstore_impl_t::enqueue_write(blockstore_op_t *op)
            // It's allowed to write versions with low numbers over deletes
            // However, we have to flush those deletes first as we use version number for ordering
 #ifdef BLOCKSTORE_DEBUG
-            printf("Write %lx:%lx v%lu over delete (real v%lu) offset=%u len=%u\n", op->oid.inode, op->oid.stripe, version, op->version, op->offset, op->len);
+            printf("Write %jx:%jx v%ju over delete (real v%ju) offset=%u len=%u\n", op->oid.inode, op->oid.stripe, version, op->version, op->offset, op->len);
 #endif
            wait_del = true;
            PRIV(op)->real_version = op->version;
@ -95,11 +95,13 @@ bool blockstore_impl_t::enqueue_write(blockstore_op_t *op)
                // Issue an additional sync so the delete reaches the journal
                blockstore_op_t *sync_op = new blockstore_op_t;
                sync_op->opcode = BS_OP_SYNC;
-                sync_op->callback = [this, op](blockstore_op_t *sync_op)
+                sync_op->oid = op->oid;
+                sync_op->version = op->version;
+                sync_op->callback = [this](blockstore_op_t *sync_op)
                {
                    flusher->unshift_flush((obj_ver_id){
-                        .oid = op->oid,
-                        .version = op->version-1,
+                        .oid = sync_op->oid,
+                        .version = sync_op->version-1,
                    }, true);
                    delete sync_op;
                };
@ -117,7 +119,7 @@ bool blockstore_impl_t::enqueue_write(blockstore_op_t *op)
        {
            // Invalid version requested
 #ifdef BLOCKSTORE_DEBUG
-            printf("Write %lx:%lx v%lu requested, but we already have v%lu\n", op->oid.inode, op->oid.stripe, op->version, version);
+            printf("Write %jx:%jx v%ju requested, but we already have v%ju\n", op->oid.inode, op->oid.stripe, op->version, version);
 #endif
            op->retval = -EEXIST;
            if (!is_del && alloc_dyn_data)
@ -144,9 +146,9 @@ bool blockstore_impl_t::enqueue_write(blockstore_op_t *op)
        unsynced_queued_ops++;
 #ifdef BLOCKSTORE_DEBUG
    if (is_del)
-        printf("Delete %lx:%lx v%lu\n", op->oid.inode, op->oid.stripe, op->version);
+        printf("Delete %jx:%jx v%ju\n", op->oid.inode, op->oid.stripe, op->version);
    else if (!wait_del)
-        printf("Write %lx:%lx v%lu offset=%u len=%u\n", op->oid.inode, op->oid.stripe, op->version, op->offset, op->len);
+        printf("Write %jx:%jx v%ju offset=%u len=%u\n", op->oid.inode, op->oid.stripe, op->version, op->offset, op->len);
 #endif
    // No strict need to add it into dirty_db here except maybe for listings to return
    // correct data when there are inflight operations in the queue
@ -286,7 +288,7 @@ int blockstore_impl_t::dequeue_write(blockstore_op_t *op)
        }
        // Restore original low version number for unblocked operations
 #ifdef BLOCKSTORE_DEBUG
-        printf("Restoring %lx:%lx version: v%lu -> v%lu\n", op->oid.inode, op->oid.stripe, op->version, PRIV(op)->real_version);
+        printf("Restoring %jx:%jx version: v%ju -> v%ju\n", op->oid.inode, op->oid.stripe, op->version, PRIV(op)->real_version);
 #endif
        auto prev_it = dirty_it;
        if (prev_it != dirty_db.begin())
@ -296,7 +298,7 @@ int blockstore_impl_t::dequeue_write(blockstore_op_t *op)
            {
                // Original version is still invalid
                // All subsequent writes to the same object must be canceled too
-                printf("Tried to write %lx:%lx v%lu after delete (old version v%lu), but already have v%lu\n",
+                printf("Tried to write %jx:%jx v%ju after delete (old version v%ju), but already have v%ju\n",
                    op->oid.inode, op->oid.stripe, PRIV(op)->real_version, op->version, prev_it->first.version);
                cancel_all_writes(op, dirty_it, -EEXIST);
                return 2;
@ -320,7 +322,7 @@ int blockstore_impl_t::dequeue_write(blockstore_op_t *op)
        blockstore_journal_check_t space_check(this);
        if (!space_check.check_available(op, unsynced_big_write_count + 1,
            sizeof(journal_entry_big_write) + dsk.clean_dyn_size,
-            (unstable_writes.size()+unstable_unsynced)*journal.block_size))
+            (unstable_writes.size()+unstable_unsynced+((dirty_it->second.state & BS_ST_INSTANT) ? 0 : 1))*journal.block_size))
        {
            return 0;
        }
@ -348,8 +350,8 @@ int blockstore_impl_t::dequeue_write(blockstore_op_t *op)
            if (entry->oid.inode || entry->oid.stripe || entry->version)
            {
                printf(
-                    "Fatal error (metadata corruption or bug): tried to write object %lx:%lx v%lu"
-                    " over a non-zero metadata entry %lu with %lx:%lx v%lu\n", op->oid.inode,
+                    "Fatal error (metadata corruption or bug): tried to write object %jx:%jx v%ju"
+                    " over a non-zero metadata entry %ju with %jx:%jx v%ju\n", op->oid.inode,
                    op->oid.stripe, op->version, loc, entry->oid.inode, entry->oid.stripe, entry->version
                );
                exit(1);
@ -361,7 +363,7 @@ int blockstore_impl_t::dequeue_write(blockstore_op_t *op)
        dirty_it->second.state = (dirty_it->second.state & ~BS_ST_WORKFLOW_MASK) | BS_ST_SUBMITTED;
 #ifdef BLOCKSTORE_DEBUG
        printf(
-            "Allocate block %lu for %lx:%lx v%lu\n",
+            "Allocate block %ju for %jx:%jx v%ju\n",
            loc, op->oid.inode, op->oid.stripe, op->version
        );
 #endif
@ -372,13 +374,13 @@ int blockstore_impl_t::dequeue_write(blockstore_op_t *op)
        int vcnt = 0;
        if (stripe_offset)
        {
-            PRIV(op)->iov_zerofill[vcnt++] = (struct iovec){ zero_object, stripe_offset };
+            PRIV(op)->iov_zerofill[vcnt++] = (struct iovec){ zero_object, (size_t)stripe_offset };
        }
        PRIV(op)->iov_zerofill[vcnt++] = (struct iovec){ op->buf, op->len };
        if (stripe_end)
        {
            stripe_end = dsk.bitmap_granularity - stripe_end;
-            PRIV(op)->iov_zerofill[vcnt++] = (struct iovec){ zero_object, stripe_end };
+            PRIV(op)->iov_zerofill[vcnt++] = (struct iovec){ zero_object, (size_t)stripe_end };
        }
        data->iov.iov_len = op->len + stripe_offset + stripe_end; // to check it in the callback
        data->callback = [this, op](ring_data_t *data) { handle_write_event(data, op); };
@ -412,7 +414,7 @@ int blockstore_impl_t::dequeue_write(blockstore_op_t *op)
                sizeof(journal_entry_big_write) + dsk.clean_dyn_size, 0)
            || !space_check.check_available(op, 1,
                sizeof(journal_entry_small_write) + dyn_size,
-                op->len + (unstable_writes.size()+unstable_unsynced)*journal.block_size))
+                op->len + (unstable_writes.size()+unstable_unsynced+((dirty_it->second.state & BS_ST_INSTANT) ? 0 : 1))*journal.block_size))
        {
            return 0;
        }
@ -436,11 +438,23 @@ int blockstore_impl_t::dequeue_write(blockstore_op_t *op)
            journal, op->opcode == BS_OP_WRITE_STABLE ? JE_SMALL_WRITE_INSTANT : JE_SMALL_WRITE,
            sizeof(journal_entry_small_write) + dyn_size
        );
-        dirty_it->second.journal_sector = journal.sector_info[journal.cur_sector].offset;
+        auto jsec = dirty_it->second.journal_sector = journal.sector_info[journal.cur_sector].offset;
+        if (!(journal.next_free >= journal.used_start
+            ? (jsec >= journal.used_start && jsec < journal.next_free)
+            : (jsec >= journal.used_start || jsec < journal.next_free)))
+        {
+            printf(
+                "BUG: journal offset %08jx is used by %jx:%jx v%ju (%ju refs) BUT used_start=%jx next_free=%jx\n",
+                dirty_it->second.journal_sector, dirty_it->first.oid.inode, dirty_it->first.oid.stripe, dirty_it->first.version,
+                journal.used_sectors[journal.sector_info[journal.cur_sector].offset],
+                journal.used_start, journal.next_free
+            );
+            abort();
+        }
        journal.used_sectors[journal.sector_info[journal.cur_sector].offset]++;
 #ifdef BLOCKSTORE_DEBUG
        printf(
-            "journal offset %08lx is used by %lx:%lx v%lu (%lu refs)\n",
+            "journal offset %08jx is used by %jx:%jx v%ju (%ju refs)\n",
            dirty_it->second.journal_sector, dirty_it->first.oid.inode, dirty_it->first.oid.stripe, dirty_it->first.version,
            journal.used_sectors[journal.sector_info[journal.cur_sector].offset]
        );
@ -454,8 +468,8 @@ int blockstore_impl_t::dequeue_write(blockstore_op_t *op)
                journal_used_it->first < next_next_free + op->len)
            {
                printf(
-                    "BUG: Attempt to overwrite used offset (%lx, %lu refs) of the journal with the object %lx:%lx v%lu: data at %lx, len %x!"
-                    " Journal used_start=%08lx (%lu refs), next_free=%08lx, dirty_start=%08lx\n",
+                    "BUG: Attempt to overwrite used offset (%jx, %ju refs) of the journal with the object %jx:%jx v%ju: data at %jx, len %x!"
+                    " Journal used_start=%08jx (%ju refs), next_free=%08jx, dirty_start=%08jx\n",
                    journal_used_it->first, journal_used_it->second, op->oid.inode, op->oid.stripe, op->version, next_next_free, op->len,
                    journal.used_start, journal.used_sectors[journal.used_start], journal.next_free, journal.dirty_start
                );
@ -463,7 +477,7 @@ int blockstore_impl_t::dequeue_write(blockstore_op_t *op)
            }
        }
        // double check that next_free doesn't cross used_start from the left
-        assert(journal.next_free >= journal.used_start || next_next_free < journal.used_start);
+        assert(journal.next_free >= journal.used_start && next_next_free >= journal.next_free || next_next_free < journal.used_start);
        journal.next_free = next_next_free;
        je->oid = op->oid;
        je->version = op->version;
@ -505,7 +519,7 @@ int blockstore_impl_t::dequeue_write(blockstore_op_t *op)
        if (next_next_free >= journal.len)
            next_next_free = dsk.journal_block_size;
        // double check that next_free doesn't cross used_start from the left
-        assert(journal.next_free >= journal.used_start || next_next_free < journal.used_start);
+        assert(journal.next_free >= journal.used_start && next_next_free >= journal.next_free || next_next_free < journal.used_start);
        journal.next_free = next_next_free;
        if (!(dirty_it->second.state & BS_ST_INSTANT))
        {
@ -549,7 +563,7 @@ resume_2:
        uint64_t dyn_size = dsk.dirty_dyn_size(op->offset, op->len);
        blockstore_journal_check_t space_check(this);
        if (!space_check.check_available(op, 1, sizeof(journal_entry_big_write) + dyn_size,
-            (unstable_writes.size()+unstable_unsynced)*journal.block_size))
+            (unstable_writes.size()+unstable_unsynced+((dirty_it->second.state & BS_ST_INSTANT) ? 0 : 1))*journal.block_size))
        {
            return 0;
        }
@ -558,11 +572,23 @@ resume_2:
            journal, op->opcode == BS_OP_WRITE_STABLE ? JE_BIG_WRITE_INSTANT : JE_BIG_WRITE,
            sizeof(journal_entry_big_write) + dyn_size
        );
-        dirty_it->second.journal_sector = journal.sector_info[journal.cur_sector].offset;
+        auto jsec = dirty_it->second.journal_sector = journal.sector_info[journal.cur_sector].offset;
+        if (!(journal.next_free >= journal.used_start
+            ? (jsec >= journal.used_start && jsec < journal.next_free)
+            : (jsec >= journal.used_start || jsec < journal.next_free)))
+        {
+            printf(
+                "BUG: journal offset %08jx is used by %jx:%jx v%ju (%ju refs) BUT used_start=%jx next_free=%jx\n",
+                dirty_it->second.journal_sector, dirty_it->first.oid.inode, dirty_it->first.oid.stripe, dirty_it->first.version,
+                journal.used_sectors[journal.sector_info[journal.cur_sector].offset],
+                journal.used_start, journal.next_free
+            );
+            abort();
+        }
        journal.used_sectors[journal.sector_info[journal.cur_sector].offset]++;
 #ifdef BLOCKSTORE_DEBUG
        printf(
-            "journal offset %08lx is used by %lx:%lx v%lu (%lu refs)\n",
+            "journal offset %08jx is used by %jx:%jx v%ju (%ju refs)\n",
            journal.sector_info[journal.cur_sector].offset, op->oid.inode, op->oid.stripe, op->version,
            journal.used_sectors[journal.sector_info[journal.cur_sector].offset]
        );
@ -589,7 +615,7 @@ resume_4:
        });
        assert(dirty_it != dirty_db.end());
 #ifdef BLOCKSTORE_DEBUG
-        printf("Ack write %lx:%lx v%lu = state 0x%x\n", op->oid.inode, op->oid.stripe, op->version, dirty_it->second.state);
+        printf("Ack write %jx:%jx v%ju = state 0x%x\n", op->oid.inode, op->oid.stripe, op->version, dirty_it->second.state);
 #endif
        bool is_big = (dirty_it->second.state & BS_ST_TYPE_MASK) == BS_ST_BIG_WRITE;
        bool imm = is_big ? (immediate_commit == IMMEDIATE_ALL) : (immediate_commit != IMMEDIATE_NONE);
@ -782,7 +808,7 @@ int blockstore_impl_t::dequeue_del(blockstore_op_t *op)
    journal.used_sectors[journal.sector_info[journal.cur_sector].offset]++;
 #ifdef BLOCKSTORE_DEBUG
    printf(
-        "journal offset %08lx is used by %lx:%lx v%lu (%lu refs)\n",
+        "journal offset %08jx is used by %jx:%jx v%ju (%ju refs)\n",
        dirty_it->second.journal_sector, dirty_it->first.oid.inode, dirty_it->first.oid.stripe, dirty_it->first.version,
        journal.used_sectors[journal.sector_info[journal.cur_sector].offset]
    );
--- a/src/cli_alloc_osd.cpp
+++ b/src/cli_alloc_osd.cpp
@ -77,7 +77,7 @@ struct alloc_osd_t
                    std::string key = base64_decode(kv["key"].string_value());
                    osd_num_t cur_osd;
                    char null_byte = 0;
-                    int scanned = sscanf(key.c_str() + parent->cli->st_cli.etcd_prefix.length(), "/osd/stats/%lu%c", &cur_osd, &null_byte);
+                    int scanned = sscanf(key.c_str() + parent->cli->st_cli.etcd_prefix.length(), "/osd/stats/%ju%c", &cur_osd, &null_byte);
                    if (scanned != 1 || !cur_osd)
                    {
                        fprintf(stderr, "Invalid key in etcd: %s\n", key.c_str());
--- a/src/cli_common.cpp
+++ b/src/cli_common.cpp
@ -11,7 +11,7 @@ void cli_tool_t::change_parent(inode_t cur, inode_t new_parent, cli_result_t *re
    if (cur_cfg_it == cli->st_cli.inode_config.end())
    {
        char buf[128];
-        snprintf(buf, 128, "Inode 0x%lx disappeared", cur);
+        snprintf(buf, 128, "Inode 0x%jx disappeared", cur);
        *result = (cli_result_t){ .err = EIO, .text = buf };
        return;
    }
--- a/src/cli_describe.cpp
+++ b/src/cli_describe.cpp
@ -160,14 +160,14 @@ struct cli_describe_t
                if (op->reply.hdr.retval < 0)
                {
                    fprintf(
-                        stderr, "Failed to describe objects on OSD %lu (retval=%ld)\n",
+                        stderr, "Failed to describe objects on OSD %ju (retval=%jd)\n",
                        osd_num, op->reply.hdr.retval
                    );
                }
                else if (op->reply.describe.result_bytes != op->reply.hdr.retval * sizeof(osd_reply_describe_item_t))
                {
                    fprintf(
-                        stderr, "Invalid response size from OSD %lu (expected %lu bytes, got %lu bytes)\n",
+                        stderr, "Invalid response size from OSD %ju (expected %ju bytes, got %ju bytes)\n",
                        osd_num, op->reply.hdr.retval * sizeof(osd_reply_describe_item_t), op->reply.describe.result_bytes
                    );
                }
@ -178,11 +178,11 @@ struct cli_describe_t
                    {
                        if (!parent->json_output || parent->is_command_line)
                        {
-#define FMT "{\"inode\":\"0x%lx\",\"stripe\":\"0x%lx\",\"part\":%u,\"osd_num\":%lu%s%s%s}"
+#define FMT "{\"inode\":\"0x%jx\",\"stripe\":\"0x%jx\",\"part\":%u,\"osd_num\":%ju%s%s%s}"
                            printf(
                                (parent->json_output
                                    ? (count > 0 ? ",\n  " FMT : "  " FMT)
-                                    : "%lx:%lx part %u on OSD %lu%s%s%s\n"),
+                                    : "%jx:%jx part %u on OSD %ju%s%s%s\n"),
 #undef FMT
                                items[i].inode, items[i].stripe,
                                items[i].role, items[i].osd_num,
--- a/src/cli_df.cpp
+++ b/src/cli_df.cpp
@ -82,7 +82,7 @@ resume_1:
            // osd ID
            osd_num_t osd_num;
            char null_byte = 0;
-            int scanned = sscanf(kv.key.substr(parent->cli->st_cli.etcd_prefix.length()).c_str(), "/osd/stats/%lu%c", &osd_num, &null_byte);
+            int scanned = sscanf(kv.key.substr(parent->cli->st_cli.etcd_prefix.length()).c_str(), "/osd/stats/%ju%c", &osd_num, &null_byte);
            if (scanned != 1 || !osd_num || osd_num >= POOL_ID_MAX)
            {
                fprintf(stderr, "Invalid key in etcd: %s\n", kv.key.c_str());
--- a/src/cli_fix.cpp
+++ b/src/cli_fix.cpp
@ -136,7 +136,7 @@ struct cli_fix_t
            auto pool_cfg_it = parent->cli->st_cli.pool_config.find(INODE_POOL(obj.inode));
            if (pool_cfg_it == parent->cli->st_cli.pool_config.end())
            {
-                fprintf(stderr, "Object %lx:%lx is from unknown pool\n", obj.inode, obj.stripe);
+                fprintf(stderr, "Object %jx:%jx is from unknown pool\n", obj.inode, obj.stripe);
                continue;
            }
            auto & pool_cfg = pool_cfg_it->second;
@ -146,7 +146,7 @@ struct cli_fix_t
                !pg_it->second.cur_primary || !(pg_it->second.cur_state & PG_ACTIVE))
            {
                fprintf(
-                    stderr, "Object %lx:%lx is from PG %u/%u which is not currently active\n",
+                    stderr, "Object %jx:%jx is from PG %u/%u which is not currently active\n",
                    obj.inode, obj.stripe, pool_cfg_it->first, pg_num
                );
                continue;
@ -171,7 +171,7 @@ struct cli_fix_t
            {
                if (op->reply.hdr.retval < 0 || op->reply.describe.result_bytes != op->reply.hdr.retval * sizeof(osd_reply_describe_item_t))
                {
-                    fprintf(stderr, "Failed to describe objects on OSD %lu (retval=%ld)\n", primary_osd, op->reply.hdr.retval);
+                    fprintf(stderr, "Failed to describe objects on OSD %ju (retval=%jd)\n", primary_osd, op->reply.hdr.retval);
                    parent->waiting--;
                    loop();
                }
@ -209,7 +209,7 @@ struct cli_fix_t
                                if (rm_op->reply.hdr.retval < 0)
                                {
                                    fprintf(
-                                        stderr, "Failed to remove object %lx:%lx from OSD %lu (retval=%ld)\n",
+                                        stderr, "Failed to remove object %jx:%jx from OSD %ju (retval=%jd)\n",
                                        rm_op->req.sec_del.oid.inode, rm_op->req.sec_del.oid.stripe,
                                        rm_osd_num, rm_op->reply.hdr.retval
                                    );
@ -226,7 +226,7 @@ struct cli_fix_t
                                else
                                {
                                    printf(
-                                        "Removed %lx:%lx (part %lu) from OSD %lu\n",
+                                        "Removed %jx:%jx (part %ju) from OSD %ju\n",
                                        rm_op->req.sec_del.oid.inode, rm_op->req.sec_del.oid.stripe & ~STRIPE_MASK,
                                        rm_op->req.sec_del.oid.stripe & STRIPE_MASK, rm_osd_num
                                    );
@ -254,7 +254,7 @@ struct cli_fix_t
                                        if (scrub_op->reply.hdr.retval < 0 && scrub_op->reply.hdr.retval != -ENOENT)
                                        {
                                            fprintf(
-                                                stderr, "Failed to scrub %lx:%lx on OSD %lu (retval=%ld)\n",
+                                                stderr, "Failed to scrub %jx:%jx on OSD %ju (retval=%jd)\n",
                                                obj.inode, obj.stripe, primary_osd, scrub_op->reply.hdr.retval
                                            );
                                        }
--- a/src/cli_ls.cpp
+++ b/src/cli_ls.cpp
@ -150,7 +150,7 @@ resume_1:
            inode_t only_inode_num;
            char null_byte = 0;
            int scanned = sscanf(kv.key.substr(parent->cli->st_cli.etcd_prefix.length()).c_str(),
-                "/inode/stats/%u/%lu%c", &pool_id, &only_inode_num, &null_byte);
+                "/inode/stats/%u/%ju%c", &pool_id, &only_inode_num, &null_byte);
            if (scanned != 2 || !pool_id || pool_id >= POOL_ID_MAX || INODE_POOL(only_inode_num) != 0)
            {
                fprintf(stderr, "Invalid key in etcd: %s\n", kv.key.c_str());
@ -456,7 +456,7 @@ std::string format_lat(uint64_t lat)
    char buf[256];
    int l = 0;
    if (lat < 100)
-        l = snprintf(buf, sizeof(buf), "%lu us", lat);
+        l = snprintf(buf, sizeof(buf), "%ju us", lat);
    else if (lat < 500000)
        l = snprintf(buf, sizeof(buf), "%.2f ms", (double)lat/1000);
    else
--- a/src/cli_merge.cpp
+++ b/src/cli_merge.cpp
@ -202,7 +202,7 @@ struct snap_merger_t
        if (parent->progress)
        {
            printf(
-                "Merging %ld layer(s) into target %s%s (inode %lu in pool %u)\n",
+                "Merging %zd layer(s) into target %s%s (inode %ju in pool %u)\n",
                sources.size(), target_cfg->name.c_str(),
                use_cas ? " online (with CAS)" : "", INODE_NO_POOL(target), INODE_POOL(target)
            );
@ -275,7 +275,7 @@ struct snap_merger_t
                processed++;
                if (parent->progress && !(processed % 128))
                {
-                    printf("\rFiltering target blocks: %lu/%lu", processed, to_process);
+                    printf("\rFiltering target blocks: %ju/%ju", processed, to_process);
                }
            }
            if (in_flight > 0 || oit != merge_offsets.end())
@ -285,7 +285,7 @@ struct snap_merger_t
            }
            if (parent->progress)
            {
-                printf("\r%lu full blocks of target filtered out\n", to_process-merge_offsets.size());
+                printf("\r%ju full blocks of target filtered out\n", to_process-merge_offsets.size());
            }
        }
        state = 3;
@ -320,7 +320,7 @@ struct snap_merger_t
            processed++;
            if (parent->progress && !(processed % 128))
            {
-                printf("\rOverwriting blocks: %lu/%lu", processed, to_process);
+                printf("\rOverwriting blocks: %ju/%ju", processed, to_process);
            }
        }
        if (in_flight == 0 && rwo_error.size())
@ -339,7 +339,7 @@ struct snap_merger_t
        }
        if (parent->progress)
        {
-            printf("\rOverwriting blocks: %lu/%lu\n", to_process, to_process);
+            printf("\rOverwriting blocks: %ju/%ju\n", to_process, to_process);
        }
        // Done
        result = (cli_result_t){ .text = "Done, layers from "+from_name+" to "+to_name+" merged into "+target_name };
@ -384,7 +384,7 @@ struct snap_merger_t
                        auto & name = parent->cli->st_cli.inode_config.at(src).name;
                        if (parent->progress)
                        {
-                            printf("Got listing of layer %s (inode %lu in pool %u)\n", name.c_str(), INODE_NO_POOL(src), INODE_POOL(src));
+                            printf("Got listing of layer %s (inode %ju in pool %u)\n", name.c_str(), INODE_NO_POOL(src), INODE_POOL(src));
                        }
                        if (delete_source)
                        {
@ -416,7 +416,7 @@ struct snap_merger_t
        {
            if (op->retval < 0)
            {
-                fprintf(stderr, "error reading target bitmap at offset %lx: %s\n", op->offset, strerror(-op->retval));
+                fprintf(stderr, "error reading target bitmap at offset %jx: %s\n", op->offset, strerror(-op->retval));
            }
            else
            {
@ -571,7 +571,7 @@ struct snap_merger_t
        {
            if (subop->retval != 0)
            {
-                fprintf(stderr, "error deleting from layer 0x%lx at offset %lx: %s", subop->inode, subop->offset, strerror(-subop->retval));
+                fprintf(stderr, "error deleting from layer 0x%jx at offset %jx: %s", subop->inode, subop->offset, strerror(-subop->retval));
            }
            delete subop;
        };
@ -620,7 +620,7 @@ struct snap_merger_t
            if (rwo->error_code)
            {
                char buf[1024];
-                snprintf(buf, 1024, "Error %s target at offset %lx: %s",
+                snprintf(buf, 1024, "Error %s target at offset %jx: %s",
                    rwo->error_read ? "reading" : "writing", rwo->error_offset, strerror(rwo->error_code));
                rwo_error = std::string(buf);
            }
--- a/src/cli_rm.cpp
+++ b/src/cli_rm.cpp
@ -291,7 +291,7 @@ resume_100:
            if (it == parent->cli->st_cli.inode_config.end())
            {
                char buf[1024];
-                snprintf(buf, 1024, "Parent inode of layer %s (id 0x%lx) not found", cur->name.c_str(), cur->parent_id);
+                snprintf(buf, 1024, "Parent inode of layer %s (id 0x%jx) not found", cur->name.c_str(), cur->parent_id);
                state = 100;
                return;
            }
@ -384,7 +384,7 @@ resume_100:
                pool_id_t pool_id = 0;
                inode_t inode = 0;
                char null_byte = 0;
-                int scanned = sscanf(kv.key.c_str() + parent->cli->st_cli.etcd_prefix.length()+13, "%u/%lu%c", &pool_id, &inode, &null_byte);
+                int scanned = sscanf(kv.key.c_str() + parent->cli->st_cli.etcd_prefix.length()+13, "%u/%ju%c", &pool_id, &inode, &null_byte);
                if (scanned != 2 || !inode)
                {
                    result = (cli_result_t){ .err = EIO, .text = "Bad key returned from etcd: "+kv.key };
@ -439,7 +439,7 @@ resume_100:
        if (child_it == parent->cli->st_cli.inode_config.end())
        {
            char buf[1024];
-            snprintf(buf, 1024, "Inode 0x%lx disappeared", inverse_child);
+            snprintf(buf, 1024, "Inode 0x%jx disappeared", inverse_child);
            result = (cli_result_t){ .err = EIO, .text = std::string(buf) };
            state = 100;
            return;
@ -448,7 +448,7 @@ resume_100:
        if (target_it == parent->cli->st_cli.inode_config.end())
        {
            char buf[1024];
-            snprintf(buf, 1024, "Inode 0x%lx disappeared", inverse_parent);
+            snprintf(buf, 1024, "Inode 0x%jx disappeared", inverse_parent);
            result = (cli_result_t){ .err = EIO, .text = std::string(buf) };
            state = 100;
            return;
@ -576,7 +576,7 @@ resume_100:
        if (cur_cfg_it == parent->cli->st_cli.inode_config.end())
        {
            char buf[1024];
-            snprintf(buf, 1024, "Inode 0x%lx disappeared", cur);
+            snprintf(buf, 1024, "Inode 0x%jx disappeared", cur);
            result = (cli_result_t){ .err = EIO, .text = std::string(buf) };
            state = 100;
            return;
@ -640,7 +640,7 @@ resume_100:
        if (child_it == parent->cli->st_cli.inode_config.end())
        {
            char buf[1024];
-            snprintf(buf, 1024, "Inode 0x%lx disappeared", child_inode);
+            snprintf(buf, 1024, "Inode 0x%jx disappeared", child_inode);
            result = (cli_result_t){ .err = EIO, .text = std::string(buf) };
            state = 100;
            return;
@ -649,7 +649,7 @@ resume_100:
        if (target_it == parent->cli->st_cli.inode_config.end())
        {
            char buf[1024];
-            snprintf(buf, 1024, "Inode 0x%lx disappeared", target_inode);
+            snprintf(buf, 1024, "Inode 0x%jx disappeared", target_inode);
            result = (cli_result_t){ .err = EIO, .text = std::string(buf) };
            state = 100;
            return;
@ -670,7 +670,7 @@ resume_100:
        if (source == parent->cli->st_cli.inode_config.end())
        {
            char buf[1024];
-            snprintf(buf, 1024, "Inode 0x%lx disappeared", inode);
+            snprintf(buf, 1024, "Inode 0x%jx disappeared", inode);
            result = (cli_result_t){ .err = EIO, .text = std::string(buf) };
            state = 100;
            return;
--- a/src/cli_rm_data.cpp
+++ b/src/cli_rm_data.cpp
@ -95,7 +95,7 @@ struct rm_inode_t
            fprintf(stderr, "Some data may remain after delete on OSDs which are currently down: ");
            for (int i = 0; i < inactive_osds.size(); i++)
            {
-                fprintf(stderr, i > 0 ? ", %lu" : "%lu", inactive_osds[i]);
+                fprintf(stderr, i > 0 ? ", %ju" : "%ju", inactive_osds[i]);
            }
            fprintf(stderr, "\n");
        }
@ -138,7 +138,7 @@ struct rm_inode_t
                    cur_list->in_flight--;
                    if (op->reply.hdr.retval < 0)
                    {
-                        fprintf(stderr, "Failed to remove object %lx:%lx from PG %u (OSD %lu) (retval=%ld)\n",
+                        fprintf(stderr, "Failed to remove object %jx:%jx from PG %u (OSD %ju) (retval=%jd)\n",
                            op->req.rw.inode, op->req.rw.offset,
                            cur_list->pg_num, cur_list->rm_osd_num, op->reply.hdr.retval);
                        error_count++;
@ -174,7 +174,7 @@ struct rm_inode_t
                cur_list->synced = true;
                if (op->reply.hdr.retval < 0)
                {
-                    fprintf(stderr, "Failed to sync OSD %lu (retval=%ld)\n",
+                    fprintf(stderr, "Failed to sync OSD %ju (retval=%jd)\n",
                        cur_list->rm_osd_num, op->reply.hdr.retval);
                    error_count++;
                }
@ -212,7 +212,7 @@ struct rm_inode_t
        }
        if (parent->progress && total_count > 0 && total_done*1000/total_count != total_prev_pct)
        {
-            fprintf(stderr, "\rRemoved %lu/%lu objects, %lu more PGs to list...", total_done, total_count, pgs_to_list);
+            fprintf(stderr, "\rRemoved %ju/%ju objects, %ju more PGs to list...", total_done, total_count, pgs_to_list);
            total_prev_pct = total_done*1000/total_count;
        }
        if (lists_done && !lists.size())
@ -224,8 +224,8 @@ struct rm_inode_t
            if (parent->progress && (total_done < total_count || inactive_osds.size() > 0 || error_count > 0))
            {
                fprintf(
-                    stderr, "Warning: Pool:%u,ID:%lu inode data may not have been fully removed.\n"
-                    " Use `vitastor-cli rm-data --pool %u --inode %lu` if you encounter it in listings.\n",
+                    stderr, "Warning: Pool:%u,ID:%ju inode data may not have been fully removed.\n"
+                    " Use `vitastor-cli rm-data --pool %u --inode %ju` if you encounter it in listings.\n",
                    pool_id, INODE_NO_POOL(inode), pool_id, INODE_NO_POOL(inode)
                );
            }
--- a/src/cli_status.cpp
+++ b/src/cli_status.cpp
@ -106,7 +106,7 @@ resume_2:
            if (etcd_states[i]["error"].is_null())
            {
                etcd_alive++;
-                etcd_db_size = etcd_states[i]["dbSizeInUse"].uint64_value();
+                etcd_db_size = etcd_states[i]["dbSize"].uint64_value();
            }
        }
        int mon_count = 0;
@ -132,7 +132,7 @@ resume_2:
            auto kv = parent->cli->st_cli.parse_etcd_kv(osd_stats[i]);
            osd_num_t stat_osd_num = 0;
            char null_byte = 0;
-            int scanned = sscanf(kv.key.c_str() + parent->cli->st_cli.etcd_prefix.size(), "/osd/stats/%lu%c", &stat_osd_num, &null_byte);
+            int scanned = sscanf(kv.key.c_str() + parent->cli->st_cli.etcd_prefix.size(), "/osd/stats/%ju%c", &stat_osd_num, &null_byte);
            if (scanned != 1 || !stat_osd_num)
            {
                fprintf(stderr, "Invalid key in etcd: %s\n", kv.key.c_str());
@ -283,7 +283,7 @@ resume_2:
        }
        printf(
            "  cluster:\n"
-            "    etcd: %d / %ld up, %s database size\n"
+            "    etcd: %d / %zd up, %s database size\n"
            "    mon:  %d up%s\n"
            "    osd:  %d / %d up\n"
            "  \n"
--- a/src/cluster_client.cpp
+++ b/src/cluster_client.cpp
@ -6,7 +6,7 @@
 #include "cluster_client_impl.h"
 #include "http_client.h" // json_is_true

-cluster_client_t::cluster_client_t(ring_loop_t *ringloop, timerfd_manager_t *tfd, json11::Json & config)
+cluster_client_t::cluster_client_t(ring_loop_t *ringloop, timerfd_manager_t *tfd, json11::Json config)
 {
    wb = new writeback_cache_t();

@ -534,7 +534,7 @@ void cluster_client_t::execute_internal(cluster_op_t *op)
        return;
    }
    if (op->opcode == OSD_OP_WRITE && enable_writeback && !(op->flags & OP_FLUSH_BUFFER) &&
-        !op->version /* FIXME no CAS writeback */)
+        !op->version /* no CAS writeback */)
    {
        if (wb->writebacks_active >= client_max_writeback_iodepth)
        {
@ -555,7 +555,7 @@ void cluster_client_t::execute_internal(cluster_op_t *op)
    }
    if (op->opcode == OSD_OP_WRITE && !(op->flags & OP_IMMEDIATE_COMMIT))
    {
-        if (!(op->flags & OP_FLUSH_BUFFER))
+        if (!(op->flags & OP_FLUSH_BUFFER) && !op->version /* no CAS write-repeat */)
        {
            wb->copy_write(op, CACHE_WRITTEN);
        }
@ -1156,15 +1156,15 @@ void cluster_client_t::handle_op_part(cluster_op_part_t *part)
            if (op->retval != -EPIPE || log_level > 0)
            {
                fprintf(
-                    stderr, "%s operation failed on OSD %lu: retval=%ld (expected %d), dropping connection\n",
+                    stderr, "%s operation failed on OSD %ju: retval=%jd (expected %d), dropping connection\n",
                    osd_op_names[part->op.req.hdr.opcode], part->osd_num, part->op.reply.hdr.retval, expected
                );
            }
        }
-        else
+        else if (log_level > 0)
        {
            fprintf(
-                stderr, "%s operation failed on OSD %lu: retval=%ld (expected %d)\n",
+                stderr, "%s operation failed on OSD %ju: retval=%jd (expected %d)\n",
                osd_op_names[part->op.req.hdr.opcode], part->osd_num, part->op.reply.hdr.retval, expected
            );
        }
--- a/src/cluster_client.h
+++ b/src/cluster_client.h
@ -121,7 +121,7 @@ public:
    json11::Json::object cli_config, file_config, etcd_global_config;
    json11::Json::object config;

-    cluster_client_t(ring_loop_t *ringloop, timerfd_manager_t *tfd, json11::Json & config);
+    cluster_client_t(ring_loop_t *ringloop, timerfd_manager_t *tfd, json11::Json config);
    ~cluster_client_t();
    void execute(cluster_op_t *op);
    void execute_raw(osd_num_t osd_num, osd_op_t *op);
--- a/src/cluster_client_list.cpp
+++ b/src/cluster_client_list.cpp
@ -226,7 +226,7 @@ void cluster_client_t::send_list(inode_list_osd_t *cur_list)
    {
        if (op->reply.hdr.retval < 0)
        {
-            fprintf(stderr, "Failed to get PG %u/%u object list from OSD %lu (retval=%ld), skipping\n",
+            fprintf(stderr, "Failed to get PG %u/%u object list from OSD %ju (retval=%jd), skipping\n",
                cur_list->pg->lst->pool_id, cur_list->pg->pg_num, cur_list->osd_num, op->reply.hdr.retval);
        }
        else
@ -236,7 +236,7 @@ void cluster_client_t::send_list(inode_list_osd_t *cur_list)
                // Unstable objects, if present, mean that someone still writes into the inode. Warn the user about it.
                cur_list->pg->has_unstable = true;
                fprintf(
-                    stderr, "[PG %u/%u] Inode still has %lu unstable object versions out of total %lu - is it still open?\n",
+                    stderr, "[PG %u/%u] Inode still has %ju unstable object versions out of total %ju - is it still open?\n",
                    cur_list->pg->lst->pool_id, cur_list->pg->pg_num, op->reply.hdr.retval - op->reply.sec_list.stable_count,
                    op->reply.hdr.retval
                );
@ -244,7 +244,7 @@ void cluster_client_t::send_list(inode_list_osd_t *cur_list)
            if (log_level > 0)
            {
                fprintf(
-                    stderr, "[PG %u/%u] Got inode object list from OSD %lu: %ld object versions\n",
+                    stderr, "[PG %u/%u] Got inode object list from OSD %ju: %jd object versions\n",
                    cur_list->pg->lst->pool_id, cur_list->pg->pg_num, cur_list->osd_num, op->reply.hdr.retval
                );
            }
--- a/src/disk_simple_offsets.cpp
+++ b/src/disk_simple_offsets.cpp
@ -47,7 +47,7 @@ void disk_tool_simple_offsets(json11::Json cfg, bool json_output)
    if (!bitmap_granularity)
        bitmap_granularity = DEFAULT_BITMAP_GRANULARITY;
    if (!journal_size)
-        journal_size = 16*1024*1024;
+        journal_size = 32*1024*1024;
    if (!device_block_size)
        device_block_size = 4096;
    if (!data_csum_type)
@ -75,9 +75,9 @@ void disk_tool_simple_offsets(json11::Json cfg, bool json_output)
            if (st.st_blksize < device_block_size)
            {
                fprintf(
-                    stderr, "Warning: %s reports %lu byte blocks, but we use %lu."
-                    " Set --device_block_size=%lu if you're sure it works well with %lu byte blocks.\n",
-                    device.c_str(), st.st_blksize, device_block_size, st.st_blksize, st.st_blksize
+                    stderr, "Warning: %s reports %ju byte blocks, but we use %ju."
+                    " Set --device_block_size=%ju if you're sure it works well with %ju byte blocks.\n",
+                    device.c_str(), (uint64_t)st.st_blksize, device_block_size, (uint64_t)st.st_blksize, (uint64_t)st.st_blksize
                );
            }
        }
@ -99,19 +99,19 @@ void disk_tool_simple_offsets(json11::Json cfg, bool json_output)
    if (device_block_size < 512 || device_block_size > 1048576 ||
        device_block_size & (device_block_size-1) != 0)
    {
-        fprintf(stderr, "Invalid device block size specified: %lu\n", device_block_size);
+        fprintf(stderr, "Invalid device block size specified: %ju\n", device_block_size);
        exit(1);
    }
    if (data_block_size < device_block_size || data_block_size > MAX_DATA_BLOCK_SIZE ||
        data_block_size & (data_block_size-1) != 0)
    {
-        fprintf(stderr, "Invalid object size specified: %lu\n", data_block_size);
+        fprintf(stderr, "Invalid object size specified: %ju\n", data_block_size);
        exit(1);
    }
    if (bitmap_granularity < device_block_size || bitmap_granularity > data_block_size ||
        bitmap_granularity & (bitmap_granularity-1) != 0)
    {
-        fprintf(stderr, "Invalid bitmap granularity specified: %lu\n", bitmap_granularity);
+        fprintf(stderr, "Invalid bitmap granularity specified: %ju\n", bitmap_granularity);
        exit(1);
    }
    if (csum_block_size && (data_block_size % csum_block_size))
@ -145,8 +145,8 @@ void disk_tool_simple_offsets(json11::Json cfg, bool json_output)
    {
        // Env
        printf(
-            "meta_block_size=%lu\njournal_block_size=%lu\ndata_size=%lu\n"
-            "data_device=%s\njournal_offset=%lu\nmeta_offset=%lu\ndata_offset=%lu\n",
+            "meta_block_size=%ju\njournal_block_size=%ju\ndata_size=%ju\n"
+            "data_device=%s\njournal_offset=%ju\nmeta_offset=%ju\ndata_offset=%ju\n",
            device_block_size, device_block_size, device_size-data_offset,
            device.c_str(), journal_offset, meta_offset, data_offset
        );
@ -160,14 +160,14 @@ void disk_tool_simple_offsets(json11::Json cfg, bool json_output)
        }
        if (device_block_size != 4096)
        {
-            printf("--meta_block_size %lu\n--journal_block_size %lu\n", device_block_size, device_block_size);
+            printf("--meta_block_size %ju\n--journal_block_size %ju\n", device_block_size, device_block_size);
        }
        if (orig_device_size)
        {
-            printf("--data_size %lu\n", device_size-data_offset);
+            printf("--data_size %ju\n", device_size-data_offset);
        }
        printf(
-            "--data_device %s\n--journal_offset %lu\n--meta_offset %lu\n--data_offset %lu\n",
+            "--data_device %s\n--journal_offset %ju\n--meta_offset %ju\n--data_offset %ju\n",
            device.c_str(), journal_offset, meta_offset, data_offset
        );
    }
--- a/src/disk_tool.cpp
+++ b/src/disk_tool.cpp
@ -167,7 +167,7 @@ static const char *help_text =
    "  Calculate offsets for old simple&stupid (no superblock) OSD deployment. Options:\n"
    "    --object_size 128k       Set blockstore block size\n"
    "    --bitmap_granularity 4k  Set bitmap granularity\n"
-    "    --journal_size 16M       Set journal size\n"
+    "    --journal_size 32M       Set journal size\n"
    "    --data_csum_type none    Set data checksum type (crc32c or none)\n"
    "    --csum_block_size 4k     Set data checksum block size\n"
    "    --device_block_size 4k   Set device block size\n"
--- a/src/disk_tool.h
+++ b/src/disk_tool.h
@ -4,7 +4,7 @@
 #pragma once

 #ifndef _LARGEFILE64_SOURCE
-#define _LARGEFILE64_SOURCE 1
+#define _LARGEFILE64_SOURCE
 #endif

 #include <map>
--- a/src/disk_tool_journal.cpp
+++ b/src/disk_tool_journal.cpp
@ -38,7 +38,7 @@ int disk_tool_t::dump_journal()
            }
            if (json)
            {
-                printf("%s{\"offset\":\"0x%lx\"", first_block ? "" : ",\n", journal_pos);
+                printf("%s{\"offset\":\"0x%jx\"", first_block ? "" : ",\n", journal_pos);
                first_block = false;
            }
            if (s == dsk.journal_block_size)
@ -46,13 +46,13 @@ int disk_tool_t::dump_journal()
                if (json)
                    printf(",\"type\":\"zero\"}");
                else
-                    printf("offset %08lx: zeroes\n", journal_pos);
+                    printf("offset %08jx: zeroes\n", journal_pos);
                journal_pos += dsk.journal_block_size;
            }
            else if (((journal_entry*)journal_buf)->magic == JOURNAL_MAGIC)
            {
                if (!json)
-                    printf("offset %08lx:\n", journal_pos);
+                    printf("offset %08jx:\n", journal_pos);
                else
                    printf(",\"entries\":[\n");
                if (journal_pos == 0)
@ -80,9 +80,9 @@ int disk_tool_t::dump_journal()
            else
            {
                if (json)
-                    printf(",\"type\":\"data\",\"pattern\":\"%08lx\"}", *((uint64_t*)journal_buf));
+                    printf(",\"type\":\"data\",\"pattern\":\"%08jx\"}", *((uint64_t*)journal_buf));
                else
-                    printf("offset %08lx: no magic in the beginning, looks like random data (pattern=%08lx)\n", journal_pos, *((uint64_t*)journal_buf));
+                    printf("offset %08jx: no magic in the beginning, looks like random data (pattern=%08jx)\n", journal_pos, *((uint64_t*)journal_buf));
                journal_pos += dsk.journal_block_size;
            }
        }
@ -98,12 +98,12 @@ int disk_tool_t::dump_journal()
            if (json && dump_with_blocks)
                first_entry = true;
            if (!json)
-                printf("offset %08lx:\n", journal_pos);
+                printf("offset %08jx:\n", journal_pos);
            auto pos = journal_pos;
            int r = process_journal_block(data, [this, pos](int num, journal_entry *je)
            {
                if (json && dump_with_blocks && first_entry)
-                    printf("%s{\"offset\":\"0x%lx\",\"entries\":[\n", first_block ? "" : ",\n", pos);
+                    printf("%s{\"offset\":\"0x%jx\",\"entries\":[\n", first_block ? "" : ",\n", pos);
                dump_journal_entry(num, je, json);
                first_block = false;
            });
@ -134,12 +134,12 @@ int disk_tool_t::process_journal(std::function<int(void*)> block_fn)
    journal_entry *je = (journal_entry*)(data);
    if (je->magic != JOURNAL_MAGIC || je->type != JE_START || je_crc32(je) != je->crc32)
    {
-        fprintf(stderr, "offset %08lx: journal superblock is invalid\n", journal_pos);
+        fprintf(stderr, "offset %08jx: journal superblock is invalid\n", journal_pos);
        r = 1;
    }
    else if (je->start.size != JE_START_V0_SIZE && je->start.version != JOURNAL_VERSION_V1 && je->start.version != JOURNAL_VERSION_V2)
    {
-        fprintf(stderr, "offset %08lx: journal superblock contains version %lu, but I only understand 0, 1 and 2\n",
+        fprintf(stderr, "offset %08jx: journal superblock contains version %ju, but I only understand 0, 1 and 2\n",
            journal_pos, je->start.size == JE_START_V0_SIZE ? 0 : je->start.version);
        r = 1;
    }
@ -296,7 +296,7 @@ void disk_tool_t::dump_journal_entry(int num, journal_entry *je, bool json)
    if (je->type == JE_START)
    {
        printf(
-            json ? ",\"type\":\"start\",\"start\":\"0x%lx\"" : "je_start start=%08lx",
+            json ? ",\"type\":\"start\",\"start\":\"0x%jx\"" : "je_start start=%08jx",
            je->start.journal_start
        );
        if (je->start.data_csum_type)
@ -312,15 +312,15 @@ void disk_tool_t::dump_journal_entry(int num, journal_entry *je, bool json)
    {
        auto & sw = je->small_write;
        printf(
-            json ? ",\"type\":\"small_write%s\",\"inode\":\"0x%lx\",\"stripe\":\"0x%lx\",\"ver\":\"%lu\",\"offset\":%u,\"len\":%u,\"loc\":\"0x%lx\""
-                : "je_small_write%s oid=%lx:%lx ver=%lu offset=%u len=%u loc=%08lx",
+            json ? ",\"type\":\"small_write%s\",\"inode\":\"0x%jx\",\"stripe\":\"0x%jx\",\"ver\":\"%ju\",\"offset\":%u,\"len\":%u,\"loc\":\"0x%jx\""
+                : "je_small_write%s oid=%jx:%jx ver=%ju offset=%u len=%u loc=%08jx",
            je->type == JE_SMALL_WRITE_INSTANT ? "_instant" : "",
            sw.oid.inode, sw.oid.stripe, sw.version, sw.offset, sw.len, sw.data_offset
        );
        if (journal_calc_data_pos != sw.data_offset)
        {
-            printf(json ? ",\"bad_loc\":true,\"calc_loc\":\"0x%lx\""
-                : " (mismatched, calculated = %08lx)", journal_pos);
+            printf(json ? ",\"bad_loc\":true,\"calc_loc\":\"0x%jx\""
+                : " (mismatched, calculated = %08jx)", journal_pos);
        }
        uint32_t data_csum_size = (!je_start.csum_block_size
            ? 0
@ -367,8 +367,8 @@ void disk_tool_t::dump_journal_entry(int num, journal_entry *je, bool json)
    {
        auto & bw = je->big_write;
        printf(
-            json ? ",\"type\":\"big_write%s\",\"inode\":\"0x%lx\",\"stripe\":\"0x%lx\",\"ver\":\"%lu\",\"offset\":%u,\"len\":%u,\"loc\":\"0x%lx\""
-                : "je_big_write%s oid=%lx:%lx ver=%lu offset=%u len=%u loc=%08lx",
+            json ? ",\"type\":\"big_write%s\",\"inode\":\"0x%jx\",\"stripe\":\"0x%jx\",\"ver\":\"%ju\",\"offset\":%u,\"len\":%u,\"loc\":\"0x%jx\""
+                : "je_big_write%s oid=%jx:%jx ver=%ju offset=%u len=%u loc=%08jx",
            je->type == JE_BIG_WRITE_INSTANT ? "_instant" : "",
            bw.oid.inode, bw.oid.stripe, bw.version, bw.offset, bw.len, bw.location
        );
@ -398,24 +398,24 @@ void disk_tool_t::dump_journal_entry(int num, journal_entry *je, bool json)
    else if (je->type == JE_STABLE)
    {
        printf(
-            json ? ",\"type\":\"stable\",\"inode\":\"0x%lx\",\"stripe\":\"0x%lx\",\"ver\":\"%lu\"}"
-                : "je_stable oid=%lx:%lx ver=%lu\n",
+            json ? ",\"type\":\"stable\",\"inode\":\"0x%jx\",\"stripe\":\"0x%jx\",\"ver\":\"%ju\"}"
+                : "je_stable oid=%jx:%jx ver=%ju\n",
            je->stable.oid.inode, je->stable.oid.stripe, je->stable.version
        );
    }
    else if (je->type == JE_ROLLBACK)
    {
        printf(
-            json ? ",\"type\":\"rollback\",\"inode\":\"0x%lx\",\"stripe\":\"0x%lx\",\"ver\":\"%lu\"}"
-                : "je_rollback oid=%lx:%lx ver=%lu\n",
+            json ? ",\"type\":\"rollback\",\"inode\":\"0x%jx\",\"stripe\":\"0x%jx\",\"ver\":\"%ju\"}"
+                : "je_rollback oid=%jx:%jx ver=%ju\n",
            je->rollback.oid.inode, je->rollback.oid.stripe, je->rollback.version
        );
    }
    else if (je->type == JE_DELETE)
    {
        printf(
-            json ? ",\"type\":\"delete\",\"inode\":\"0x%lx\",\"stripe\":\"0x%lx\",\"ver\":\"%lu\"}"
-                : "je_delete oid=%lx:%lx ver=%lu\n",
+            json ? ",\"type\":\"delete\",\"inode\":\"0x%jx\",\"stripe\":\"0x%jx\",\"ver\":\"%ju\"}"
+                : "je_delete oid=%jx:%jx ver=%ju\n",
            je->del.oid.inode, je->del.oid.stripe, je->del.version
        );
    }
--- a/src/disk_tool_meta.cpp
+++ b/src/disk_tool_meta.cpp
@ -54,7 +54,7 @@ int disk_tool_t::process_meta(std::function<void(blockstore_meta_header_v2_t *)>
        else
        {
            // Unsupported version
-            fprintf(stderr, "Metadata format is too new for me (stored version is %lu, max supported %u).\n", hdr->version, BLOCKSTORE_META_FORMAT_V2);
+            fprintf(stderr, "Metadata format is too new for me (stored version is %ju, max supported %u).\n", hdr->version, BLOCKSTORE_META_FORMAT_V2);
            free(data);
            close(dsk.meta_fd);
            dsk.meta_fd = -1;
@ -108,7 +108,7 @@ int disk_tool_t::process_meta(std::function<void(blockstore_meta_header_v2_t *)>
                            uint32_t *entry_csum = (uint32_t*)((uint8_t*)entry + dsk.clean_entry_size - 4);
                            if (*entry_csum != crc32c(0, entry, dsk.clean_entry_size - 4))
                            {
-                                fprintf(stderr, "Metadata entry %lu is corrupt (checksum mismatch), skipping\n", block_num);
+                                fprintf(stderr, "Metadata entry %ju is corrupt (checksum mismatch), skipping\n", block_num);
                                continue;
                            }
                        }
@ -184,7 +184,7 @@ void disk_tool_t::dump_meta_header(blockstore_meta_header_v2_t *hdr)
    }
    else
    {
-        printf("{\"version\":\"0.5\",\"meta_block_size\":%lu,\"entries\":[\n", dsk.meta_block_size);
+        printf("{\"version\":\"0.5\",\"meta_block_size\":%ju,\"entries\":[\n", dsk.meta_block_size);
    }
    first_entry = true;
 }
@ -192,7 +192,7 @@ void disk_tool_t::dump_meta_header(blockstore_meta_header_v2_t *hdr)
 void disk_tool_t::dump_meta_entry(uint64_t block_num, clean_disk_entry *entry, uint8_t *bitmap)
 {
    printf(
-#define ENTRY_FMT "{\"block\":%lu,\"pool\":%u,\"inode\":\"0x%lx\",\"stripe\":\"0x%lx\",\"version\":%lu"
+#define ENTRY_FMT "{\"block\":%ju,\"pool\":%u,\"inode\":\"0x%jx\",\"stripe\":\"0x%jx\",\"version\":%ju"
        (first_entry ? ENTRY_FMT : (",\n" ENTRY_FMT)),
 #undef ENTRY_FMT
        block_num, INODE_POOL(entry->oid.inode), INODE_NO_POOL(entry->oid.inode),
@ -265,7 +265,7 @@ int disk_tool_t::write_json_meta(json11::Json meta)
        {
            free(new_meta_buf);
            new_meta_buf = NULL;
-            fprintf(stderr, "Metadata (data block %lu) doesn't fit into the new area\n", data_block);
+            fprintf(stderr, "Metadata (data block %ju) doesn't fit into the new area\n", data_block);
            return 1;
        }
        clean_disk_entry *new_entry = (clean_disk_entry*)(new_meta_buf +
--- a/src/disk_tool_prepare.cpp
+++ b/src/disk_tool_prepare.cpp
@ -8,6 +8,7 @@
 int disk_tool_t::prepare_one(std::map<std::string, std::string> options, int is_hdd)
 {
    static const char *allow_additional_params[] = {
+        "autosync_writes",
        "data_io",
        "meta_io",
        "journal_io",
@ -99,12 +100,9 @@ int disk_tool_t::prepare_one(std::map<std::string, std::string> options, int is_
        options["disable_journal_fsync"] = options["disable_data_fsync"];
    }
    // Calculate offsets if the same device is used for two or more of data, meta, and journal
-    if (options["journal_size"] == "")
+    if (options["journal_size"] == "" && (options["journal_device"] == "" || options["journal_device"] == options["data_device"]))
    {
-        if (options["journal_device"] == "")
-            options["journal_size"] = is_hdd ? "128M" : "32M";
-        else if (is_hdd)
-            options["journal_size"] = DEFAULT_HYBRID_JOURNAL;
+        options["journal_size"] = is_hdd || !json_is_true(options["disable_data_fsync"]) ? "128M" : "32M";
    }
    bool is_hybrid = is_hdd && options["journal_device"] != "" && options["journal_device"] != options["data_device"];
    if (is_hdd)
@ -114,6 +112,15 @@ int disk_tool_t::prepare_one(std::map<std::string, std::string> options, int is_
        if (is_hybrid && options["throttle_small_writes"] == "")
            options["throttle_small_writes"] = "1";
    }
+    else if (!json_is_true(options["disable_data_fsync"]))
+    {
+        if (options.find("min_flusher_count") == options.end())
+            options["min_flusher_count"] = "32";
+        if (options.find("max_flusher_count") == options.end())
+            options["max_flusher_count"] = "256";
+        if (options.find("autosync_writes") == options.end())
+            options["autosync_writes"] = "512";
+    }
    json11::Json::object sb;
    blockstore_disk_t dsk;
    try
@ -203,10 +210,10 @@ int disk_tool_t::prepare_one(std::map<std::string, std::string> options, int is_
        desc += " with metadata on "+realpath_str(options["meta_device"]);
    if (sep_j)
        desc += (sep_m ? " and journal on " : " with journal on ") + realpath_str(options["journal_device"]);
-    fprintf(stderr, "Initialized OSD %lu on %s\n", osd_num, desc.c_str());
+    fprintf(stderr, "Initialized OSD %ju on %s\n", osd_num, desc.c_str());
    if (shell_exec({ "systemctl", "enable", "--now", "vitastor-osd@"+std::to_string(osd_num) }, "", NULL, NULL) != 0)
    {
-        fprintf(stderr, "Failed to enable systemd unit vitastor-osd@%lu\n", osd_num);
+        fprintf(stderr, "Failed to enable systemd unit vitastor-osd@%ju\n", osd_num);
        return 1;
    }
    return 0;
@ -330,7 +337,7 @@ json11::Json disk_tool_t::add_partitions(vitastor_dev_info_t & devinfo, std::vec
    std::string out;
    if (shell_exec({ "sfdisk", "--no-reread", "--force", devinfo.path }, script, &out, NULL) != 0)
    {
-        fprintf(stderr, "Failed to add %lu partition(s) with sfdisk\n", sizes.size());
+        fprintf(stderr, "Failed to add %zu partition(s) with sfdisk\n", sizes.size());
        return {};
    }
    // Get new partition table and find created partitions
@ -345,7 +352,7 @@ json11::Json disk_tool_t::add_partitions(vitastor_dev_info_t & devinfo, std::vec
    }
    if (new_parts.size() != sizes.size())
    {
-        fprintf(stderr, "Failed to add %lu partition(s) with sfdisk: new partitions not found in table\n", sizes.size());
+        fprintf(stderr, "Failed to add %zu partition(s) with sfdisk: new partitions not found in table\n", sizes.size());
        return {};
    }
    // Check if new nodes exist and run partprobe if not
@ -449,7 +456,7 @@ std::vector<std::string> disk_tool_t::get_new_data_parts(vitastor_dev_info_t & d
                    bool is_journal = sb["params"]["journal_device"].string_value() == part_path;
                    bool is_data = sb["params"]["data_device"].string_value() == part_path;
                    fprintf(
-                        stderr, "%s is already initialized for OSD %lu%s, skipping\n",
+                        stderr, "%s is already initialized for OSD %ju%s, skipping\n",
                        part["node"].string_value().c_str(), sb["params"]["osd_num"].uint64_value(),
                        (is_data ? " data" : (is_meta ? " meta" : (is_journal ? " journal" : "")))
                    );
@ -532,7 +539,7 @@ int disk_tool_t::get_meta_partition(std::vector<vitastor_dev_info_t> & ssds, std
    if (sel < 0)
    {
        fprintf(
-            stderr, "Could not find free space for new SSD journal and metadata (need %lu + %lu MiB)\n",
+            stderr, "Could not find free space for new SSD journal and metadata (need %ju + %ju MiB)\n",
            meta_size/1024/1024, journal_size/1024/1024
        );
        return 1;
@ -616,6 +623,7 @@ int disk_tool_t::prepare(std::vector<std::string> devices)
        options.erase("disable_meta_fsync");
        options.erase("disable_journal_fsync");
    }
+    auto journal_size = options["journal_size"];
    for (auto & dev: devinfo)
    {
        if (!hybrid || dev.is_hdd)
@ -633,11 +641,13 @@ int disk_tool_t::prepare(std::vector<std::string> devices)
                    {
                        return 1;
                    }
+                    options.erase("journal_size");
                }
                // Treat all disks as SSDs if not in the hybrid mode
                prepare_one(options, dev.is_hdd ? 1 : 0);
                if (hybrid)
                {
+                    options["journal_size"] = journal_size;
                    options.erase("journal_device");
                    options.erase("meta_device");
                }
--- a/src/disk_tool_resize.cpp
+++ b/src/disk_tool_resize.cpp
@ -184,7 +184,7 @@ void disk_tool_t::resize_init(blockstore_meta_header_v2_t *hdr)
    }
    if (new_meta_len < dsk.meta_block_size*new_meta_blocks)
    {
-        fprintf(stderr, "New metadata area size is too small, should be at least %lu bytes\n", dsk.meta_block_size*new_meta_blocks);
+        fprintf(stderr, "New metadata area size is too small, should be at least %ju bytes\n", dsk.meta_block_size*new_meta_blocks);
        exit(1);
    }
    // Check that new metadata, journal and data areas don't overlap
@ -289,7 +289,7 @@ int disk_tool_t::resize_copy_data()
                        if (data->res != dsk.data_block_size)
                        {
                            fprintf(
-                                stderr, "Failed to read %u bytes at %lu from %s: %s\n", dsk.data_block_size,
+                                stderr, "Failed to read %u bytes at %ju from %s: %s\n", dsk.data_block_size,
                                dsk.data_offset + moving_blocks[i].old_loc*dsk.data_block_size, dsk.data_device.c_str(),
                                data->res < 0 ? strerror(-data->res) : "short read"
                            );
@ -314,7 +314,7 @@ int disk_tool_t::resize_copy_data()
                        if (data->res != dsk.data_block_size)
                        {
                            fprintf(
-                                stderr, "Failed to write %u bytes at %lu to %s: %s\n", dsk.data_block_size,
+                                stderr, "Failed to write %u bytes at %ju to %s: %s\n", dsk.data_block_size,
                                dsk.data_offset + moving_blocks[i].new_loc*dsk.data_block_size, dsk.data_device.c_str(),
                                data->res < 0 ? strerror(-data->res) : "short write"
                            );
--- a/src/disk_tool_udev.cpp
+++ b/src/disk_tool_udev.cpp
@ -43,8 +43,8 @@ int disk_tool_t::udev_import(std::string device)
    }
    uint64_t osd_num = sb["params"]["osd_num"].uint64_value();
    // Print variables for udev
-    printf("VITASTOR_OSD_NUM=%lu\n", osd_num);
-    printf("VITASTOR_ALIAS=osd%lu-%s\n", osd_num, sb["device_type"].string_value().c_str());
+    printf("VITASTOR_OSD_NUM=%ju\n", osd_num);
+    printf("VITASTOR_ALIAS=osd%ju-%s\n", osd_num, sb["device_type"].string_value().c_str());
    printf("VITASTOR_DATA_DEVICE=%s\n", udev_escape(sb["params"]["data_device"].string_value()).c_str());
    if (sb["real_meta_device"].string_value() != "" && sb["real_meta_device"] != sb["real_data_device"])
        printf("VITASTOR_META_DEVICE=%s\n", udev_escape(sb["params"]["meta_device"].string_value()).c_str());
@ -466,12 +466,12 @@ int disk_tool_t::purge_devices(const std::vector<std::string> & devices)
                    close(fd);
                if (r != 0)
                {
-                    fprintf(stderr, "Failed to clear OSD %lu %s device %s superblock: %s\n",
+                    fprintf(stderr, "Failed to clear OSD %ju %s device %s superblock: %s\n",
                        sb["params"]["osd_num"].uint64_value(), dev_type.c_str(), dev.c_str(), strerror(errno));
                }
                else
                {
-                    fprintf(stderr, "OSD %lu %s device %s superblock cleared\n",
+                    fprintf(stderr, "OSD %ju %s device %s superblock cleared\n",
                        sb["params"]["osd_num"].uint64_value(), dev_type.c_str(), dev.c_str());
                }
                if (sb["params"][dev_type+"_device"].string_value().substr(0, 22) == "/dev/disk/by-partuuid/")
--- a/src/disk_tool_utils.cpp
+++ b/src/disk_tool_utils.cpp
@ -12,9 +12,9 @@ uint64_t sscanf_json(const char *fmt, const json11::Json & str)
 {
    uint64_t value = 0;
    if (fmt)
-        sscanf(str.string_value().c_str(), "%lx", &value);
+        sscanf(str.string_value().c_str(), "%jx", &value);
    else if (str.string_value().size() > 2 && (str.string_value()[0] == '0' && str.string_value()[1] == 'x'))
-        sscanf(str.string_value().c_str(), "0x%lx", &value);
+        sscanf(str.string_value().c_str(), "0x%jx", &value);
    else
        value = str.uint64_value();
    return value;
--- a/src/etcd_state_client.cpp
+++ b/src/etcd_state_client.cpp
@ -333,7 +333,7 @@ void etcd_state_client_t::start_etcd_watcher()
        etcd_watch_ws = NULL;
    }
    if (this->log_level > 1)
-        fprintf(stderr, "Trying to connect to etcd websocket at %s, watch from revision %lu\n", etcd_address.c_str(), etcd_watch_revision);
+        fprintf(stderr, "Trying to connect to etcd websocket at %s, watch from revision %ju\n", etcd_address.c_str(), etcd_watch_revision);
    etcd_watch_ws = open_websocket(tfd, etcd_address, etcd_api_path+"/watch", etcd_slow_timeout,
        [this, cur_addr = selected_etcd_address](const http_response_t *msg)
    {
@ -357,7 +357,7 @@ void etcd_state_client_t::start_etcd_watcher()
                        watch_id == ETCD_OSD_STATE_WATCH_ID)
                        etcd_watches_initialised++;
                    if (etcd_watches_initialised == ETCD_TOTAL_WATCHES && this->log_level > 0)
-                        fprintf(stderr, "Successfully subscribed to etcd at %s, revision %lu\n", cur_addr.c_str(), etcd_watch_revision);
+                        fprintf(stderr, "Successfully subscribed to etcd at %s, revision %ju\n", cur_addr.c_str(), etcd_watch_revision);
                }
                if (data["result"]["canceled"].bool_value())
                {
@ -371,7 +371,7 @@ void etcd_state_client_t::start_etcd_watcher()
                            // check to not trigger on_reload_hook multiple times
                            if (etcd_watch_ws != NULL)
                            {
-                                fprintf(stderr, "Revisions before %lu were compacted by etcd, reloading state\n",
+                                fprintf(stderr, "Revisions before %ju were compacted by etcd, reloading state\n",
                                    data["result"]["compact_revision"].uint64_value());
                                http_close(etcd_watch_ws);
                                etcd_watch_ws = NULL;
@ -382,7 +382,7 @@ void etcd_state_client_t::start_etcd_watcher()
                        }
                        else
                        {
-                            fprintf(stderr, "Revisions before %lu were compacted by etcd, exiting\n",
+                            fprintf(stderr, "Revisions before %ju were compacted by etcd, exiting\n",
                                data["result"]["compact_revision"].uint64_value());
                            exit(1);
                        }
@ -646,7 +646,7 @@ void etcd_state_client_t::load_pgs()
            etcd_watch_revision = data["header"]["revision"].uint64_value()+1;
            if (this->log_level > 3)
            {
-                fprintf(stderr, "Loaded revision %lu of PG configuration\n", etcd_watch_revision-1);
+                fprintf(stderr, "Loaded revision %ju of PG configuration\n", etcd_watch_revision-1);
            }
        }
        for (auto & res: data["responses"].array_items())
@ -740,7 +740,7 @@ void etcd_state_client_t::clean_nonexistent_pgs()
    {
        if (seen_peers.find(peer_item.first) == seen_peers.end())
        {
-            fprintf(stderr, "OSD %lu state disappeared after reload, forgetting it\n", peer_item.first);
+            fprintf(stderr, "OSD %ju state disappeared after reload, forgetting it\n", peer_item.first);
            parse_state((etcd_kv_t){
                .key = etcd_prefix+"/osd/state/"+std::to_string(peer_item.first),
            });
@ -890,7 +890,7 @@ void etcd_state_client_t::parse_state(const etcd_kv_t & kv)
            {
                if (pg_item.second.target_set.size() != parsed_cfg.pg_size)
                {
-                    fprintf(stderr, "Pool %u PG %u configuration is invalid: osd_set size %lu != pool pg_size %lu\n",
+                    fprintf(stderr, "Pool %u PG %u configuration is invalid: osd_set size %zu != pool pg_size %ju\n",
                        pool_id, pg_item.first, pg_item.second.target_set.size(), parsed_cfg.pg_size);
                    pg_item.second.pause = true;
                }
@ -936,7 +936,7 @@ void etcd_state_client_t::parse_state(const etcd_kv_t & kv)
                }
                if (parsed_cfg.target_set.size() != pool_config[pool_id].pg_size)
                {
-                    fprintf(stderr, "Pool %u PG %u configuration is invalid: osd_set size %lu != pool pg_size %lu\n",
+                    fprintf(stderr, "Pool %u PG %u configuration is invalid: osd_set size %zu != pool pg_size %ju\n",
                        pool_id, pg_num, parsed_cfg.target_set.size(), pool_config[pool_id].pg_size);
                    parsed_cfg.pause = true;
                }
@ -950,7 +950,7 @@ void etcd_state_client_t::parse_state(const etcd_kv_t & kv)
                if (pg_it->second.config_exists && pg_it->first != ++n)
                {
                    fprintf(
-                        stderr, "Invalid pool %u PG configuration: PG numbers don't cover whole 1..%lu range\n",
+                        stderr, "Invalid pool %u PG configuration: PG numbers don't cover whole 1..%zu range\n",
                        pool_item.second.id, pool_item.second.pg_config.size()
                    );
                    for (pg_it = pool_item.second.pg_config.begin(); pg_it != pool_item.second.pg_config.end(); pg_it++)
@ -1066,7 +1066,7 @@ void etcd_state_client_t::parse_state(const etcd_kv_t & kv)
                (state & PG_PEERING) && state != PG_PEERING ||
                (state & PG_INCOMPLETE) && state != PG_INCOMPLETE)
            {
-                fprintf(stderr, "Unexpected pool %u PG %u state in etcd: primary=%lu, state=%s\n", pool_id, pg_num, cur_primary, value["state"].dump().c_str());
+                fprintf(stderr, "Unexpected pool %u PG %u state in etcd: primary=%ju, state=%s\n", pool_id, pg_num, cur_primary, value["state"].dump().c_str());
                return;
            }
            pg_cfg.cur_primary = cur_primary;
@ -1102,7 +1102,7 @@ void etcd_state_client_t::parse_state(const etcd_kv_t & kv)
        uint64_t pool_id = 0;
        uint64_t inode_num = 0;
        char null_byte = 0;
-        int scanned = sscanf(key.c_str() + etcd_prefix.length()+14, "%lu/%lu%c", &pool_id, &inode_num, &null_byte);
+        int scanned = sscanf(key.c_str() + etcd_prefix.length()+14, "%ju/%ju%c", &pool_id, &inode_num, &null_byte);
        if (scanned != 2 || !pool_id || pool_id >= POOL_ID_MAX || !inode_num || (inode_num >> (64-POOL_ID_BITS)))
        {
            fprintf(stderr, "Bad etcd key %s, ignoring\n", key.c_str());
@ -1145,7 +1145,7 @@ void etcd_state_client_t::parse_state(const etcd_kv_t & kv)
                    else if (parent_pool_id >= POOL_ID_MAX)
                    {
                        fprintf(
-                            stderr, "Inode %lu/%lu parent_pool value is invalid, ignoring parent setting\n",
+                            stderr, "Inode %ju/%ju parent_pool value is invalid, ignoring parent setting\n",
                            inode_num >> (64-POOL_ID_BITS), inode_num & (((uint64_t)1 << (64-POOL_ID_BITS)) - 1)
                        );
                        parent_inode_num = 0;
--- a/src/fio_cluster.cpp
+++ b/src/fio_cluster.cpp
@ -377,7 +377,7 @@ static void io_callback(void *opaque, long retval)
    bsd->completed.push_back(io);
    if (bsd->trace)
    {
-        printf("--- %s 0x%lx retval=%ld\n", io->ddir == DDIR_READ ? "READ" :
+        printf("--- %s 0x%jx retval=%ld\n", io->ddir == DDIR_READ ? "READ" :
            (io->ddir == DDIR_WRITE ? "WRITE" : "SYNC"), (uint64_t)io, retval);
    }
 }
@ -405,10 +405,11 @@ static enum fio_q_status sec_queue(struct thread_data *td, struct io_u *io)
    bsd->inflight++;

    uint64_t inode = opt->image ? vitastor_c_inode_get_num(bsd->watch) : opt->inode;
+    assert(io->xfer_buflen < (size_t)-1);
    switch (io->ddir)
    {
    case DDIR_READ:
-        iov = { .iov_base = io->xfer_buf, .iov_len = io->xfer_buflen };
+        iov = { .iov_base = io->xfer_buf, .iov_len = (size_t)io->xfer_buflen };
        vitastor_c_read(bsd->cli, inode, io->offset, io->xfer_buflen, &iov, 1, read_callback, io);
        bsd->last_sync = false;
        break;
@ -436,7 +437,7 @@ static enum fio_q_status sec_queue(struct thread_data *td, struct io_u *io)
            io->error = EROFS;
            return FIO_Q_COMPLETED;
        }
-        iov = { .iov_base = io->xfer_buf, .iov_len = io->xfer_buflen };
+        iov = { .iov_base = io->xfer_buf, .iov_len = (size_t)io->xfer_buflen };
        vitastor_c_write(bsd->cli, inode, io->offset, io->xfer_buflen, 0, &iov, 1, io_callback, io);
        bsd->last_sync = false;
        break;
@ -453,11 +454,11 @@ static enum fio_q_status sec_queue(struct thread_data *td, struct io_u *io)
    {
        if (io->ddir == DDIR_SYNC)
        {
-            printf("+++ SYNC 0x%lx\n", (uint64_t)io);
+            printf("+++ SYNC 0x%jx\n", (uint64_t)io);
        }
        else
        {
-            printf("+++ %s 0x%lx 0x%llx+%lx\n",
+            printf("+++ %s 0x%jx 0x%llx+%jx\n",
                io->ddir == DDIR_READ ? "READ" : "WRITE",
                (uint64_t)io, io->offset, (uint64_t)io->xfer_buflen);
        }
--- a/src/fio_sec_osd.cpp
+++ b/src/fio_sec_osd.cpp
@ -310,7 +310,8 @@ static enum fio_q_status sec_queue(struct thread_data *td, struct io_u *io)
    int iovcnt = 1, wtotal = OSD_PACKET_SIZE;
    if (io->ddir == DDIR_WRITE)
    {
-        iov[iovcnt++] = { .iov_base = io->xfer_buf, .iov_len = io->xfer_buflen };
+        assert(io->xfer_buflen <= 0x7fffffff);
+        iov[iovcnt++] = { .iov_base = io->xfer_buf, .iov_len = (size_t)io->xfer_buflen };
        wtotal += io->xfer_buflen;
    }
    if (sendv_blocking(bsd->connect_fd, iov, iovcnt,
@ -341,13 +342,13 @@ static int sec_getevents(struct thread_data *td, unsigned int min, unsigned int
        read_blocking(bsd->connect_fd, reply.buf, OSD_PACKET_SIZE);
        if (reply.hdr.magic != SECONDARY_OSD_REPLY_MAGIC)
        {
-            fprintf(stderr, "bad reply: magic = %lx instead of %lx\n", reply.hdr.magic, SECONDARY_OSD_REPLY_MAGIC);
+            fprintf(stderr, "bad reply: magic = %jx instead of %jx\n", reply.hdr.magic, SECONDARY_OSD_REPLY_MAGIC);
            exit(1);
        }
        auto it = bsd->queue.find(reply.hdr.id);
        if (it == bsd->queue.end())
        {
-            fprintf(stderr, "bad reply: op id %lx missing in local queue\n", reply.hdr.id);
+            fprintf(stderr, "bad reply: op id %jx missing in local queue\n", reply.hdr.id);
            exit(1);
        }
        io_u* io = it->second->fio_op;
@ -357,7 +358,7 @@ static int sec_getevents(struct thread_data *td, unsigned int min, unsigned int
        {
            if (reply.hdr.retval != io->xfer_buflen)
            {
-                fprintf(stderr, "Short read: retval = %ld instead of %lu\n", reply.hdr.retval, (uint64_t)io->xfer_buflen);
+                fprintf(stderr, "Short read: retval = %jd instead of %ju\n", reply.hdr.retval, (uint64_t)io->xfer_buflen);
                exit(1);
            }
            // Support bitmap
@ -371,7 +372,8 @@ static int sec_getevents(struct thread_data *td, unsigned int min, unsigned int
                else
                    iov[iovcnt++] = { .iov_base = (void*)(bitmap = (uint64_t)malloc(reply.sec_rw.attr_len)), .iov_len = reply.sec_rw.attr_len };
            }
-            iov[iovcnt++] = { .iov_base = io->xfer_buf, .iov_len = io->xfer_buflen };
+            assert(io->xfer_buflen <= 0x7FFFFFFF);
+            iov[iovcnt++] = { .iov_base = io->xfer_buf, .iov_len = (size_t)io->xfer_buflen };
            readv_blocking(bsd->connect_fd, iov, iovcnt);
            if (reply.sec_rw.attr_len > 8)
            {
@ -382,7 +384,7 @@ static int sec_getevents(struct thread_data *td, unsigned int min, unsigned int
        {
            if (reply.hdr.retval != io->xfer_buflen)
            {
-                fprintf(stderr, "Short write: retval = %ld instead of %lu\n", reply.hdr.retval, (uint64_t)io->xfer_buflen);
+                fprintf(stderr, "Short write: retval = %jd instead of %ju\n", reply.hdr.retval, (uint64_t)io->xfer_buflen);
                exit(1);
            }
        }
@ -390,13 +392,13 @@ static int sec_getevents(struct thread_data *td, unsigned int min, unsigned int
        {
            if (reply.hdr.retval != 0)
            {
-                fprintf(stderr, "Sync failed: retval = %ld\n", reply.hdr.retval);
+                fprintf(stderr, "Sync failed: retval = %jd\n", reply.hdr.retval);
                exit(1);
            }
        }
        if (opt->trace)
        {
-            printf("--- %s # %ld\n", io->ddir == DDIR_READ ? "READ" :
+            printf("--- %s # %ju\n", io->ddir == DDIR_READ ? "READ" :
                (io->ddir == DDIR_WRITE ? "WRITE" : "SYNC"), reply.hdr.id);
        }
        bsd->completed.push_back(io);
--- a/src/kv_cli.cpp
+++ b/src/kv_cli.cpp
@ -0,0 +1,401 @@
+// Copyright (c) Vitaliy Filippov, 2019+
+// License: VNPL-1.1 (see README.md for details)
+//
+// Vitastor shared key/value database test CLI
+
+#define _XOPEN_SOURCE
+#include <limits.h>
+
+#include <netinet/tcp.h>
+#include <sys/epoll.h>
+#include <unistd.h>
+#include <fcntl.h>
+//#include <signal.h>
+
+#include "epoll_manager.h"
+#include "str_util.h"
+#include "kv_db.h"
+
+const char *exe_name = NULL;
+
+class kv_cli_t
+{
+public:
+    kv_dbw_t *db = NULL;
+    ring_loop_t *ringloop = NULL;
+    epoll_manager_t *epmgr = NULL;
+    cluster_client_t *cli = NULL;
+    bool interactive = false;
+    int in_progress = 0;
+    char *cur_cmd = NULL;
+    int cur_cmd_size = 0, cur_cmd_alloc = 0;
+    bool finished = false, eof = false;
+    json11::Json::object cfg;
+
+    ~kv_cli_t();
+
+    static json11::Json::object parse_args(int narg, const char *args[]);
+    void run(const json11::Json::object & cfg);
+    void read_cmd();
+    void next_cmd();
+    void handle_cmd(const std::string & cmd, std::function<void()> cb);
+};
+
+kv_cli_t::~kv_cli_t()
+{
+    if (cur_cmd)
+    {
+        free(cur_cmd);
+        cur_cmd = NULL;
+    }
+    cur_cmd_alloc = 0;
+    if (db)
+        delete db;
+    if (cli)
+    {
+        cli->flush();
+        delete cli;
+    }
+    if (epmgr)
+        delete epmgr;
+    if (ringloop)
+        delete ringloop;
+}
+
+json11::Json::object kv_cli_t::parse_args(int narg, const char *args[])
+{
+    json11::Json::object cfg;
+    for (int i = 1; i < narg; i++)
+    {
+        if (!strcmp(args[i], "-h") || !strcmp(args[i], "--help"))
+        {
+            printf(
+                "Vitastor Key/Value CLI\n"
+                "(c) Vitaliy Filippov, 2023+ (VNPL-1.1)\n"
+                "\n"
+                "USAGE: %s [--etcd_address ADDR] [OTHER OPTIONS]\n",
+                exe_name
+            );
+            exit(0);
+        }
+        else if (args[i][0] == '-' && args[i][1] == '-')
+        {
+            const char *opt = args[i]+2;
+            cfg[opt] = !strcmp(opt, "json") || i == narg-1 ? "1" : args[++i];
+        }
+    }
+    return cfg;
+}
+
+void kv_cli_t::run(const json11::Json::object & cfg)
+{
+    // Create client
+    ringloop = new ring_loop_t(512);
+    epmgr = new epoll_manager_t(ringloop);
+    cli = new cluster_client_t(ringloop, epmgr->tfd, cfg);
+    db = new kv_dbw_t(cli);
+    // Load image metadata
+    while (!cli->is_ready())
+    {
+        ringloop->loop();
+        if (cli->is_ready())
+            break;
+        ringloop->wait();
+    }
+    // Run
+    fcntl(0, F_SETFL, fcntl(0, F_GETFL, 0) | O_NONBLOCK);
+    try
+    {
+        epmgr->tfd->set_fd_handler(0, false, [this](int fd, int events)
+        {
+            if (events & EPOLLIN)
+            {
+                read_cmd();
+            }
+            if (events & EPOLLRDHUP)
+            {
+                epmgr->tfd->set_fd_handler(0, false, NULL);
+                finished = true;
+            }
+        });
+        interactive = true;
+        printf("> ");
+    }
+    catch (std::exception & e)
+    {
+        // Can't add to epoll, STDIN is probably a file
+        read_cmd();
+    }
+    while (!finished)
+    {
+        ringloop->loop();
+        if (!finished)
+            ringloop->wait();
+    }
+    // Destroy the client
+    delete db;
+    db = NULL;
+    cli->flush();
+    delete cli;
+    delete epmgr;
+    delete ringloop;
+    cli = NULL;
+    epmgr = NULL;
+    ringloop = NULL;
+}
+
+void kv_cli_t::read_cmd()
+{
+    if (!cur_cmd_alloc)
+    {
+        cur_cmd_alloc = 65536;
+        cur_cmd = (char*)malloc_or_die(cur_cmd_alloc);
+    }
+    while (cur_cmd_size < cur_cmd_alloc)
+    {
+        int r = read(0, cur_cmd+cur_cmd_size, cur_cmd_alloc-cur_cmd_size);
+        if (r < 0 && errno != EAGAIN)
+            fprintf(stderr, "Error reading from stdin: %s\n", strerror(errno));
+        if (r > 0)
+            cur_cmd_size += r;
+        if (r == 0)
+            eof = true;
+        if (r <= 0)
+            break;
+    }
+    next_cmd();
+}
+
+void kv_cli_t::next_cmd()
+{
+    if (in_progress > 0)
+    {
+        return;
+    }
+    int pos = 0;
+    for (; pos < cur_cmd_size; pos++)
+    {
+        if (cur_cmd[pos] == '\n' || cur_cmd[pos] == '\r')
+        {
+            auto cmd = trim(std::string(cur_cmd, pos));
+            pos++;
+            memmove(cur_cmd, cur_cmd+pos, cur_cmd_size-pos);
+            cur_cmd_size -= pos;
+            in_progress++;
+            handle_cmd(cmd, [this]()
+            {
+                in_progress--;
+                if (interactive)
+                    printf("> ");
+                next_cmd();
+                if (!in_progress)
+                    read_cmd();
+            });
+            break;
+        }
+    }
+    if (eof && !in_progress)
+    {
+        finished = true;
+    }
+}
+
+void kv_cli_t::handle_cmd(const std::string & cmd, std::function<void()> cb)
+{
+    if (cmd == "")
+    {
+        cb();
+        return;
+    }
+    auto pos = cmd.find_first_of(" \t");
+    if (pos != std::string::npos)
+    {
+        while (pos < cmd.size()-1 && (cmd[pos+1] == ' ' || cmd[pos+1] == '\t'))
+            pos++;
+    }
+    auto opname = strtolower(pos == std::string::npos ? cmd : cmd.substr(0, pos));
+    if (opname == "open")
+    {
+        uint64_t pool_id = 0;
+        inode_t inode_id = 0;
+        uint32_t kv_block_size = 0;
+        int scanned = sscanf(cmd.c_str() + pos+1, "%lu %lu %u", &pool_id, &inode_id, &kv_block_size);
+        if (scanned == 2)
+        {
+            kv_block_size = 4096;
+        }
+        if (scanned < 2 || !pool_id || !inode_id || !kv_block_size || (kv_block_size & (kv_block_size-1)) != 0)
+        {
+            fprintf(stderr, "Usage: open <pool_id> <inode_id> [block_size]. Block size must be a power of 2. Default is 4096.\n");
+            cb();
+            return;
+        }
+        cfg["kv_block_size"] = (uint64_t)kv_block_size;
+        db->open(INODE_WITH_POOL(pool_id, inode_id), cfg, [=](int res)
+        {
+            if (res < 0)
+                fprintf(stderr, "Error opening index: %s (code %d)\n", strerror(-res), res);
+            else
+                printf("Index opened. Current size: %lu bytes\n", db->get_size());
+            cb();
+        });
+    }
+    else if (opname == "config")
+    {
+        auto pos2 = cmd.find_first_of(" \t", pos+1);
+        if (pos2 == std::string::npos)
+        {
+            fprintf(stderr, "Usage: config <property> <value>\n");
+            cb();
+            return;
+        }
+        auto key = trim(cmd.substr(pos+1, pos2-pos-1));
+        auto value = parse_size(trim(cmd.substr(pos2+1)));
+        if (key != "kv_memory_limit" &&
+            key != "kv_allocate_blocks" &&
+            key != "kv_evict_max_misses" &&
+            key != "kv_evict_attempts_per_level" &&
+            key != "kv_evict_unused_age" &&
+            key != "kv_log_level")
+        {
+            fprintf(
+                stderr, "Allowed properties: kv_memory_limit, kv_allocate_blocks,"
+                " kv_evict_max_misses, kv_evict_attempts_per_level, kv_evict_unused_age, kv_log_level\n"
+            );
+        }
+        else
+        {
+            cfg[key] = value;
+            db->set_config(cfg);
+        }
+        cb();
+    }
+    else if (opname == "get" || opname == "set" || opname == "del")
+    {
+        if (opname == "get" || opname == "del")
+        {
+            if (pos == std::string::npos)
+            {
+                fprintf(stderr, "Usage: %s <key>\n", opname.c_str());
+                cb();
+                return;
+            }
+            auto key = trim(cmd.substr(pos+1));
+            if (opname == "get")
+            {
+                db->get(key, [this, cb](int res, const std::string & value)
+                {
+                    if (res < 0)
+                        fprintf(stderr, "Error: %s (code %d)\n", strerror(-res), res);
+                    else
+                    {
+                        write(1, value.c_str(), value.size());
+                        write(1, "\n", 1);
+                    }
+                    cb();
+                });
+            }
+            else
+            {
+                db->del(key, [this, cb](int res)
+                {
+                    if (res < 0)
+                        fprintf(stderr, "Error: %s (code %d)\n", strerror(-res), res);
+                    else
+                        printf("OK\n");
+                    cb();
+                });
+            }
+        }
+        else
+        {
+            auto pos2 = cmd.find_first_of(" \t", pos+1);
+            if (pos2 == std::string::npos)
+            {
+                fprintf(stderr, "Usage: set <key> <value>\n");
+                cb();
+                return;
+            }
+            auto key = trim(cmd.substr(pos+1, pos2-pos-1));
+            auto value = trim(cmd.substr(pos2+1));
+            db->set(key, value, [this, cb](int res)
+            {
+                if (res < 0)
+                    fprintf(stderr, "Error: %s (code %d)\n", strerror(-res), res);
+                else
+                    printf("OK\n");
+                cb();
+            });
+        }
+    }
+    else if (opname == "list")
+    {
+        std::string start, end;
+        if (pos != std::string::npos)
+        {
+            auto pos2 = cmd.find_first_of(" \t", pos+1);
+            if (pos2 != std::string::npos)
+            {
+                start = trim(cmd.substr(pos+1, pos2-pos-1));
+                end = trim(cmd.substr(pos2+1));
+            }
+            else
+            {
+                start = trim(cmd.substr(pos+1));
+            }
+        }
+        void *handle = db->list_start(start);
+        db->list_next(handle, [=](int res, const std::string & key, const std::string & value)
+        {
+            if (res < 0)
+            {
+                if (res != -ENOENT)
+                {
+                    fprintf(stderr, "Error: %s (code %d)\n", strerror(-res), res);
+                }
+                db->list_close(handle);
+                cb();
+            }
+            else
+            {
+                printf("%s = %s\n", key.c_str(), value.c_str());
+                db->list_next(handle, NULL);
+            }
+        });
+    }
+    else if (opname == "close")
+    {
+        db->close([=]()
+        {
+            printf("Index closed\n");
+            cb();
+        });
+    }
+    else if (opname == "quit" || opname == "q")
+    {
+        ::close(0);
+        finished = true;
+    }
+    else
+    {
+        fprintf(
+            stderr, "Unknown operation: %s. Supported operations:\n"
+            "open <pool_id> <inode_id> [block_size]\n"
+            "config <property> <value>\n"
+            "get <key>\nset <key> <value>\ndel <key>\nlist [<start> [end]]\n"
+            "close\nquit\n", opname.c_str()
+        );
+        cb();
+    }
+}
+
+int main(int narg, const char *args[])
+{
+    setvbuf(stdout, NULL, _IONBF, 0);
+    setvbuf(stderr, NULL, _IONBF, 0);
+    exe_name = args[0];
+    kv_cli_t *p = new kv_cli_t();
+    p->run(kv_cli_t::parse_args(narg, args));
+    delete p;
+    return 0;
+}
--- a/src/kv_db.cpp
+++ b/src/kv_db.cpp
--- a/src/kv_db.h
+++ b/src/kv_db.h
@ -0,0 +1,36 @@
+// Copyright (c) Vitaliy Filippov, 2019+
+// License: VNPL-1.1 (see README.md for details)
+//
+// Vitastor shared key/value database
+// Parallel optimistic B-Tree O:-)
+
+#pragma once
+
+#include "cluster_client.h"
+
+struct kv_db_t;
+
+struct kv_dbw_t
+{
+    kv_dbw_t(cluster_client_t *cli);
+    ~kv_dbw_t();
+
+    void open(inode_t inode_id, json11::Json cfg, std::function<void(int)> cb);
+    void set_config(json11::Json cfg);
+    void close(std::function<void()> cb);
+
+    uint64_t get_size();
+
+    void get(const std::string & key, std::function<void(int res, const std::string & value)> cb,
+        bool allow_old_cached = false);
+    void set(const std::string & key, const std::string & value, std::function<void(int res)> cb,
+        std::function<bool(int res, const std::string & value)> cas_compare = NULL);
+    void del(const std::string & key, std::function<void(int res)> cb,
+        std::function<bool(int res, const std::string & value)> cas_compare = NULL);
+
+    void* list_start(const std::string & start);
+    void list_next(void *handle, std::function<void(int res, const std::string & key, const std::string & value)> cb);
+    void list_close(void *handle);
+
+    kv_db_t *db;
+};
--- a/src/kv_stress.cpp
+++ b/src/kv_stress.cpp
@ -0,0 +1,697 @@
+// Copyright (c) Vitaliy Filippov, 2019+
+// License: VNPL-1.1 (see README.md for details)
+//
+// Vitastor shared key/value database stress tester / benchmark
+
+#define _XOPEN_SOURCE
+#include <limits.h>
+
+#include <netinet/tcp.h>
+#include <sys/epoll.h>
+#include <unistd.h>
+#include <fcntl.h>
+//#include <signal.h>
+
+#include "epoll_manager.h"
+#include "str_util.h"
+#include "kv_db.h"
+
+const char *exe_name = NULL;
+
+struct kv_test_listing_t
+{
+    uint64_t count = 0, done = 0;
+    void *handle = NULL;
+    std::string next_after;
+    std::set<std::string> inflights;
+    timespec tv_begin;
+    bool error = false;
+};
+
+struct kv_test_lat_t
+{
+    const char *name = NULL;
+    uint64_t usec = 0, count = 0;
+};
+
+struct kv_test_stat_t
+{
+    kv_test_lat_t get, add, update, del, list;
+    uint64_t list_keys = 0;
+};
+
+class kv_test_t
+{
+public:
+    // Config
+    json11::Json::object kv_cfg;
+    std::string key_prefix, key_suffix;
+    uint64_t inode_id = 0;
+    uint64_t op_count = 1000000;
+    uint64_t runtime_sec = 0;
+    uint64_t parallelism = 4;
+    uint64_t reopen_prob = 1;
+    uint64_t get_prob = 30000;
+    uint64_t add_prob = 20000;
+    uint64_t update_prob = 20000;
+    uint64_t del_prob = 5000;
+    uint64_t list_prob = 300;
+    uint64_t min_key_len = 10;
+    uint64_t max_key_len = 70;
+    uint64_t min_value_len = 50;
+    uint64_t max_value_len = 300;
+    uint64_t min_list_count = 10;
+    uint64_t max_list_count = 1000;
+    uint64_t print_stats_interval = 1;
+    bool json_output = false;
+    uint64_t log_level = 1;
+    bool trace = false;
+    bool stop_on_error = false;
+    // FIXME: Multiple clients
+    kv_test_stat_t stat, prev_stat;
+    timespec prev_stat_time, start_stat_time;
+
+    // State
+    kv_dbw_t *db = NULL;
+    ring_loop_t *ringloop = NULL;
+    epoll_manager_t *epmgr = NULL;
+    cluster_client_t *cli = NULL;
+    ring_consumer_t consumer;
+    bool finished = false;
+    uint64_t total_prob = 0;
+    uint64_t ops_sent = 0, ops_done = 0;
+    int stat_timer_id = -1;
+    int in_progress = 0;
+    bool reopening = false;
+    std::set<kv_test_listing_t*> listings;
+    std::set<std::string> changing_keys;
+    std::map<std::string, std::string> values;
+
+    ~kv_test_t();
+
+    static json11::Json::object parse_args(int narg, const char *args[]);
+    void parse_config(json11::Json cfg);
+    void run(json11::Json cfg);
+    void loop();
+    void print_stats(kv_test_stat_t & prev_stat, timespec & prev_stat_time);
+    void print_total_stats();
+    void start_change(const std::string & key);
+    void stop_change(const std::string & key);
+    void add_stat(kv_test_lat_t & stat, timespec tv_begin);
+};
+
+kv_test_t::~kv_test_t()
+{
+    if (db)
+        delete db;
+    if (cli)
+    {
+        cli->flush();
+        delete cli;
+    }
+    if (epmgr)
+        delete epmgr;
+    if (ringloop)
+        delete ringloop;
+}
+
+json11::Json::object kv_test_t::parse_args(int narg, const char *args[])
+{
+    json11::Json::object cfg;
+    for (int i = 1; i < narg; i++)
+    {
+        if (!strcmp(args[i], "-h") || !strcmp(args[i], "--help"))
+        {
+            printf(
+                "Vitastor Key/Value DB stress tester / benchmark\n"
+                "(c) Vitaliy Filippov, 2023+ (VNPL-1.1)\n"
+                "\n"
+                "USAGE: %s --pool_id POOL_ID --inode_id INODE_ID [OPTIONS]\n"
+                "  --op_count 1000000\n"
+                "    Total operations to run during test. 0 means unlimited\n"
+                "  --key_prefix \"\"\n"
+                "    Prefix for all keys read or written (to avoid collisions)\n"
+                "  --key_suffix \"\"\n"
+                "    Suffix for all keys read or written (to avoid collisions, but scan all DB)\n"
+                "  --runtime 0\n"
+                "    Run for this number of seconds. 0 means unlimited\n"
+                "  --parallelism 4\n"
+                "    Run this number of operations in parallel\n"
+                "  --get_prob 30000\n"
+                "    Fraction of key retrieve operations\n"
+                "  --add_prob 20000\n"
+                "    Fraction of key addition operations\n"
+                "  --update_prob 20000\n"
+                "    Fraction of key update operations\n"
+                "  --del_prob 30000\n"
+                "    Fraction of key delete operations\n"
+                "  --list_prob 300\n"
+                "    Fraction of listing operations\n"
+                "  --min_key_len 10\n"
+                "    Minimum key size in bytes\n"
+                "  --max_key_len 70\n"
+                "    Maximum key size in bytes\n"
+                "  --min_value_len 50\n"
+                "    Minimum value size in bytes\n"
+                "  --max_value_len 300\n"
+                "    Maximum value size in bytes\n"
+                "  --min_list_count 10\n"
+                "    Minimum number of keys read in listing (0 = all keys)\n"
+                "  --max_list_count 1000\n"
+                "    Maximum number of keys read in listing\n"
+                "  --print_stats 1\n"
+                "    Print operation statistics every this number of seconds\n"
+                "  --json\n"
+                "    JSON output\n"
+                "  --stop_on_error 0\n"
+                "    Stop on first execution error, mismatch, lost key or extra key during listing\n"
+                "  --kv_memory_limit 128M\n"
+                "    Maximum memory to use for vitastor-kv index cache\n"
+                "  --kv_allocate_blocks 4\n"
+                "    Number of PG blocks used for new tree block allocation in parallel\n"
+                "  --kv_evict_max_misses 10\n"
+                "    Eviction algorithm parameter: retry eviction from another random spot\n"
+                "    if this number of keys is used currently or was used recently\n"
+                "  --kv_evict_attempts_per_level 3\n"
+                "    Retry eviction at most this number of times per tree level, starting\n"
+                "    with bottom-most levels\n"
+                "  --kv_evict_unused_age 1000\n"
+                "    Evict only keys unused during this number of last operations\n"
+                "  --kv_log_level 1\n"
+                "    Log level. 0 = errors, 1 = warnings, 10 = trace operations\n",
+                exe_name
+            );
+            exit(0);
+        }
+        else if (args[i][0] == '-' && args[i][1] == '-')
+        {
+            const char *opt = args[i]+2;
+            cfg[opt] = !strcmp(opt, "json") || i == narg-1 ? "1" : args[++i];
+        }
+    }
+    return cfg;
+}
+
+void kv_test_t::parse_config(json11::Json cfg)
+{
+    inode_id = INODE_WITH_POOL(cfg["pool_id"].uint64_value(), cfg["inode_id"].uint64_value());
+    if (cfg["op_count"].uint64_value() > 0)
+        op_count = cfg["op_count"].uint64_value();
+    key_prefix = cfg["key_prefix"].string_value();
+    key_suffix = cfg["key_suffix"].string_value();
+    if (cfg["runtime"].uint64_value() > 0)
+        runtime_sec = cfg["runtime"].uint64_value();
+    if (cfg["parallelism"].uint64_value() > 0)
+        parallelism = cfg["parallelism"].uint64_value();
+    if (!cfg["reopen_prob"].is_null())
+        reopen_prob = cfg["reopen_prob"].uint64_value();
+    if (!cfg["get_prob"].is_null())
+        get_prob = cfg["get_prob"].uint64_value();
+    if (!cfg["add_prob"].is_null())
+        add_prob = cfg["add_prob"].uint64_value();
+    if (!cfg["update_prob"].is_null())
+        update_prob = cfg["update_prob"].uint64_value();
+    if (!cfg["del_prob"].is_null())
+        del_prob = cfg["del_prob"].uint64_value();
+    if (!cfg["list_prob"].is_null())
+        list_prob = cfg["list_prob"].uint64_value();
+    if (!cfg["min_key_len"].is_null())
+        min_key_len = cfg["min_key_len"].uint64_value();
+    if (cfg["max_key_len"].uint64_value() > 0)
+        max_key_len = cfg["max_key_len"].uint64_value();
+    if (!cfg["min_value_len"].is_null())
+        min_value_len = cfg["min_value_len"].uint64_value();
+    if (cfg["max_value_len"].uint64_value() > 0)
+        max_value_len = cfg["max_value_len"].uint64_value();
+    if (!cfg["min_list_count"].is_null())
+        min_list_count = cfg["min_list_count"].uint64_value();
+    if (!cfg["max_list_count"].is_null())
+        max_list_count = cfg["max_list_count"].uint64_value();
+    if (!cfg["print_stats"].is_null())
+        print_stats_interval = cfg["print_stats"].uint64_value();
+    if (!cfg["json"].is_null())
+        json_output = true;
+    if (!cfg["stop_on_error"].is_null())
+        stop_on_error = cfg["stop_on_error"].bool_value();
+    if (!cfg["kv_memory_limit"].is_null())
+        kv_cfg["kv_memory_limit"] = cfg["kv_memory_limit"];
+    if (!cfg["kv_allocate_blocks"].is_null())
+        kv_cfg["kv_allocate_blocks"] = cfg["kv_allocate_blocks"];
+    if (!cfg["kv_evict_max_misses"].is_null())
+        kv_cfg["kv_evict_max_misses"] = cfg["kv_evict_max_misses"];
+    if (!cfg["kv_evict_attempts_per_level"].is_null())
+        kv_cfg["kv_evict_attempts_per_level"] = cfg["kv_evict_attempts_per_level"];
+    if (!cfg["kv_evict_unused_age"].is_null())
+        kv_cfg["kv_evict_unused_age"] = cfg["kv_evict_unused_age"];
+    if (!cfg["kv_log_level"].is_null())
+    {
+        log_level = cfg["kv_log_level"].uint64_value();
+        trace = log_level >= 10;
+        kv_cfg["kv_log_level"] = cfg["kv_log_level"];
+    }
+    total_prob = reopen_prob+get_prob+add_prob+update_prob+del_prob+list_prob;
+    stat.get.name = "get";
+    stat.add.name = "add";
+    stat.update.name = "update";
+    stat.del.name = "del";
+    stat.list.name = "list";
+}
+
+void kv_test_t::run(json11::Json cfg)
+{
+    srand48(time(NULL));
+    parse_config(cfg);
+    // Create client
+    ringloop = new ring_loop_t(512);
+    epmgr = new epoll_manager_t(ringloop);
+    cli = new cluster_client_t(ringloop, epmgr->tfd, cfg);
+    db = new kv_dbw_t(cli);
+    // Load image metadata
+    while (!cli->is_ready())
+    {
+        ringloop->loop();
+        if (cli->is_ready())
+            break;
+        ringloop->wait();
+    }
+    // Run
+    reopening = true;
+    db->open(inode_id, kv_cfg, [this](int res)
+    {
+        reopening = false;
+        if (res < 0)
+        {
+            fprintf(stderr, "ERROR: Open index: %d (%s)\n", res, strerror(-res));
+            exit(1);
+        }
+        if (trace)
+            printf("Index opened\n");
+        ringloop->wakeup();
+    });
+    consumer.loop = [this]() { loop(); };
+    ringloop->register_consumer(&consumer);
+    if (print_stats_interval)
+        stat_timer_id = epmgr->tfd->set_timer(print_stats_interval*1000, true, [this](int) { print_stats(prev_stat, prev_stat_time); });
+    clock_gettime(CLOCK_REALTIME, &start_stat_time);
+    prev_stat_time = start_stat_time;
+    while (!finished)
+    {
+        ringloop->loop();
+        if (!finished)
+            ringloop->wait();
+    }
+    if (stat_timer_id >= 0)
+        epmgr->tfd->clear_timer(stat_timer_id);
+    ringloop->unregister_consumer(&consumer);
+    // Print total stats
+    print_total_stats();
+    // Destroy the client
+    delete db;
+    db = NULL;
+    cli->flush();
+    delete cli;
+    delete epmgr;
+    delete ringloop;
+    cli = NULL;
+    epmgr = NULL;
+    ringloop = NULL;
+}
+
+static const char *base64_chars = "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789@+/";
+
+std::string random_str(int len)
+{
+    std::string str;
+    str.resize(len);
+    for (int i = 0; i < len; i++)
+    {
+        str[i] = base64_chars[lrand48() % 64];
+    }
+    return str;
+}
+
+void kv_test_t::loop()
+{
+    if (reopening)
+    {
+        return;
+    }
+    if (ops_done >= op_count)
+    {
+        finished = true;
+    }
+    while (!finished && ops_sent < op_count && in_progress < parallelism)
+    {
+        uint64_t dice = (lrand48() % total_prob);
+        if (dice < reopen_prob)
+        {
+            reopening = true;
+            db->close([this]()
+            {
+                if (trace)
+                    printf("Index closed\n");
+                db->open(inode_id, kv_cfg, [this](int res)
+                {
+                    reopening = false;
+                    if (res < 0)
+                    {
+                        fprintf(stderr, "ERROR: Reopen index: %d (%s)\n", res, strerror(-res));
+                        finished = true;
+                        return;
+                    }
+                    if (trace)
+                        printf("Index reopened\n");
+                    ringloop->wakeup();
+                });
+            });
+            return;
+        }
+        else if (dice < reopen_prob+get_prob)
+        {
+            // get existing
+            auto key = random_str(max_key_len);
+            auto k_it = values.lower_bound(key);
+            if (k_it == values.end())
+                continue;
+            key = k_it->first;
+            if (changing_keys.find(key) != changing_keys.end())
+                continue;
+            in_progress++;
+            ops_sent++;
+            if (trace)
+                printf("get %s\n", key.c_str());
+            timespec tv_begin;
+            clock_gettime(CLOCK_REALTIME, &tv_begin);
+            db->get(key, [this, key, tv_begin](int res, const std::string & value)
+            {
+                add_stat(stat.get, tv_begin);
+                ops_done++;
+                in_progress--;
+                auto it = values.find(key);
+                if (res != (it == values.end() ? -ENOENT : 0))
+                {
+                    fprintf(stderr, "ERROR: get %s: %d (%s)\n", key.c_str(), res, strerror(-res));
+                    if (stop_on_error)
+                        exit(1);
+                }
+                else if (it != values.end() && value != it->second)
+                {
+                    fprintf(stderr, "ERROR: get %s: mismatch: %s vs %s\n", key.c_str(), value.c_str(), it->second.c_str());
+                    if (stop_on_error)
+                        exit(1);
+                }
+                ringloop->wakeup();
+            });
+        }
+        else if (dice < reopen_prob+get_prob+add_prob+update_prob)
+        {
+            bool is_add = false;
+            std::string key;
+            if (dice < reopen_prob+get_prob+add_prob)
+            {
+                // add
+                is_add = true;
+                uint64_t key_len = min_key_len + (max_key_len > min_key_len ? lrand48() % (max_key_len-min_key_len) : 0);
+                key = key_prefix + random_str(key_len) + key_suffix;
+            }
+            else
+            {
+                // update
+                key = random_str(max_key_len);
+                auto k_it = values.lower_bound(key);
+                if (k_it == values.end())
+                    continue;
+                key = k_it->first;
+            }
+            if (changing_keys.find(key) != changing_keys.end())
+                continue;
+            uint64_t value_len = min_value_len + (max_value_len > min_value_len ? lrand48() % (max_value_len-min_value_len) : 0);
+            auto value = random_str(value_len);
+            start_change(key);
+            ops_sent++;
+            in_progress++;
+            if (trace)
+                printf("set %s = %s\n", key.c_str(), value.c_str());
+            timespec tv_begin;
+            clock_gettime(CLOCK_REALTIME, &tv_begin);
+            db->set(key, value, [this, key, value, tv_begin, is_add](int res)
+            {
+                add_stat(is_add ? stat.add : stat.update, tv_begin);
+                stop_change(key);
+                ops_done++;
+                in_progress--;
+                if (res != 0)
+                {
+                    fprintf(stderr, "ERROR: set %s = %s: %d (%s)\n", key.c_str(), value.c_str(), res, strerror(-res));
+                    if (stop_on_error)
+                        exit(1);
+                }
+                else
+                {
+                    values[key] = value;
+                }
+                ringloop->wakeup();
+            }, NULL);
+        }
+        else if (dice < reopen_prob+get_prob+add_prob+update_prob+del_prob)
+        {
+            // delete
+            auto key = random_str(max_key_len);
+            auto k_it = values.lower_bound(key);
+            if (k_it == values.end())
+                continue;
+            key = k_it->first;
+            if (changing_keys.find(key) != changing_keys.end())
+                continue;
+            start_change(key);
+            ops_sent++;
+            in_progress++;
+            if (trace)
+                printf("del %s\n", key.c_str());
+            timespec tv_begin;
+            clock_gettime(CLOCK_REALTIME, &tv_begin);
+            db->del(key, [this, key, tv_begin](int res)
+            {
+                add_stat(stat.del, tv_begin);
+                stop_change(key);
+                ops_done++;
+                in_progress--;
+                if (res != 0)
+                {
+                    fprintf(stderr, "ERROR: del %s: %d (%s)\n", key.c_str(), res, strerror(-res));
+                    if (stop_on_error)
+                        exit(1);
+                }
+                else
+                {
+                    values.erase(key);
+                }
+                ringloop->wakeup();
+            }, NULL);
+        }
+        else if (dice < reopen_prob+get_prob+add_prob+update_prob+del_prob+list_prob)
+        {
+            // list
+            ops_sent++;
+            in_progress++;
+            auto key = random_str(max_key_len);
+            auto lst = new kv_test_listing_t;
+            auto k_it = values.lower_bound(key);
+            lst->count = min_list_count + (max_list_count > min_list_count ? lrand48() % (max_list_count-min_list_count) : 0);
+            lst->handle = db->list_start(k_it == values.begin() ? key_prefix : key);
+            lst->next_after = k_it == values.begin() ? key_prefix : key;
+            lst->inflights = changing_keys;
+            listings.insert(lst);
+            if (trace)
+                printf("list from %s\n", key.c_str());
+            clock_gettime(CLOCK_REALTIME, &lst->tv_begin);
+            db->list_next(lst->handle, [this, lst](int res, const std::string & key, const std::string & value)
+            {
+                if (log_level >= 11)
+                    printf("list: %s = %s\n", key.c_str(), value.c_str());
+                if (res >= 0 && key_prefix.size() && (key.size() < key_prefix.size() ||
+                    key.substr(0, key_prefix.size()) != key_prefix))
+                {
+                    // stop at this key
+                    res = -ENOENT;
+                }
+                if (res < 0 || (lst->count > 0 && lst->done >= lst->count))
+                {
+                    add_stat(stat.list, lst->tv_begin);
+                    if (res == 0)
+                    {
+                        // ok (done >= count)
+                    }
+                    else if (res != -ENOENT)
+                    {
+                        fprintf(stderr, "ERROR: list: %d (%s)\n", res, strerror(-res));
+                        lst->error = true;
+                    }
+                    else
+                    {
+                        auto k_it = lst->next_after == "" ? values.begin() : values.upper_bound(lst->next_after);
+                        while (k_it != values.end())
+                        {
+                            while (k_it != values.end() && lst->inflights.find(k_it->first) != lst->inflights.end())
+                                k_it++;
+                            if (k_it != values.end())
+                            {
+                                fprintf(stderr, "ERROR: list: missing key %s\n", (k_it++)->first.c_str());
+                                lst->error = true;
+                            }
+                        }
+                    }
+                    if (lst->error && stop_on_error)
+                        exit(1);
+                    ops_done++;
+                    in_progress--;
+                    db->list_close(lst->handle);
+                    delete lst;
+                    listings.erase(lst);
+                    ringloop->wakeup();
+                }
+                else
+                {
+                    stat.list_keys++;
+                    // Do not check modified keys in listing
+                    // Listing may return their old or new state
+                    if ((!key_suffix.size() || key.size() >= key_suffix.size() &&
+                        key.substr(key.size()-key_suffix.size()) == key_suffix) &&
+                        lst->inflights.find(key) == lst->inflights.end())
+                    {
+                        lst->done++;
+                        auto k_it = lst->next_after == "" ? values.begin() : values.upper_bound(lst->next_after);
+                        while (true)
+                        {
+                            while (k_it != values.end() && lst->inflights.find(k_it->first) != lst->inflights.end())
+                            {
+                                k_it++;
+                            }
+                            if (k_it == values.end() || k_it->first > key)
+                            {
+                                fprintf(stderr, "ERROR: list: extra key %s\n", key.c_str());
+                                lst->error = true;
+                                break;
+                            }
+                            else if (k_it->first < key)
+                            {
+                                fprintf(stderr, "ERROR: list: missing key %s\n", k_it->first.c_str());
+                                lst->error = true;
+                                lst->next_after = k_it->first;
+                                k_it++;
+                            }
+                            else
+                            {
+                                if (k_it->second != value)
+                                {
+                                    fprintf(stderr, "ERROR: list: mismatch: %s = %s but should be %s\n",
+                                        key.c_str(), value.c_str(), k_it->second.c_str());
+                                    lst->error = true;
+                                }
+                                lst->next_after = k_it->first;
+                                break;
+                            }
+                        }
+                    }
+                    db->list_next(lst->handle, NULL);
+                }
+            });
+        }
+    }
+}
+
+void kv_test_t::add_stat(kv_test_lat_t & stat, timespec tv_begin)
+{
+    timespec tv_end;
+    clock_gettime(CLOCK_REALTIME, &tv_end);
+    int64_t usec = (tv_end.tv_sec - tv_begin.tv_sec)*1000000 +
+        (tv_end.tv_nsec - tv_begin.tv_nsec)/1000;
+    if (usec > 0)
+    {
+        stat.usec += usec;
+        stat.count++;
+    }
+}
+
+void kv_test_t::print_stats(kv_test_stat_t & prev_stat, timespec & prev_stat_time)
+{
+    timespec cur_stat_time;
+    clock_gettime(CLOCK_REALTIME, &cur_stat_time);
+    int64_t usec = (cur_stat_time.tv_sec - prev_stat_time.tv_sec)*1000000 +
+        (cur_stat_time.tv_nsec - prev_stat_time.tv_nsec)/1000;
+    if (usec > 0)
+    {
+        kv_test_lat_t *lats[] = { &stat.get, &stat.add, &stat.update, &stat.del, &stat.list };
+        kv_test_lat_t *prev[] = { &prev_stat.get, &prev_stat.add, &prev_stat.update, &prev_stat.del, &prev_stat.list };
+        if (!json_output)
+        {
+            char buf[128] = { 0 };
+            for (int i = 0; i < sizeof(lats)/sizeof(lats[0]); i++)
+            {
+                snprintf(buf, sizeof(buf)-1, "%.1f %s/s (%lu us)", (lats[i]->count-prev[i]->count)*1000000.0/usec,
+                    lats[i]->name, (lats[i]->usec-prev[i]->usec)/(lats[i]->count-prev[i]->count > 0 ? lats[i]->count-prev[i]->count : 1));
+                int k;
+                for (k = strlen(buf); k < strlen(lats[i]->name)+21; k++)
+                    buf[k] = ' ';
+                buf[k] = 0;
+                printf("%s", buf);
+            }
+            printf("\n");
+        }
+        else
+        {
+            int64_t runtime = (cur_stat_time.tv_sec - start_stat_time.tv_sec)*1000000 +
+                (cur_stat_time.tv_nsec - start_stat_time.tv_nsec)/1000;
+            printf("{\"runtime\":%.1f", (double)runtime/1000000.0);
+            for (int i = 0; i < sizeof(lats)/sizeof(lats[0]); i++)
+            {
+                if (lats[i]->count > prev[i]->count)
+                {
+                    printf(
+                        ",\"%s\":{\"avg\":{\"iops\":%.1f,\"usec\":%lu},\"total\":{\"count\":%lu,\"usec\":%lu}}",
+                        lats[i]->name, (lats[i]->count-prev[i]->count)*1000000.0/usec,
+                        (lats[i]->usec-prev[i]->usec)/(lats[i]->count-prev[i]->count),
+                        lats[i]->count, lats[i]->usec
+                    );
+                }
+            }
+            printf("}\n");
+        }
+    }
+    prev_stat = stat;
+    prev_stat_time = cur_stat_time;
+}
+
+void kv_test_t::print_total_stats()
+{
+    if (!json_output)
+        printf("Total:\n");
+    kv_test_stat_t start_stats;
+    timespec start_stat_time = this->start_stat_time;
+    print_stats(start_stats, start_stat_time);
+}
+
+void kv_test_t::start_change(const std::string & key)
+{
+    changing_keys.insert(key);
+    for (auto lst: listings)
+    {
+        lst->inflights.insert(key);
+    }
+}
+
+void kv_test_t::stop_change(const std::string & key)
+{
+    changing_keys.erase(key);
+}
+
+int main(int narg, const char *args[])
+{
+    setvbuf(stdout, NULL, _IONBF, 0);
+    setvbuf(stderr, NULL, _IONBF, 0);
+    exe_name = args[0];
+    kv_test_t *p = new kv_test_t();
+    p->run(kv_test_t::parse_args(narg, args));
+    delete p;
+    return 0;
+}
--- a/src/malloc_or_die.h
+++ b/src/malloc_or_die.h
@ -11,7 +11,7 @@ inline void* memalign_or_die(size_t alignment, size_t size)
    void *buf = memalign(alignment, size);
    if (!buf)
    {
-        printf("Failed to allocate %lu bytes\n", size);
+        printf("Failed to allocate %zu bytes\n", size);
        exit(1);
    }
    return buf;
@ -22,7 +22,7 @@ inline void* malloc_or_die(size_t size)
    void *buf = malloc(size);
    if (!buf)
    {
-        printf("Failed to allocate %lu bytes\n", size);
+        printf("Failed to allocate %zu bytes\n", size);
        exit(1);
    }
    return buf;
@ -33,7 +33,7 @@ inline void* realloc_or_die(void *ptr, size_t size)
    void *buf = realloc(ptr, size);
    if (!buf)
    {
-        printf("Failed to allocate %lu bytes\n", size);
+        printf("Failed to allocate %zu bytes\n", size);
        exit(1);
    }
    return buf;
@ -44,7 +44,7 @@ inline void* calloc_or_die(size_t nmemb, size_t size)
    void *buf = calloc(nmemb, size);
    if (!buf)
    {
-        printf("Failed to allocate %lu bytes\n", size * nmemb);
+        printf("Failed to allocate %zu bytes\n", size * nmemb);
        exit(1);
    }
    return buf;
--- a/src/messenger.cpp
+++ b/src/messenger.cpp
@ -27,13 +27,13 @@ void osd_messenger_t::init()
        if (!rdma_context)
        {
            if (log_level > 0)
-                fprintf(stderr, "[OSD %lu] Couldn't initialize RDMA, proceeding with TCP only\n", osd_num);
+                fprintf(stderr, "[OSD %ju] Couldn't initialize RDMA, proceeding with TCP only\n", osd_num);
        }
        else
        {
            rdma_max_sge = rdma_max_sge < rdma_context->attrx.orig_attr.max_sge
                ? rdma_max_sge : rdma_context->attrx.orig_attr.max_sge;
-            fprintf(stderr, "[OSD %lu] RDMA initialized successfully\n", osd_num);
+            fprintf(stderr, "[OSD %ju] RDMA initialized successfully\n", osd_num);
            fcntl(rdma_context->channel->fd, F_SETFL, fcntl(rdma_context->channel->fd, F_GETFL, 0) | O_NONBLOCK);
            tfd->set_fd_handler(rdma_context->channel->fd, false, [this](int notify_fd, int epoll_events)
            {
@ -45,11 +45,12 @@ void osd_messenger_t::init()
 #endif
    keepalive_timer_id = tfd->set_timer(1000, true, [this](int)
    {
-        std::vector<int> to_stop;
-        std::vector<osd_op_t*> to_ping;
-        for (auto cl_it = clients.begin(); cl_it != clients.end(); cl_it++)
+        auto cl_it = clients.begin();
+        while (cl_it != clients.end())
        {
            auto cl = cl_it->second;
+            cl_it++;
+            auto peer_fd = cl->peer_fd;
            if (!cl->osd_num || cl->peer_state != PEER_CONNECTED && cl->peer_state != PEER_RDMA)
            {
                // Do not run keepalive on regular clients
@ -61,8 +62,10 @@ void osd_messenger_t::init()
                if (!cl->ping_time_remaining)
                {
                    // Ping timed out, stop the client
-                    fprintf(stderr, "Ping timed out for OSD %lu (client %d), disconnecting peer\n", cl->osd_num, cl->peer_fd);
-                    to_stop.push_back(cl->peer_fd);
+                    fprintf(stderr, "Ping timed out for OSD %ju (client %d), disconnecting peer\n", cl->osd_num, cl->peer_fd);
+                    stop_client(peer_fd, true);
+                    // Restart iterator because it may be invalidated
+                    cl_it = clients.upper_bound(peer_fd);
                }
            }
            else if (cl->idle_time_remaining > 0)
@ -96,13 +99,15 @@ void osd_messenger_t::init()
                        delete op;
                        if (fail_fd >= 0)
                        {
-                            fprintf(stderr, "Ping failed for OSD %lu (client %d), disconnecting peer\n", fail_osd_num, fail_fd);
+                            fprintf(stderr, "Ping failed for OSD %ju (client %d), disconnecting peer\n", fail_osd_num, fail_fd);
                            stop_client(fail_fd, true);
                        }
                    };
-                    to_ping.push_back(op);
                    cl->ping_time_remaining = osd_ping_timeout;
                    cl->idle_time_remaining = osd_idle_timeout;
+                    outbox_push(op);
+                    // Restart iterator because it may be invalidated
+                    cl_it = clients.upper_bound(peer_fd);
                }
            }
            else
@ -110,15 +115,6 @@ void osd_messenger_t::init()
                cl->idle_time_remaining = osd_idle_timeout;
            }
        }
-        // Don't stop clients while a 'clients' iterator is still active
-        for (int peer_fd: to_stop)
-        {
-            stop_client(peer_fd, true);
-        }
-        for (auto op: to_ping)
-        {
-            outbox_push(op);
-        }
    });
 }

@ -257,7 +253,7 @@ void osd_messenger_t::try_connect_peer_addr(osd_num_t peer_osd, const char *peer
    clients[peer_fd] = new osd_client_t();
    if (log_level > 0)
    {
-        fprintf(stderr, "Connecting to OSD %lu at %s:%d (client %d)\n", peer_osd, peer_host, peer_port, peer_fd);
+        fprintf(stderr, "Connecting to OSD %ju at %s:%d (client %d)\n", peer_osd, peer_host, peer_port, peer_fd);
    }
    clients[peer_fd]->peer_addr = addr;
    clients[peer_fd]->peer_port = peer_port;
@ -323,7 +319,7 @@ void osd_messenger_t::handle_peer_epoll(int peer_fd, int epoll_events)
        // Stop client
        if (log_level > 0)
        {
-            fprintf(stderr, "[OSD %lu] client %d disconnected\n", this->osd_num, peer_fd);
+            fprintf(stderr, "[OSD %ju] client %d disconnected\n", this->osd_num, peer_fd);
        }
        stop_client(peer_fd, true);
    }
@ -349,7 +345,7 @@ void osd_messenger_t::on_connect_peer(osd_num_t peer_osd, int peer_fd)
    wp.connecting = false;
    if (peer_fd < 0)
    {
-        fprintf(stderr, "Failed to connect to peer OSD %lu address %s port %d: %s\n", peer_osd, wp.cur_addr.c_str(), wp.cur_port, strerror(-peer_fd));
+        fprintf(stderr, "Failed to connect to peer OSD %ju address %s port %d: %s\n", peer_osd, wp.cur_addr.c_str(), wp.cur_port, strerror(-peer_fd));
        if (wp.address_changed)
        {
            wp.address_changed = false;
@ -376,7 +372,7 @@ void osd_messenger_t::on_connect_peer(osd_num_t peer_osd, int peer_fd)
    }
    if (log_level > 0)
    {
-        fprintf(stderr, "[OSD %lu] Connected with peer OSD %lu (client %d)\n", osd_num, peer_osd, peer_fd);
+        fprintf(stderr, "[OSD %ju] Connected with peer OSD %ju (client %d)\n", osd_num, peer_osd, peer_fd);
    }
    wanted_peers.erase(peer_osd);
    repeer_pgs(peer_osd);
@ -422,7 +418,7 @@ void osd_messenger_t::check_peer_config(osd_client_t *cl)
        if (op->reply.hdr.retval < 0)
        {
            err = true;
-            fprintf(stderr, "Failed to get config from OSD %lu (retval=%ld), disconnecting peer\n", cl->osd_num, op->reply.hdr.retval);
+            fprintf(stderr, "Failed to get config from OSD %ju (retval=%jd), disconnecting peer\n", cl->osd_num, op->reply.hdr.retval);
        }
        else
        {
@ -430,18 +426,18 @@ void osd_messenger_t::check_peer_config(osd_client_t *cl)
            if (json_err != "")
            {
                err = true;
-                fprintf(stderr, "Failed to get config from OSD %lu: bad JSON: %s, disconnecting peer\n", cl->osd_num, json_err.c_str());
+                fprintf(stderr, "Failed to get config from OSD %ju: bad JSON: %s, disconnecting peer\n", cl->osd_num, json_err.c_str());
            }
            else if (config["osd_num"].uint64_value() != cl->osd_num)
            {
                err = true;
-                fprintf(stderr, "Connected to OSD %lu instead of OSD %lu, peer state is outdated, disconnecting peer\n", config["osd_num"].uint64_value(), cl->osd_num);
+                fprintf(stderr, "Connected to OSD %ju instead of OSD %ju, peer state is outdated, disconnecting peer\n", config["osd_num"].uint64_value(), cl->osd_num);
            }
            else if (config["protocol_version"].uint64_value() != OSD_PROTOCOL_VERSION)
            {
                err = true;
                fprintf(
-                    stderr, "OSD %lu protocol version is %lu, but only version %u is supported.\n"
+                    stderr, "OSD %ju protocol version is %ju, but only version %u is supported.\n"
                    " If you need to upgrade from 0.5.x please request it via the issue tracker.\n",
                    cl->osd_num, config["protocol_version"].uint64_value(), OSD_PROTOCOL_VERSION
                );
@ -467,7 +463,7 @@ void osd_messenger_t::check_peer_config(osd_client_t *cl)
                cl->rdma_conn->connect(&addr) != 0)
            {
                fprintf(
-                    stderr, "Failed to connect to OSD %lu (address %s) using RDMA\n",
+                    stderr, "Failed to connect to OSD %ju (address %s) using RDMA\n",
                    cl->osd_num, config["rdma_address"].string_value().c_str()
                );
                delete cl->rdma_conn;
@ -488,7 +484,7 @@ void osd_messenger_t::check_peer_config(osd_client_t *cl)
                }
                if (log_level > 0)
                {
-                    fprintf(stderr, "Connected to OSD %lu using RDMA\n", cl->osd_num);
+                    fprintf(stderr, "Connected to OSD %ju using RDMA\n", cl->osd_num);
                }
                cl->peer_state = PEER_RDMA;
                tfd->set_fd_handler(cl->peer_fd, false, [this](int peer_fd, int epoll_events)
@ -520,7 +516,7 @@ void osd_messenger_t::accept_connections(int listen_fd)
    while ((peer_fd = accept(listen_fd, (sockaddr*)&addr, &peer_addr_size)) >= 0)
    {
        assert(peer_fd != 0);
-        fprintf(stderr, "[OSD %lu] new client %d: connection from %s\n", this->osd_num, peer_fd,
+        fprintf(stderr, "[OSD %ju] new client %d: connection from %s\n", this->osd_num, peer_fd,
            addr_to_string(addr).c_str());
        fcntl(peer_fd, F_SETFL, fcntl(peer_fd, F_GETFL, 0) | O_NONBLOCK);
        int one = 1;
--- a/src/msgr_op.h
+++ b/src/msgr_op.h
@ -76,7 +76,7 @@ struct osd_op_buf_list_t
                buf = (iovec*)malloc(sizeof(iovec) * alloc);
                if (!buf)
                {
-                    fprintf(stderr, "Failed to allocate %lu bytes\n", sizeof(iovec) * alloc);
+                    fprintf(stderr, "Failed to allocate %u bytes\n", (int)sizeof(iovec) * alloc);
                    exit(1);
                }
                memcpy(buf, inline_buf, sizeof(iovec) * old);
@ -87,7 +87,7 @@ struct osd_op_buf_list_t
                buf = (iovec*)realloc(buf, sizeof(iovec) * alloc);
                if (!buf)
                {
-                    fprintf(stderr, "Failed to allocate %lu bytes\n", sizeof(iovec) * alloc);
+                    fprintf(stderr, "Failed to allocate %u bytes\n", (int)sizeof(iovec) * alloc);
                    exit(1);
                }
            }
@ -109,7 +109,7 @@ struct osd_op_buf_list_t
                buf = (iovec*)malloc(sizeof(iovec) * alloc);
                if (!buf)
                {
-                    fprintf(stderr, "Failed to allocate %lu bytes\n", sizeof(iovec) * alloc);
+                    fprintf(stderr, "Failed to allocate %u bytes\n", (int)sizeof(iovec) * alloc);
                    exit(1);
                }
                memcpy(buf, inline_buf, sizeof(iovec)*old);
@ -120,7 +120,7 @@ struct osd_op_buf_list_t
                buf = (iovec*)realloc(buf, sizeof(iovec) * alloc);
                if (!buf)
                {
-                    fprintf(stderr, "Failed to allocate %lu bytes\n", sizeof(iovec) * alloc);
+                    fprintf(stderr, "Failed to allocate %u bytes\n", (int)sizeof(iovec) * alloc);
                    exit(1);
                }
            }
--- a/src/msgr_rdma.cpp
+++ b/src/msgr_rdma.cpp
@ -10,7 +10,7 @@ std::string msgr_rdma_address_t::to_string()
 {
    char msg[sizeof "0000:00000000:00000000:00000000000000000000000000000000"];
    sprintf(
-        msg, "%04x:%06x:%06x:%016lx%016lx", lid, qpn, psn,
+        msg, "%04x:%06x:%06x:%016jx%016jx", lid, qpn, psn,
        htobe64(((uint64_t*)&gid)[0]), htobe64(((uint64_t*)&gid)[1])
    );
    return std::string(msg);
@ -20,7 +20,7 @@ bool msgr_rdma_address_t::from_string(const char *str, msgr_rdma_address_t *dest
 {
    uint64_t* gid = (uint64_t*)&dest->gid;
    int scanned = sscanf(
-        str, "%hx:%x:%x:%16lx%16lx", &dest->lid, &dest->qpn, &dest->psn, gid, gid+1
+        str, "%hx:%x:%x:%16jx%16jx", &dest->lid, &dest->qpn, &dest->psn, gid, gid+1
    );
    gid[0] = be64toh(gid[0]);
    gid[1] = be64toh(gid[1]);
@ -594,7 +594,7 @@ void osd_messenger_t::handle_rdma_events()
                fprintf(stderr, "RDMA work request failed for client %d", client_id);
                if (cl->osd_num)
                {
-                    fprintf(stderr, " (OSD %lu)", cl->osd_num);
+                    fprintf(stderr, " (OSD %ju)", cl->osd_num);
                }
                fprintf(stderr, " with status: %s, stopping client\n", ibv_wc_status_str(wc[i].status));
                stop_client(client_id);
--- a/src/msgr_receive.cpp
+++ b/src/msgr_receive.cpp
@ -180,7 +180,7 @@ bool osd_messenger_t::handle_finished_read(osd_client_t *cl)
            handle_op_hdr(cl);
        else
        {
-            fprintf(stderr, "Received garbage: magic=%lx id=%lu opcode=%lx from %d\n", cl->read_op->req.hdr.magic, cl->read_op->req.hdr.id, cl->read_op->req.hdr.opcode, cl->peer_fd);
+            fprintf(stderr, "Received garbage: magic=%jx id=%ju opcode=%jx from %d\n", cl->read_op->req.hdr.magic, cl->read_op->req.hdr.id, cl->read_op->req.hdr.opcode, cl->peer_fd);
            stop_client(cl->peer_fd);
            return false;
        }
@ -297,7 +297,7 @@ bool osd_messenger_t::handle_reply_hdr(osd_client_t *cl)
    if (req_it == cl->sent_ops.end())
    {
        // Command out of sync. Drop connection
-        fprintf(stderr, "Client %d command out of sync: id %lu\n", cl->peer_fd, cl->read_op->req.hdr.id);
+        fprintf(stderr, "Client %d command out of sync: id %ju\n", cl->peer_fd, cl->read_op->req.hdr.id);
        stop_client(cl->peer_fd);
        return false;
    }
@ -312,7 +312,7 @@ bool osd_messenger_t::handle_reply_hdr(osd_client_t *cl)
        if (op->reply.hdr.retval >= 0 && (op->reply.hdr.retval != expected_size || bmp_len > op->bitmap_len))
        {
            // Check reply length to not overflow the buffer
-            fprintf(stderr, "Client %d read reply of different length: expected %u+%u, got %ld+%u\n",
+            fprintf(stderr, "Client %d read reply of different length: expected %u+%u, got %jd+%u\n",
                cl->peer_fd, expected_size, op->bitmap_len, op->reply.hdr.retval, bmp_len);
            cl->sent_ops[op->req.hdr.id] = op;
            stop_client(cl->peer_fd);
--- a/src/msgr_stop.cpp
+++ b/src/msgr_stop.cpp
@ -61,11 +61,11 @@ void osd_messenger_t::stop_client(int peer_fd, bool force, bool force_delete)
    {
        if (cl->osd_num)
        {
-            fprintf(stderr, "[OSD %lu] Stopping client %d (OSD peer %lu)\n", osd_num, peer_fd, cl->osd_num);
+            fprintf(stderr, "[OSD %ju] Stopping client %d (OSD peer %ju)\n", osd_num, peer_fd, cl->osd_num);
        }
        else
        {
-            fprintf(stderr, "[OSD %lu] Stopping client %d (regular client)\n", osd_num, peer_fd);
+            fprintf(stderr, "[OSD %ju] Stopping client %d (regular client)\n", osd_num, peer_fd);
        }
    }
    // First set state to STOPPED so another stop_client() call doesn't try to free it again
--- a/src/nbd_proxy.cpp
+++ b/src/nbd_proxy.cpp
@ -738,7 +738,7 @@ protected:
            }
            uint64_t handle = *((uint64_t*)cur_req.handle);
 #ifdef DEBUG
-            printf("request %lx +%x %lx\n", be64toh(cur_req.from), be32toh(cur_req.len), handle);
+            printf("request %jx +%x %jx\n", be64toh(cur_req.from), be32toh(cur_req.len), handle);
 #endif
            void *buf = NULL;
            cluster_op_t *op = new cluster_op_t;
@ -759,7 +759,7 @@ protected:
            op->callback = [this, buf, handle](cluster_op_t *op)
            {
 #ifdef DEBUG
-                printf("reply %lx e=%d\n", handle, op->retval);
+                printf("reply %jx e=%d\n", handle, op->retval);
 #endif
                nbd_reply *reply = (nbd_reply*)buf;
                reply->magic = htobe32(NBD_REPLY_MAGIC);
@ -769,7 +769,7 @@ protected:
                if (op->retval < 0 || op->opcode != OSD_OP_READ)
                    to_list.push_back({ .iov_base = buf, .iov_len = sizeof(nbd_reply) });
                else
-                    to_list.push_back({ .iov_base = buf, .iov_len = sizeof(nbd_reply) + op->len });
+                    to_list.push_back({ .iov_base = buf, .iov_len = sizeof(nbd_reply) + (size_t)op->len });
                to_free.push_back(buf);
                delete op;
                ringloop->wakeup();
--- a/src/nfs_block.cpp
+++ b/src/nfs_block.cpp
@ -1,23 +1,18 @@
 // Copyright (c) Vitaliy Filippov, 2019+
 // License: VNPL-1.1 (see README.md for details)
 //
-// NFS connection handler for NFS proxy
+// NFS proxy over Vitastor block images

 #include <sys/time.h>

 #include "str_util.h"

 #include "nfs_proxy.h"
-
+#include "nfs_common.h"
+#include "nfs_block.h"
 #include "nfs/nfs.h"
-
 #include "cli.h"

-#define TRUE 1
-#define FALSE 0
-
-#define MAX_REQUEST_SIZE 128*1024*1024
-
 static unsigned len_pad4(unsigned len)
 {
    return len + (len&3 ? 4-(len&3) : 0);
@ -28,10 +23,10 @@ static std::string get_inode_name(nfs_client_t *self, diropargs3 & what)
    // Get name
    std::string dirhash = what.dir;
    std::string dir;
-    if (dirhash != "roothandle")
+    if (dirhash != NFS_ROOT_HANDLE)
    {
-        auto dir_it = self->parent->dir_by_hash.find(dirhash);
-        if (dir_it != self->parent->dir_by_hash.end())
+        auto dir_it = self->parent->blockfs->dir_by_hash.find(dirhash);
+        if (dir_it != self->parent->blockfs->dir_by_hash.end())
            dir = dir_it->second;
        else
            return "";
@ -42,24 +37,9 @@ static std::string get_inode_name(nfs_client_t *self, diropargs3 & what)
        : self->parent->name_prefix+name);
 }

-static nfsstat3 vitastor_nfs_map_err(int err)
-{
-    return (err == EINVAL ? NFS3ERR_INVAL
-        : (err == ENOENT ? NFS3ERR_NOENT
-        : (err == ENOSPC ? NFS3ERR_NOSPC
-        : (err == EEXIST ? NFS3ERR_EXIST
-        : (err == EIO ? NFS3ERR_IO : (err ? NFS3ERR_IO : NFS3_OK))))));
-}
-
-static int nfs3_null_proc(void *opaque, rpc_op_t *rop)
-{
-    rpc_queue_reply(rop);
-    return 0;
-}
-
 static fattr3 get_dir_attributes(nfs_client_t *self, std::string dir)
 {
-    auto & dinf = self->parent->dir_info.at(dir);
+    auto & dinf = self->parent->blockfs->dir_info.at(dir);
    return (fattr3){
        .type = NF3DIR,
        .mode = 0755,
@ -108,7 +88,7 @@ static fattr3 get_file_attributes(nfs_client_t *self, inode_t inode_num)
    };
 }

-static int nfs3_getattr_proc(void *opaque, rpc_op_t *rop)
+static int block_nfs3_getattr_proc(void *opaque, rpc_op_t *rop)
 {
    nfs_client_t *self = (nfs_client_t*)opaque;
    GETATTR3args *args = (GETATTR3args*)rop->request;
@ -116,12 +96,12 @@ static int nfs3_getattr_proc(void *opaque, rpc_op_t *rop)
    bool is_dir = false;
    std::string dirhash = args->object;
    std::string dir;
-    if (args->object == "roothandle")
+    if (args->object == NFS_ROOT_HANDLE)
        is_dir = true;
    else
    {
-        auto dir_it = self->parent->dir_by_hash.find(dirhash);
-        if (dir_it != self->parent->dir_by_hash.end())
+        auto dir_it = self->parent->blockfs->dir_by_hash.find(dirhash);
+        if (dir_it != self->parent->blockfs->dir_by_hash.end())
        {
            is_dir = true;
            dir = dir_it->second;
@ -140,8 +120,8 @@ static int nfs3_getattr_proc(void *opaque, rpc_op_t *rop)
    else
    {
        uint64_t inode_num = 0;
-        auto inode_num_it = self->parent->inode_by_hash.find(dirhash);
-        if (inode_num_it != self->parent->inode_by_hash.end())
+        auto inode_num_it = self->parent->blockfs->inode_by_hash.find(dirhash);
+        if (inode_num_it != self->parent->blockfs->inode_by_hash.end())
            inode_num = inode_num_it->second;
        auto inode_it = self->parent->cli->st_cli.inode_config.find(inode_num);
        if (inode_num && inode_it != self->parent->cli->st_cli.inode_config.end())
@ -179,16 +159,16 @@ static int nfs3_getattr_proc(void *opaque, rpc_op_t *rop)
    return 0;
 }

-static int nfs3_setattr_proc(void *opaque, rpc_op_t *rop)
+static int block_nfs3_setattr_proc(void *opaque, rpc_op_t *rop)
 {
    nfs_client_t *self = (nfs_client_t*)opaque;
    SETATTR3args *args = (SETATTR3args*)rop->request;
    SETATTR3res *reply = (SETATTR3res*)rop->reply;
    std::string handle = args->object;
-    auto ino_it = self->parent->inode_by_hash.find(handle);
-    if (ino_it == self->parent->inode_by_hash.end())
+    auto ino_it = self->parent->blockfs->inode_by_hash.find(handle);
+    if (ino_it == self->parent->blockfs->inode_by_hash.end())
    {
-        if (handle == "roothandle" || self->parent->dir_by_hash.find(handle) != self->parent->dir_by_hash.end())
+        if (handle == NFS_ROOT_HANDLE || self->parent->blockfs->dir_by_hash.find(handle) != self->parent->blockfs->dir_by_hash.end())
        {
            if (args->new_attributes.size.set_it)
            {
@ -228,7 +208,7 @@ static int nfs3_setattr_proc(void *opaque, rpc_op_t *rop)
    return 0;
 }

-static int nfs3_lookup_proc(void *opaque, rpc_op_t *rop)
+static int block_nfs3_lookup_proc(void *opaque, rpc_op_t *rop)
 {
    nfs_client_t *self = (nfs_client_t*)opaque;
    LOOKUP3args *args = (LOOKUP3args*)rop->request;
@ -255,8 +235,8 @@ static int nfs3_lookup_proc(void *opaque, rpc_op_t *rop)
                return 0;
            }
        }
-        auto dir_it = self->parent->dir_info.find(full_name);
-        if (dir_it != self->parent->dir_info.end())
+        auto dir_it = self->parent->blockfs->dir_info.find(full_name);
+        if (dir_it != self->parent->blockfs->dir_info.end())
        {
            *reply = (LOOKUP3res){
                .status = NFS3_OK,
@ -277,7 +257,7 @@ static int nfs3_lookup_proc(void *opaque, rpc_op_t *rop)
    return 0;
 }

-static int nfs3_access_proc(void *opaque, rpc_op_t *rop)
+static int block_nfs3_access_proc(void *opaque, rpc_op_t *rop)
 {
    //nfs_client_t *self = (nfs_client_t*)opaque;
    ACCESS3args *args = (ACCESS3args*)rop->request;
@ -292,7 +272,7 @@ static int nfs3_access_proc(void *opaque, rpc_op_t *rop)
    return 0;
 }

-static int nfs3_readlink_proc(void *opaque, rpc_op_t *rop)
+static int block_nfs3_readlink_proc(void *opaque, rpc_op_t *rop)
 {
    //nfs_client_t *self = (nfs_client_t*)opaque;
    //READLINK3args *args = (READLINK3args*)rop->request;
@ -303,14 +283,14 @@ static int nfs3_readlink_proc(void *opaque, rpc_op_t *rop)
    return 0;
 }

-static int nfs3_read_proc(void *opaque, rpc_op_t *rop)
+static int block_nfs3_read_proc(void *opaque, rpc_op_t *rop)
 {
    nfs_client_t *self = (nfs_client_t*)opaque;
    READ3args *args = (READ3args*)rop->request;
    READ3res *reply = (READ3res*)rop->reply;
    std::string handle = args->file;
-    auto ino_it = self->parent->inode_by_hash.find(handle);
-    if (ino_it == self->parent->inode_by_hash.end())
+    auto ino_it = self->parent->blockfs->inode_by_hash.find(handle);
+    if (ino_it == self->parent->blockfs->inode_by_hash.end())
    {
        *reply = (READ3res){ .status = NFS3ERR_NOENT };
        rpc_queue_reply(rop);
@ -367,14 +347,14 @@ static int nfs3_read_proc(void *opaque, rpc_op_t *rop)

 static void nfs_resize_write(nfs_client_t *self, rpc_op_t *rop, uint64_t inode, uint64_t new_size, uint64_t offset, uint64_t count, void *buf);

-static int nfs3_write_proc(void *opaque, rpc_op_t *rop)
+static int block_nfs3_write_proc(void *opaque, rpc_op_t *rop)
 {
    nfs_client_t *self = (nfs_client_t*)opaque;
    WRITE3args *args = (WRITE3args*)rop->request;
    WRITE3res *reply = (WRITE3res*)rop->reply;
    std::string handle = args->file;
-    auto ino_it = self->parent->inode_by_hash.find(handle);
-    if (ino_it == self->parent->inode_by_hash.end())
+    auto ino_it = self->parent->blockfs->inode_by_hash.find(handle);
+    if (ino_it == self->parent->blockfs->inode_by_hash.end())
    {
        *reply = (WRITE3res){ .status = NFS3ERR_NOENT };
        rpc_queue_reply(rop);
@ -480,8 +460,8 @@ static void complete_extend_write(nfs_client_t *self, rpc_op_t *rop, inode_t ino

 static void complete_extend_inode(nfs_client_t *self, uint64_t inode, uint64_t new_size, int err)
 {
-    auto ext_it = self->extend_writes.lower_bound((extend_size_t){ .inode = inode, .new_size = 0 });
-    while (ext_it != self->extend_writes.end() &&
+    auto ext_it = self->parent->blockfs->extend_writes.lower_bound((extend_size_t){ .inode = inode, .new_size = 0 });
+    while (ext_it != self->parent->blockfs->extend_writes.end() &&
        ext_it->first.inode == inode &&
        ext_it->first.new_size <= new_size)
    {
@ -490,7 +470,7 @@ static void complete_extend_inode(nfs_client_t *self, uint64_t inode, uint64_t n
        {
            complete_extend_write(self, ext_it->second.rop, inode, ext_it->second.write_res < 0
                ? ext_it->second.write_res : ext_it->second.resize_res);
-            self->extend_writes.erase(ext_it++);
+            self->parent->blockfs->extend_writes.erase(ext_it++);
        }
        else
            ext_it++;
@ -500,7 +480,7 @@ static void complete_extend_inode(nfs_client_t *self, uint64_t inode, uint64_t n
 static void extend_inode(nfs_client_t *self, uint64_t inode, uint64_t new_size)
 {
    // Send an extend request
-    auto & ext = self->extends[inode];
+    auto & ext = self->parent->blockfs->extends[inode];
    ext.cur_extend = new_size;
    auto inode_it = self->parent->cli->st_cli.inode_config.find(inode);
    if (inode_it != self->parent->cli->st_cli.inode_config.end() &&
@ -514,7 +494,7 @@ static void extend_inode(nfs_client_t *self, uint64_t inode, uint64_t new_size)
            { "force_size", true },
        }), [=](const cli_result_t & r)
        {
-            auto & ext = self->extends[inode];
+            auto & ext = self->parent->blockfs->extends[inode];
            if (r.err)
            {
                fprintf(stderr, "Error extending inode %lu to %lu bytes: %s\n", inode, new_size, r.text.c_str());
@ -548,7 +528,7 @@ static void nfs_do_write(nfs_client_t *self, std::multimap<extend_size_t, extend
    {
        auto inode = op->inode;
        int write_res = op->retval < 0 ? op->retval : (op->retval != op->len ? -ERANGE : 0);
-        if (ewr_it == self->extend_writes.end())
+        if (ewr_it == self->parent->blockfs->extend_writes.end())
        {
            complete_extend_write(self, rop, inode, write_res);
        }
@ -558,7 +538,7 @@ static void nfs_do_write(nfs_client_t *self, std::multimap<extend_size_t, extend
            if (ewr_it->second.resize_res <= 0)
            {
                complete_extend_write(self, rop, inode, write_res < 0 ? write_res : ewr_it->second.resize_res);
-                self->extend_writes.erase(ewr_it);
+                self->parent->blockfs->extend_writes.erase(ewr_it);
            }
        }
    };
@ -572,7 +552,7 @@ static void nfs_resize_write(nfs_client_t *self, rpc_op_t *rop, uint64_t inode,
    if (inode_it != self->parent->cli->st_cli.inode_config.end() &&
        inode_it->second.size < new_size)
    {
-        auto ewr_it = self->extend_writes.emplace((extend_size_t){
+        auto ewr_it = self->parent->blockfs->extend_writes.emplace((extend_size_t){
            .inode = inode,
            .new_size = new_size,
        }, (extend_write_t){
@ -580,7 +560,7 @@ static void nfs_resize_write(nfs_client_t *self, rpc_op_t *rop, uint64_t inode,
            .resize_res = 1,
            .write_res = 1,
        });
-        auto & ext = self->extends[inode];
+        auto & ext = self->parent->blockfs->extends[inode];
        if (ext.cur_extend > 0)
        {
            // Already resizing, just wait
@ -595,11 +575,11 @@ static void nfs_resize_write(nfs_client_t *self, rpc_op_t *rop, uint64_t inode,
    }
    else
    {
-        nfs_do_write(self, self->extend_writes.end(), rop, inode, offset, count, buf);
+        nfs_do_write(self, self->parent->blockfs->extend_writes.end(), rop, inode, offset, count, buf);
    }
 }

-static int nfs3_create_proc(void *opaque, rpc_op_t *rop)
+static int block_nfs3_create_proc(void *opaque, rpc_op_t *rop)
 {
    nfs_client_t *self = (nfs_client_t*)opaque;
    CREATE3args *args = (CREATE3args*)rop->request;
@ -650,7 +630,7 @@ static int nfs3_create_proc(void *opaque, rpc_op_t *rop)
    return 1;
 }

-static int nfs3_mkdir_proc(void *opaque, rpc_op_t *rop)
+static int block_nfs3_mkdir_proc(void *opaque, rpc_op_t *rop)
 {
    nfs_client_t *self = (nfs_client_t*)opaque;
    MKDIR3args *args = (MKDIR3args*)rop->request;
@ -669,19 +649,19 @@ static int nfs3_mkdir_proc(void *opaque, rpc_op_t *rop)
        rpc_queue_reply(rop);
        return 0;
    }
-    auto dir_id_it = self->parent->dir_info.find(full_name);
-    if (dir_id_it != self->parent->dir_info.end())
+    auto dir_id_it = self->parent->blockfs->dir_info.find(full_name);
+    if (dir_id_it != self->parent->blockfs->dir_info.end())
    {
        *reply = (MKDIR3res){ .status = NFS3ERR_EXIST };
        rpc_queue_reply(rop);
        return 0;
    }
    // FIXME: Persist empty directories in some etcd keys, like /vitastor/dir/...
-    self->parent->dir_info[full_name] = (nfs_dir_t){
-        .id = self->parent->next_dir_id++,
+    self->parent->blockfs->dir_info[full_name] = (nfs_dir_t){
+        .id = self->parent->blockfs->next_dir_id++,
        .mod_rev = 0,
    };
-    self->parent->dir_by_hash["S"+base64_encode(sha256(full_name))] = full_name;
+    self->parent->blockfs->dir_by_hash["S"+base64_encode(sha256(full_name))] = full_name;
    *reply = (MKDIR3res){
        .status = NFS3_OK,
        .resok = (MKDIR3resok){
@ -700,7 +680,7 @@ static int nfs3_mkdir_proc(void *opaque, rpc_op_t *rop)
    return 0;
 }

-static int nfs3_symlink_proc(void *opaque, rpc_op_t *rop)
+static int block_nfs3_symlink_proc(void *opaque, rpc_op_t *rop)
 {
 //    nfs_client_t *self = (nfs_client_t*)opaque;
 //    SYMLINK3args *args = (SYMLINK3args*)rop->request;
@ -711,7 +691,7 @@ static int nfs3_symlink_proc(void *opaque, rpc_op_t *rop)
    return 0;
 }

-static int nfs3_mknod_proc(void *opaque, rpc_op_t *rop)
+static int block_nfs3_mknod_proc(void *opaque, rpc_op_t *rop)
 {
 //    nfs_client_t *self = (nfs_client_t*)opaque;
 //    MKNOD3args *args = (MKNOD3args*)rop->request;
@ -722,7 +702,7 @@ static int nfs3_mknod_proc(void *opaque, rpc_op_t *rop)
    return 0;
 }

-static int nfs3_remove_proc(void *opaque, rpc_op_t *rop)
+static int block_nfs3_remove_proc(void *opaque, rpc_op_t *rop)
 {
    nfs_client_t *self = (nfs_client_t*)opaque;
    REMOVE3res *reply = (REMOVE3res*)rop->reply;
@ -752,7 +732,7 @@ static int nfs3_remove_proc(void *opaque, rpc_op_t *rop)
    return 1;
 }

-static int nfs3_rmdir_proc(void *opaque, rpc_op_t *rop)
+static int block_nfs3_rmdir_proc(void *opaque, rpc_op_t *rop)
 {
    nfs_client_t *self = (nfs_client_t*)opaque;
    RMDIR3args *args = (RMDIR3args*)rop->request;
@ -764,8 +744,8 @@ static int nfs3_rmdir_proc(void *opaque, rpc_op_t *rop)
        rpc_queue_reply(rop);
        return 0;
    }
-    auto dir_it = self->parent->dir_info.find(full_name);
-    if (dir_it == self->parent->dir_info.end())
+    auto dir_it = self->parent->blockfs->dir_info.find(full_name);
+    if (dir_it == self->parent->blockfs->dir_info.end())
    {
        *reply = (RMDIR3res){ .status = NFS3ERR_NOENT };
        rpc_queue_reply(rop);
@ -781,8 +761,8 @@ static int nfs3_rmdir_proc(void *opaque, rpc_op_t *rop)
            return 0;
        }
    }
-    self->parent->dir_by_hash.erase("S"+base64_encode(sha256(full_name)));
-    self->parent->dir_info.erase(dir_it);
+    self->parent->blockfs->dir_by_hash.erase("S"+base64_encode(sha256(full_name)));
+    self->parent->blockfs->dir_info.erase(dir_it);
    *reply = (RMDIR3res){ .status = NFS3_OK };
    rpc_queue_reply(rop);
    return 0;
@ -811,12 +791,12 @@ static int continue_dir_rename(nfs_dir_rename_state *rename_st)
    if (!rename_st->items.size())
    {
        // old dir
-        auto old_info = self->parent->dir_info.at(rename_st->old_name);
-        self->parent->dir_info.erase(rename_st->old_name);
-        self->parent->dir_by_hash.erase("S"+base64_encode(sha256(rename_st->old_name)));
+        auto old_info = self->parent->blockfs->dir_info.at(rename_st->old_name);
+        self->parent->blockfs->dir_info.erase(rename_st->old_name);
+        self->parent->blockfs->dir_by_hash.erase("S"+base64_encode(sha256(rename_st->old_name)));
        // new dir
-        self->parent->dir_info[rename_st->new_name] = old_info;
-        self->parent->dir_by_hash["S"+base64_encode(sha256(rename_st->new_name))] = rename_st->new_name;
+        self->parent->blockfs->dir_info[rename_st->new_name] = old_info;
+        self->parent->blockfs->dir_by_hash["S"+base64_encode(sha256(rename_st->new_name))] = rename_st->new_name;
        RENAME3res *reply = (RENAME3res*)rename_st->rop->reply;
        *reply = (RENAME3res){
            .status = NFS3_OK,
@ -853,7 +833,7 @@ static int continue_dir_rename(nfs_dir_rename_state *rename_st)

 static void nfs_do_rename(nfs_client_t *self, rpc_op_t *rop, std::string old_name, std::string new_name);

-static int nfs3_rename_proc(void *opaque, rpc_op_t *rop)
+static int block_nfs3_rename_proc(void *opaque, rpc_op_t *rop)
 {
    nfs_client_t *self = (nfs_client_t*)opaque;
    RENAME3args *args = (RENAME3args*)rop->request;
@ -866,8 +846,8 @@ static int nfs3_rename_proc(void *opaque, rpc_op_t *rop)
        rpc_queue_reply(rop);
        return 0;
    }
-    bool old_is_dir = self->parent->dir_info.find(old_name) != self->parent->dir_info.end();
-    bool new_is_dir = self->parent->dir_info.find(new_name) != self->parent->dir_info.end();
+    bool old_is_dir = self->parent->blockfs->dir_info.find(old_name) != self->parent->blockfs->dir_info.end();
+    bool new_is_dir = self->parent->blockfs->dir_info.find(new_name) != self->parent->blockfs->dir_info.end();
    bool old_is_file = false, new_is_file = false;
    for (auto & ic: self->parent->cli->st_cli.inode_config)
    {
@ -948,7 +928,7 @@ static void nfs_do_rename(nfs_client_t *self, rpc_op_t *rop, std::string old_nam
    });
 }

-static int nfs3_link_proc(void *opaque, rpc_op_t *rop)
+static int block_nfs3_link_proc(void *opaque, rpc_op_t *rop)
 {
    //nfs_client_t *self = (nfs_client_t*)opaque;
    //LINK3args *args = (LINK3args*)rop->request;
@ -962,7 +942,7 @@ static int nfs3_link_proc(void *opaque, rpc_op_t *rop)
 static void fill_dir_entry(nfs_client_t *self, rpc_op_t *rop,
    std::map<std::string, nfs_dir_t>::iterator dir_id_it, struct entryplus3 *entry, bool is_plus)
 {
-    if (dir_id_it == self->parent->dir_info.end())
+    if (dir_id_it == self->parent->blockfs->dir_info.end())
    {
        return;
    }
@ -980,7 +960,7 @@ static void fill_dir_entry(nfs_client_t *self, rpc_op_t *rop,
    }
 }

-static void nfs3_readdir_common(void *opaque, rpc_op_t *rop, bool is_plus)
+static void block_nfs3_readdir_common(void *opaque, rpc_op_t *rop, bool is_plus)
 {
    nfs_client_t *self = (nfs_client_t*)opaque;
    READDIRPLUS3args plus_args;
@ -999,10 +979,10 @@ static void nfs3_readdir_common(void *opaque, rpc_op_t *rop, bool is_plus)
    }
    std::string dirhash = args->dir;
    std::string dir;
-    if (dirhash != "roothandle")
+    if (dirhash != NFS_ROOT_HANDLE)
    {
-        auto dir_it = self->parent->dir_by_hash.find(dirhash);
-        if (dir_it != self->parent->dir_by_hash.end())
+        auto dir_it = self->parent->blockfs->dir_by_hash.find(dirhash);
+        if (dir_it != self->parent->blockfs->dir_by_hash.end())
            dir = dir_it->second;
    }
    std::string prefix = dir.size() ? dir+"/" : self->parent->name_prefix;
@ -1043,12 +1023,12 @@ static void nfs3_readdir_common(void *opaque, rpc_op_t *rop, bool is_plus)
        }
        else
        {
-            // skip directories, they will be added from dir_info
+            // skip directories, they will be added from blockfs->dir_info
        }
    }
-    // Add directories from dir_info
-    for (auto dir_id_it = self->parent->dir_info.lower_bound(prefix);
-        dir_id_it != self->parent->dir_info.end(); dir_id_it++)
+    // Add directories from blockfs->dir_info
+    for (auto dir_id_it = self->parent->blockfs->dir_info.lower_bound(prefix);
+        dir_id_it != self->parent->blockfs->dir_info.end(); dir_id_it++)
    {
        if (prefix != "" && dir_id_it->first.substr(0, prefix.size()) != prefix)
            break;
@ -1061,12 +1041,12 @@ static void nfs3_readdir_common(void *opaque, rpc_op_t *rop, bool is_plus)
    }
    // Add . and ..
    {
-        auto dir_id_it = self->parent->dir_info.find(dir);
+        auto dir_id_it = self->parent->blockfs->dir_info.find(dir);
        fill_dir_entry(self, rop, dir_id_it, &entries["."], is_plus);
        auto sl = dir.rfind("/");
        if (sl != std::string::npos)
        {
-            auto dir_id_it = self->parent->dir_info.find(dir.substr(0, sl));
+            auto dir_id_it = self->parent->blockfs->dir_info.find(dir.substr(0, sl));
            fill_dir_entry(self, rop, dir_id_it, &entries[".."], is_plus);
        }
    }
@ -1147,7 +1127,7 @@ static void nfs3_readdir_common(void *opaque, rpc_op_t *rop, bool is_plus)
    {
        READDIRPLUS3res *reply = (READDIRPLUS3res*)rop->reply;
        *reply = { .status = NFS3_OK };
-        *(uint64_t*)(reply->resok.cookieverf) = self->parent->dir_info.at(dir).mod_rev;
+        *(uint64_t*)(reply->resok.cookieverf) = self->parent->blockfs->dir_info.at(dir).mod_rev;
        reply->resok.reply.entries = entries.size() ? &entries.begin()->second : NULL;
        reply->resok.reply.eof = eof;
    }
@ -1155,250 +1135,123 @@ static void nfs3_readdir_common(void *opaque, rpc_op_t *rop, bool is_plus)
    {
        READDIR3res *reply = (READDIR3res*)rop->reply;
        *reply = { .status = NFS3_OK };
-        *(uint64_t*)(reply->resok.cookieverf) = self->parent->dir_info.at(dir).mod_rev;
+        *(uint64_t*)(reply->resok.cookieverf) = self->parent->blockfs->dir_info.at(dir).mod_rev;
        reply->resok.reply.entries = entries.size() ? (entry3*)&entries.begin()->second : NULL;
        reply->resok.reply.eof = eof;
    }
    rpc_queue_reply(rop);
 }

-static int nfs3_readdir_proc(void *opaque, rpc_op_t *rop)
+static int block_nfs3_readdir_proc(void *opaque, rpc_op_t *rop)
 {
-    nfs3_readdir_common(opaque, rop, false);
+    block_nfs3_readdir_common(opaque, rop, false);
    return 0;
 }

-static int nfs3_readdirplus_proc(void *opaque, rpc_op_t *rop)
+static int block_nfs3_readdirplus_proc(void *opaque, rpc_op_t *rop)
 {
-    nfs3_readdir_common(opaque, rop, true);
+    block_nfs3_readdir_common(opaque, rop, true);
    return 0;
 }

-// Get file system statistics
-static int nfs3_fsstat_proc(void *opaque, rpc_op_t *rop)
+void block_fs_state_t::init(nfs_proxy_t *proxy)
 {
-    nfs_client_t *self = (nfs_client_t*)opaque;
-    //FSSTAT3args *args = (FSSTAT3args*)rop->request;
-    FSSTAT3res *reply = (FSSTAT3res*)rop->reply;
-    uint64_t tbytes = 0, fbytes = 0;
-    auto pst_it = self->parent->pool_stats.find(self->parent->default_pool_id);
-    if (pst_it != self->parent->pool_stats.end())
-    {
-        auto ttb = pst_it->second["total_raw_tb"].number_value();
-        auto ftb = (pst_it->second["total_raw_tb"].number_value() - pst_it->second["used_raw_tb"].number_value());
-        tbytes = ttb / pst_it->second["raw_to_usable"].number_value() * ((uint64_t)2<<40);
-        fbytes = ftb / pst_it->second["raw_to_usable"].number_value() * ((uint64_t)2<<40);
-    }
-    *reply = (FSSTAT3res){
-        .status = NFS3_OK,
-        .resok = (FSSTAT3resok){
-            .obj_attributes = {
-                .attributes_follow = 1,
-                .attributes = get_dir_attributes(self, ""),
-            },
-            .tbytes = tbytes, // total bytes
-            .fbytes = fbytes, // free bytes
-            .abytes = fbytes, // available bytes
-            .tfiles = (size3)(1 << 31), // maximum total files
-            .ffiles = (size3)(1 << 31), // free files
-            .afiles = (size3)(1 << 31), // available files
-            .invarsec = 0,
-        },
+    // We need inode name hashes for NFS handles to remain stateless and <= 64 bytes long
+    dir_info[""] = (nfs_dir_t){
+        .id = 1,
+        .mod_rev = 0,
    };
-    rpc_queue_reply(rop);
-    return 0;
-}
-
-static int nfs3_fsinfo_proc(void *opaque, rpc_op_t *rop)
-{
-    nfs_client_t *self = (nfs_client_t*)opaque;
-    FSINFO3args *args = (FSINFO3args*)rop->request;
-    FSINFO3res *reply = (FSINFO3res*)rop->reply;
-    if (args->fsroot != "roothandle")
+    clock_gettime(CLOCK_REALTIME, &dir_info[""].mtime);
+    assert(proxy->cli->st_cli.on_inode_change_hook == NULL);
+    proxy->cli->st_cli.on_inode_change_hook = [this, proxy](inode_t changed_inode, bool removed)
    {
-        // Example error
-        *reply = (FSINFO3res){ .status = NFS3ERR_INVAL };
+        auto inode_cfg_it = proxy->cli->st_cli.inode_config.find(changed_inode);
+        if (inode_cfg_it == proxy->cli->st_cli.inode_config.end())
+        {
+            return;
+        }
+        auto & inode_cfg = inode_cfg_it->second;
+        std::string full_name = inode_cfg.name;
+        if (proxy->name_prefix != "" && full_name.substr(0, proxy->name_prefix.size()) != proxy->name_prefix)
+        {
+            return;
+        }
+        // Calculate directory modification time and revision (used as "cookie verifier")
+        timespec now;
+        clock_gettime(CLOCK_REALTIME, &now);
+        dir_info[""].mod_rev = dir_info[""].mod_rev < inode_cfg.mod_revision ? inode_cfg.mod_revision : dir_info[""].mod_rev;
+        dir_info[""].mtime = now;
+        int pos = full_name.find('/', proxy->name_prefix.size());
+        while (pos >= 0)
+        {
+            std::string dir = full_name.substr(0, pos);
+            auto & dinf = dir_info[dir];
+            if (!dinf.id)
+                dinf.id = next_dir_id++;
+            dinf.mod_rev = dinf.mod_rev < inode_cfg.mod_revision ? inode_cfg.mod_revision : dinf.mod_rev;
+            dinf.mtime = now;
+            dir_by_hash["S"+base64_encode(sha256(dir))] = dir;
+            pos = full_name.find('/', pos+1);
+        }
+        // Alter inode_by_hash
+        if (removed)
+        {
+            auto ino_it = hash_by_inode.find(changed_inode);
+            if (ino_it != hash_by_inode.end())
+            {
+                inode_by_hash.erase(ino_it->second);
+                hash_by_inode.erase(ino_it);
+            }
        }
        else
        {
-        // Fill info
-        *reply = (FSINFO3res){
-            .status = NFS3_OK,
-            .resok = (FSINFO3resok){
-                .obj_attributes = {
-                    .attributes_follow = 1,
-                    .attributes = get_dir_attributes(self, ""),
-                },
-                .rtmax = 128*1024*1024,
-                .rtpref = 128*1024*1024,
-                .rtmult = 4096,
-                .wtmax = 128*1024*1024,
-                .wtpref = 128*1024*1024,
-                .wtmult = 4096,
-                .dtpref = 128,
-                .maxfilesize = 0x7fffffffffffffff,
-                .time_delta = {
-                    .seconds = 1,
-                    .nseconds = 0,
-                },
-                .properties = FSF3_SYMLINK | FSF3_HOMOGENEOUS,
-            },
-        };
-    }
-    rpc_queue_reply(rop);
-    return 0;
-}
-
-static int nfs3_pathconf_proc(void *opaque, rpc_op_t *rop)
-{
-    //nfs_client_t *self = (nfs_client_t*)opaque;
-    PATHCONF3args *args = (PATHCONF3args*)rop->request;
-    PATHCONF3res *reply = (PATHCONF3res*)rop->reply;
-    if (args->object != "roothandle")
+            std::string hash = "S"+base64_encode(sha256(full_name));
+            auto hbi_it = hash_by_inode.find(changed_inode);
+            if (hbi_it != hash_by_inode.end() && hbi_it->second != hash)
            {
-        // Example error
-        *reply = (PATHCONF3res){ .status = NFS3ERR_INVAL };
+                // inode had a different name, remove old hash=>inode pointer
+                inode_by_hash.erase(hbi_it->second);
            }
-    else
-    {
-        // Fill info
-        bool_t x = FALSE;
-        *reply = (PATHCONF3res){
-            .status = NFS3_OK,
-            .resok = (PATHCONF3resok){
-                .obj_attributes = {
-                    // Without at least one reference to a non-constant value (local variable or something else),
-                    // with gcc 8 we get "internal compiler error: side-effects element in no-side-effects CONSTRUCTOR" here
-                    // FIXME: get rid of this after raising compiler requirement
-                    .attributes_follow = x,
-                },
-                .linkmax = 0,
-                .name_max = 255,
-                .no_trunc = TRUE,
-                .chown_restricted = FALSE,
-                .case_insensitive = FALSE,
-                .case_preserving = TRUE,
-            },
-        };
+            inode_by_hash[hash] = changed_inode;
+            hash_by_inode[changed_inode] = hash;
        }
-    rpc_queue_reply(rop);
-    return 0;
-}
-
-static int nfs3_commit_proc(void *opaque, rpc_op_t *rop)
-{
-    nfs_client_t *self = (nfs_client_t*)opaque;
-    //COMMIT3args *args = (COMMIT3args*)rop->request;
-    cluster_op_t *op = new cluster_op_t;
-    // fsync. we don't know how to fsync a single inode, so just fsync everything
-    op->opcode = OSD_OP_SYNC;
-    op->callback = [self, rop](cluster_op_t *op)
-    {
-        COMMIT3res *reply = (COMMIT3res*)rop->reply;
-        *reply = (COMMIT3res){ .status = vitastor_nfs_map_err(op->retval) };
-        *(uint64_t*)reply->resok.verf = self->parent->server_id;
-        rpc_queue_reply(rop);
    };
-    self->parent->cli->execute(op);
-    return 1;
 }

-static int mount3_mnt_proc(void *opaque, rpc_op_t *rop)
-{
-    //nfs_client_t *self = (nfs_client_t*)opaque;
-    //nfs_dirpath *args = (nfs_dirpath*)rop->request;
-    nfs_mountres3 *reply = (nfs_mountres3*)rop->reply;
-    u_int flavor = RPC_AUTH_NONE;
-    reply->fhs_status = MNT3_OK;
-    reply->mountinfo.fhandle = xdr_copy_string(rop->xdrs, "roothandle");
-    reply->mountinfo.auth_flavors.auth_flavors_len = 1;
-    reply->mountinfo.auth_flavors.auth_flavors_val = (u_int*)xdr_copy_string(rop->xdrs, (char*)&flavor, sizeof(u_int)).data;
-    rpc_queue_reply(rop);
-    return 0;
-}
-
-static int mount3_dump_proc(void *opaque, rpc_op_t *rop)
-{
-    nfs_client_t *self = (nfs_client_t*)opaque;
-    nfs_mountlist *reply = (nfs_mountlist*)rop->reply;
-    *reply = (struct nfs_mountbody*)malloc_or_die(sizeof(struct nfs_mountbody));
-    xdr_add_malloc(rop->xdrs, *reply);
-    (*reply)->ml_hostname = xdr_copy_string(rop->xdrs, "127.0.0.1");
-    (*reply)->ml_directory = xdr_copy_string(rop->xdrs, self->parent->export_root);
-    (*reply)->ml_next = NULL;
-    rpc_queue_reply(rop);
-    return 0;
-}
-
-static int mount3_umnt_proc(void *opaque, rpc_op_t *rop)
-{
-    //nfs_client_t *self = (nfs_client_t*)opaque;
-    //nfs_dirpath *arg = (nfs_dirpath*)rop->request;
-    // do nothing
-    rpc_queue_reply(rop);
-    return 0;
-}
-
-static int mount3_umntall_proc(void *opaque, rpc_op_t *rop)
-{
-    // do nothing
-    rpc_queue_reply(rop);
-    return 0;
-}
-
-static int mount3_export_proc(void *opaque, rpc_op_t *rop)
-{
-    nfs_client_t *self = (nfs_client_t*)opaque;
-    nfs_exports *reply = (nfs_exports*)rop->reply;
-    *reply = (struct nfs_exportnode*)calloc_or_die(1, sizeof(struct nfs_exportnode) + sizeof(struct nfs_groupnode));
-    xdr_add_malloc(rop->xdrs, *reply);
-    (*reply)->ex_dir = xdr_copy_string(rop->xdrs, self->parent->export_root);
-    (*reply)->ex_groups = (struct nfs_groupnode*)(reply+1);
-    (*reply)->ex_groups->gr_name = xdr_copy_string(rop->xdrs, "127.0.0.1");
-    (*reply)->ex_groups->gr_next = NULL;
-    (*reply)->ex_next = NULL;
-    rpc_queue_reply(rop);
-    return 0;
-}
-
-nfs_client_t::nfs_client_t()
+void nfs_block_procs(nfs_client_t *self)
 {
    struct rpc_service_proc_t pt[] = {
-        {NFS_PROGRAM, NFS_V3, NFS3_NULL,        nfs3_null_proc,        NULL,                            0,                        NULL,                           0,                       this},
-        {NFS_PROGRAM, NFS_V3, NFS3_GETATTR,     nfs3_getattr_proc,     (xdrproc_t)xdr_GETATTR3args,     sizeof(GETATTR3args),     (xdrproc_t)xdr_GETATTR3res,     sizeof(GETATTR3res),     this},
-        {NFS_PROGRAM, NFS_V3, NFS3_SETATTR,     nfs3_setattr_proc,     (xdrproc_t)xdr_SETATTR3args,     sizeof(SETATTR3args),     (xdrproc_t)xdr_SETATTR3res,     sizeof(SETATTR3res),     this},
-        {NFS_PROGRAM, NFS_V3, NFS3_LOOKUP,      nfs3_lookup_proc,      (xdrproc_t)xdr_LOOKUP3args,      sizeof(LOOKUP3args),      (xdrproc_t)xdr_LOOKUP3res,      sizeof(LOOKUP3res),      this},
-        {NFS_PROGRAM, NFS_V3, NFS3_ACCESS,      nfs3_access_proc,      (xdrproc_t)xdr_ACCESS3args,      sizeof(ACCESS3args),      (xdrproc_t)xdr_ACCESS3res,      sizeof(ACCESS3res),      this},
-        {NFS_PROGRAM, NFS_V3, NFS3_READLINK,    nfs3_readlink_proc,    (xdrproc_t)xdr_READLINK3args,    sizeof(READLINK3args),    (xdrproc_t)xdr_READLINK3res,    sizeof(READLINK3res),    this},
-        {NFS_PROGRAM, NFS_V3, NFS3_READ,        nfs3_read_proc,        (xdrproc_t)xdr_READ3args,        sizeof(READ3args),        (xdrproc_t)xdr_READ3res,        sizeof(READ3res),        this},
-        {NFS_PROGRAM, NFS_V3, NFS3_WRITE,       nfs3_write_proc,       (xdrproc_t)xdr_WRITE3args,       sizeof(WRITE3args),       (xdrproc_t)xdr_WRITE3res,       sizeof(WRITE3res),       this},
-        {NFS_PROGRAM, NFS_V3, NFS3_CREATE,      nfs3_create_proc,      (xdrproc_t)xdr_CREATE3args,      sizeof(CREATE3args),      (xdrproc_t)xdr_CREATE3res,      sizeof(CREATE3res),      this},
-        {NFS_PROGRAM, NFS_V3, NFS3_MKDIR,       nfs3_mkdir_proc,       (xdrproc_t)xdr_MKDIR3args,       sizeof(MKDIR3args),       (xdrproc_t)xdr_MKDIR3res,       sizeof(MKDIR3res),       this},
-        {NFS_PROGRAM, NFS_V3, NFS3_SYMLINK,     nfs3_symlink_proc,     (xdrproc_t)xdr_SYMLINK3args,     sizeof(SYMLINK3args),     (xdrproc_t)xdr_SYMLINK3res,     sizeof(SYMLINK3res),     this},
-        {NFS_PROGRAM, NFS_V3, NFS3_MKNOD,       nfs3_mknod_proc,       (xdrproc_t)xdr_MKNOD3args,       sizeof(MKNOD3args),       (xdrproc_t)xdr_MKNOD3res,       sizeof(MKNOD3res),       this},
-        {NFS_PROGRAM, NFS_V3, NFS3_REMOVE,      nfs3_remove_proc,      (xdrproc_t)xdr_REMOVE3args,      sizeof(REMOVE3args),      (xdrproc_t)xdr_REMOVE3res,      sizeof(REMOVE3res),      this},
-        {NFS_PROGRAM, NFS_V3, NFS3_RMDIR,       nfs3_rmdir_proc,       (xdrproc_t)xdr_RMDIR3args,       sizeof(RMDIR3args),       (xdrproc_t)xdr_RMDIR3res,       sizeof(RMDIR3res),       this},
-        {NFS_PROGRAM, NFS_V3, NFS3_RENAME,      nfs3_rename_proc,      (xdrproc_t)xdr_RENAME3args,      sizeof(RENAME3args),      (xdrproc_t)xdr_RENAME3res,      sizeof(RENAME3res),      this},
-        {NFS_PROGRAM, NFS_V3, NFS3_LINK,        nfs3_link_proc,        (xdrproc_t)xdr_LINK3args,        sizeof(LINK3args),        (xdrproc_t)xdr_LINK3res,        sizeof(LINK3res),        this},
-        {NFS_PROGRAM, NFS_V3, NFS3_READDIR,     nfs3_readdir_proc,     (xdrproc_t)xdr_READDIR3args,     sizeof(READDIR3args),     (xdrproc_t)xdr_READDIR3res,     sizeof(READDIR3res),     this},
-        {NFS_PROGRAM, NFS_V3, NFS3_READDIRPLUS, nfs3_readdirplus_proc, (xdrproc_t)xdr_READDIRPLUS3args, sizeof(READDIRPLUS3args), (xdrproc_t)xdr_READDIRPLUS3res, sizeof(READDIRPLUS3res), this},
-        {NFS_PROGRAM, NFS_V3, NFS3_FSSTAT,      nfs3_fsstat_proc,      (xdrproc_t)xdr_FSSTAT3args,      sizeof(FSSTAT3args),      (xdrproc_t)xdr_FSSTAT3res,      sizeof(FSSTAT3res),      this},
-        {NFS_PROGRAM, NFS_V3, NFS3_FSINFO,      nfs3_fsinfo_proc,      (xdrproc_t)xdr_FSINFO3args,      sizeof(FSINFO3args),      (xdrproc_t)xdr_FSINFO3res,      sizeof(FSINFO3res),      this},
-        {NFS_PROGRAM, NFS_V3, NFS3_PATHCONF,    nfs3_pathconf_proc,    (xdrproc_t)xdr_PATHCONF3args,    sizeof(PATHCONF3args),    (xdrproc_t)xdr_PATHCONF3res,    sizeof(PATHCONF3res),    this},
-        {NFS_PROGRAM, NFS_V3, NFS3_COMMIT,      nfs3_commit_proc,      (xdrproc_t)xdr_COMMIT3args,      sizeof(COMMIT3args),      (xdrproc_t)xdr_COMMIT3res,      sizeof(COMMIT3res),      this},
-        {MOUNT_PROGRAM, MOUNT_V3, MOUNT3_NULL,    nfs3_null_proc,      NULL,                            0,                        NULL,                         0,                         this},
-        {MOUNT_PROGRAM, MOUNT_V3, MOUNT3_MNT,     mount3_mnt_proc,     (xdrproc_t)xdr_nfs_dirpath,      sizeof(nfs_dirpath),      (xdrproc_t)xdr_nfs_mountres3, sizeof(nfs_mountres3),     this},
-        {MOUNT_PROGRAM, MOUNT_V3, MOUNT3_DUMP,    mount3_dump_proc,    NULL,                            0,                        (xdrproc_t)xdr_nfs_mountlist, sizeof(nfs_mountlist),     this},
-        {MOUNT_PROGRAM, MOUNT_V3, MOUNT3_UMNT,    mount3_umnt_proc,    (xdrproc_t)xdr_nfs_dirpath,      sizeof(nfs_dirpath),      NULL,                         0,                         this},
-        {MOUNT_PROGRAM, MOUNT_V3, MOUNT3_UMNTALL, mount3_umntall_proc, NULL,                            0,                        NULL,                         0,                         this},
-        {MOUNT_PROGRAM, MOUNT_V3, MOUNT3_EXPORT,  mount3_export_proc,  NULL,                            0,                        (xdrproc_t)xdr_nfs_exports,   sizeof(nfs_exports),       this},
+        {NFS_PROGRAM, NFS_V3, NFS3_NULL,          nfs3_null_proc,              NULL,                            0,                        NULL,                           0,                       self},
+        {NFS_PROGRAM, NFS_V3, NFS3_GETATTR,       block_nfs3_getattr_proc,     (xdrproc_t)xdr_GETATTR3args,     sizeof(GETATTR3args),     (xdrproc_t)xdr_GETATTR3res,     sizeof(GETATTR3res),     self},
+        {NFS_PROGRAM, NFS_V3, NFS3_SETATTR,       block_nfs3_setattr_proc,     (xdrproc_t)xdr_SETATTR3args,     sizeof(SETATTR3args),     (xdrproc_t)xdr_SETATTR3res,     sizeof(SETATTR3res),     self},
+        {NFS_PROGRAM, NFS_V3, NFS3_LOOKUP,        block_nfs3_lookup_proc,      (xdrproc_t)xdr_LOOKUP3args,      sizeof(LOOKUP3args),      (xdrproc_t)xdr_LOOKUP3res,      sizeof(LOOKUP3res),      self},
+        {NFS_PROGRAM, NFS_V3, NFS3_ACCESS,        block_nfs3_access_proc,      (xdrproc_t)xdr_ACCESS3args,      sizeof(ACCESS3args),      (xdrproc_t)xdr_ACCESS3res,      sizeof(ACCESS3res),      self},
+        {NFS_PROGRAM, NFS_V3, NFS3_READLINK,      block_nfs3_readlink_proc,    (xdrproc_t)xdr_READLINK3args,    sizeof(READLINK3args),    (xdrproc_t)xdr_READLINK3res,    sizeof(READLINK3res),    self},
+        {NFS_PROGRAM, NFS_V3, NFS3_READ,          block_nfs3_read_proc,        (xdrproc_t)xdr_READ3args,        sizeof(READ3args),        (xdrproc_t)xdr_READ3res,        sizeof(READ3res),        self},
+        {NFS_PROGRAM, NFS_V3, NFS3_WRITE,         block_nfs3_write_proc,       (xdrproc_t)xdr_WRITE3args,       sizeof(WRITE3args),       (xdrproc_t)xdr_WRITE3res,       sizeof(WRITE3res),       self},
+        {NFS_PROGRAM, NFS_V3, NFS3_CREATE,        block_nfs3_create_proc,      (xdrproc_t)xdr_CREATE3args,      sizeof(CREATE3args),      (xdrproc_t)xdr_CREATE3res,      sizeof(CREATE3res),      self},
+        {NFS_PROGRAM, NFS_V3, NFS3_MKDIR,         block_nfs3_mkdir_proc,       (xdrproc_t)xdr_MKDIR3args,       sizeof(MKDIR3args),       (xdrproc_t)xdr_MKDIR3res,       sizeof(MKDIR3res),       self},
+        {NFS_PROGRAM, NFS_V3, NFS3_SYMLINK,       block_nfs3_symlink_proc,     (xdrproc_t)xdr_SYMLINK3args,     sizeof(SYMLINK3args),     (xdrproc_t)xdr_SYMLINK3res,     sizeof(SYMLINK3res),     self},
+        {NFS_PROGRAM, NFS_V3, NFS3_MKNOD,         block_nfs3_mknod_proc,       (xdrproc_t)xdr_MKNOD3args,       sizeof(MKNOD3args),       (xdrproc_t)xdr_MKNOD3res,       sizeof(MKNOD3res),       self},
+        {NFS_PROGRAM, NFS_V3, NFS3_REMOVE,        block_nfs3_remove_proc,      (xdrproc_t)xdr_REMOVE3args,      sizeof(REMOVE3args),      (xdrproc_t)xdr_REMOVE3res,      sizeof(REMOVE3res),      self},
+        {NFS_PROGRAM, NFS_V3, NFS3_RMDIR,         block_nfs3_rmdir_proc,       (xdrproc_t)xdr_RMDIR3args,       sizeof(RMDIR3args),       (xdrproc_t)xdr_RMDIR3res,       sizeof(RMDIR3res),       self},
+        {NFS_PROGRAM, NFS_V3, NFS3_RENAME,        block_nfs3_rename_proc,      (xdrproc_t)xdr_RENAME3args,      sizeof(RENAME3args),      (xdrproc_t)xdr_RENAME3res,      sizeof(RENAME3res),      self},
+        {NFS_PROGRAM, NFS_V3, NFS3_LINK,          block_nfs3_link_proc,        (xdrproc_t)xdr_LINK3args,        sizeof(LINK3args),        (xdrproc_t)xdr_LINK3res,        sizeof(LINK3res),        self},
+        {NFS_PROGRAM, NFS_V3, NFS3_READDIR,       block_nfs3_readdir_proc,     (xdrproc_t)xdr_READDIR3args,     sizeof(READDIR3args),     (xdrproc_t)xdr_READDIR3res,     sizeof(READDIR3res),     self},
+        {NFS_PROGRAM, NFS_V3, NFS3_READDIRPLUS,   block_nfs3_readdirplus_proc, (xdrproc_t)xdr_READDIRPLUS3args, sizeof(READDIRPLUS3args), (xdrproc_t)xdr_READDIRPLUS3res, sizeof(READDIRPLUS3res), self},
+        {NFS_PROGRAM, NFS_V3, NFS3_FSSTAT,        nfs3_fsstat_proc,            (xdrproc_t)xdr_FSSTAT3args,      sizeof(FSSTAT3args),      (xdrproc_t)xdr_FSSTAT3res,      sizeof(FSSTAT3res),      self},
+        {NFS_PROGRAM, NFS_V3, NFS3_FSINFO,        nfs3_fsinfo_proc,            (xdrproc_t)xdr_FSINFO3args,      sizeof(FSINFO3args),      (xdrproc_t)xdr_FSINFO3res,      sizeof(FSINFO3res),      self},
+        {NFS_PROGRAM, NFS_V3, NFS3_PATHCONF,      nfs3_pathconf_proc,          (xdrproc_t)xdr_PATHCONF3args,    sizeof(PATHCONF3args),    (xdrproc_t)xdr_PATHCONF3res,    sizeof(PATHCONF3res),    self},
+        {NFS_PROGRAM, NFS_V3, NFS3_COMMIT,        nfs3_commit_proc,            (xdrproc_t)xdr_COMMIT3args,      sizeof(COMMIT3args),      (xdrproc_t)xdr_COMMIT3res,      sizeof(COMMIT3res),      self},
+        {MOUNT_PROGRAM, MOUNT_V3, MOUNT3_NULL,    nfs3_null_proc,              NULL,                            0,                        NULL,                         0,                         self},
+        {MOUNT_PROGRAM, MOUNT_V3, MOUNT3_MNT,     mount3_mnt_proc,             (xdrproc_t)xdr_nfs_dirpath,      sizeof(nfs_dirpath),      (xdrproc_t)xdr_nfs_mountres3, sizeof(nfs_mountres3),     self},
+        {MOUNT_PROGRAM, MOUNT_V3, MOUNT3_DUMP,    mount3_dump_proc,            NULL,                            0,                        (xdrproc_t)xdr_nfs_mountlist, sizeof(nfs_mountlist),     self},
+        {MOUNT_PROGRAM, MOUNT_V3, MOUNT3_UMNT,    mount3_umnt_proc,            (xdrproc_t)xdr_nfs_dirpath,      sizeof(nfs_dirpath),      NULL,                         0,                         self},
+        {MOUNT_PROGRAM, MOUNT_V3, MOUNT3_UMNTALL, mount3_umntall_proc,         NULL,                            0,                        NULL,                         0,                         self},
+        {MOUNT_PROGRAM, MOUNT_V3, MOUNT3_EXPORT,  mount3_export_proc,          NULL,                            0,                        (xdrproc_t)xdr_nfs_exports,   sizeof(nfs_exports),       self},
    };
    for (int i = 0; i < sizeof(pt)/sizeof(pt[0]); i++)
    {
-        proc_table.insert(pt[i]);
+        self->proc_table.insert(pt[i]);
    }
 }
-
-nfs_client_t::~nfs_client_t()
-{
-}
--- a/src/nfs_block.h
+++ b/src/nfs_block.h
@ -0,0 +1,57 @@
+// Copyright (c) Vitaliy Filippov, 2019+
+// License: VNPL-1.1 (see README.md for details)
+//
+// NFS proxy over Vitastor block images - header
+
+#pragma once
+
+struct nfs_dir_t
+{
+    uint64_t id;
+    uint64_t mod_rev;
+    timespec mtime;
+};
+
+struct extend_size_t
+{
+    inode_t inode;
+    uint64_t new_size;
+};
+
+inline bool operator < (const extend_size_t &a, const extend_size_t &b)
+{
+    return a.inode < b.inode || a.inode == b.inode && a.new_size < b.new_size;
+}
+
+struct extend_write_t
+{
+    rpc_op_t *rop;
+    int resize_res, write_res; // 1 = started, 0 = completed OK, -errno = completed with error
+};
+
+struct extend_inode_t
+{
+    uint64_t cur_extend = 0, next_extend = 0;
+};
+
+struct block_fs_state_t
+{
+    // filehandle = "S"+base64(sha256(full name with prefix)) or "roothandle" for mount root)
+    uint64_t next_dir_id = 2;
+    // filehandle => dir with name_prefix
+    std::map<std::string, std::string> dir_by_hash;
+    // dir with name_prefix => dir info
+    std::map<std::string, nfs_dir_t> dir_info;
+    // filehandle => inode ID
+    std::map<std::string, inode_t> inode_by_hash;
+    // inode ID => filehandle
+    std::map<inode_t, std::string> hash_by_inode;
+
+    // inode extend requests in progress
+    std::map<inode_t, extend_inode_t> extends;
+    std::multimap<extend_size_t, extend_write_t> extend_writes;
+
+    void init(nfs_proxy_t *proxy);
+};
+
+nfsstat3 vitastor_nfs_map_err(int err);
--- a/src/nfs_common.h
+++ b/src/nfs_common.h
@ -0,0 +1,22 @@
+// Copyright (c) Vitaliy Filippov, 2019+
+// License: VNPL-1.1 (see README.md for details)
+//
+// NFS proxy - common functions
+
+#pragma once
+
+#include "nfs/nfs.h"
+
+void nfs_block_procs(nfs_client_t *self);
+void nfs_kv_procs(nfs_client_t *self);
+int nfs3_fsstat_proc(void *opaque, rpc_op_t *rop);
+int nfs3_fsinfo_proc(void *opaque, rpc_op_t *rop);
+int nfs3_pathconf_proc(void *opaque, rpc_op_t *rop);
+int nfs3_access_proc(void *opaque, rpc_op_t *rop);
+int nfs3_null_proc(void *opaque, rpc_op_t *rop);
+int nfs3_commit_proc(void *opaque, rpc_op_t *rop);
+int mount3_mnt_proc(void *opaque, rpc_op_t *rop);
+int mount3_dump_proc(void *opaque, rpc_op_t *rop);
+int mount3_umnt_proc(void *opaque, rpc_op_t *rop);
+int mount3_umntall_proc(void *opaque, rpc_op_t *rop);
+int mount3_export_proc(void *opaque, rpc_op_t *rop);
--- a/src/nfs_fsstat.cpp
+++ b/src/nfs_fsstat.cpp
@ -0,0 +1,124 @@
+// Copyright (c) Vitaliy Filippov, 2019+
+// License: VNPL-1.1 (see README.md for details)
+//
+// NFS proxy - common FSSTAT, FSINFO, PATHCONF
+
+#include <sys/time.h>
+
+#include "nfs_proxy.h"
+#include "nfs_kv.h"
+
+// Get file system statistics
+int nfs3_fsstat_proc(void *opaque, rpc_op_t *rop)
+{
+    nfs_client_t *self = (nfs_client_t*)opaque;
+    //FSSTAT3args *args = (FSSTAT3args*)rop->request;
+    if (self->parent->trace)
+        fprintf(stderr, "[%d] FSSTAT\n", self->nfs_fd);
+    FSSTAT3res *reply = (FSSTAT3res*)rop->reply;
+    uint64_t tbytes = 0, fbytes = 0;
+    auto pst_it = self->parent->pool_stats.find(self->parent->default_pool_id);
+    if (pst_it != self->parent->pool_stats.end())
+    {
+        auto ttb = pst_it->second["total_raw_tb"].number_value();
+        auto ftb = (pst_it->second["total_raw_tb"].number_value() - pst_it->second["used_raw_tb"].number_value());
+        tbytes = ttb / pst_it->second["raw_to_usable"].number_value() * ((uint64_t)2<<40);
+        fbytes = ftb / pst_it->second["raw_to_usable"].number_value() * ((uint64_t)2<<40);
+    }
+    *reply = (FSSTAT3res){
+        .status = NFS3_OK,
+        .resok = (FSSTAT3resok){
+            .obj_attributes = {
+                .attributes_follow = 0,
+                //.attributes = get_root_attributes(self),
+            },
+            .tbytes = tbytes, // total bytes
+            .fbytes = fbytes, // free bytes
+            .abytes = fbytes, // available bytes
+            .tfiles = (size3)1 << (63-POOL_ID_BITS), // maximum total files
+            .ffiles = (size3)1 << (63-POOL_ID_BITS), // free files
+            .afiles = (size3)1 << (63-POOL_ID_BITS), // available files
+            .invarsec = 0,
+        },
+    };
+    rpc_queue_reply(rop);
+    return 0;
+}
+
+int nfs3_fsinfo_proc(void *opaque, rpc_op_t *rop)
+{
+    nfs_client_t *self = (nfs_client_t*)opaque;
+    FSINFO3args *args = (FSINFO3args*)rop->request;
+    FSINFO3res *reply = (FSINFO3res*)rop->reply;
+    if (self->parent->trace)
+        fprintf(stderr, "[%d] FSINFO %s\n", self->nfs_fd, std::string(args->fsroot).c_str());
+    if (args->fsroot != NFS_ROOT_HANDLE)
+    {
+        *reply = (FSINFO3res){ .status = NFS3ERR_INVAL };
+    }
+    else
+    {
+        // Fill info
+        *reply = (FSINFO3res){
+            .status = NFS3_OK,
+            .resok = (FSINFO3resok){
+                .obj_attributes = {
+                    .attributes_follow = 0,
+                    //.attributes = get_root_attributes(self),
+                },
+                .rtmax = 128*1024*1024,
+                .rtpref = 128*1024*1024,
+                .rtmult = 4096,
+                .wtmax = 128*1024*1024,
+                .wtpref = 128*1024*1024,
+                .wtmult = 4096,
+                .dtpref = 128,
+                .maxfilesize = 0x7fffffffffffffff,
+                .time_delta = {
+                    .seconds = 1,
+                    .nseconds = 0,
+                },
+                .properties = FSF3_SYMLINK | FSF3_HOMOGENEOUS,
+            },
+        };
+    }
+    rpc_queue_reply(rop);
+    return 0;
+}
+
+int nfs3_pathconf_proc(void *opaque, rpc_op_t *rop)
+{
+    nfs_client_t *self = (nfs_client_t*)opaque;
+    PATHCONF3args *args = (PATHCONF3args*)rop->request;
+    PATHCONF3res *reply = (PATHCONF3res*)rop->reply;
+    if (self->parent->trace)
+        fprintf(stderr, "[%d] PATHCONF %s\n", self->nfs_fd, std::string(args->object).c_str());
+    if (args->object != NFS_ROOT_HANDLE)
+    {
+        *reply = (PATHCONF3res){ .status = NFS3ERR_INVAL };
+    }
+    else
+    {
+        // Fill info
+        *reply = (PATHCONF3res){
+            .status = NFS3_OK,
+            .resok = (PATHCONF3resok){
+                .obj_attributes = {
+                    // Without at least one reference to a non-constant value (local variable or something else),
+                    // with gcc 8 we get "internal compiler error: side-effects element in no-side-effects CONSTRUCTOR" here
+                    // FIXME: get rid of this after raising compiler requirement
+                    .attributes_follow = 0,
+                    //.attributes = get_root_attributes(self),
+                },
+                .linkmax = 0,
+                .name_max = 255,
+                .no_trunc = TRUE,
+                .chown_restricted = FALSE,
+                .case_insensitive = FALSE,
+                .case_preserving = TRUE,
+            },
+        };
+    }
+    rpc_queue_reply(rop);
+    return 0;
+}
--- a/src/nfs_kv.cpp
+++ b/src/nfs_kv.cpp
@ -0,0 +1,174 @@
+// Copyright (c) Vitaliy Filippov, 2019+
+// License: VNPL-1.1 (see README.md for details)
+//
+// NFS proxy over VitastorKV database - common functions
+
+#include <sys/time.h>
+
+#include "str_util.h"
+#include "nfs_proxy.h"
+#include "nfs_common.h"
+#include "nfs_kv.h"
+
+nfstime3 nfstime_from_str(const std::string & s)
+{
+    nfstime3 t;
+    auto p = s.find(".");
+    if (p != std::string::npos)
+    {
+        t.seconds = stoull_full(s.substr(0, p), 10);
+        t.nseconds = stoull_full(s.substr(p+1), 10);
+        p = s.size()-p-1;
+        for (; p < 9; p++)
+            t.nseconds *= 10;
+        for (; p > 9; p--)
+            t.nseconds /= 10;
+    }
+    else
+        t.seconds = stoull_full(s, 10);
+    return t;
+}
+
+std::string nfstime_to_str(nfstime3 t)
+{
+    char buf[32];
+    snprintf(buf, sizeof(buf), "%u.%09u", t.seconds, t.nseconds);
+    return buf;
+}
+
+int kv_map_type(const std::string & type)
+{
+    return (type == "" || type == "file" ? NF3REG :
+        (type == "dir" ? NF3DIR :
+        (type == "blk" ? NF3BLK :
+        (type == "chr" ? NF3CHR :
+        (type == "link" ? NF3LNK :
+        (type == "sock" ? NF3SOCK :
+        (type == "fifo" ? NF3FIFO : -1)))))));
+}
+
+fattr3 get_kv_attributes(nfs_client_t *self, uint64_t ino, json11::Json attrs)
+{
+    auto type = kv_map_type(attrs["type"].string_value());
+    auto mode = attrs["mode"].uint64_value();
+    auto nlink = attrs["nlink"].uint64_value();
+    nfstime3 mtime = nfstime_from_str(attrs["mtime"].string_value());
+    nfstime3 atime = attrs["atime"].is_null() ? mtime : nfstime_from_str(attrs["atime"].string_value());
+    // FIXME In theory we could store the binary structure itself instead of JSON
+    return (fattr3){
+        .type = (type == 0 ? NF3REG : (ftype3)type),
+        .mode = (attrs["mode"].is_null() ? (type == NF3DIR ? 0755 : 0644) : (uint32_t)mode),
+        .nlink = (nlink == 0 ? 1 : (uint32_t)nlink),
+        .uid = (uint32_t)attrs["uid"].uint64_value(),
+        .gid = (uint32_t)attrs["gid"].uint64_value(),
+        .size = (type == NF3DIR ? 4096 : attrs["size"].uint64_value()),
+        .used = (type == NF3DIR ? 4096 : attrs["alloc"].uint64_value()),
+        .rdev = (type == NF3BLK || type == NF3CHR
+            ? (specdata3){ (uint32_t)attrs["major"].uint64_value(), (uint32_t)attrs["minor"].uint64_value() }
+            : (specdata3){}),
+        .fsid = self->parent->fsid,
+        .fileid = ino,
+        .atime = atime,
+        .mtime = mtime,
+        .ctime = mtime,
+    };
+}
+
+std::string kv_direntry_key(uint64_t dir_ino, const std::string & filename)
+{
+    // encode as: d <length> <hex dir_ino> / <filename>
+    char key[24] = { 0 };
+    snprintf(key, sizeof(key), "d-%jx/", dir_ino);
+    int n = strnlen(key, sizeof(key)-1) - 3;
+    if (n < 10)
+        key[1] = '0'+n;
+    else
+        key[1] = 'A'+(n-10);
+    return (char*)key + filename;
+}
+
+std::string kv_direntry_filename(const std::string & key)
+{
+    // decode as: d <length> <hex dir_ino> / <filename>
+    auto pos = key.find("/");
+    if (pos != std::string::npos)
+        return key.substr(pos+1);
+    return key;
+}
+
+std::string kv_inode_key(uint64_t ino)
+{
+    char key[24] = { 0 };
+    snprintf(key, sizeof(key), "i-%jx", ino);
+    int n = strnlen(key, sizeof(key)-1) - 2;
+    if (n < 10)
+        key[1] = '0'+n;
+    else
+        key[1] = 'A'+(n-10);
+    return std::string(key, n+2);
+}
+
+std::string kv_fh(uint64_t ino)
+{
+    return "S"+std::string((char*)&ino, 8);
+}
+
+uint64_t kv_fh_inode(const std::string & fh)
+{
+    if (fh.size() == 1 && fh[0] == 'R')
+    {
+        return 1;
+    }
+    else if (fh.size() == 9 && fh[0] == 'S')
+    {
+        return *(uint64_t*)&fh[1];
+    }
+    else if (fh.size() > 17 && fh[0] == 'I')
+    {
+        return *(uint64_t*)&fh[fh.size()-8];
+    }
+    return 0;
+}
+
+bool kv_fh_valid(const std::string & fh)
+{
+    return fh == NFS_ROOT_HANDLE || fh.size() == 9 && fh[0] == 'S' || fh.size() > 17 && fh[0] == 'I';
+}
+
+void nfs_kv_procs(nfs_client_t *self)
+{
+    struct rpc_service_proc_t pt[] = {
+        {NFS_PROGRAM, NFS_V3, NFS3_NULL,        nfs3_null_proc,           NULL,                            0,                        NULL,                           0,                       self},
+        {NFS_PROGRAM, NFS_V3, NFS3_GETATTR,     kv_nfs3_getattr_proc,     (xdrproc_t)xdr_GETATTR3args,     sizeof(GETATTR3args),     (xdrproc_t)xdr_GETATTR3res,     sizeof(GETATTR3res),     self},
+        {NFS_PROGRAM, NFS_V3, NFS3_SETATTR,     kv_nfs3_setattr_proc,     (xdrproc_t)xdr_SETATTR3args,     sizeof(SETATTR3args),     (xdrproc_t)xdr_SETATTR3res,     sizeof(SETATTR3res),     self},
+        {NFS_PROGRAM, NFS_V3, NFS3_LOOKUP,      kv_nfs3_lookup_proc,      (xdrproc_t)xdr_LOOKUP3args,      sizeof(LOOKUP3args),      (xdrproc_t)xdr_LOOKUP3res,      sizeof(LOOKUP3res),      self},
+        {NFS_PROGRAM, NFS_V3, NFS3_ACCESS,      nfs3_access_proc,         (xdrproc_t)xdr_ACCESS3args,      sizeof(ACCESS3args),      (xdrproc_t)xdr_ACCESS3res,      sizeof(ACCESS3res),      self},
+        {NFS_PROGRAM, NFS_V3, NFS3_READLINK,    kv_nfs3_readlink_proc,    (xdrproc_t)xdr_READLINK3args,    sizeof(READLINK3args),    (xdrproc_t)xdr_READLINK3res,    sizeof(READLINK3res),    self},
+        {NFS_PROGRAM, NFS_V3, NFS3_READ,        kv_nfs3_read_proc,        (xdrproc_t)xdr_READ3args,        sizeof(READ3args),        (xdrproc_t)xdr_READ3res,        sizeof(READ3res),        self},
+        {NFS_PROGRAM, NFS_V3, NFS3_WRITE,       kv_nfs3_write_proc,       (xdrproc_t)xdr_WRITE3args,       sizeof(WRITE3args),       (xdrproc_t)xdr_WRITE3res,       sizeof(WRITE3res),       self},
+        {NFS_PROGRAM, NFS_V3, NFS3_CREATE,      kv_nfs3_create_proc,      (xdrproc_t)xdr_CREATE3args,      sizeof(CREATE3args),      (xdrproc_t)xdr_CREATE3res,      sizeof(CREATE3res),      self},
+        {NFS_PROGRAM, NFS_V3, NFS3_MKDIR,       kv_nfs3_mkdir_proc,       (xdrproc_t)xdr_MKDIR3args,       sizeof(MKDIR3args),       (xdrproc_t)xdr_MKDIR3res,       sizeof(MKDIR3res),       self},
+        {NFS_PROGRAM, NFS_V3, NFS3_SYMLINK,     kv_nfs3_symlink_proc,     (xdrproc_t)xdr_SYMLINK3args,     sizeof(SYMLINK3args),     (xdrproc_t)xdr_SYMLINK3res,     sizeof(SYMLINK3res),     self},
+        {NFS_PROGRAM, NFS_V3, NFS3_MKNOD,       kv_nfs3_mknod_proc,       (xdrproc_t)xdr_MKNOD3args,       sizeof(MKNOD3args),       (xdrproc_t)xdr_MKNOD3res,       sizeof(MKNOD3res),       self},
+        {NFS_PROGRAM, NFS_V3, NFS3_REMOVE,      kv_nfs3_remove_proc,      (xdrproc_t)xdr_REMOVE3args,      sizeof(REMOVE3args),      (xdrproc_t)xdr_REMOVE3res,      sizeof(REMOVE3res),      self},
+        {NFS_PROGRAM, NFS_V3, NFS3_RMDIR,       kv_nfs3_rmdir_proc,       (xdrproc_t)xdr_RMDIR3args,       sizeof(RMDIR3args),       (xdrproc_t)xdr_RMDIR3res,       sizeof(RMDIR3res),       self},
+        {NFS_PROGRAM, NFS_V3, NFS3_RENAME,      kv_nfs3_rename_proc,      (xdrproc_t)xdr_RENAME3args,      sizeof(RENAME3args),      (xdrproc_t)xdr_RENAME3res,      sizeof(RENAME3res),      self},
+        {NFS_PROGRAM, NFS_V3, NFS3_LINK,        kv_nfs3_link_proc,        (xdrproc_t)xdr_LINK3args,        sizeof(LINK3args),        (xdrproc_t)xdr_LINK3res,        sizeof(LINK3res),        self},
+        {NFS_PROGRAM, NFS_V3, NFS3_READDIR,     kv_nfs3_readdir_proc,     (xdrproc_t)xdr_READDIR3args,     sizeof(READDIR3args),     (xdrproc_t)xdr_READDIR3res,     sizeof(READDIR3res),     self},
+        {NFS_PROGRAM, NFS_V3, NFS3_READDIRPLUS, kv_nfs3_readdirplus_proc, (xdrproc_t)xdr_READDIRPLUS3args, sizeof(READDIRPLUS3args), (xdrproc_t)xdr_READDIRPLUS3res, sizeof(READDIRPLUS3res), self},
+        {NFS_PROGRAM, NFS_V3, NFS3_FSSTAT,      nfs3_fsstat_proc,         (xdrproc_t)xdr_FSSTAT3args,      sizeof(FSSTAT3args),      (xdrproc_t)xdr_FSSTAT3res,      sizeof(FSSTAT3res),      self},
+        {NFS_PROGRAM, NFS_V3, NFS3_FSINFO,      nfs3_fsinfo_proc,         (xdrproc_t)xdr_FSINFO3args,      sizeof(FSINFO3args),      (xdrproc_t)xdr_FSINFO3res,      sizeof(FSINFO3res),      self},
+        {NFS_PROGRAM, NFS_V3, NFS3_PATHCONF,    nfs3_pathconf_proc,       (xdrproc_t)xdr_PATHCONF3args,    sizeof(PATHCONF3args),    (xdrproc_t)xdr_PATHCONF3res,    sizeof(PATHCONF3res),    self},
+        {NFS_PROGRAM, NFS_V3, NFS3_COMMIT,      nfs3_commit_proc,         (xdrproc_t)xdr_COMMIT3args,      sizeof(COMMIT3args),      (xdrproc_t)xdr_COMMIT3res,      sizeof(COMMIT3res),      self},
+        {MOUNT_PROGRAM, MOUNT_V3, MOUNT3_NULL,    nfs3_null_proc,         NULL,                            0,                        NULL,                         0,                         self},
+        {MOUNT_PROGRAM, MOUNT_V3, MOUNT3_MNT,     mount3_mnt_proc,        (xdrproc_t)xdr_nfs_dirpath,      sizeof(nfs_dirpath),      (xdrproc_t)xdr_nfs_mountres3, sizeof(nfs_mountres3),     self},
+        {MOUNT_PROGRAM, MOUNT_V3, MOUNT3_DUMP,    mount3_dump_proc,       NULL,                            0,                        (xdrproc_t)xdr_nfs_mountlist, sizeof(nfs_mountlist),     self},
+        {MOUNT_PROGRAM, MOUNT_V3, MOUNT3_UMNT,    mount3_umnt_proc,       (xdrproc_t)xdr_nfs_dirpath,      sizeof(nfs_dirpath),      NULL,                         0,                         self},
+        {MOUNT_PROGRAM, MOUNT_V3, MOUNT3_UMNTALL, mount3_umntall_proc,    NULL,                            0,                        NULL,                         0,                         self},
+        {MOUNT_PROGRAM, MOUNT_V3, MOUNT3_EXPORT,  mount3_export_proc,     NULL,                            0,                        (xdrproc_t)xdr_nfs_exports,   sizeof(nfs_exports),       self},
+    };
+    for (int i = 0; i < sizeof(pt)/sizeof(pt[0]); i++)
+    {
+        self->proc_table.insert(pt[i]);
+    }
+}
--- a/src/nfs_kv.h
+++ b/src/nfs_kv.h
@ -0,0 +1,95 @@
+// Copyright (c) Vitaliy Filippov, 2019+
+// License: VNPL-1.1 (see README.md for details)
+//
+// NFS proxy over VitastorKV database - header
+
+#pragma once
+
+#include "nfs/nfs.h"
+
+#define KV_ROOT_INODE 1
+#define KV_NEXT_ID_KEY "id"
+#define SHARED_FILE_MAGIC_V1 0x711A5158A6EDF17E
+
+struct nfs_kv_write_state;
+
+struct list_cookie_t
+{
+    uint64_t dir_ino, cookieverf, cookie;
+};
+
+inline bool operator < (const list_cookie_t & a, const list_cookie_t & b)
+{
+    return a.dir_ino < b.dir_ino || a.dir_ino == b.dir_ino &&
+        (a.cookieverf < b.cookieverf || a.cookieverf == b.cookieverf && a.cookie < b.cookie);
+};
+
+struct list_cookie_val_t
+{
+    std::string key;
+};
+
+struct shared_alloc_queue_t
+{
+    nfs_kv_write_state *st;
+    int state;
+    uint64_t size;
+};
+
+struct kv_inode_extend_t
+{
+    int refcnt = 0;
+    uint64_t cur_extend = 0, next_extend = 0, done_extend = 0;
+    std::vector<std::function<void()>> waiters;
+};
+
+struct kv_fs_state_t
+{
+    std::map<list_cookie_t, list_cookie_val_t> list_cookies;
+    uint64_t fs_next_id = 0, fs_allocated_id = 0;
+    std::vector<uint64_t> unallocated_ids;
+    std::vector<shared_alloc_queue_t> allocating_shared;
+    uint64_t cur_shared_inode = 0, cur_shared_offset = 0;
+    std::map<inode_t, kv_inode_extend_t> extends;
+};
+
+struct shared_file_header_t
+{
+    uint64_t magic = 0;
+    uint64_t inode = 0;
+    uint64_t size = 0;
+};
+
+nfsstat3 vitastor_nfs_map_err(int err);
+nfstime3 nfstime_from_str(const std::string & s);
+std::string nfstime_to_str(nfstime3 t);
+int kv_map_type(const std::string & type);
+fattr3 get_kv_attributes(nfs_client_t *self, uint64_t ino, json11::Json attrs);
+std::string kv_direntry_key(uint64_t dir_ino, const std::string & filename);
+std::string kv_direntry_filename(const std::string & key);
+std::string kv_inode_key(uint64_t ino);
+std::string kv_fh(uint64_t ino);
+uint64_t kv_fh_inode(const std::string & fh);
+bool kv_fh_valid(const std::string & fh);
+void allocate_new_id(nfs_client_t *self, std::function<void(int res, uint64_t new_id)> cb);
+void kv_read_inode(nfs_client_t *self, uint64_t ino,
+    std::function<void(int res, const std::string & value, json11::Json ientry)> cb,
+    bool allow_cache = false);
+uint64_t align_shared_size(nfs_client_t *self, uint64_t size);
+
+int kv_nfs3_getattr_proc(void *opaque, rpc_op_t *rop);
+int kv_nfs3_setattr_proc(void *opaque, rpc_op_t *rop);
+int kv_nfs3_lookup_proc(void *opaque, rpc_op_t *rop);
+int kv_nfs3_readlink_proc(void *opaque, rpc_op_t *rop);
+int kv_nfs3_read_proc(void *opaque, rpc_op_t *rop);
+int kv_nfs3_write_proc(void *opaque, rpc_op_t *rop);
+int kv_nfs3_create_proc(void *opaque, rpc_op_t *rop);
+int kv_nfs3_mkdir_proc(void *opaque, rpc_op_t *rop);
+int kv_nfs3_symlink_proc(void *opaque, rpc_op_t *rop);
+int kv_nfs3_mknod_proc(void *opaque, rpc_op_t *rop);
+int kv_nfs3_remove_proc(void *opaque, rpc_op_t *rop);
+int kv_nfs3_rmdir_proc(void *opaque, rpc_op_t *rop);
+int kv_nfs3_rename_proc(void *opaque, rpc_op_t *rop);
+int kv_nfs3_link_proc(void *opaque, rpc_op_t *rop);
+int kv_nfs3_readdir_proc(void *opaque, rpc_op_t *rop);
+int kv_nfs3_readdirplus_proc(void *opaque, rpc_op_t *rop);
--- a/src/nfs_kv_create.cpp
+++ b/src/nfs_kv_create.cpp
@ -0,0 +1,324 @@
+// Copyright (c) Vitaliy Filippov, 2019+
+// License: VNPL-1.1 (see README.md for details)
+//
+// NFS proxy over VitastorKV database - CREATE, MKDIR, SYMLINK, MKNOD
+
+#include <sys/time.h>
+
+#include "str_util.h"
+#include "nfs_proxy.h"
+#include "nfs_kv.h"
+
+void allocate_new_id(nfs_client_t *self, std::function<void(int res, uint64_t new_id)> cb)
+{
+    if (self->parent->kvfs->fs_next_id <= self->parent->kvfs->fs_allocated_id)
+    {
+        cb(0, self->parent->kvfs->fs_next_id++);
+        return;
+    }
+    else if (self->parent->kvfs->fs_next_id > self->parent->fs_inode_count)
+    {
+        cb(-ENOSPC, 0);
+        return;
+    }
+    self->parent->db->get(KV_NEXT_ID_KEY, [=](int res, const std::string & prev_str)
+    {
+        if (res < 0 && res != -ENOENT)
+        {
+            cb(res, 0);
+            return;
+        }
+        uint64_t prev_val = stoull_full(prev_str);
+        if (prev_val >= self->parent->fs_inode_count)
+        {
+            cb(-ENOSPC, 0);
+            return;
+        }
+        if (prev_val < 1)
+        {
+            prev_val = 1;
+        }
+        uint64_t new_val = prev_val + self->parent->id_alloc_batch_size;
+        if (new_val >= self->parent->fs_inode_count)
+        {
+            new_val = self->parent->fs_inode_count;
+        }
+        self->parent->db->set(KV_NEXT_ID_KEY, std::to_string(new_val), [=](int res)
+        {
+            if (res == -EAGAIN)
+            {
+                // CAS failure - retry
+                allocate_new_id(self, cb);
+            }
+            else if (res < 0)
+            {
+                cb(res, 0);
+            }
+            else
+            {
+                self->parent->kvfs->fs_next_id = prev_val+2;
+                self->parent->kvfs->fs_allocated_id = new_val;
+                cb(0, prev_val+1);
+            }
+        }, [prev_val](int res, const std::string & value)
+        {
+            // FIXME: Allow to modify value from CAS callback? ("update" query)
+            return res < 0 || stoull_full(value) == prev_val;
+        });
+    });
+}
+
+struct kv_create_state
+{
+    nfs_client_t *self = NULL;
+    rpc_op_t *rop = NULL;
+    bool exclusive = false;
+    uint64_t verf = 0;
+    uint64_t dir_ino = 0;
+    std::string filename;
+    uint64_t new_id = 0;
+    json11::Json::object attrobj;
+    json11::Json attrs;
+    std::string direntry_text;
+    uint64_t dup_ino = 0;
+    std::function<void(int res)> cb;
+};
+
+static void kv_do_create(kv_create_state *st)
+{
+    if (st->self->parent->trace)
+        fprintf(stderr, "[%d] CREATE %ju/%s ATTRS %s\n", st->self->nfs_fd, st->dir_ino, st->filename.c_str(), json11::Json(st->attrobj).dump().c_str());
+    if (st->filename == "" || st->filename.find("/") != std::string::npos)
+    {
+        auto cb = std::move(st->cb);
+        cb(-EINVAL);
+        return;
+    }
+    // Generate inode ID
+    allocate_new_id(st->self, [st](int res, uint64_t new_id)
+    {
+        if (res < 0)
+        {
+            auto cb = std::move(st->cb);
+            cb(res);
+            return;
+        }
+        st->new_id = new_id;
+        auto direntry = json11::Json::object{ { "ino", st->new_id } };
+        if (st->attrobj["type"].string_value() == "dir")
+        {
+            direntry["type"] = "dir";
+        }
+        st->attrs = std::move(st->attrobj);
+        st->direntry_text = json11::Json(direntry).dump().c_str();
+        // Set direntry
+        st->self->parent->db->set(kv_direntry_key(st->dir_ino, st->filename), st->direntry_text, [st](int res)
+        {
+            if (res < 0)
+            {
+                st->self->parent->kvfs->unallocated_ids.push_back(st->new_id);
+                if (res == -EAGAIN)
+                {
+                    if (st->dup_ino)
+                    {
+                        st->new_id = st->dup_ino;
+                        res = 0;
+                    }
+                    else
+                        res = -EEXIST;
+                }
+                else
+                    fprintf(stderr, "create %ju/%s failed: %s (code %d)\n", st->dir_ino, st->filename.c_str(), strerror(-res), res);
+                auto cb = std::move(st->cb);
+                cb(res);
+            }
+            else
+            {
+                st->self->parent->db->set(kv_inode_key(st->new_id), st->attrs.dump().c_str(), [st](int res)
+                {
+                    if (res == -EAGAIN)
+                    {
+                        res = -EEXIST;
+                    }
+                    if (res < 0)
+                    {
+                        st->self->parent->db->del(kv_direntry_key(st->dir_ino, st->filename), [st, res](int del_res)
+                        {
+                            if (!del_res)
+                            {
+                                st->self->parent->kvfs->unallocated_ids.push_back(st->new_id);
+                            }
+                            auto cb = std::move(st->cb);
+                            cb(res);
+                        }, [st](int res, const std::string & value)
+                        {
+                            return res != -ENOENT && value == st->direntry_text;
+                        });
+                    }
+                    else
+                    {
+                        auto cb = std::move(st->cb);
+                        cb(0);
+                    }
+                }, [st](int res, const std::string & value)
+                {
+                    return res == -ENOENT;
+                });
+            }
+        }, [st](int res, const std::string & value)
+        {
+            // CAS compare - check that the key doesn't exist
+            if (res == 0)
+            {
+                std::string err;
+                auto direntry = json11::Json::parse(value, err);
+                if (err != "")
+                {
+                    fprintf(stderr, "Invalid JSON in direntry %s = %s: %s, overwriting\n",
+                        kv_direntry_key(st->dir_ino, st->filename).c_str(), value.c_str(), err.c_str());
+                    return true;
+                }
+                if (st->exclusive && direntry["verf"].uint64_value() == st->verf)
+                {
+                    st->dup_ino = direntry["ino"].uint64_value();
+                    return false;
+                }
+                return false;
+            }
+            return true;
+        });
+    });
+}
+
+static void kv_create_setattr(json11::Json::object & attrobj, sattr3 & sattr)
+{
+    if (sattr.mode.set_it)
+        attrobj["mode"] = (uint64_t)sattr.mode.mode;
+    if (sattr.uid.set_it)
+        attrobj["uid"] = (uint64_t)sattr.uid.uid;
+    if (sattr.gid.set_it)
+        attrobj["gid"] = (uint64_t)sattr.gid.gid;
+    if (sattr.atime.set_it)
+        attrobj["atime"] = nfstime_to_str(sattr.atime.atime);
+    if (sattr.mtime.set_it)
+        attrobj["mtime"] = nfstime_to_str(sattr.mtime.mtime);
+}
+
+template<class T, class Tok> static void kv_create_reply(kv_create_state *st, int res)
+{
+    T *reply = (T*)st->rop->reply;
+    if (res < 0)
+    {
+        *reply = (T){ .status = vitastor_nfs_map_err(-res) };
+    }
+    else
+    {
+        *reply = (T){
+            .status = NFS3_OK,
+            .resok = (Tok){
+                .obj = {
+                    .handle_follows = 1,
+                    .handle = xdr_copy_string(st->rop->xdrs, kv_fh(st->new_id)),
+                },
+                .obj_attributes = {
+                    .attributes_follow = 1,
+                    .attributes = get_kv_attributes(st->self, st->new_id, st->attrs),
+                },
+            },
+        };
+    }
+    rpc_queue_reply(st->rop);
+    delete st;
+}
+
+int kv_nfs3_create_proc(void *opaque, rpc_op_t *rop)
+{
+    kv_create_state *st = new kv_create_state;
+    st->self = (nfs_client_t*)opaque;
+    st->rop = rop;
+    auto args = (CREATE3args*)rop->request;
+    st->exclusive = args->how.mode == NFS_EXCLUSIVE;
+    st->verf = st->exclusive ? *(uint64_t*)&args->how.verf : 0;
+    st->dir_ino = kv_fh_inode(args->where.dir);
+    st->filename = args->where.name;
+    if (args->how.mode == NFS_EXCLUSIVE)
+    {
+        st->attrobj["verf"] = *(uint64_t*)&args->how.verf;
+    }
+    else if (args->how.mode == NFS_UNCHECKED)
+    {
+        kv_create_setattr(st->attrobj, args->how.obj_attributes);
+        if (args->how.obj_attributes.size.set_it)
+        {
+            st->attrobj["size"] = (uint64_t)args->how.obj_attributes.size.size;
+            st->attrobj["empty"] = true;
+        }
+    }
+    st->cb = [st](int res) { kv_create_reply<CREATE3res, CREATE3resok>(st, res); };
+    kv_do_create(st);
+    return 1;
+}
+
+int kv_nfs3_mkdir_proc(void *opaque, rpc_op_t *rop)
+{
+    kv_create_state *st = new kv_create_state;
+    st->self = (nfs_client_t*)opaque;
+    st->rop = rop;
+    auto args = (MKDIR3args*)rop->request;
+    st->dir_ino = kv_fh_inode(args->where.dir);
+    st->filename = args->where.name;
+    st->attrobj["type"] = "dir";
+    st->attrobj["parent_ino"] = st->dir_ino;
+    kv_create_setattr(st->attrobj, args->attributes);
+    st->cb = [st](int res) { kv_create_reply<MKDIR3res, MKDIR3resok>(st, res); };
+    kv_do_create(st);
+    return 1;
+}
+
+int kv_nfs3_symlink_proc(void *opaque, rpc_op_t *rop)
+{
+    kv_create_state *st = new kv_create_state;
+    st->self = (nfs_client_t*)opaque;
+    st->rop = rop;
+    auto args = (SYMLINK3args*)rop->request;
+    st->dir_ino = kv_fh_inode(args->where.dir);
+    st->filename = args->where.name;
+    st->attrobj["type"] = "link";
+    st->attrobj["symlink"] = (std::string)args->symlink.symlink_data;
+    kv_create_setattr(st->attrobj, args->symlink.symlink_attributes);
+    st->cb = [st](int res) { kv_create_reply<SYMLINK3res, SYMLINK3resok>(st, res); };
+    kv_do_create(st);
+    return 1;
+}
+
+int kv_nfs3_mknod_proc(void *opaque, rpc_op_t *rop)
+{
+    kv_create_state *st = new kv_create_state;
+    st->self = (nfs_client_t*)opaque;
+    st->rop = rop;
+    auto args = (MKNOD3args*)rop->request;
+    st->dir_ino = kv_fh_inode(args->where.dir);
+    st->filename = args->where.name;
+    if (args->what.type == NF3CHR || args->what.type == NF3BLK)
+    {
+        st->attrobj["type"] = (args->what.type == NF3CHR ? "chr" : "blk");
+        st->attrobj["major"] = (uint64_t)args->what.chr_device.spec.specdata1;
+        st->attrobj["minor"] = (uint64_t)args->what.chr_device.spec.specdata2;
+        kv_create_setattr(st->attrobj, args->what.chr_device.dev_attributes);
+    }
+    else if (args->what.type == NF3SOCK || args->what.type == NF3FIFO)
+    {
+        st->attrobj["type"] = (args->what.type == NF3SOCK ? "sock" : "fifo");
+        kv_create_setattr(st->attrobj, args->what.sock_attributes);
+    }
+    else
+    {
+        *(MKNOD3res*)rop->reply = (MKNOD3res){ .status = NFS3ERR_INVAL };
+        rpc_queue_reply(rop);
+        delete st;
+        return 0;
+    }
+    st->cb = [st](int res) { kv_create_reply<MKNOD3res, MKNOD3resok>(st, res); };
+    kv_do_create(st);
+    return 1;
+}
--- a/src/nfs_kv_getattr.cpp
+++ b/src/nfs_kv_getattr.cpp
@ -0,0 +1,76 @@
+// Copyright (c) Vitaliy Filippov, 2019+
+// License: VNPL-1.1 (see README.md for details)
+//
+// NFS proxy over VitastorKV database - GETATTR
+
+#include <sys/time.h>
+
+#include "nfs_proxy.h"
+#include "nfs_kv.h"
+
+// Attributes are always stored in the inode
+void kv_read_inode(nfs_client_t *self, uint64_t ino,
+    std::function<void(int res, const std::string & value, json11::Json ientry)> cb,
+    bool allow_cache)
+{
+    auto key = kv_inode_key(ino);
+    self->parent->db->get(key, [=](int res, const std::string & value)
+    {
+        if (ino == KV_ROOT_INODE && res == -ENOENT)
+        {
+            // Allow root inode to not exist
+            cb(0, "", json11::Json(json11::Json::object{ { "type", "dir" } }));
+            return;
+        }
+        if (res < 0)
+        {
+            if (res != -ENOENT)
+                fprintf(stderr, "Error reading inode %s: %s (code %d)\n", kv_inode_key(ino).c_str(), strerror(-res), res);
+            cb(res, "", json11::Json());
+            return;
+        }
+        std::string err;
+        auto attrs = json11::Json::parse(value, err);
+        if (err != "")
+        {
+            fprintf(stderr, "Invalid JSON in inode %s = %s: %s\n", kv_inode_key(ino).c_str(), value.c_str(), err.c_str());
+            res = -EIO;
+        }
+        cb(res, value, attrs);
+    }, allow_cache);
+}
+
+int kv_nfs3_getattr_proc(void *opaque, rpc_op_t *rop)
+{
+    nfs_client_t *self = (nfs_client_t*)opaque;
+    GETATTR3args *args = (GETATTR3args*)rop->request;
+    GETATTR3res *reply = (GETATTR3res*)rop->reply;
+    std::string fh = args->object;
+    auto ino = kv_fh_inode(fh);
+    if (self->parent->trace)
+        fprintf(stderr, "[%d] GETATTR %ju\n", self->nfs_fd, ino);
+    if (!kv_fh_valid(fh))
+    {
+        *reply = (GETATTR3res){ .status = NFS3ERR_INVAL };
+        rpc_queue_reply(rop);
+        return 0;
+    }
+    kv_read_inode(self, ino, [=](int res, const std::string & value, json11::Json attrs)
+    {
+        if (res < 0)
+        {
+            *reply = (GETATTR3res){ .status = vitastor_nfs_map_err(-res) };
+        }
+        else
+        {
+            *reply = (GETATTR3res){
+                .status = NFS3_OK,
+                .resok = (GETATTR3resok){
+                    .obj_attributes = get_kv_attributes(self, ino, attrs),
+                },
+            };
+        }
+        rpc_queue_reply(rop);
+    });
+    return 1;
+}
--- a/src/nfs_kv_link.cpp
+++ b/src/nfs_kv_link.cpp
@ -0,0 +1,189 @@
+// Copyright (c) Vitaliy Filippov, 2019+
+// License: VNPL-1.1 (see README.md for details)
+//
+// NFS proxy over VitastorKV database - LINK
+
+#include <sys/time.h>
+
+#include "nfs_proxy.h"
+#include "nfs_kv.h"
+
+struct nfs_kv_link_state
+{
+    nfs_client_t *self = NULL;
+    rpc_op_t *rop = NULL;
+    uint64_t ino = 0;
+    uint64_t dir_ino = 0;
+    std::string filename;
+    std::string ientry_text;
+    json11::Json ientry;
+    bool retrying = false;
+    int wait = 0;
+    int res = 0, res2 = 0;
+    std::function<void(int)> cb;
+};
+
+static void nfs_kv_continue_link(nfs_kv_link_state *st, int state)
+{
+    // 1) Read the source inode
+    // 2) If it's a directory - fail with -EISDIR
+    // 3) Create the new direntry with the same inode reference
+    // 4) Update the inode entry with refcount++
+    // 5) Retry update if CAS failed but the inode exists
+    // 6) Otherwise fail and remove the new direntry
+    // Yeah we may leave a bad direntry if we crash
+    // But the other option is to possibly leave an inode with too big refcount
+    if (state == 0)      {}
+    else if (state == 1) goto resume_1;
+    else if (state == 2) goto resume_2;
+    else if (state == 3) goto resume_3;
+    else if (state == 4) goto resume_4;
+    else
+    {
+        fprintf(stderr, "BUG: invalid state in nfs_kv_continue_link()");
+        abort();
+    }
+resume_0:
+    // Check that the source inode exists and is not a directory
+    st->wait = st->retrying ? 1 : 2;
+    st->res2 = 0;
+    kv_read_inode(st->self, st->ino, [st](int res, const std::string & value, json11::Json attrs)
+    {
+        st->res = res == 0 ? (attrs["type"].string_value() == "dir" ? -EISDIR : 0) : res;
+        st->ientry_text = value;
+        st->ientry = attrs;
+        if (!--st->wait)
+            nfs_kv_continue_link(st, 1);
+    });
+    if (!st->retrying)
+    {
+        // Check that the new directory exists
+        kv_read_inode(st->self, st->dir_ino, [st](int res, const std::string & value, json11::Json attrs)
+        {
+            st->res2 = res == 0 ? (attrs["type"].string_value() == "dir" ? 0 : -ENOTDIR) : res;
+            if (!--st->wait)
+                nfs_kv_continue_link(st, 1);
+        });
+    }
+    return;
+resume_1:
+    if (st->res < 0 || st->res2 < 0)
+    {
+        auto cb = std::move(st->cb);
+        cb(st->res < 0 ? st->res : st->res2);
+        return;
+    }
+    // Write the new direntry
+    if (!st->retrying)
+    {
+        st->self->parent->db->set(kv_direntry_key(st->dir_ino, st->filename),
+            json11::Json(json11::Json::object{ { "ino", st->ino } }).dump(), [st](int res)
+        {
+            st->res = res;
+            nfs_kv_continue_link(st, 2);
+        }, [st](int res, const std::string & old_value)
+        {
+            return res == -ENOENT;
+        });
+        return;
+resume_2:
+        if (st->res < 0)
+        {
+            auto cb = std::move(st->cb);
+            cb(st->res);
+            return;
+        }
+    }
+    // Increase inode refcount
+    {
+        auto new_ientry = st->ientry.object_items();
+        auto nlink = new_ientry["nlink"].uint64_value();
+        new_ientry["nlink"] = nlink ? nlink+1 : 2;
+        st->ientry = new_ientry;
+    }
+    st->self->parent->db->set(kv_inode_key(st->ino), st->ientry.dump(), [st](int res)
+    {
+        st->res = res;
+        nfs_kv_continue_link(st, 3);
+    }, [st](int res, const std::string & old_value)
+    {
+        st->res2 = res;
+        return res == 0 && old_value == st->ientry_text;
+    });
+    return;
+resume_3:
+    if (st->res2 == -ENOENT)
+    {
+        st->res = -ENOENT;
+    }
+    if (st->res == -EAGAIN)
+    {
+        // Re-read inode and retry
+        st->retrying = true;
+        goto resume_0;
+    }
+    if (st->res < 0)
+    {
+        // Maybe inode was deleted in the meantime, delete our direntry
+        st->self->parent->db->del(kv_direntry_key(st->dir_ino, st->filename), [st](int res)
+        {
+            st->res2 = res;
+            nfs_kv_continue_link(st, 4);
+        });
+        return;
+resume_4:
+        if (st->res2 < 0)
+        {
+            fprintf(stderr, "Warning: failed to delete new linked direntry %ju/%s: %s (code %d)\n",
+                st->dir_ino, st->filename.c_str(), strerror(-st->res2), st->res2);
+        }
+    }
+    auto cb = std::move(st->cb);
+    cb(st->res);
+}
+
+// FIXME: We'll need some tests for the FS
+int kv_nfs3_link_proc(void *opaque, rpc_op_t *rop)
+{
+    auto st = new nfs_kv_link_state;
+    st->self = (nfs_client_t*)opaque;
+    st->rop = rop;
+    LINK3args *args = (LINK3args*)rop->request;
+    st->ino = kv_fh_inode(args->file);
+    st->dir_ino = kv_fh_inode(args->link.dir);
+    st->filename = args->link.name;
+    if (st->self->parent->trace)
+        fprintf(stderr, "[%d] LINK %ju -> %ju/%s\n", st->self->nfs_fd, st->ino, st->dir_ino, st->filename.c_str());
+    if (!st->ino || !st->dir_ino || st->filename == "")
+    {
+        LINK3res *reply = (LINK3res*)rop->reply;
+        *reply = (LINK3res){ .status = NFS3ERR_INVAL };
+        rpc_queue_reply(rop);
+        delete st;
+        return 0;
+    }
+    st->cb = [st](int res)
+    {
+        LINK3res *reply = (LINK3res*)st->rop->reply;
+        if (res < 0)
+        {
+            *reply = (LINK3res){ .status = vitastor_nfs_map_err(res) };
+        }
+        else
+        {
+            *reply = (LINK3res){
+                .status = NFS3_OK,
+                .resok = (LINK3resok){
+                    .file_attributes = (post_op_attr){
+                        .attributes_follow = 1,
+                        .attributes = get_kv_attributes(st->self, st->ino, st->ientry),
+                    },
+                },
+            };
+        }
+        rpc_queue_reply(st->rop);
+        delete st;
+    };
+    nfs_kv_continue_link(st, 0);
+    return 1;
+}
--- a/src/nfs_kv_lookup.cpp
+++ b/src/nfs_kv_lookup.cpp
@ -0,0 +1,104 @@
+// Copyright (c) Vitaliy Filippov, 2019+
+// License: VNPL-1.1 (see README.md for details)
+//
+// NFS proxy over VitastorKV database - LOOKUP, READLINK
+
+#include <sys/time.h>
+
+#include "nfs_proxy.h"
+#include "nfs_kv.h"
+
+int kv_nfs3_lookup_proc(void *opaque, rpc_op_t *rop)
+{
+    nfs_client_t *self = (nfs_client_t*)opaque;
+    LOOKUP3args *args = (LOOKUP3args*)rop->request;
+    LOOKUP3res *reply = (LOOKUP3res*)rop->reply;
+    inode_t dir_ino = kv_fh_inode(args->what.dir);
+    std::string filename = args->what.name;
+    if (self->parent->trace)
+        fprintf(stderr, "[%d] LOOKUP %ju/%s\n", self->nfs_fd, dir_ino, filename.c_str());
+    if (!dir_ino || filename == "")
+    {
+        *reply = (LOOKUP3res){ .status = NFS3ERR_INVAL };
+        rpc_queue_reply(rop);
+        return 0;
+    }
+    self->parent->db->get(kv_direntry_key(dir_ino, filename), [=](int res, const std::string & value)
+    {
+        if (res < 0)
+        {
+            *reply = (LOOKUP3res){ .status = vitastor_nfs_map_err(-res) };
+            rpc_queue_reply(rop);
+            return;
+        }
+        std::string err;
+        auto direntry = json11::Json::parse(value, err);
+        if (err != "")
+        {
+            fprintf(stderr, "Invalid JSON in direntry %s = %s: %s\n", kv_direntry_key(dir_ino, filename).c_str(), value.c_str(), err.c_str());
+            *reply = (LOOKUP3res){ .status = NFS3ERR_IO };
+            rpc_queue_reply(rop);
+            return;
+        }
+        uint64_t ino = direntry["ino"].uint64_value();
+        kv_read_inode(self, ino, [=](int res, const std::string & value, json11::Json ientry)
+        {
+            if (res < 0)
+            {
+                *reply = (LOOKUP3res){ .status = vitastor_nfs_map_err(res == -ENOENT ? -EIO : res) };
+                rpc_queue_reply(rop);
+                return;
+            }
+            *reply = (LOOKUP3res){
+                .status = NFS3_OK,
+                .resok = (LOOKUP3resok){
+                    .object = xdr_copy_string(rop->xdrs, kv_fh(ino)),
+                    .obj_attributes = {
+                        .attributes_follow = 1,
+                        .attributes = get_kv_attributes(self, ino, ientry),
+                    },
+                },
+            };
+            rpc_queue_reply(rop);
+        });
+    });
+    return 1;
+}
+
+int kv_nfs3_readlink_proc(void *opaque, rpc_op_t *rop)
+{
+    nfs_client_t *self = (nfs_client_t*)opaque;
+    READLINK3args *args = (READLINK3args*)rop->request;
+    if (self->parent->trace)
+        fprintf(stderr, "[%d] READLINK %ju\n", self->nfs_fd, kv_fh_inode(args->symlink));
+    READLINK3res *reply = (READLINK3res*)rop->reply;
+    if (!kv_fh_valid(args->symlink) || args->symlink == NFS_ROOT_HANDLE)
+    {
+        // Invalid filehandle or trying to read symlink from root entry
+        *reply = (READLINK3res){ .status = NFS3ERR_INVAL };
+        rpc_queue_reply(rop);
+        return 0;
+    }
+    kv_read_inode(self, kv_fh_inode(args->symlink), [=](int res, const std::string & value, json11::Json attrs)
+    {
+        if (res < 0)
+        {
+            *reply = (READLINK3res){ .status = vitastor_nfs_map_err(-res) };
+        }
+        else if (attrs["type"] != "link")
+        {
+            *reply = (READLINK3res){ .status = NFS3ERR_INVAL };
+        }
+        else
+        {
+            *reply = (READLINK3res){
+                .status = NFS3_OK,
+                .resok = (READLINK3resok){
+                    .data = xdr_copy_string(rop->xdrs, attrs["symlink"].string_value()),
+                },
+            };
+        }
+        rpc_queue_reply(rop);
+    });
+    return 1;
+}
--- a/src/nfs_kv_read.cpp
+++ b/src/nfs_kv_read.cpp
@ -0,0 +1,162 @@
+// Copyright (c) Vitaliy Filippov, 2019+
+// License: VNPL-1.1 (see README.md for details)
+//
+// NFS proxy over VitastorKV database - READ
+
+#include <sys/time.h>
+
+#include "nfs_proxy.h"
+#include "nfs_kv.h"
+
+struct nfs_kv_read_state
+{
+    nfs_client_t *self = NULL;
+    rpc_op_t *rop = NULL;
+    bool allow_cache = true;
+    inode_t ino = 0;
+    uint64_t offset = 0, size = 0;
+    std::function<void(int)> cb;
+    // state
+    int res = 0;
+    json11::Json ientry;
+    uint64_t aligned_size = 0, aligned_offset = 0;
+    uint8_t *aligned_buf = NULL;
+    cluster_op_t *op = NULL;
+    uint8_t *buf = NULL;
+};
+
+static void nfs_kv_continue_read(nfs_kv_read_state *st, int state)
+{
+    if (state == 0)      {}
+    else if (state == 1) goto resume_1;
+    else if (state == 2) goto resume_2;
+    else if (state == 3) goto resume_3;
+    else
+    {
+        fprintf(stderr, "BUG: invalid state in nfs_kv_continue_read()");
+        abort();
+    }
+    if (st->offset + sizeof(shared_file_header_t) < st->self->parent->shared_inode_threshold)
+    {
+        kv_read_inode(st->self, st->ino, [st](int res, const std::string & value, json11::Json attrs)
+        {
+            st->res = res;
+            st->ientry = attrs;
+            nfs_kv_continue_read(st, 1);
+        }, st->allow_cache);
+        return;
+resume_1:
+        if (st->res < 0 || kv_map_type(st->ientry["type"].string_value()) != NF3REG)
+        {
+            auto cb = std::move(st->cb);
+            cb(st->res < 0 ? st->res : -EINVAL);
+            return;
+        }
+        if (st->ientry["shared_ino"].uint64_value() != 0)
+        {
+            st->aligned_size = align_shared_size(st->self, st->offset+st->size);
+            st->aligned_buf = (uint8_t*)malloc_or_die(st->aligned_size);
+            st->buf = st->aligned_buf + sizeof(shared_file_header_t) + st->offset;
+            st->op = new cluster_op_t;
+            st->op->opcode = OSD_OP_READ;
+            st->op->inode = st->self->parent->fs_base_inode + st->ientry["shared_ino"].uint64_value();
+            st->op->offset = st->ientry["shared_offset"].uint64_value();
+            if (st->offset+st->size > st->ientry["size"].uint64_value())
+            {
+                st->op->len = align_shared_size(st->self, st->ientry["size"].uint64_value());
+                memset(st->aligned_buf+st->op->len, 0, st->aligned_size-st->op->len);
+            }
+            else
+                st->op->len = st->aligned_size;
+            st->op->iov.push_back(st->aligned_buf, st->op->len);
+            st->op->callback = [st, state](cluster_op_t *op)
+            {
+                st->res = op->retval == op->len ? 0 : op->retval;
+                delete op;
+                nfs_kv_continue_read(st, 2);
+            };
+            st->self->parent->cli->execute(st->op);
+            return;
+resume_2:
+            if (st->res < 0)
+            {
+                auto cb = std::move(st->cb);
+                cb(st->res);
+                return;
+            }
+            auto hdr = ((shared_file_header_t*)st->aligned_buf);
+            if (hdr->magic != SHARED_FILE_MAGIC_V1 || hdr->inode != st->ino ||
+                align_shared_size(st->self, hdr->size) > align_shared_size(st->self, st->ientry["size"].uint64_value()))
+            {
+                // Got unrelated data - retry from the beginning
+                free(st->aligned_buf);
+                st->aligned_buf = NULL;
+                st->allow_cache = false;
+                nfs_kv_continue_read(st, 0);
+                return;
+            }
+            auto cb = std::move(st->cb);
+            cb(0);
+            return;
+        }
+    }
+    st->aligned_offset = (st->offset & ~(st->self->parent->pool_alignment-1));
+    st->aligned_size = ((st->offset + st->size + st->self->parent->pool_alignment) &
+        ~(st->self->parent->pool_alignment-1)) - st->aligned_offset;
+    st->aligned_buf = (uint8_t*)malloc_or_die(st->aligned_size);
+    st->buf = st->aligned_buf + st->offset - st->aligned_offset;
+    st->op = new cluster_op_t;
+    st->op->opcode = OSD_OP_READ;
+    st->op->inode = st->self->parent->fs_base_inode + st->ino;
+    st->op->offset = st->aligned_offset;
+    st->op->len = st->aligned_size;
+    st->op->iov.push_back(st->aligned_buf, st->aligned_size);
+    st->op->callback = [st](cluster_op_t *op)
+    {
+        st->res = op->retval;
+        delete op;
+        nfs_kv_continue_read(st, 3);
+    };
+    st->self->parent->cli->execute(st->op);
+    return;
+resume_3:
+    auto cb = std::move(st->cb);
+    cb(st->res);
+    return;
+}
+
+int kv_nfs3_read_proc(void *opaque, rpc_op_t *rop)
+{
+    READ3args *args = (READ3args*)rop->request;
+    READ3res *reply = (READ3res*)rop->reply;
+    auto ino = kv_fh_inode(args->file);
+    if (args->count > MAX_REQUEST_SIZE || !ino)
+    {
+        *reply = (READ3res){ .status = NFS3ERR_INVAL };
+        rpc_queue_reply(rop);
+        return 0;
+    }
+    auto st = new nfs_kv_read_state;
+    st->self = (nfs_client_t*)opaque;
+    st->rop = rop;
+    st->ino = ino;
+    st->offset = args->offset;
+    st->size = args->count;
+    st->cb = [st](int res)
+    {
+        READ3res *reply = (READ3res*)st->rop->reply;
+        *reply = (READ3res){ .status = vitastor_nfs_map_err(res) };
+        if (res == 0)
+        {
+            xdr_add_malloc(st->rop->xdrs, st->aligned_buf);
+            reply->resok.data.data = (char*)st->buf;
+            reply->resok.data.size = st->size;
+            reply->resok.count = st->size;
+            reply->resok.eof = 0;
+        }
+        rpc_queue_reply(st->rop);
+        delete st;
+    };
+    nfs_kv_continue_read(st, 0);
+    return 1;
+}
--- a/src/nfs_kv_readdir.cpp
+++ b/src/nfs_kv_readdir.cpp
@ -0,0 +1,371 @@
+// Copyright (c) Vitaliy Filippov, 2019+
+// License: VNPL-1.1 (see README.md for details)
+//
+// NFS proxy over VitastorKV database - READDIR, READDIRPLUS
+
+#include <sys/time.h>
+
+#include "nfs_proxy.h"
+#include "nfs_kv.h"
+
+static unsigned len_pad4(unsigned len)
+{
+    return len + (len&3 ? 4-(len&3) : 0);
+}
+
+struct nfs_kv_readdir_state
+{
+    nfs_client_t *self = NULL;
+    rpc_op_t *rop = NULL;
+    // Request:
+    bool is_plus = false;
+    uint64_t cookie = 0;
+    uint64_t cookieverf = 0;
+    uint64_t dir_ino = 0;
+    uint64_t maxcount = 0;
+    std::function<void(int)> cb;
+    // State:
+    int res = 0;
+    std::string prefix, start;
+    void *list_handle;
+    uint64_t parent_ino = 0;
+    std::string ientry_text, parent_ientry_text;
+    json11::Json ientry, parent_ientry;
+    std::string cur_key, cur_value;
+    int reply_size = 0;
+    int to_skip = 0;
+    uint64_t offset = 0;
+    int getattr_running = 0, getattr_cur = 0;
+    // Result:
+    bool eof = false;
+    //uint64_t cookieverf = 0; // same field
+    std::vector<entryplus3> entries;
+};
+
+static void nfs_kv_continue_readdir(nfs_kv_readdir_state *st, int state);
+
+static void kv_getattr_next(nfs_kv_readdir_state *st)
+{
+    while (st->is_plus && st->getattr_cur < st->entries.size() && st->getattr_running < st->self->parent->readdir_getattr_parallel)
+    {
+        auto idx = st->getattr_cur++;
+        st->getattr_running++;
+        kv_read_inode(st->self, st->entries[idx].fileid, [st, idx](int res, const std::string & value, json11::Json ientry)
+        {
+            if (res == 0)
+            {
+                st->entries[idx].name_attributes = (post_op_attr){
+                    // FIXME: maybe do not read parent attributes and leave them to a GETATTR?
+                    .attributes_follow = 1,
+                    .attributes = get_kv_attributes(st->self, st->entries[idx].fileid, ientry),
+                };
+            }
+            st->getattr_running--;
+            kv_getattr_next(st);
+            if (st->getattr_running == 0 && !st->list_handle)
+            {
+                nfs_kv_continue_readdir(st, 4);
+            }
+        });
+    }
+}
+
+static void nfs_kv_continue_readdir(nfs_kv_readdir_state *st, int state)
+{
+    if (state == 0)      {}
+    else if (state == 1) goto resume_1;
+    else if (state == 2) goto resume_2;
+    else if (state == 3) goto resume_3;
+    else if (state == 4) goto resume_4;
+    else
+    {
+        fprintf(stderr, "BUG: invalid state in nfs_kv_continue_readdir()");
+        abort();
+    }
+    // Limit results based on maximum reply size
+    // Sadly we have to calculate reply size by hand
+    // reply without entries is 4+4+(dir_attributes ? sizeof(fattr3) : 0)+8+4 bytes
+    st->reply_size = 20;
+    if (st->reply_size > st->maxcount)
+    {
+        // Error, too small max reply size
+        auto cb = std::move(st->cb);
+        cb(-NFS3ERR_TOOSMALL);
+        return;
+    }
+    // Add . and ..
+    if (st->cookie <= 1)
+    {
+        kv_read_inode(st->self, st->dir_ino, [st](int res, const std::string & value, json11::Json ientry)
+        {
+            st->res = res;
+            st->ientry_text = value;
+            st->ientry = ientry;
+            nfs_kv_continue_readdir(st, 1);
+        });
+        return;
+resume_1:
+        if (st->res < 0)
+        {
+            auto cb = std::move(st->cb);
+            cb(st->res);
+            return;
+        }
+        if (st->cookie == 0)
+        {
+            auto fh = kv_fh(st->dir_ino);
+            auto entry_size = 20 + 4/*len_pad4(".")*/ + (st->is_plus ? 8 + 88 + len_pad4(fh.size()) : 0);
+            if (st->reply_size + entry_size > st->maxcount)
+            {
+                auto cb = std::move(st->cb);
+                cb(-NFS3ERR_TOOSMALL);
+                return;
+            }
+            entryplus3 dot = {};
+            dot.name = xdr_copy_string(st->rop->xdrs, ".");
+            dot.fileid = st->dir_ino;
+            dot.name_attributes = (post_op_attr){
+                .attributes_follow = 1,
+                .attributes = get_kv_attributes(st->self, st->dir_ino, st->ientry),
+            };
+            dot.name_handle = (post_op_fh3){
+                .handle_follows = 1,
+                .handle = xdr_copy_string(st->rop->xdrs, fh),
+            };
+            st->entries.push_back(dot);
+            st->reply_size += entry_size;
+        }
+        st->parent_ino = st->ientry["parent_ino"].uint64_value();
+        if (st->parent_ino)
+        {
+            kv_read_inode(st->self, st->ientry["parent_ino"].uint64_value(), [st](int res, const std::string & value, json11::Json ientry)
+            {
+                st->res = res;
+                st->parent_ientry_text = value;
+                st->parent_ientry = ientry;
+                nfs_kv_continue_readdir(st, 2);
+            });
+            return;
+resume_2:
+            if (st->res < 0)
+            {
+                auto cb = std::move(st->cb);
+                cb(st->res);
+                return;
+            }
+        }
+        auto fh = kv_fh(st->parent_ino);
+        auto entry_size = 20 + 4/*len_pad4("..")*/ + (st->is_plus ? 8 + 88 + len_pad4(fh.size()) : 0);
+        if (st->reply_size + entry_size > st->maxcount)
+        {
+            st->eof = false;
+            auto cb = std::move(st->cb);
+            cb(0);
+            return;
+        }
+        entryplus3 dotdot = {};
+        dotdot.name = xdr_copy_string(st->rop->xdrs, "..");
+        dotdot.fileid = st->dir_ino;
+        dotdot.name_attributes = (post_op_attr){
+            // FIXME: maybe do not read parent attributes and leave them to a GETATTR?
+            .attributes_follow = 1,
+            .attributes = get_kv_attributes(st->self,
+                st->parent_ino ? st->parent_ino : st->dir_ino,
+                st->parent_ino ? st->parent_ientry : st->ientry),
+        };
+        dotdot.name_handle = (post_op_fh3){
+            .handle_follows = 1,
+            .handle = xdr_copy_string(st->rop->xdrs, fh),
+        };
+        st->entries.push_back(dotdot);
+        st->reply_size += entry_size;
+    }
+    st->prefix = kv_direntry_key(st->dir_ino, "");
+    st->eof = true;
+    st->start = st->prefix;
+    if (st->cookie > 1)
+    {
+        auto lc_it = st->self->parent->kvfs->list_cookies.find((list_cookie_t){ st->dir_ino, st->cookieverf, st->cookie });
+        if (lc_it != st->self->parent->kvfs->list_cookies.end())
+        {
+            st->start = lc_it->second.key;
+            st->to_skip = 1;
+            st->offset = st->cookie;
+        }
+        else
+        {
+            st->to_skip = st->cookie-2;
+            st->offset = 2;
+            st->cookieverf = ((uint64_t)lrand48() | ((uint64_t)lrand48() << 31) | ((uint64_t)lrand48() << 62));
+        }
+    }
+    else
+    {
+        st->to_skip = 0;
+        st->offset = 2;
+        st->cookieverf = ((uint64_t)lrand48() | ((uint64_t)lrand48() << 31) | ((uint64_t)lrand48() << 62));
+    }
+    {
+        auto lc_it = st->self->parent->kvfs->list_cookies.lower_bound((list_cookie_t){ st->dir_ino, st->cookieverf, 0 });
+        if (lc_it != st->self->parent->kvfs->list_cookies.end() &&
+            lc_it->first.dir_ino == st->dir_ino &&
+            lc_it->first.cookieverf == st->cookieverf &&
+            lc_it->first.cookie < st->cookie)
+        {
+            auto lc_start = lc_it;
+            while (lc_it != st->self->parent->kvfs->list_cookies.end() && lc_it->first.cookieverf == st->cookieverf)
+            {
+                lc_it++;
+            }
+            st->self->parent->kvfs->list_cookies.erase(lc_start, lc_it);
+        }
+    }
+    st->getattr_cur = st->entries.size();
+    st->list_handle = st->self->parent->db->list_start(st->start);
+    st->self->parent->db->list_next(st->list_handle, [=](int res, const std::string & key, const std::string & value)
+    {
+        st->res = res;
+        st->cur_key = key;
+        st->cur_value = value;
+        nfs_kv_continue_readdir(st, 3);
+    });
+    return;
+    while (st->list_handle)
+    {
+        st->self->parent->db->list_next(st->list_handle, NULL);
+        return;
+resume_3:
+        if (st->res == -ENOENT || st->cur_key.size() > st->prefix.size() || st->cur_key.substr(0, st->prefix.size()) != st->prefix)
+        {
+            st->self->parent->db->list_close(st->list_handle);
+            st->list_handle = NULL;
+            break;
+        }
+        if (st->to_skip > 0)
+        {
+            st->to_skip--;
+            continue;
+        }
+        std::string err;
+        auto direntry = json11::Json::parse(st->cur_value, err);
+        if (err != "")
+        {
+            fprintf(stderr, "readdir: direntry %s contains invalid JSON: %s, skipping\n",
+                st->cur_key.c_str(), st->cur_value.c_str());
+            continue;
+        }
+        auto ino = direntry["ino"].uint64_value();
+        auto name = kv_direntry_filename(st->cur_key);
+        auto fh = kv_fh(ino);
+        // 1 entry3 is (8+4+(filename_len+3)/4*4+8) bytes
+        // 1 entryplus3 is (8+4+(filename_len+3)/4*4+8
+        //   + 4+(name_attributes ? (sizeof(fattr3) = 84) : 0)
+        //   + 4+(name_handle ? 4+(handle_len+3)/4*4 : 0)) bytes
+        auto entry_size = 20 + len_pad4(name.size()) + (st->is_plus ? 8 + 88 + len_pad4(fh.size()) : 0);
+        if (st->reply_size + entry_size > st->maxcount)
+        {
+            st->eof = false;
+            st->self->parent->db->list_close(st->list_handle);
+            st->list_handle = NULL;
+            break;
+        }
+        st->reply_size += entry_size;
+        auto idx = st->entries.size();
+        st->entries.push_back((entryplus3){});
+        auto entry = &st->entries[idx];
+        entry->name = xdr_copy_string(st->rop->xdrs, name);
+        entry->fileid = ino;
+        entry->cookie = st->offset++;
+        st->self->parent->kvfs->list_cookies[(list_cookie_t){ st->dir_ino, st->cookieverf, entry->cookie }] = { .key = entry->name };
+        if (st->is_plus)
+        {
+            entry->name_handle = (post_op_fh3){
+                .handle_follows = 1,
+                .handle = xdr_copy_string(st->rop->xdrs, fh),
+            };
+            kv_getattr_next(st);
+        }
+        st->self->parent->db->list_next(st->list_handle, NULL);
+    }
+resume_4:
+    while (st->getattr_running > 0)
+    {
+        return;
+    }
+    void *prev = NULL;
+    for (int i = 0; i < st->entries.size(); i++)
+    {
+        entryplus3 *entry = &st->entries[i];
+        if (prev)
+        {
+            if (st->is_plus)
+                ((entryplus3*)prev)->nextentry = entry;
+            else
+                ((entry3*)prev)->nextentry = (entry3*)entry;
+        }
+        prev = entry;
+    }
+    // Send reply
+    auto cb = std::move(st->cb);
+    cb(0);
+}
+
+static void nfs3_readdir_common(void *opaque, rpc_op_t *rop, bool is_plus)
+{
+    auto st = new nfs_kv_readdir_state;
+    st->self = (nfs_client_t*)opaque;
+    st->rop = rop;
+    st->is_plus = is_plus;
+    if (st->is_plus)
+    {
+        READDIRPLUS3args *args = (READDIRPLUS3args*)rop->request;
+        st->dir_ino = kv_fh_inode(args->dir);
+        st->cookie = args->cookie;
+        st->cookieverf = *((uint64_t*)args->cookieverf);
+        st->maxcount = args->maxcount;
+    }
+    else
+    {
+        READDIR3args *args = ((READDIR3args*)rop->request);
+        st->dir_ino = kv_fh_inode(args->dir);
+        st->cookie = args->cookie;
+        st->cookieverf = *((uint64_t*)args->cookieverf);
+        st->maxcount = args->count;
+    }
+    if (st->self->parent->trace)
+        fprintf(stderr, "[%d] READDIR %ju VERF %jx OFFSET %ju LIMIT %ju\n", st->self->nfs_fd, st->dir_ino, st->cookieverf, st->cookie, st->maxcount);
+    st->cb = [st](int res)
+    {
+        if (st->is_plus)
+        {
+            READDIRPLUS3res *reply = (READDIRPLUS3res*)st->rop->reply;
+            *reply = (READDIRPLUS3res){ .status = vitastor_nfs_map_err(res) };
+            *(uint64_t*)(reply->resok.cookieverf) = st->cookieverf;
+            reply->resok.reply.entries = st->entries.size() ? &st->entries[0] : NULL;
+            reply->resok.reply.eof = st->eof;
+        }
+        else
+        {
+            READDIR3res *reply = (READDIR3res*)st->rop->reply;
+            *reply = (READDIR3res){ .status = vitastor_nfs_map_err(res) };
+            *(uint64_t*)(reply->resok.cookieverf) = st->cookieverf;
+            reply->resok.reply.entries = st->entries.size() ? (entry3*)&st->entries[0] : NULL;
+            reply->resok.reply.eof = st->eof;
+        }
+        rpc_queue_reply(st->rop);
+        delete st;
+    };
+    nfs_kv_continue_readdir(st, 0);
+}
+
+int kv_nfs3_readdir_proc(void *opaque, rpc_op_t *rop)
+{
+    nfs3_readdir_common(opaque, rop, false);
+    return 0;
+}
+
+int kv_nfs3_readdirplus_proc(void *opaque, rpc_op_t *rop)
+{
+    nfs3_readdir_common(opaque, rop, true);
+    return 0;
+}
--- a/src/nfs_kv_remove.cpp
+++ b/src/nfs_kv_remove.cpp
@ -0,0 +1,313 @@
+// Copyright (c) Vitaliy Filippov, 2019+
+// License: VNPL-1.1 (see README.md for details)
+//
+// NFS proxy over VitastorKV database - REMOVE, RMDIR
+
+#include <sys/time.h>
+
+#include "nfs_proxy.h"
+#include "nfs_kv.h"
+#include "cli.h"
+
+struct kv_del_state
+{
+    nfs_client_t *self = NULL;
+    rpc_op_t *rop = NULL;
+    uint64_t dir_ino = 0;
+    std::string filename;
+    uint64_t ino = 0;
+    void *list_handle = NULL;
+    std::string prefix, list_key, direntry_text, ientry_text;
+    json11::Json direntry, ientry;
+    int type = 0;
+    bool is_rmdir = false;
+    bool rm_data = false;
+    int res = 0, res2 = 0;
+    std::function<void(int)> cb;
+};
+
+static void nfs_kv_continue_delete(kv_del_state *st, int state)
+{
+    // Overall algorithm:
+    // 1) Get inode attributes and check that it's not a directory (REMOVE)
+    // 2) Get inode attributes and check that it is a directory (RMDIR)
+    // 3) Delete direntry with CAS
+    // 4) Check that the directory didn't contain files (RMDIR) and restore it if it did
+    // 5) Reduce inode refcount by 1 or delete inode
+    // 6) If regular file and inode is deleted: delete data
+    if (state == 0)      {}
+    else if (state == 1) goto resume_1;
+    else if (state == 2) goto resume_2;
+    else if (state == 3) goto resume_3;
+    else if (state == 4) goto resume_4;
+    else if (state == 5) goto resume_5;
+    else if (state == 6) goto resume_6;
+    else if (state == 7) goto resume_7;
+    else
+    {
+        fprintf(stderr, "BUG: invalid state in nfs_kv_continue_delete()");
+        abort();
+    }
+    st->self->parent->db->get(kv_direntry_key(st->dir_ino, st->filename), [st](int res, const std::string & value)
+    {
+        st->res = res;
+        st->direntry_text = value;
+        nfs_kv_continue_delete(st, 1);
+    });
+    return;
+resume_1:
+    if (st->res < 0)
+    {
+        auto cb = std::move(st->cb);
+        cb(st->res);
+        return;
+    }
+    {
+        std::string err;
+        st->direntry = json11::Json::parse(st->direntry_text, err);
+        if (err != "")
+        {
+            fprintf(stderr, "Invalid JSON in direntry %s = %s: %s, deleting\n",
+                kv_direntry_key(st->dir_ino, st->filename).c_str(), st->direntry_text.c_str(), err.c_str());
+            // Just delete direntry and skip inode
+        }
+        else
+        {
+            st->ino = st->direntry["ino"].uint64_value();
+        }
+    }
+    // Get inode
+    st->self->parent->db->get(kv_inode_key(st->ino), [st](int res, const std::string & value)
+    {
+        st->res = res;
+        st->ientry_text = value;
+        nfs_kv_continue_delete(st, 2);
+    });
+    return;
+resume_2:
+    if (st->res < 0)
+    {
+        fprintf(stderr, "error reading inode %s: %s (code %d)\n",
+            kv_inode_key(st->ino).c_str(), strerror(-st->res), st->res);
+        auto cb = std::move(st->cb);
+        cb(st->res);
+        return;
+    }
+    {
+        std::string err;
+        st->ientry = json11::Json::parse(st->ientry_text, err);
+        if (err != "")
+        {
+            fprintf(stderr, "Invalid JSON in inode %s = %s: %s, treating as a regular file\n",
+                kv_inode_key(st->ino).c_str(), st->ientry_text.c_str(), err.c_str());
+        }
+    }
+    // (1-2) Check type
+    st->type = kv_map_type(st->ientry["type"].string_value());
+    if (st->type == -1 || st->is_rmdir != (st->type == NF3DIR))
+    {
+        auto cb = std::move(st->cb);
+        cb(st->is_rmdir ? -ENOTDIR : -EISDIR);
+        return;
+    }
+    // (3) Delete direntry with CAS
+    st->self->parent->db->del(kv_direntry_key(st->dir_ino, st->filename), [st](int res)
+    {
+        st->res = res;
+        nfs_kv_continue_delete(st, 3);
+    }, [st](int res, const std::string & value)
+    {
+        return value == st->direntry_text;
+    });
+    return;
+resume_3:
+    if (st->res == -EAGAIN)
+    {
+        // CAS failure, restart from the beginning
+        nfs_kv_continue_delete(st, 0);
+        return;
+    }
+    else if (st->res < 0 && st->res != -ENOENT)
+    {
+        fprintf(stderr, "failed to remove direntry %s: %s (code %d)\n",
+            kv_direntry_key(st->dir_ino, st->filename).c_str(), strerror(-st->res), st->res);
+        auto cb = std::move(st->cb);
+        cb(st->res);
+        return;
+    }
+    if (!st->ino)
+    {
+        // direntry contained invalid JSON and was deleted, finish
+        auto cb = std::move(st->cb);
+        cb(0);
+        return;
+    }
+    if (st->is_rmdir)
+    {
+        // (4) Check if directory actually is not empty
+        st->list_handle = st->self->parent->db->list_start(kv_direntry_key(st->ino, ""));
+        st->self->parent->db->list_next(st->list_handle, [st](int res, const std::string & key, const std::string & value)
+        {
+            st->res = res;
+            st->list_key = key;
+            st->self->parent->db->list_close(st->list_handle);
+            nfs_kv_continue_delete(st, 4);
+        });
+        return;
+resume_4:
+        st->prefix = kv_direntry_key(st->ino, "");
+        if (st->res == -ENOENT || st->list_key.size() < st->prefix.size() || st->list_key.substr(0, st->prefix.size()) != st->prefix)
+        {
+            // OK, directory is empty
+        }
+        else
+        {
+            // Not OK, restore direntry
+            st->self->parent->db->del(kv_direntry_key(st->dir_ino, st->filename), [st](int res)
+            {
+                st->res2 = res;
+                nfs_kv_continue_delete(st, 5);
+            }, [st](int res, const std::string & value)
+            {
+                return res == -ENOENT;
+            });
+            return;
+resume_5:
+            if (st->res2 < 0)
+            {
+                fprintf(stderr, "failed to restore direntry %s (%s): %s (code %d)",
+                    kv_direntry_key(st->dir_ino, st->filename).c_str(), st->direntry_text.c_str(), strerror(-st->res2), st->res2);
+                fprintf(stderr, " - inode %ju may be left as garbage\n", st->ino);
+            }
+            if (st->res < 0)
+            {
+                fprintf(stderr, "failed to list entries from %s: %s (code %d)\n",
+                    kv_direntry_key(st->ino, "").c_str(), strerror(-st->res), st->res);
+            }
+            auto cb = std::move(st->cb);
+            cb(st->res < 0 ? st->res : -ENOTEMPTY);
+            return;
+        }
+    }
+    // (5) Reduce inode refcount by 1 or delete inode
+    if (st->ientry["nlink"].uint64_value() > 1)
+    {
+        auto copy = st->ientry.object_items();
+        copy["nlink"] = st->ientry["nlink"].uint64_value()-1;
+        st->self->parent->db->set(kv_inode_key(st->ino), json11::Json(copy).dump(), [st](int res)
+        {
+            st->res = res;
+            nfs_kv_continue_delete(st, 6);
+        }, [st](int res, const std::string & old_value)
+        {
+            return old_value == st->ientry_text;
+        });
+    }
+    else
+    {
+        st->self->parent->db->del(kv_inode_key(st->ino), [st](int res)
+        {
+            st->res = res;
+            nfs_kv_continue_delete(st, 6);
+        }, [st](int res, const std::string & old_value)
+        {
+            return old_value == st->ientry_text;
+        });
+    }
+    return;
+resume_6:
+    if (st->res < 0)
+    {
+        // Assume EAGAIN is OK, maybe someone created a hard link in the meantime
+        auto cb = std::move(st->cb);
+        cb(st->res == -EAGAIN ? 0 : st->res);
+        return;
+    }
+    // (6) If regular file and inode is deleted: delete data
+    if ((!st->type || st->type == NF3REG) && st->ientry["nlink"].uint64_value() <= 1 &&
+        !st->ientry["shared_inode"].uint64_value())
+    {
+        // Remove data
+        st->self->parent->cmd->loop_and_wait(st->self->parent->cmd->start_rm_data(json11::Json::object {
+            { "inode", INODE_NO_POOL(st->self->parent->fs_base_inode + st->ino) },
+            { "pool", (uint64_t)INODE_POOL(st->self->parent->fs_base_inode + st->ino) },
+        }), [st](const cli_result_t & r)
+        {
+            if (r.err)
+            {
+                fprintf(stderr, "Failed to remove inode %jx data: %s (code %d)\n",
+                    st->ino, r.text.c_str(), r.err);
+            }
+            st->res = r.err;
+            nfs_kv_continue_delete(st, 7);
+        });
+        return;
+resume_7:
+        auto cb = std::move(st->cb);
+        cb(st->res);
+        return;
+    }
+    auto cb = std::move(st->cb);
+    cb(0);
+}
+
+int kv_nfs3_remove_proc(void *opaque, rpc_op_t *rop)
+{
+    kv_del_state *st = new kv_del_state;
+    st->self = (nfs_client_t*)opaque;
+    st->rop = rop;
+    REMOVE3res *reply = (REMOVE3res*)rop->reply;
+    REMOVE3args *args = (REMOVE3args*)rop->request;
+    st->dir_ino = kv_fh_inode(args->object.dir);
+    st->filename = args->object.name;
+    if (st->self->parent->trace)
+        fprintf(stderr, "[%d] REMOVE %ju/%s\n", st->self->nfs_fd, st->dir_ino, st->filename.c_str());
+    if (!st->dir_ino)
+    {
+        *reply = (REMOVE3res){ .status = NFS3ERR_INVAL };
+        rpc_queue_reply(rop);
+        delete st;
+        return 0;
+    }
+    st->cb = [st](int res)
+    {
+        *((REMOVE3res*)st->rop->reply) = (REMOVE3res){
+            .status = vitastor_nfs_map_err(res),
+        };
+        rpc_queue_reply(st->rop);
+        delete st;
+    };
+    nfs_kv_continue_delete(st, 0);
+    return 1;
+}
+
+int kv_nfs3_rmdir_proc(void *opaque, rpc_op_t *rop)
+{
+    kv_del_state *st = new kv_del_state;
+    st->self = (nfs_client_t*)opaque;
+    st->rop = rop;
+    RMDIR3args *args = (RMDIR3args*)rop->request;
+    RMDIR3res *reply = (RMDIR3res*)rop->reply;
+    st->dir_ino = kv_fh_inode(args->object.dir);
+    st->filename = args->object.name;
+    st->is_rmdir = true;
+    if (st->self->parent->trace)
+        fprintf(stderr, "[%d] RMDIR %ju/%s\n", st->self->nfs_fd, st->dir_ino, st->filename.c_str());
+    if (!st->dir_ino)
+    {
+        *reply = (RMDIR3res){ .status = NFS3ERR_INVAL };
+        rpc_queue_reply(rop);
+        delete st;
+        return 0;
+    }
+    st->cb = [st](int res)
+    {
+        *((RMDIR3res*)st->rop->reply) = (RMDIR3res){
+            .status = vitastor_nfs_map_err(res),
+        };
+        rpc_queue_reply(st->rop);
+        delete st;
+    };
+    nfs_kv_continue_delete(st, 0);
+    return 1;
+}
--- a/src/nfs_kv_rename.cpp
+++ b/src/nfs_kv_rename.cpp
@ -0,0 +1,206 @@
+// Copyright (c) Vitaliy Filippov, 2019+
+// License: VNPL-1.1 (see README.md for details)
+//
+// NFS proxy over VitastorKV database - RENAME
+
+#include <sys/time.h>
+
+#include "nfs_proxy.h"
+#include "nfs_kv.h"
+
+struct nfs_kv_rename_state
+{
+    nfs_client_t *self = NULL;
+    rpc_op_t *rop = NULL;
+    uint64_t old_dir_ino = 0, new_dir_ino = 0;
+    std::string old_name, new_name;
+    std::string old_direntry_text;
+    std::string ientry_text;
+    json11::Json direntry, ientry;
+    int res = 0, res2 = 0;
+    std::function<void(int)> cb;
+};
+
+static void nfs_kv_continue_rename(nfs_kv_rename_state *st, int state)
+{
+    // Simplified algorithm (non-atomic and without ENOTDIR/EISDIR):
+    // 1) Check if the target directory exists
+    // 2) Delete & save (using CAS) the source direntry
+    // 3) Write the target direntry using CAS, fail if it already exists
+    // 4) Restore the source direntry on failure
+    // Atomic version would require something like a journal
+    if (state == 0)      {}
+    else if (state == 1) goto resume_1;
+    else if (state == 2) goto resume_2;
+    else if (state == 3) goto resume_3;
+    else if (state == 4) goto resume_4;
+    else if (state == 5) goto resume_5;
+    else if (state == 6) goto resume_6;
+    else
+    {
+        fprintf(stderr, "BUG: invalid state in nfs_kv_continue_rename()");
+        abort();
+    }
+    kv_read_inode(st->self, st->new_dir_ino, [st](int res, const std::string & value, json11::Json attrs)
+    {
+        st->res = res == 0 ? (attrs["type"].string_value() == "dir" ? 0 : -ENOTDIR) : res;
+        nfs_kv_continue_rename(st, 1);
+    });
+    return;
+resume_1:
+    if (st->res < 0)
+    {
+        auto cb = std::move(st->cb);
+        cb(st->res);
+        return;
+    }
+    // Read & delete the old direntry
+    st->self->parent->db->del(kv_direntry_key(st->old_dir_ino, st->old_name), [st](int res)
+    {
+        st->res = res;
+        nfs_kv_continue_rename(st, 2);
+    }, [=](int res, const std::string & old_value)
+    {
+        st->res = res;
+        st->old_direntry_text = old_value;
+        return true;
+    });
+    return;
+resume_2:
+    if (st->res < 0)
+    {
+        auto cb = std::move(st->cb);
+        cb(st->res);
+        return;
+    }
+    {
+        std::string err;
+        st->direntry = json11::Json::parse(st->old_direntry_text, err);
+        if (err != "")
+        {
+            fprintf(stderr, "Invalid JSON in direntry %s = %s: %s\n",
+                kv_direntry_key(st->old_dir_ino, st->old_name).c_str(),
+                st->old_direntry_text.c_str(), err.c_str());
+        }
+    }
+    if (st->direntry["type"].string_value() == "dir" &&
+        st->direntry["ino"].uint64_value() != 0 &&
+        st->new_dir_ino != st->old_dir_ino)
+    {
+        // Read & check inode
+        kv_read_inode(st->self, st->direntry["ino"].uint64_value(), [st](int res, const std::string & value, json11::Json ientry)
+        {
+            st->res = res;
+            st->ientry_text = value;
+            st->ientry = ientry;
+            nfs_kv_continue_rename(st, 3);
+        });
+        return;
+resume_3:
+        if (st->res < 0)
+        {
+            auto cb = std::move(st->cb);
+            cb(st->res);
+            return;
+        }
+        // Change parent reference
+        {
+            auto ientry_new = st->ientry.object_items();
+            ientry_new["parent_ino"] = st->new_dir_ino;
+            st->self->parent->db->set(kv_inode_key(st->direntry["ino"].uint64_value()), json11::Json(ientry_new).dump(), [st](int res)
+            {
+                st->res = res;
+                nfs_kv_continue_rename(st, 4);
+            }, [st](int res, const std::string & old_value)
+            {
+                return old_value == st->ientry_text;
+            });
+        }
+        return;
+resume_4:
+        if (st->res < 0)
+        {
+            auto cb = std::move(st->cb);
+            cb(st->res);
+            return;
+        }
+    }
+    st->self->parent->db->set(kv_direntry_key(st->new_dir_ino, st->new_name), st->old_direntry_text, [st](int res)
+    {
+        st->res = res;
+        nfs_kv_continue_rename(st, 5);
+    }, [st](int res, const std::string & old_value)
+    {
+        return res == -ENOENT;
+    });
+    return;
+resume_5:
+    if (st->res < 0)
+    {
+        if (st->res == -EAGAIN)
+            st->res = -EEXIST;
+        st->res2 = st->res;
+        st->self->parent->db->set(kv_direntry_key(st->old_dir_ino, st->old_name), st->old_direntry_text, [st](int res)
+        {
+            st->res = res;
+            nfs_kv_continue_rename(st, 6);
+        }, [st](int res, const std::string & old_value)
+        {
+            return res == -ENOENT;
+        });
+        return;
+resume_6:
+        if (st->res < 0)
+        {
+            if (st->res == -EAGAIN)
+                st->res = -EEXIST;
+            fprintf(stderr, "error restoring %s = %s after failed rename: %s (code %d)\n",
+                kv_direntry_key(st->old_dir_ino, st->old_name).c_str(), st->old_direntry_text.c_str(),
+                strerror(-st->res), st->res);
+        }
+        auto cb = std::move(st->cb);
+        cb(st->res2);
+        return;
+    }
+    auto cb = std::move(st->cb);
+    cb(st->res);
+}
+
+int kv_nfs3_rename_proc(void *opaque, rpc_op_t *rop)
+{
+    auto st = new nfs_kv_rename_state;
+    st->self = (nfs_client_t*)opaque;
+    st->rop = rop;
+    RENAME3args *args = (RENAME3args*)rop->request;
+    st->old_dir_ino = kv_fh_inode(args->from.dir);
+    st->new_dir_ino = kv_fh_inode(args->to.dir);
+    st->old_name = args->from.name;
+    st->new_name = args->to.name;
+    if (st->self->parent->trace)
+        fprintf(stderr, "[%d] RENAME %ju/%s -> %ju/%s\n", st->self->nfs_fd, st->old_dir_ino, st->old_name.c_str(), st->new_dir_ino, st->new_name.c_str());
+    if (!st->old_dir_ino || !st->new_dir_ino || st->old_name == "" || st->new_name == "")
+    {
+        RENAME3res *reply = (RENAME3res*)rop->reply;
+        *reply = (RENAME3res){ .status = NFS3ERR_INVAL };
+        rpc_queue_reply(rop);
+        delete st;
+        return 0;
+    }
+    if (st->old_dir_ino == st->new_dir_ino && st->old_name == st->new_name)
+    {
+        RENAME3res *reply = (RENAME3res*)rop->reply;
+        *reply = (RENAME3res){ .status = NFS3_OK };
+        rpc_queue_reply(st->rop);
+        delete st;
+        return 0;
+    }
+    st->cb = [st](int res)
+    {
+        RENAME3res *reply = (RENAME3res*)st->rop->reply;
+        *reply = (RENAME3res){ .status = vitastor_nfs_map_err(res) };
+        rpc_queue_reply(st->rop);
+        delete st;
+    };
+    nfs_kv_continue_rename(st, 0);
+    return 1;
+}
--- a/src/nfs_kv_setattr.cpp
+++ b/src/nfs_kv_setattr.cpp
@ -0,0 +1,183 @@
+// Copyright (c) Vitaliy Filippov, 2019+
+// License: VNPL-1.1 (see README.md for details)
+//
+// NFS proxy over VitastorKV database - SETATTR
+
+#include <sys/time.h>
+
+#include "nfs_proxy.h"
+#include "nfs_kv.h"
+#include "cli.h"
+
+struct nfs_kv_setattr_state
+{
+    nfs_client_t *self = NULL;
+    rpc_op_t *rop = NULL;
+    uint64_t ino = 0;
+    uint64_t old_size = 0, new_size = 0;
+    json11::Json::object set_attrs;
+    int res = 0, cas_res = 0;
+    std::string ientry_text;
+    json11::Json ientry;
+    json11::Json::object new_attrs;
+    std::function<void(int)> cb;
+};
+
+static void nfs_kv_continue_setattr(nfs_kv_setattr_state *st, int state)
+{
+    if (state == 0)      {}
+    else if (state == 1) goto resume_1;
+    else if (state == 2) goto resume_2;
+    else if (state == 3) goto resume_3;
+    else
+    {
+        fprintf(stderr, "BUG: invalid state in nfs_kv_continue_setattr()");
+        abort();
+    }
+resume_0:
+    kv_read_inode(st->self, st->ino, [st](int res, const std::string & value, json11::Json attrs)
+    {
+        st->res = res;
+        st->ientry_text = value;
+        st->ientry = attrs;
+        nfs_kv_continue_setattr(st, 1);
+    });
+    return;
+resume_1:
+    if (st->res < 0)
+    {
+        auto cb = std::move(st->cb);
+        cb(st->res);
+        return;
+    }
+    if (st->ientry["type"].string_value() == "link" ||
+        st->ientry["type"].string_value() != "file" &&
+        st->ientry["type"].string_value() != "" &&
+        !st->set_attrs["size"].is_null())
+    {
+        auto cb = std::move(st->cb);
+        cb(-EINVAL);
+        return;
+    }
+    // Now we can update it
+    st->new_attrs = st->ientry.object_items();
+    st->old_size = st->ientry["size"].uint64_value();
+    for (auto & kv: st->set_attrs)
+    {
+        if (kv.first == "size")
+        {
+            st->new_size = kv.second.uint64_value();
+        }
+        st->new_attrs[kv.first] = kv.second;
+    }
+    st->self->parent->db->set(kv_inode_key(st->ino), json11::Json(st->new_attrs).dump(), [st](int res)
+    {
+        st->res = res;
+        nfs_kv_continue_setattr(st, 2);
+    }, [st](int res, const std::string & cas_value)
+    {
+        st->cas_res = res;
+        return (res == 0 || res == -ENOENT && st->ino == KV_ROOT_INODE) && cas_value == st->ientry_text;
+    });
+    return;
+resume_2:
+    if (st->cas_res == -ENOENT)
+    {
+        st->res = -ENOENT;
+    }
+    if (st->res == -EAGAIN)
+    {
+        // Retry
+        fprintf(stderr, "CAS failure during setattr, retrying\n");
+        goto resume_0;
+    }
+    if (st->res < 0)
+    {
+        fprintf(stderr, "Failed to update inode %ju: %s (code %d)\n", st->ino, strerror(-st->res), st->res);
+        auto cb = std::move(st->cb);
+        cb(st->res);
+        return;
+    }
+    if (!st->set_attrs["size"].is_null() &&
+        st->ientry["size"].uint64_value() > st->set_attrs["size"].uint64_value())
+    {
+        // Delete extra data when downsizing
+        st->self->parent->cmd->loop_and_wait(st->self->parent->cmd->start_rm_data(json11::Json::object {
+            { "inode", INODE_NO_POOL(st->self->parent->fs_base_inode + st->ino) },
+            { "pool", (uint64_t)INODE_POOL(st->self->parent->fs_base_inode + st->ino) },
+            { "min_offset", st->set_attrs["size"].uint64_value() },
+        }), [st](const cli_result_t & r)
+        {
+            if (r.err)
+            {
+                fprintf(stderr, "Failed to truncate inode %ju: %s (code %d)\n",
+                    st->ino, r.text.c_str(), r.err);
+            }
+            st->res = r.err;
+            nfs_kv_continue_setattr(st, 3);
+        });
+        return;
+    }
+resume_3:
+    auto cb = std::move(st->cb);
+    cb(0);
+}
+
+int kv_nfs3_setattr_proc(void *opaque, rpc_op_t *rop)
+{
+    nfs_kv_setattr_state *st = new nfs_kv_setattr_state;
+    st->self = (nfs_client_t*)opaque;
+    st->rop = rop;
+    auto args = (SETATTR3args*)rop->request;
+    auto reply = (SETATTR3res*)rop->reply;
+    std::string fh = args->object;
+    if (!kv_fh_valid(fh))
+    {
+        *reply = (SETATTR3res){ .status = NFS3ERR_INVAL };
+        rpc_queue_reply(rop);
+        delete st;
+        return 0;
+    }
+    st->ino = kv_fh_inode(fh);
+    if (args->new_attributes.size.set_it)
+        st->set_attrs["size"] = args->new_attributes.size.size;
+    if (args->new_attributes.mode.set_it)
+        st->set_attrs["mode"] = (uint64_t)args->new_attributes.mode.mode;
+    if (args->new_attributes.uid.set_it)
+        st->set_attrs["uid"] = (uint64_t)args->new_attributes.uid.uid;
+    if (args->new_attributes.gid.set_it)
+        st->set_attrs["gid"] = (uint64_t)args->new_attributes.gid.gid;
+    if (args->new_attributes.atime.set_it)
+        st->set_attrs["atime"] = nfstime_to_str(args->new_attributes.atime.atime);
+    if (args->new_attributes.mtime.set_it)
+        st->set_attrs["mtime"] = nfstime_to_str(args->new_attributes.mtime.mtime);
+    fprintf(stderr, "SETATTR %ju ATTRS %s\n", st->ino, json11::Json(st->set_attrs).dump().c_str());
+    st->cb = [st](int res)
+    {
+        auto reply = (SETATTR3res*)st->rop->reply;
+        if (res < 0)
+        {
+            *reply = (SETATTR3res){
+                .status = vitastor_nfs_map_err(res),
+            };
+        }
+        else
+        {
+            *reply = (SETATTR3res){
+                .status = NFS3_OK,
+                .resok = (SETATTR3resok){
+                    .obj_wcc = (wcc_data){
+                        .after = (post_op_attr){
+                            .attributes_follow = 1,
+                            .attributes = get_kv_attributes(st->self, st->ino, st->new_attrs),
+                        },
+                    },
+                },
+            };
+        }
+        rpc_queue_reply(st->rop);
+        delete st;
+    };
+    nfs_kv_continue_setattr(st, 0);
+    return 1;
+}
--- a/src/nfs_kv_write.cpp
+++ b/src/nfs_kv_write.cpp
@ -0,0 +1,796 @@
+// Copyright (c) Vitaliy Filippov, 2019+
+// License: VNPL-1.1 (see README.md for details)
+//
+// NFS proxy over VitastorKV database - WRITE
+
+#include <sys/time.h>
+
+#include "nfs_proxy.h"
+#include "nfs_kv.h"
+
+struct nfs_rmw_t
+{
+    nfs_kv_write_state *st = NULL;
+    int continue_state = 0;
+    uint64_t ino = 0;
+    uint64_t offset = 0;
+    uint8_t *buf = NULL;
+    uint64_t size = 0;
+    uint8_t *part_buf = NULL;
+};
+
+struct nfs_kv_write_state
+{
+    nfs_client_t *self = NULL;
+    rpc_op_t *rop = NULL;
+    uint64_t ino = 0;
+    uint64_t offset = 0, size = 0;
+    bool stable = false;
+    uint8_t *buf = NULL;
+    std::function<void(int res)> cb;
+    // state
+    bool allow_cache = true;
+    int res = 0, res2 = 0;
+    int waiting = 0;
+    std::string ientry_text;
+    json11::Json ientry;
+    uint64_t new_size = 0;
+    uint64_t aligned_size = 0;
+    uint8_t *aligned_buf = NULL;
+    uint64_t shared_inode = 0, shared_offset = 0;
+    bool was_immediate = false;
+    nfs_rmw_t rmw[2];
+    kv_inode_extend_t *ext = NULL;
+
+    ~nfs_kv_write_state()
+    {
+        if (aligned_buf)
+        {
+            free(aligned_buf);
+            aligned_buf = NULL;
+        }
+    }
+};
+
+static void nfs_kv_continue_write(nfs_kv_write_state *st, int state);
+
+static void finish_allocate_shared(nfs_client_t *self, int res)
+{
+    std::vector<shared_alloc_queue_t> waiting;
+    waiting.swap(self->parent->kvfs->allocating_shared);
+    for (auto & w: waiting)
+    {
+        w.st->res = res;
+        if (res == 0)
+        {
+            w.st->shared_inode = self->parent->kvfs->cur_shared_inode;
+            w.st->shared_offset = self->parent->kvfs->cur_shared_offset;
+            self->parent->kvfs->cur_shared_offset += (w.size + self->parent->pool_alignment-1) & ~(self->parent->pool_alignment-1);
+        }
+        nfs_kv_continue_write(w.st, w.state);
+    }
+}
+
+static void allocate_shared_inode(nfs_kv_write_state *st, int state, uint64_t size)
+{
+    if (st->self->parent->kvfs->cur_shared_inode == 0)
+    {
+        st->self->parent->kvfs->allocating_shared.push_back({ st, state, size });
+        if (st->self->parent->kvfs->allocating_shared.size() > 1)
+        {
+            return;
+        }
+        allocate_new_id(st->self, [st](int res, uint64_t new_id)
+        {
+            if (res < 0)
+            {
+                finish_allocate_shared(st->self, res);
+                return;
+            }
+            st->self->parent->kvfs->cur_shared_inode = new_id;
+            st->self->parent->kvfs->cur_shared_offset = 0;
+            st->self->parent->db->set(
+                kv_inode_key(new_id), json11::Json(json11::Json::object{ { "type", "shared" } }).dump(),
+                [st](int res)
+                {
+                    if (res < 0)
+                    {
+                        st->self->parent->kvfs->cur_shared_inode = 0;
+                    }
+                    finish_allocate_shared(st->self, res);
+                },
+                [](int res, const std::string & old_value)
+                {
+                    return res == -ENOENT;
+                }
+            );
+        });
+    }
+}
+
+uint64_t align_shared_size(nfs_client_t *self, uint64_t size)
+{
+    return (size + sizeof(shared_file_header_t) + self->parent->pool_alignment-1)
+        & ~(self->parent->pool_alignment-1);
+}
+
+static void nfs_do_write(uint64_t ino, uint64_t offset, uint8_t *buf, uint64_t size, nfs_kv_write_state *st, int state)
+{
+    auto op = new cluster_op_t;
+    op->opcode = OSD_OP_WRITE;
+    op->inode = st->self->parent->fs_base_inode + ino;
+    op->offset = offset;
+    op->len = size;
+    op->iov.push_back(buf, size);
+    st->waiting++;
+    op->callback = [st, state](cluster_op_t *op)
+    {
+        if (op->retval != op->len)
+        {
+            st->res = op->retval >= 0 ? -EIO : op->retval;
+        }
+        delete op;
+        st->waiting--;
+        if (!st->waiting)
+        {
+            nfs_kv_continue_write(st, state);
+        }
+    };
+    st->self->parent->cli->execute(op);
+}
+
+static void nfs_do_shared_write(nfs_kv_write_state *st, int state)
+{
+    nfs_do_write(st->shared_inode, st->shared_offset, st->aligned_buf, st->aligned_size, st, state);
+}
+
+static void nfs_do_unshare_write(nfs_kv_write_state *st, int state)
+{
+    nfs_do_write(st->ino, 0, st->aligned_buf + sizeof(shared_file_header_t),
+        st->aligned_size - sizeof(shared_file_header_t), st, state);
+}
+
+static void nfs_do_rmw(nfs_rmw_t *rmw)
+{
+    auto parent = rmw->st->self->parent;
+    auto align = parent->pool_alignment;
+    bool is_begin = (rmw->offset % align);
+    bool is_end = ((rmw->offset+rmw->size) % align);
+    // RMW either only at beginning or only at end and within a single block
+    assert(is_begin != is_end);
+    assert((rmw->offset/parent->pool_block_size) == ((rmw->offset+rmw->size-1)/parent->pool_block_size));
+    if (!rmw->part_buf)
+    {
+        rmw->part_buf = (uint8_t*)malloc_or_die(align);
+    }
+    auto op = new cluster_op_t;
+    op->opcode = OSD_OP_READ;
+    op->inode = parent->fs_base_inode + rmw->ino;
+    op->offset = (rmw->offset + (is_begin ? 0 : rmw->size)) & ~(align-1);
+    op->len = align;
+    op->iov.push_back(rmw->part_buf, op->len);
+    rmw->st->waiting++;
+    op->callback = [rmw](cluster_op_t *rd_op)
+    {
+        if (rd_op->retval != rd_op->len)
+        {
+            free(rmw->part_buf);
+            rmw->part_buf = NULL;
+            rmw->st->res = rd_op->retval >= 0 ? -EIO : rd_op->retval;
+            rmw->st->waiting--;
+            if (!rmw->st->waiting)
+            {
+                nfs_kv_continue_write(rmw->st, rmw->continue_state);
+            }
+        }
+        else
+        {
+            auto parent = rmw->st->self->parent;
+            auto align = parent->pool_alignment;
+            bool is_begin = (rmw->offset % align);
+            auto op = new cluster_op_t;
+            op->opcode = OSD_OP_WRITE;
+            op->inode = rmw->st->self->parent->fs_base_inode + rmw->ino;
+            op->offset = rmw->offset & ~(align-1);
+            op->len = (rmw->size + align-1) & ~(align-1);
+            op->version = rd_op->version+1;
+            if (is_begin)
+            {
+                op->iov.push_back(rmw->part_buf, rmw->offset % align);
+            }
+            op->iov.push_back(rmw->buf, rmw->size);
+            if (!is_begin)
+            {
+                auto tail = ((rmw->offset+rmw->size) % align);
+                op->iov.push_back(rmw->part_buf + tail, align - tail);
+            }
+            op->callback = [rmw](cluster_op_t *op)
+            {
+                if (op->retval == -EAGAIN)
+                {
+                    // CAS failure - retry
+                    rmw->st->waiting--;
+                    nfs_do_rmw(rmw);
+                }
+                else
+                {
+                    free(rmw->part_buf);
+                    rmw->part_buf = NULL;
+                    if (op->retval != op->len)
+                    {
+                        rmw->st->res = (op->retval >= 0 ? -EIO : op->retval);
+                    }
+                    rmw->st->waiting--;
+                    if (!rmw->st->waiting)
+                    {
+                        nfs_kv_continue_write(rmw->st, rmw->continue_state);
+                    }
+                }
+                delete op;
+            };
+            parent->cli->execute(op);
+        }
+        delete rd_op;
+    };
+    parent->cli->execute(op);
+}
+
+static void nfs_do_shared_read(nfs_kv_write_state *st, int state)
+{
+    auto op = new cluster_op_t;
+    op->opcode = OSD_OP_READ;
+    op->inode = st->self->parent->fs_base_inode + st->ientry["shared_ino"].uint64_value();
+    op->offset = st->ientry["shared_offset"].uint64_value();
+    op->len = align_shared_size(st->self, st->ientry["size"].uint64_value());
+    op->iov.push_back(st->aligned_buf, op->len);
+    op->callback = [st, state](cluster_op_t *op)
+    {
+        st->res = op->retval == op->len ? 0 : op->retval;
+        delete op;
+        nfs_kv_continue_write(st, state);
+    };
+    st->self->parent->cli->execute(op);
+}
+
+static void nfs_do_fsync(nfs_kv_write_state *st, int state)
+{
+    // Client requested a stable write. Add an fsync
+    auto op = new cluster_op_t;
+    op->opcode = OSD_OP_SYNC;
+    op->callback = [st, state](cluster_op_t *op)
+    {
+        delete op;
+        nfs_kv_continue_write(st, state);
+    };
+    st->self->parent->cli->execute(op);
+}
+
+static bool nfs_do_shared_readmodify(nfs_kv_write_state *st, int base_state, int state, bool unshare)
+{
+    assert(state <= base_state);
+    if (state < base_state)       {}
+    else if (state == base_state) goto resume_0;
+    assert(!st->aligned_buf);
+    st->aligned_size = unshare
+        ? sizeof(shared_file_header_t) + (st->new_size + st->self->parent->pool_alignment-1) & ~(st->self->parent->pool_alignment-1)
+        : align_shared_size(st->self, st->new_size);
+    st->aligned_buf = (uint8_t*)malloc_or_die(st->aligned_size);
+    memset(st->aligned_buf + sizeof(shared_file_header_t), 0, st->offset);
+    if (st->ientry["shared_ino"].uint64_value() != 0)
+    {
+        // Read old data if shared non-empty
+        nfs_do_shared_read(st, base_state);
+        return false;
+resume_0:
+        if (st->res < 0)
+        {
+            auto cb = std::move(st->cb);
+            cb(st->res);
+            return true;
+        }
+        auto hdr = ((shared_file_header_t*)st->aligned_buf);
+        if (hdr->magic != SHARED_FILE_MAGIC_V1 || hdr->inode != st->ino ||
+            align_shared_size(st->self, hdr->size) > align_shared_size(st->self, st->ientry["size"].uint64_value()))
+        {
+            // Got unrelated data - retry from the beginning
+            st->allow_cache = false;
+            nfs_kv_continue_write(st, 0);
+            return false;
+        }
+    }
+    *((shared_file_header_t*)st->aligned_buf) = {
+        .magic = SHARED_FILE_MAGIC_V1,
+        .inode = st->ino,
+        .size = st->new_size,
+    };
+    memcpy(st->aligned_buf + sizeof(shared_file_header_t) + st->offset, st->buf, st->size);
+    memset(st->aligned_buf + sizeof(shared_file_header_t) + st->offset + st->size, 0,
+        st->aligned_size - sizeof(shared_file_header_t) - st->offset - st->size);
+    return true;
+}
+
+static void nfs_do_align_write(nfs_kv_write_state *st, uint64_t ino, uint64_t offset, int state)
+{
+    auto alignment = st->self->parent->pool_alignment;
+    uint8_t *good_buf = st->buf;
+    uint64_t good_offset = offset;
+    uint64_t good_size = st->size;
+    st->waiting++;
+    if (offset % alignment)
+    {
+        // Requires read-modify-write in the beginning
+        auto s = (alignment - (offset % alignment));
+        if (good_size > s)
+        {
+            good_buf += s;
+            good_offset += s;
+            good_size -= s;
+        }
+        else
+            good_size = 0;
+        s = s > st->size ? st->size : s;
+        st->rmw[0] = {
+            .st = st,
+            .continue_state = state,
+            .ino = ino,
+            .offset = offset,
+            .buf = st->buf,
+            .size = s,
+        };
+        nfs_do_rmw(&st->rmw[0]);
+    }
+    if ((offset+st->size-1) % alignment)
+    {
+        // Requires read-modify-write in the end
+        auto s = ((offset+st->size-1) % alignment);
+        if (good_size > s)
+            good_size -= s;
+        else
+            good_size = 0;
+        if (((offset+st->size-1) / alignment) > (offset / alignment))
+        {
+            st->rmw[1] = {
+                .st = st,
+                .continue_state = state,
+                .ino = ino,
+                .offset = offset + st->size-s,
+                .buf = st->buf + st->size-s,
+                .size = s,
+            };
+            nfs_do_rmw(&st->rmw[1]);
+        }
+    }
+    if (good_size > 0)
+    {
+        // Normal write
+        nfs_do_write(ino, good_offset, good_buf, good_size, st, state);
+    }
+    st->waiting--;
+    if (!st->waiting)
+    {
+        nfs_kv_continue_write(st, state);
+    }
+}
+
+static std::string new_normal_ientry(nfs_kv_write_state *st)
+{
+    auto ni = st->ientry.object_items();
+    ni.erase("empty");
+    ni.erase("shared_ino");
+    ni.erase("shared_offset");
+    ni.erase("shared_alloc");
+    ni.erase("shared_ver");
+    ni["size"] = st->ext->cur_extend;
+    return json11::Json(ni).dump();
+}
+
+static std::string new_moved_ientry(nfs_kv_write_state *st)
+{
+    auto ni = st->ientry.object_items();
+    ni.erase("empty");
+    ni["shared_ino"] = st->shared_inode;
+    ni["shared_offset"] = st->shared_offset;
+    ni["shared_alloc"] = st->aligned_size;
+    ni.erase("shared_ver");
+    ni["size"] = st->new_size;
+    return json11::Json(ni).dump();
+}
+
+static std::string new_shared_ientry(nfs_kv_write_state *st)
+{
+    auto ni = st->ientry.object_items();
+    ni.erase("empty");
+    ni["size"] = st->new_size;
+    ni["shared_ver"] = ni["shared_ver"].uint64_value()+1;
+    return json11::Json(ni).dump();
+}
+
+static void nfs_kv_extend_inode(nfs_kv_write_state *st, int state)
+{
+    if (state == 1)
+    {
+        goto resume_1;
+    }
+    st->ext->cur_extend = st->ext->next_extend;
+    st->ext->next_extend = 0;
+    st->res2 = -EAGAIN;
+    st->self->parent->db->set(kv_inode_key(st->ino), new_normal_ientry(st), [st](int res)
+    {
+        st->res = res;
+        nfs_kv_continue_write(st, 13);
+    }, [st](int res, const std::string & old_value)
+    {
+        if (res != 0)
+        {
+            return false;
+        }
+        if (old_value == st->ientry_text)
+        {
+            return true;
+        }
+        std::string err;
+        auto ientry = json11::Json::parse(old_value, err).object_items();
+        if (err != "")
+        {
+            st->res2 = -EINVAL;
+            return false;
+        }
+        else if (ientry.size() == st->ientry.object_items().size())
+        {
+            for (auto & kv: st->ientry.object_items())
+            {
+                if (kv.first != "size" && ientry[kv.first] != kv.second)
+                {
+                    // Something except size changed
+                    return false;
+                }
+            }
+            // OK, only size changed
+            if (ientry["size"] >= st->new_size)
+            {
+                // Already extended
+                st->res2 = 0;
+                return false;
+            }
+            // size is different but can still be extended, other parameters don't differ
+            return true;
+        }
+        return false;
+    });
+    return;
+resume_1:
+    if (st->res == -EAGAIN)
+    {
+        // EAGAIN may be OK in fact (see above)
+        st->res = st->res2;
+    }
+    if (st->res == 0)
+    {
+        st->ext->done_extend = st->ext->cur_extend;
+    }
+    st->ext->cur_extend = 0;
+    // Wake up other extenders anyway
+    auto waiters = std::move(st->ext->waiters);
+    for (auto & cb: waiters)
+    {
+        cb();
+    }
+}
+
+// Packing small files into "shared inodes". Insane algorithm...
+// Write:
+// - If (offset+size <= threshold):
+//   - Read inode from cache
+//   - If inode does not exist - stop with -ENOENT
+//   - If inode is not a regular file - stop with -EINVAL
+//   - If it's empty (size == 0 || empty == true):
+//     - If preset size is larger than threshold:
+//       - Write data into non-shared inode
+//       - In parallel: clear empty flag
+//         - If CAS failure: re-read inode and restart
+//     - Otherwise:
+//       - Allocate/take a shared inode
+//       - Allocate space in its end
+//       - Write data into shared inode
+//         - If CAS failure: allocate another shared inode and retry
+//       - Write shared inode reference, set size
+//         - If CAS failure: free allocated shared space, re-read inode and restart
+//   - If it's not empty:
+//     - If non-shared:
+//       - Write data into non-shared inode
+//       - In parallel: check if data fits into inode size and extend if it doesn't
+//         - If CAS failure: re-read inode and retry to extend the size
+//     - If shared:
+//       - Read whole file from shared inode
+//         - If the file header in data doesn't match: re-read inode and restart
+//       - If data doesn't fit into the same shared inode:
+//         - Allocate space in a new shared inode
+//         - Write data into the new shared inode
+//           - If CAS failure: allocate another shared inode and retry
+//         - Update inode metadata (set new size and new shared inode)
+//           - If CAS failure: free allocated shared space, re-read inode and restart
+//       - If it fits:
+//         - Write updated data into the shared inode
+//         - Update inode entry in any case to block parallel non-shared writes
+//           - If CAS failure: re-read inode and restart
+// - Otherwise:
+//   - Write data into non-shared inode
+//   - Read inode in parallel
+//     - If not a regular file:
+//       - Remove data
+//       - Stop with -EINVAL
+//     - If shared:
+//       - Read whole file from shared inode
+//       - Write data into non-shared inode
+//         - If CAS failure (block should not exist): restart
+//       - Update inode metadata (make non-shared, update size)
+//         - If CAS failure: restart
+//       - Zero out the shared inode header
+//         - If CAS failure: restart
+//     - Check if size fits
+//       - Extend if it doesn't
+// Read:
+// - If (offset+size <= threshold):
+//   - Read inode from cache
+//   - If empty: return zeroes
+//   - If shared:
+//     - Read the whole file from shared inode, or at least data and shared inode header
+//     - If the file header in data doesn't match: re-read inode and restart
+//   - If non-shared:
+//     - Read data from non-shared inode
+// - Otherwise:
+//   - Read data from non-shared inode
+
+static void nfs_kv_continue_write(nfs_kv_write_state *st, int state)
+{
+    if (state == 0)      {}
+    else if (state == 1) goto resume_1;
+    else if (state == 2) goto resume_2;
+    else if (state == 3) goto resume_3;
+    else if (state == 4) goto resume_4;
+    else if (state == 5) goto resume_5;
+    else if (state == 6) goto resume_6;
+    else if (state == 7) goto resume_7;
+    else if (state == 8) goto resume_8;
+    else if (state == 9) goto resume_9;
+    else if (state == 10) goto resume_10;
+    else if (state == 11) goto resume_11;
+    else if (state == 12) goto resume_12;
+    else if (state == 13) goto resume_13;
+    else
+    {
+        fprintf(stderr, "BUG: invalid state in nfs_kv_continue_write()");
+        abort();
+    }
+resume_0:
+    if (!st->size)
+    {
+        auto cb = std::move(st->cb);
+        cb(0);
+        return;
+    }
+    kv_read_inode(st->self, st->ino, [st](int res, const std::string & value, json11::Json attrs)
+    {
+        st->res = res;
+        st->ientry_text = value;
+        st->ientry = attrs;
+        nfs_kv_continue_write(st, 1);
+    }, st->allow_cache);
+    return;
+resume_1:
+    if (st->res < 0 ||
+        st->ientry["type"].uint64_value() != 0 &&
+        st->ientry["type"].uint64_value() != NF3REG)
+    {
+        auto cb = std::move(st->cb);
+        cb(st->res == 0 ? -EINVAL : st->res);
+        return;
+    }
+    st->was_immediate = st->self->parent->cli->get_immediate_commit(st->self->parent->fs_base_inode + st->ino);
+    st->new_size = st->ientry["size"].uint64_value();
+    if (st->new_size < st->offset + st->size)
+    {
+        st->new_size = st->offset + st->size;
+    }
+    if (st->offset + st->size + sizeof(shared_file_header_t) < st->self->parent->shared_inode_threshold)
+    {
+        if (st->ientry["size"].uint64_value() == 0 ||
+            st->ientry["empty"].bool_value() &&
+            st->ientry["size"].uint64_value() + sizeof(shared_file_header_t) < st->self->parent->shared_inode_threshold ||
+            st->ientry["shared_ino"].uint64_value() != 0 &&
+            st->ientry["size"].uint64_value() < st->offset+st->size &&
+            st->ientry["shared_alloc"].uint64_value() < align_shared_size(st->self, st->offset+st->size))
+        {
+            // Either empty, or shared and requires moving into a larger place (redirect-write)
+            allocate_shared_inode(st, 2, st->new_size);
+            return;
+resume_2:
+            if (st->res < 0)
+            {
+                auto cb = std::move(st->cb);
+                cb(st->res);
+                return;
+            }
+resume_3:
+            if (!nfs_do_shared_readmodify(st, 3, state, false))
+                return;
+            nfs_do_shared_write(st, 4); // FIXME assemble from parts, do not copy?
+            return;
+resume_4:
+            if (st->res < 0)
+            {
+                auto cb = std::move(st->cb);
+                cb(st->res);
+                return;
+            }
+            st->self->parent->db->set(kv_inode_key(st->ino), new_moved_ientry(st), [st](int res)
+            {
+                st->res = res;
+                nfs_kv_continue_write(st, 5);
+            }, [st](int res, const std::string & old_value)
+            {
+                return res == 0 && old_value == st->ientry_text;
+            });
+            return;
+resume_5:
+            if (st->res < 0)
+            {
+                st->res2 = st->res;
+                memset(st->aligned_buf, 0, st->aligned_size);
+                nfs_do_shared_write(st, 6);
+                return;
+resume_6:
+                free(st->aligned_buf);
+                st->aligned_buf = NULL;
+                if (st->res2 == -EAGAIN)
+                {
+                    goto resume_0;
+                }
+                else
+                {
+                    auto cb = std::move(st->cb);
+                    cb(st->res2);
+                    return;
+                }
+            }
+            auto cb = std::move(st->cb);
+            cb(0);
+            return;
+        }
+        else if (st->ientry["shared_ino"].uint64_value() > 0)
+        {
+            // Non-empty, shared, can be updated in-place
+            nfs_do_align_write(st, st->ientry["shared_ino"].uint64_value(),
+                st->ientry["shared_offset"].uint64_value() + sizeof(shared_file_header_t) + st->offset, 7);
+            return;
+resume_7:
+            if (st->res == 0 && st->stable && !st->was_immediate)
+            {
+                nfs_do_fsync(st, 8);
+                return;
+            }
+            // We always have to change inode entry on shared writes
+            st->self->parent->db->set(kv_inode_key(st->ino), new_shared_ientry(st), [st](int res)
+            {
+                st->res = res;
+                nfs_kv_continue_write(st, 8);
+            }, [st](int res, const std::string & old_value)
+            {
+                return res == 0 && old_value == st->ientry_text;
+            });
+            return;
+resume_8:
+            if (st->res == -EAGAIN)
+            {
+                goto resume_0;
+            }
+            auto cb = std::move(st->cb);
+            cb(st->res);
+            return;
+        }
+        // Fall through for non-shared
+    }
+    // Non-shared write
+    if (st->ientry["shared_ino"].uint64_value() != 0)
+    {
+        // Unshare
+resume_9:
+        if (!nfs_do_shared_readmodify(st, 9, state, true))
+            return;
+        nfs_do_unshare_write(st, 10);
+        return;
+    }
+    else
+    {
+        // Just write
+        nfs_do_align_write(st, st->ino, st->offset, 10);
+    }
+resume_10:
+    if (st->res == 0 && st->stable && !st->was_immediate)
+    {
+        nfs_do_fsync(st, 11);
+        return;
+    }
+resume_11:
+    if (st->res < 0)
+    {
+        auto cb = std::move(st->cb);
+        cb(st->res);
+        return;
+    }
+    if (st->ientry["empty"].bool_value() ||
+        st->ientry["size"].uint64_value() < st->new_size ||
+        st->ientry["shared_ino"].uint64_value() != 0)
+    {
+        st->ext = &st->self->parent->kvfs->extends[st->ino];
+        st->ext->refcnt++;
+resume_12:
+        if (st->ext->next_extend < st->new_size)
+        {
+            // Aggregate inode extension requests
+            st->ext->next_extend = st->new_size;
+        }
+        if (st->ext->cur_extend > 0)
+        {
+            // Wait for current extend which is already in progress
+            st->ext->waiters.push_back([st](){ nfs_kv_continue_write(st, 12); });
+            return;
+        }
+        if (st->ext->done_extend < st->new_size)
+        {
+            nfs_kv_extend_inode(st, 0);
+            return;
+resume_13:
+            nfs_kv_extend_inode(st, 1);
+        }
+        st->ext->refcnt--;
+        assert(st->ext->refcnt >= 0);
+        if (st->ext->refcnt == 0)
+        {
+            st->self->parent->kvfs->extends.erase(st->ino);
+        }
+    }
+    if (st->res == -EAGAIN)
+    {
+        // Restart
+        goto resume_0;
+    }
+    auto cb = std::move(st->cb);
+    cb(st->res);
+}
+
+int kv_nfs3_write_proc(void *opaque, rpc_op_t *rop)
+{
+    nfs_kv_write_state *st = new nfs_kv_write_state;
+    st->self = (nfs_client_t*)opaque;
+    st->rop = rop;
+    WRITE3args *args = (WRITE3args*)rop->request;
+    WRITE3res *reply = (WRITE3res*)rop->reply;
+    st->ino = kv_fh_inode(args->file);
+    st->offset = args->offset;
+    st->size = (args->count > args->data.size ? args->data.size : args->count);
+    if (!st->ino || st->size > MAX_REQUEST_SIZE)
+    {
+        *reply = (WRITE3res){ .status = NFS3ERR_INVAL };
+        rpc_queue_reply(rop);
+        delete st;
+        return 0;
+    }
+    st->buf = (uint8_t*)args->data.data;
+    st->stable = (args->stable != UNSTABLE);
+    st->cb = [st](int res)
+    {
+        WRITE3res *reply = (WRITE3res*)st->rop->reply;
+        *reply = (WRITE3res){ .status = vitastor_nfs_map_err(res) };
+        if (res == 0)
+        {
+            reply->resok.count = (unsigned)st->size;
+            reply->resok.committed = st->stable || st->was_immediate ? FILE_SYNC : UNSTABLE;
+            *(uint64_t*)reply->resok.verf = st->self->parent->server_id;
+        }
+        rpc_queue_reply(st->rop);
+        delete st;
+    };
+    nfs_kv_continue_write(st, 0);
+    return 1;
+}
--- a/src/nfs_mount.cpp
+++ b/src/nfs_mount.cpp
@ -0,0 +1,119 @@
+// Copyright (c) Vitaliy Filippov, 2019+
+// License: VNPL-1.1 (see README.md for details)
+//
+// NFS proxy - common NULL, ACCESS, COMMIT, DUMP, EXPORT, MNT, UMNT, UMNTALL
+
+#include <sys/time.h>
+
+#include "nfs_proxy.h"
+#include "nfs/nfs.h"
+
+nfsstat3 vitastor_nfs_map_err(int err)
+{
+    return (err == EINVAL ? NFS3ERR_INVAL
+        : (err == ENOENT ? NFS3ERR_NOENT
+        : (err == ENOSPC ? NFS3ERR_NOSPC
+        : (err == EEXIST ? NFS3ERR_EXIST
+        : (err == EIO ? NFS3ERR_IO : (err ? NFS3ERR_IO : NFS3_OK))))));
+}
+
+int nfs3_null_proc(void *opaque, rpc_op_t *rop)
+{
+    rpc_queue_reply(rop);
+    return 0;
+}
+
+int nfs3_access_proc(void *opaque, rpc_op_t *rop)
+{
+    //nfs_client_t *self = (nfs_client_t*)opaque;
+    ACCESS3args *args = (ACCESS3args*)rop->request;
+    ACCESS3res *reply = (ACCESS3res*)rop->reply;
+    *reply = (ACCESS3res){
+        .status = NFS3_OK,
+        .resok = (ACCESS3resok){
+            .access = args->access,
+        },
+    };
+    rpc_queue_reply(rop);
+    return 0;
+}
+
+int nfs3_commit_proc(void *opaque, rpc_op_t *rop)
+{
+    nfs_client_t *self = (nfs_client_t*)opaque;
+    //COMMIT3args *args = (COMMIT3args*)rop->request;
+    cluster_op_t *op = new cluster_op_t;
+    // fsync. we don't know how to fsync a single inode, so just fsync everything
+    op->opcode = OSD_OP_SYNC;
+    op->callback = [self, rop](cluster_op_t *op)
+    {
+        COMMIT3res *reply = (COMMIT3res*)rop->reply;
+        *reply = (COMMIT3res){ .status = vitastor_nfs_map_err(op->retval) };
+        *(uint64_t*)reply->resok.verf = self->parent->server_id;
+        rpc_queue_reply(rop);
+    };
+    self->parent->cli->execute(op);
+    return 1;
+}
+
+int mount3_mnt_proc(void *opaque, rpc_op_t *rop)
+{
+    nfs_client_t *self = (nfs_client_t*)opaque;
+    //nfs_dirpath *args = (nfs_dirpath*)rop->request;
+    if (self->parent->trace)
+        fprintf(stderr, "[%d] MNT\n", self->nfs_fd);
+    nfs_mountres3 *reply = (nfs_mountres3*)rop->reply;
+    u_int flavor = RPC_AUTH_NONE;
+    reply->fhs_status = MNT3_OK;
+    reply->mountinfo.fhandle = xdr_copy_string(rop->xdrs, NFS_ROOT_HANDLE);
+    reply->mountinfo.auth_flavors.auth_flavors_len = 1;
+    reply->mountinfo.auth_flavors.auth_flavors_val = (u_int*)xdr_copy_string(rop->xdrs, (char*)&flavor, sizeof(u_int)).data;
+    rpc_queue_reply(rop);
+    return 0;
+}
+
+int mount3_dump_proc(void *opaque, rpc_op_t *rop)
+{
+    nfs_client_t *self = (nfs_client_t*)opaque;
+    if (self->parent->trace)
+        fprintf(stderr, "[%d] DUMP\n", self->nfs_fd);
+    nfs_mountlist *reply = (nfs_mountlist*)rop->reply;
+    *reply = (struct nfs_mountbody*)malloc_or_die(sizeof(struct nfs_mountbody));
+    xdr_add_malloc(rop->xdrs, *reply);
+    (*reply)->ml_hostname = xdr_copy_string(rop->xdrs, "127.0.0.1");
+    (*reply)->ml_directory = xdr_copy_string(rop->xdrs, self->parent->export_root);
+    (*reply)->ml_next = NULL;
+    rpc_queue_reply(rop);
+    return 0;
+}
+
+int mount3_umnt_proc(void *opaque, rpc_op_t *rop)
+{
+    //nfs_client_t *self = (nfs_client_t*)opaque;
+    //nfs_dirpath *arg = (nfs_dirpath*)rop->request;
+    // do nothing
+    rpc_queue_reply(rop);
+    return 0;
+}
+
+int mount3_umntall_proc(void *opaque, rpc_op_t *rop)
+{
+    // do nothing
+    rpc_queue_reply(rop);
+    return 0;
+}
+
+int mount3_export_proc(void *opaque, rpc_op_t *rop)
+{
+    nfs_client_t *self = (nfs_client_t*)opaque;
+    nfs_exports *reply = (nfs_exports*)rop->reply;
+    *reply = (struct nfs_exportnode*)calloc_or_die(1, sizeof(struct nfs_exportnode) + sizeof(struct nfs_groupnode));
+    xdr_add_malloc(rop->xdrs, *reply);
+    (*reply)->ex_dir = xdr_copy_string(rop->xdrs, self->parent->export_root);
+    (*reply)->ex_groups = (struct nfs_groupnode*)(reply+1);
+    (*reply)->ex_groups->gr_name = xdr_copy_string(rop->xdrs, "127.0.0.1");
+    (*reply)->ex_groups->gr_next = NULL;
+    (*reply)->ex_next = NULL;
+    rpc_queue_reply(rop);
+    return 0;
+}
--- a/src/nfs_proxy.cpp
+++ b/src/nfs_proxy.cpp
@ -21,6 +21,9 @@
 #include "addr_util.h"
 #include "str_util.h"
 #include "nfs_proxy.h"
+#include "nfs_kv.h"
+#include "nfs_block.h"
+#include "nfs_common.h"
 #include "http_client.h"
 #include "cli.h"

@ -31,6 +34,8 @@ const char *exe_name = NULL;

 nfs_proxy_t::~nfs_proxy_t()
 {
+    if (db)
+        delete db;
    if (cmd)
        delete cmd;
    if (cli)
@ -57,6 +62,7 @@ json11::Json::object nfs_proxy_t::parse_args(int narg, const char *args[])
                "\n"
                "USAGE:\n"
                "  %s [STANDARD OPTIONS] [OTHER OPTIONS]\n"
+                "  --fs <META>       mount VitastorFS with metadata in image <META>\n"
                "  --subdir <DIR>    export images prefixed <DIR>/ (default empty - export all images)\n"
                "  --portmap 0       do not listen on port 111 (portmap/rpcbind, requires root)\n"
                "  --bind <IP>       bind service to <IP> address (default 0.0.0.0)\n"
@ -92,6 +98,7 @@ void nfs_proxy_t::run(json11::Json cfg)
    srand48(tv.tv_sec*1000000000 + tv.tv_nsec);
    server_id = (uint64_t)lrand48() | ((uint64_t)lrand48() << 31) | ((uint64_t)lrand48() << 62);
    // Parse options
+    trace = cfg["log_level"].uint64_value() > 5;
    bind_address = cfg["bind"].string_value();
    if (bind_address == "")
        bind_address = "0.0.0.0";
@ -131,67 +138,12 @@ void nfs_proxy_t::run(json11::Json cfg)
    cmd->ringloop = ringloop;
    cmd->epmgr = epmgr;
    cmd->cli = cli;
-    // We need inode name hashes for NFS handles to remain stateless and <= 64 bytes long
-    dir_info[""] = (nfs_dir_t){
-        .id = 1,
-        .mod_rev = 0,
-    };
-    clock_gettime(CLOCK_REALTIME, &dir_info[""].mtime);
    watch_stats();
-    assert(cli->st_cli.on_inode_change_hook == NULL);
-    cli->st_cli.on_inode_change_hook = [this](inode_t changed_inode, bool removed)
+    if (!fs_kv_inode)
    {
-        auto inode_cfg_it = cli->st_cli.inode_config.find(changed_inode);
-        if (inode_cfg_it == cli->st_cli.inode_config.end())
-        {
-            return;
+        blockfs = new block_fs_state_t();
+        blockfs->init(this);
    }
-        auto & inode_cfg = inode_cfg_it->second;
-        std::string full_name = inode_cfg.name;
-        if (name_prefix != "" && full_name.substr(0, name_prefix.size()) != name_prefix)
-        {
-            return;
-        }
-        // Calculate directory modification time and revision (used as "cookie verifier")
-        timespec now;
-        clock_gettime(CLOCK_REALTIME, &now);
-        dir_info[""].mod_rev = dir_info[""].mod_rev < inode_cfg.mod_revision ? inode_cfg.mod_revision : dir_info[""].mod_rev;
-        dir_info[""].mtime = now;
-        int pos = full_name.find('/', name_prefix.size());
-        while (pos >= 0)
-        {
-            std::string dir = full_name.substr(0, pos);
-            auto & dinf = dir_info[dir];
-            if (!dinf.id)
-                dinf.id = next_dir_id++;
-            dinf.mod_rev = dinf.mod_rev < inode_cfg.mod_revision ? inode_cfg.mod_revision : dinf.mod_rev;
-            dinf.mtime = now;
-            dir_by_hash["S"+base64_encode(sha256(dir))] = dir;
-            pos = full_name.find('/', pos+1);
-        }
-        // Alter inode_by_hash
-        if (removed)
-        {
-            auto ino_it = hash_by_inode.find(changed_inode);
-            if (ino_it != hash_by_inode.end())
-            {
-                inode_by_hash.erase(ino_it->second);
-                hash_by_inode.erase(ino_it);
-            }
-        }
-        else
-        {
-            std::string hash = "S"+base64_encode(sha256(full_name));
-            auto hbi_it = hash_by_inode.find(changed_inode);
-            if (hbi_it != hash_by_inode.end() && hbi_it->second != hash)
-            {
-                // inode had a different name, remove old hash=>inode pointer
-                inode_by_hash.erase(hbi_it->second);
-            }
-            inode_by_hash[hash] = changed_inode;
-            hash_by_inode[changed_inode] = hash;
-        }
-    };
    // Load image metadata
    while (!cli->is_ready())
    {
@ -202,6 +154,54 @@ void nfs_proxy_t::run(json11::Json cfg)
    }
    // Check default pool
    check_default_pool();
+    // Check if we're using VitastorFS
+    fs_kv_inode = cfg["fs"].uint64_value();
+    if (!fs_kv_inode && cfg["fs"].is_string())
+    {
+        for (auto & ic: cli->st_cli.inode_config)
+        {
+            if (ic.second.name == cfg["fs"].string_value())
+            {
+                fs_kv_inode = ic.first;
+                break;
+            }
+        }
+    }
+    readdir_getattr_parallel = cfg["readdir_getattr_parallel"].uint64_value();
+    if (!readdir_getattr_parallel)
+        readdir_getattr_parallel = 8;
+    id_alloc_batch_size = cfg["id_alloc_batch_size"].uint64_value();
+    if (!id_alloc_batch_size)
+        id_alloc_batch_size = 200;
+    if (fs_kv_inode)
+    {
+        // Open DB and wait
+        int open_res = 0;
+        bool open_done = false;
+        db = new kv_dbw_t(cli);
+        db->open(fs_kv_inode, cfg, [&](int res)
+        {
+            open_done = true;
+            open_res = res;
+        });
+        while (!open_done)
+        {
+            ringloop->loop();
+            if (open_done)
+                break;
+            ringloop->wait();
+        }
+        if (open_res < 0)
+        {
+            fprintf(stderr, "Failed to open key/value filesystem metadata index: %s (code %d)\n",
+                strerror(-open_res), open_res);
+            exit(1);
+        }
+        fs_base_inode = ((uint64_t)default_pool_id << (64-POOL_ID_BITS));
+        fs_inode_count = ((uint64_t)1 << (64-POOL_ID_BITS)) - 1;
+        shared_inode_threshold = pool_block_size;
+        kvfs = new kv_fs_state_t;
+    }
    // Self-register portmap and NFS
    pmap.reg_ports.insert((portmap_id_t){
        .prog = PMAP_PROGRAM,
@ -275,9 +275,13 @@ void nfs_proxy_t::run(json11::Json cfg)
    }
    // Destroy the client
    cli->flush();
+    delete kvfs;
+    delete db;
    delete cli;
    delete epmgr;
    delete ringloop;
+    kvfs = NULL;
+    db = NULL;
    cli = NULL;
    epmgr = NULL;
    ringloop = NULL;
@ -350,7 +354,7 @@ void nfs_proxy_t::parse_stats(etcd_kv_t & kv)
        pool_id_t pool_id = 0;
        inode_t inode_num = 0;
        char null_byte = 0;
-        int scanned = sscanf(key.c_str() + cli->st_cli.etcd_prefix.length()+13, "%u/%lu%c", &pool_id, &inode_num, &null_byte);
+        int scanned = sscanf(key.c_str() + cli->st_cli.etcd_prefix.length()+13, "%u/%ju%c", &pool_id, &inode_num, &null_byte);
        if (scanned != 2 || !pool_id || pool_id >= POOL_ID_MAX || !inode_num)
        {
            fprintf(stderr, "Bad etcd key %s, ignoring\n", key.c_str());
@ -382,12 +386,15 @@ void nfs_proxy_t::check_default_pool()
    {
        if (cli->st_cli.pool_config.size() == 1)
        {
-            default_pool = cli->st_cli.pool_config.begin()->second.name;
-            default_pool_id = cli->st_cli.pool_config.begin()->first;
+            auto pool_it = cli->st_cli.pool_config.begin();
+            default_pool_id = pool_it->first;
+            default_pool = pool_it->second.name;
+            pool_block_size = pool_it->second.pg_stripe_size;
+            pool_alignment = pool_it->second.bitmap_granularity;
        }
        else
        {
-            fprintf(stderr, "There are %lu pools. Please select default pool with --pool option\n", cli->st_cli.pool_config.size());
+            fprintf(stderr, "There are %zu pools. Please select default pool with --pool option\n", cli->st_cli.pool_config.size());
            exit(1);
        }
    }
@ -398,6 +405,8 @@ void nfs_proxy_t::check_default_pool()
            if (p.second.name == default_pool)
            {
                default_pool_id = p.first;
+                pool_block_size = p.second.pg_stripe_size;
+                pool_alignment = p.second.bitmap_granularity;
                break;
            }
        }
@ -421,6 +430,10 @@ void nfs_proxy_t::do_accept(int listen_fd)
        int one = 1;
        setsockopt(nfs_fd, SOL_TCP, TCP_NODELAY, &one, sizeof(one));
        auto cli = new nfs_client_t();
+        if (fs_kv_inode)
+            nfs_kv_procs(cli);
+        else
+            nfs_block_procs(cli);
        cli->parent = this;
        cli->nfs_fd = nfs_fd;
        for (auto & fn: pmap.proc_table)
--- a/src/nfs_proxy.h
+++ b/src/nfs_proxy.h
@ -4,17 +4,18 @@
 #include "epoll_manager.h"
 #include "nfs_portmap.h"
 #include "nfs/xdr_impl.h"
+#include "kv_db.h"

+#define NFS_ROOT_HANDLE "R"
 #define RPC_INIT_BUF_SIZE 32768
+#define MAX_REQUEST_SIZE 128*1024*1024
+#define TRUE 1
+#define FALSE 0

 class cli_tool_t;

-struct nfs_dir_t
-{
-    uint64_t id;
-    uint64_t mod_rev;
-    timespec mtime;
-};
+struct kv_fs_state_t;
+struct block_fs_state_t;

 class nfs_proxy_t
 {
@ -27,28 +28,28 @@ public:
    std::string export_root;
    bool portmap_enabled;
    unsigned nfs_port;
+    uint64_t fs_kv_inode = 0;
+    uint64_t fs_base_inode = 0;
+    uint64_t fs_inode_count = 0;
+    int readdir_getattr_parallel = 8, id_alloc_batch_size = 200;
+    int trace = 0;

    pool_id_t default_pool_id;
+    uint64_t pool_block_size = 0;
+    uint64_t pool_alignment = 0;
+    uint64_t shared_inode_threshold = 0;

    portmap_service_t pmap;
    ring_loop_t *ringloop = NULL;
    epoll_manager_t *epmgr = NULL;
    cluster_client_t *cli = NULL;
    cli_tool_t *cmd = NULL;
+    kv_dbw_t *db = NULL;
+    kv_fs_state_t *kvfs = NULL;
+    block_fs_state_t *blockfs = NULL;

    std::vector<XDR*> xdr_pool;

-    // filehandle = "S"+base64(sha256(full name with prefix)) or "roothandle" for mount root)
-
-    uint64_t next_dir_id = 2;
-    // filehandle => dir with name_prefix
-    std::map<std::string, std::string> dir_by_hash;
-    // dir with name_prefix => dir info
-    std::map<std::string, nfs_dir_t> dir_info;
-    // filehandle => inode ID
-    std::map<std::string, inode_t> inode_by_hash;
-    // inode ID => filehandle
-    std::map<inode_t, std::string> hash_by_inode;
    // inode ID => statistics
    std::map<inode_t, json11::Json> inode_stats;
    // pool ID => statistics
@ -86,28 +87,6 @@ struct rpc_free_buffer_t
    unsigned size;
 };

-struct extend_size_t
-{
-    inode_t inode;
-    uint64_t new_size;
-};
-
-inline bool operator < (const extend_size_t &a, const extend_size_t &b)
-{
-    return a.inode < b.inode || a.inode == b.inode && a.new_size < b.new_size;
-}
-
-struct extend_write_t
-{
-    rpc_op_t *rop;
-    int resize_res, write_res; // 1 = started, 0 = completed OK, -errno = completed with error
-};
-
-struct extend_inode_t
-{
-    uint64_t cur_extend = 0, next_extend = 0;
-};
-
 class nfs_client_t
 {
 public:
@ -122,8 +101,6 @@ public:
    rpc_cur_buffer_t cur_buffer = { 0 };
    std::map<uint8_t*, rpc_used_buffer_t> used_buffers;
    std::vector<rpc_free_buffer_t> free_buffers;
-    std::map<inode_t, extend_inode_t> extends;
-    std::multimap<extend_size_t, extend_write_t> extend_writes;

    iovec read_iov;
    msghdr read_msg = { 0 };
@ -133,9 +110,6 @@ public:
    std::vector<iovec> send_list, next_send_list;
    std::vector<rpc_op_t*> outbox, next_outbox;

-    nfs_client_t();
-    ~nfs_client_t();
-
    void select_read_buffer(unsigned wanted_size);
    void submit_read(unsigned wanted_size);
    void handle_read(int result);
--- a/src/osd.cpp
+++ b/src/osd.cpp
@ -233,6 +233,8 @@ void osd_t::parse_config(bool init)
        ? 10 : config["recovery_tune_agg_interval"].uint64_value();
    recovery_tune_sleep_min_us = config["recovery_tune_sleep_min_us"].is_null()
        ? 10 : config["recovery_tune_sleep_min_us"].uint64_value();
+    recovery_tune_sleep_cutoff_us = config["recovery_tune_sleep_cutoff_us"].is_null()
+        ? 10000000 : config["recovery_tune_sleep_cutoff_us"].uint64_value();
    recovery_pg_switch = config["recovery_pg_switch"].uint64_value();
    if (recovery_pg_switch < 1)
        recovery_pg_switch = DEFAULT_RECOVERY_PG_SWITCH;
@ -473,14 +475,14 @@ void osd_t::print_stats()
            if (msgr.stats.op_stat_bytes[i] != 0)
            {
                printf(
-                    "[OSD %lu] avg latency for op %d (%s): %lu us, B/W: %.2f %s\n", osd_num, i, osd_op_names[i], avg,
+                    "[OSD %ju] avg latency for op %d (%s): %ju us, B/W: %.2f %s\n", osd_num, i, osd_op_names[i], avg,
                    (bw > 1024*1024*1024 ? bw/1024.0/1024/1024 : (bw > 1024*1024 ? bw/1024.0/1024 : bw/1024.0)),
                    (bw > 1024*1024*1024 ? "GB/s" : (bw > 1024*1024 ? "MB/s" : "KB/s"))
                );
            }
            else
            {
-                printf("[OSD %lu] avg latency for op %d (%s): %lu us\n", osd_num, i, osd_op_names[i], avg);
+                printf("[OSD %ju] avg latency for op %d (%s): %ju us\n", osd_num, i, osd_op_names[i], avg);
            }
            prev_stats.op_stat_count[i] = msgr.stats.op_stat_count[i];
            prev_stats.op_stat_sum[i] = msgr.stats.op_stat_sum[i];
@ -492,7 +494,7 @@ void osd_t::print_stats()
        if (msgr.stats.subop_stat_count[i] != prev_stats.subop_stat_count[i])
        {
            uint64_t avg = (msgr.stats.subop_stat_sum[i] - prev_stats.subop_stat_sum[i])/(msgr.stats.subop_stat_count[i] - prev_stats.subop_stat_count[i]);
-            printf("[OSD %lu] avg latency for subop %d (%s): %ld us\n", osd_num, i, osd_op_names[i], avg);
+            printf("[OSD %ju] avg latency for subop %d (%s): %jd us\n", osd_num, i, osd_op_names[i], avg);
            prev_stats.subop_stat_count[i] = msgr.stats.subop_stat_count[i];
            prev_stats.subop_stat_sum[i] = msgr.stats.subop_stat_sum[i];
        }
@ -503,7 +505,7 @@ void osd_t::print_stats()
        {
            uint64_t bw = (recovery_stat[i].bytes - recovery_print_prev[i].bytes) / print_stats_interval;
            printf(
-                "[OSD %lu] %s recovery: %.1f op/s, B/W: %.2f %s, avg latency %ld us, delay %ld us\n", osd_num, recovery_stat_names[i],
+                "[OSD %ju] %s recovery: %.1f op/s, B/W: %.2f %s, avg latency %jd us, delay %jd us\n", osd_num, recovery_stat_names[i],
                (recovery_stat[i].count - recovery_print_prev[i].count) * 1.0 / print_stats_interval,
                (bw > 1024*1024*1024 ? bw/1024.0/1024/1024 : (bw > 1024*1024 ? bw/1024.0/1024 : bw/1024.0)),
                (bw > 1024*1024*1024 ? "GB/s" : (bw > 1024*1024 ? "MB/s" : "KB/s")),
@ -515,19 +517,19 @@ void osd_t::print_stats()
    memcpy(recovery_print_prev, recovery_stat, sizeof(recovery_stat));
    if (corrupted_objects > 0)
    {
-        printf("[OSD %lu] %lu object(s) corrupted\n", osd_num, corrupted_objects);
+        printf("[OSD %ju] %ju object(s) corrupted\n", osd_num, corrupted_objects);
    }
    if (incomplete_objects > 0)
    {
-        printf("[OSD %lu] %lu object(s) incomplete\n", osd_num, incomplete_objects);
+        printf("[OSD %ju] %ju object(s) incomplete\n", osd_num, incomplete_objects);
    }
    if (degraded_objects > 0)
    {
-        printf("[OSD %lu] %lu object(s) degraded\n", osd_num, degraded_objects);
+        printf("[OSD %ju] %ju object(s) degraded\n", osd_num, degraded_objects);
    }
    if (misplaced_objects > 0)
    {
-        printf("[OSD %lu] %lu object(s) misplaced\n", osd_num, misplaced_objects);
+        printf("[OSD %ju] %ju object(s) misplaced\n", osd_num, misplaced_objects);
    }
 }

@ -546,27 +548,27 @@ void osd_t::print_slow()
                int l = sizeof(alloc), n;
                char *buf = alloc;
 #define bufprintf(s, ...) { n = snprintf(buf, l, s, __VA_ARGS__); n = n < 0 ? 0 : n; buf += n; l -= n; }
-                bufprintf("[OSD %lu] Slow op %lx", osd_num, (unsigned long)op);
+                bufprintf("[OSD %ju] Slow op %jx", osd_num, (uint64_t)op);
                if (kv.second->osd_num)
                {
-                    bufprintf(" from peer OSD %lu (client %d)", kv.second->osd_num, kv.second->peer_fd);
+                    bufprintf(" from peer OSD %ju (client %d)", kv.second->osd_num, kv.second->peer_fd);
                }
                else
                {
                    bufprintf(" from client %d", kv.second->peer_fd);
                }
-                bufprintf(": %s id=%lu", osd_op_names[op->req.hdr.opcode], op->req.hdr.id);
+                bufprintf(": %s id=%ju", osd_op_names[op->req.hdr.opcode], op->req.hdr.id);
                if (op->req.hdr.opcode == OSD_OP_SEC_READ || op->req.hdr.opcode == OSD_OP_SEC_WRITE ||
                    op->req.hdr.opcode == OSD_OP_SEC_WRITE_STABLE || op->req.hdr.opcode == OSD_OP_SEC_DELETE)
                {
-                    bufprintf(" %lx:%lx v", op->req.sec_rw.oid.inode, op->req.sec_rw.oid.stripe);
+                    bufprintf(" %jx:%jx v", op->req.sec_rw.oid.inode, op->req.sec_rw.oid.stripe);
                    if (op->req.sec_rw.version == UINT64_MAX)
                    {
                        bufprintf("%s", "max");
                    }
                    else
                    {
-                        bufprintf("%lu", op->req.sec_rw.version);
+                        bufprintf("%ju", op->req.sec_rw.version);
                    }
                    if (op->req.hdr.opcode != OSD_OP_SEC_DELETE)
                    {
@ -578,17 +580,17 @@ void osd_t::print_slow()
                    for (uint64_t i = 0; i < op->req.sec_stab.len && i < sizeof(obj_ver_id)*12; i += sizeof(obj_ver_id))
                    {
                        obj_ver_id *ov = (obj_ver_id*)((uint8_t*)op->buf + i);
-                        bufprintf(i == 0 ? " %lx:%lx v%lu" : ", %lx:%lx v%lu", ov->oid.inode, ov->oid.stripe, ov->version);
+                        bufprintf(i == 0 ? " %jx:%jx v%ju" : ", %jx:%jx v%ju", ov->oid.inode, ov->oid.stripe, ov->version);
                    }
                    if (op->req.sec_stab.len > sizeof(obj_ver_id)*12)
                    {
-                        bufprintf(", ... (%lu items)", op->req.sec_stab.len/sizeof(obj_ver_id));
+                        bufprintf(", ... (%ju items)", op->req.sec_stab.len/sizeof(obj_ver_id));
                    }
                }
                else if (op->req.hdr.opcode == OSD_OP_SEC_LIST)
                {
                    bufprintf(
-                        " oid=%lx/%lx-%lx/%lx pg=%u/%u, stripe=%lu, limit=%u",
+                        " oid=%jx/%jx-%jx/%jx pg=%u/%u, stripe=%ju, limit=%u",
                        op->req.sec_list.min_inode, op->req.sec_list.min_stripe,
                        op->req.sec_list.max_inode, op->req.sec_list.max_stripe,
                        op->req.sec_list.list_pg, op->req.sec_list.pg_count,
@ -598,7 +600,7 @@ void osd_t::print_slow()
                else if (op->req.hdr.opcode == OSD_OP_READ || op->req.hdr.opcode == OSD_OP_WRITE ||
                    op->req.hdr.opcode == OSD_OP_DELETE)
                {
-                    bufprintf(" inode=%lx offset=%lx len=%x", op->req.rw.inode, op->req.rw.offset, op->req.rw.len);
+                    bufprintf(" inode=%jx offset=%jx len=%x", op->req.rw.inode, op->req.rw.offset, op->req.rw.len);
                }
                if (op->req.hdr.opcode == OSD_OP_SEC_READ || op->req.hdr.opcode == OSD_OP_SEC_WRITE ||
                    op->req.hdr.opcode == OSD_OP_SEC_WRITE_STABLE || op->req.hdr.opcode == OSD_OP_SEC_DELETE ||
@ -610,7 +612,7 @@ void osd_t::print_slow()
                    int wait_for = op->bs_op ? PRIV(op->bs_op)->wait_for : 0;
                    if (wait_for)
                    {
-                        bufprintf(" wait=%d (detail=%lu)", wait_for, PRIV(op->bs_op)->wait_detail);
+                        bufprintf(" wait=%d (detail=%ju)", wait_for, PRIV(op->bs_op)->wait_detail);
                    }
                }
                else if (op->req.hdr.opcode == OSD_OP_READ || op->req.hdr.opcode == OSD_OP_WRITE ||
--- a/Show More
+++ b/Show More