WIP another experiment - "smart" iothreads

Change bool wr to event mask in epoll_manager
2024-07-03 11:05:45 +03:00 · 2024-07-01 00:30:59 +03:00
152 changed files with 1418 additions and 7998 deletions
--- a/.gitea/workflows/test.yml
+++ b/.gitea/workflows/test.yml
@@ -16,7 +16,6 @@ env:
  BUILDENV_IMAGE: git.yourcmc.ru/vitalif/vitastor/buildenv
  TEST_IMAGE: git.yourcmc.ru/vitalif/vitastor/test
  OSD_ARGS: '--etcd_quick_timeout 2000'
-  USE_RAMDISK: 1

 concurrency:
  group: ci-${{ github.ref }}
@@ -198,24 +197,6 @@ jobs:
          echo ""
        done

-  test_etcd_fail_antietcd:
-    runs-on: ubuntu-latest
-    needs: build
-    container: ${{env.TEST_IMAGE}}:${{github.sha}}
-    steps:
-    - name: Run test
-      id: test
-      timeout-minutes: 10
-      run: ANTIETCD=1 /root/vitastor/tests/test_etcd_fail.sh
-    - name: Print logs
-      if: always() && steps.test.outcome == 'failure'
-      run: |
-        for i in /root/vitastor/testdata/*.log /root/vitastor/testdata/*.txt; do
-          echo "-------- $i --------"
-          cat $i
-          echo ""
-        done
-
  test_interrupted_rebalance:
    runs-on: ubuntu-latest
    needs: build
@@ -684,24 +665,6 @@ jobs:
          echo ""
        done

-  test_heal_antietcd:
-    runs-on: ubuntu-latest
-    needs: build
-    container: ${{env.TEST_IMAGE}}:${{github.sha}}
-    steps:
-    - name: Run test
-      id: test
-      timeout-minutes: 10
-      run: ANTIETCD=1 /root/vitastor/tests/test_heal.sh
-    - name: Print logs
-      if: always() && steps.test.outcome == 'failure'
-      run: |
-        for i in /root/vitastor/testdata/*.log /root/vitastor/testdata/*.txt; do
-          echo "-------- $i --------"
-          cat $i
-          echo ""
-        done
-
  test_heal_csum_32k_dmj:
    runs-on: ubuntu-latest
    needs: build
--- a/.gitea/workflows/tests-to-yaml.pl
+++ b/.gitea/workflows/tests-to-yaml.pl
@@ -34,10 +34,6 @@ for my $line (<>)
            {
                $test_name .= '_imm';
            }
-            elsif ($1 eq 'ANTIETCD')
-            {
-                $test_name .= '_antietcd';
-            }
            else
            {
                $test_name .= '_'.lc($1).'_'.$2;
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -2,6 +2,6 @@ cmake_minimum_required(VERSION 2.8.12)

 project(vitastor)

-set(VITASTOR_VERSION "1.7.1")
+set(VERSION "1.6.1")

 add_subdirectory(src)
--- a/csi/Makefile
+++ b/csi/Makefile
@@ -1,9 +1,9 @@
-VITASTOR_VERSION ?= v1.7.1
+VERSION ?= v1.6.1

 all: build push

 build:
-	@docker build --rm -t vitalif/vitastor-csi:$(VITASTOR_VERSION) .
+	@docker build --rm -t vitalif/vitastor-csi:$(VERSION) .

 push:
-	@docker push vitalif/vitastor-csi:$(VITASTOR_VERSION)
+	@docker push vitalif/vitastor-csi:$(VERSION)
--- a/csi/deploy/004-csi-nodeplugin.yaml
+++ b/csi/deploy/004-csi-nodeplugin.yaml
@@ -49,7 +49,7 @@ spec:
            capabilities:
              add: ["SYS_ADMIN"]
            allowPrivilegeEscalation: true
-          image: vitalif/vitastor-csi:v1.7.1
+          image: vitalif/vitastor-csi:v1.6.1
          args:
            - "--node=$(NODE_ID)"
            - "--endpoint=$(CSI_ENDPOINT)"
--- a/csi/deploy/007-csi-provisioner.yaml
+++ b/csi/deploy/007-csi-provisioner.yaml
@@ -121,7 +121,7 @@ spec:
            privileged: true
            capabilities:
              add: ["SYS_ADMIN"]
-          image: vitalif/vitastor-csi:v1.7.1
+          image: vitalif/vitastor-csi:v1.6.1
          args:
            - "--node=$(NODE_ID)"
            - "--endpoint=$(CSI_ENDPOINT)"
--- a/csi/src/config.go
+++ b/csi/src/config.go
@@ -5,7 +5,7 @@ package vitastor

 const (
    vitastorCSIDriverName    = "csi.vitastor.io"
-    vitastorCSIDriverVersion = "1.7.1"
+    vitastorCSIDriverVersion = "1.6.1"
 )

 // Config struct fills the parameters of request or user input
--- a/debian/changelog
+++ b/debian/changelog
@@ -1,4 +1,4 @@
-vitastor (1.7.1-1) unstable; urgency=medium
+vitastor (1.6.1-1) unstable; urgency=medium

  * Bugfixes

--- a/debian/vitastor-mon.install
+++ b/debian/vitastor-mon.install
@@ -1,3 +1,3 @@
-mon usr/lib/vitastor/
+mon usr/lib/vitastor/mon
 mon/scripts/make-etcd usr/lib/vitastor/mon
 mon/scripts/vitastor-mon.service /lib/systemd/system
--- a/debian/vitastor-mon.postinst
+++ b/debian/vitastor-mon.postinst
@@ -6,6 +6,4 @@ if [ "$1" = "configure" ]; then
 	addgroup --system --quiet vitastor
 	adduser --system --quiet --ingroup vitastor --no-create-home --home /nonexistent vitastor
 	mkdir -p /etc/vitastor
-	mkdir -p /var/lib/vitastor
-	chown vitastor:vitastor /var/lib/vitastor
 fi
--- a/debian/vitastor.Dockerfile
+++ b/debian/vitastor.Dockerfile
@@ -9,12 +9,12 @@ ARG REL=

 WORKDIR /root

-RUN set -e -x; \
-    if [ "$REL" = "buster" ]; then \
-        apt-get update; \
-        apt-get -y install wget; \
-        wget https://vitastor.io/debian/pubkey.gpg -O /etc/apt/trusted.gpg.d/vitastor.gpg; \
-        echo "deb https://vitastor.io/debian $REL main" >> /etc/apt/sources.list; \
+RUN if [ "$REL" = "buster" -o "$REL" = "bullseye" ]; then \
+        echo "deb http://deb.debian.org/debian $REL-backports main" >> /etc/apt/sources.list; \
+        echo >> /etc/apt/preferences; \
+        echo 'Package: *' >> /etc/apt/preferences; \
+        echo "Pin: release a=$REL-backports" >> /etc/apt/preferences; \
+        echo 'Pin-Priority: 500' >> /etc/apt/preferences; \
    fi; \
    grep '^deb ' /etc/apt/sources.list | perl -pe 's/^deb/deb-src/' >> /etc/apt/sources.list; \
    perl -i -pe 's/Types: deb$/Types: deb deb-src/' /etc/apt/sources.list.d/debian.sources || true; \
@@ -22,9 +22,10 @@ RUN set -e -x; \
    echo 'APT::Install-Suggests false;' >> /etc/apt/apt.conf

 RUN apt-get update
-RUN apt-get -y install fio liburing-dev libgoogle-perftools-dev devscripts libjerasure-dev cmake libibverbs-dev libisal-dev libnl-3-dev libnl-genl-3-dev curl
+RUN apt-get -y install fio liburing-dev libgoogle-perftools-dev devscripts
 RUN apt-get -y build-dep fio
 RUN apt-get --download-only source fio
+RUN apt-get update && apt-get -y install libjerasure-dev cmake libibverbs-dev libisal-dev libnl-3-dev libnl-genl-3-dev

 ADD . /root/vitastor
 RUN set -e -x; \
@@ -36,10 +37,8 @@ RUN set -e -x; \
    mkdir -p /root/packages/vitastor-$REL; \
    rm -rf /root/packages/vitastor-$REL/*; \
    cd /root/packages/vitastor-$REL; \
-    FULLVER=$(head -n1 /root/vitastor/debian/changelog | perl -pe 's/^.*\((.*?)\).*$/$1/'); \
-    VER=${FULLVER%%-*}; \
-    cp -r /root/vitastor vitastor-$VER; \
-    cd vitastor-$VER; \
+    cp -r /root/vitastor vitastor-1.6.1; \
+    cd vitastor-1.6.1; \
    ln -s /root/fio-build/fio-*/ ./fio; \
    FIO=$(head -n1 fio/debian/changelog | perl -pe 's/^.*\((.*?)\).*$/$1/'); \
    ls /usr/include/linux/raw.h || cp ./debian/raw.h /usr/include/linux/raw.h; \
@@ -51,14 +50,10 @@ RUN set -e -x; \
    echo fio-headers.patch >> debian/patches/series; \
    rm -rf a b; \
    echo "dep:fio=$FIO" > debian/fio_version; \
-    cd /root/packages/vitastor-$REL/vitastor-$VER; \
-    mkdir mon/node_modules; \
-    cd mon/node_modules; \
-    curl -s https://git.yourcmc.ru/vitalif/antietcd/archive/master.tar.gz | tar -zx; \
-    curl -s https://git.yourcmc.ru/vitalif/tinyraft/archive/master.tar.gz | tar -zx; \
    cd /root/packages/vitastor-$REL; \
-    tar --sort=name --mtime='2020-01-01' --owner=0 --group=0 --exclude=debian -cJf vitastor_$VER.orig.tar.xz vitastor-$VER; \
-    cd vitastor-$VER; \
-    DEBFULLNAME="Vitaliy Filippov <vitalif@yourcmc.ru>" dch -D $REL -v "$FULLVER""$REL" "Rebuild for $REL"; \
+    tar --sort=name --mtime='2020-01-01' --owner=0 --group=0 --exclude=debian -cJf vitastor_1.6.1.orig.tar.xz vitastor-1.6.1; \
+    cd vitastor-1.6.1; \
+    V=$(head -n1 debian/changelog | perl -pe 's/^.*\((.*?)\).*$/$1/'); \
+    DEBFULLNAME="Vitaliy Filippov <vitalif@yourcmc.ru>" dch -D $REL -v "$V""$REL" "Rebuild for $REL"; \
    DEB_BUILD_OPTIONS=nocheck dpkg-buildpackage --jobs=auto -sa; \
    rm -rf /root/packages/vitastor-$REL/vitastor-*/
--- a/docs/config/client.en.md
+++ b/docs/config/client.en.md
@@ -9,7 +9,6 @@
 These parameters apply only to Vitastor clients (QEMU, fio, NBD and so on) and
 affect their interaction with the cluster.

- [client_iothread_count](#client_iothread_count)
 - [client_retry_interval](#client_retry_interval)
 - [client_eio_retry_interval](#client_eio_retry_interval)
 - [client_retry_enospc](#client_retry_enospc)
@@ -24,23 +23,6 @@ affect their interaction with the cluster.
 - [nbd_max_part](#nbd_max_part)
 - [osd_nearfull_ratio](#osd_nearfull_ratio)

-## client_iothread_count
-
- Type: integer
- Default: 0
-
-Number of separate threads for handling TCP network I/O at client library
-side. Enabling 4 threads usually allows to increase peak performance of each
-client from approx. 2-3 to 7-8 GByte/s linear read/write and from approx.
-100-150 to 400 thousand iops, but at the same time it increases latency.
-Latency increase depends on CPU: with CPU power saving disabled latency
-only increases by ~10 us (equivalent to Q=1 iops decrease from 10500 to 9500),
-with CPU power saving enabled it may be as high as 500 us (equivalent to Q=1
-iops decrease from 2000 to 1000). RDMA isn't affected by this option.
-
-It's recommended to enable client I/O threads if you don't use RDMA and want
-to increase peak client performance.
-
 ## client_retry_interval

 - Type: milliseconds
--- a/docs/config/client.ru.md
+++ b/docs/config/client.ru.md
@@ -9,7 +9,6 @@
 Данные параметры применяются только к клиентам Vitastor (QEMU, fio, NBD и т.п.) и
 затрагивают логику их работы с кластером.

- [client_iothread_count](#client_iothread_count)
 - [client_retry_interval](#client_retry_interval)
 - [client_eio_retry_interval](#client_eio_retry_interval)
 - [client_retry_enospc](#client_retry_enospc)
@@ -24,24 +23,6 @@
 - [nbd_max_part](#nbd_max_part)
 - [osd_nearfull_ratio](#osd_nearfull_ratio)

-## client_iothread_count
-
- Тип: целое число
- Значение по умолчанию: 0
-
-Число отдельных потоков для обработки ввода-вывода через TCP сеть на стороне
-клиентской библиотеки. Включение 4 потоков обычно позволяет поднять пиковую
-производительность каждого клиента примерно с 2-3 до 7-8 Гбайт/с линейного
-чтения/записи и примерно с 100-150 до 400 тысяч операций ввода-вывода в
-секунду, но ухудшает задержку. Увеличение задержки зависит от процессора:
-при отключённом энергосбережении CPU это всего ~10 микросекунд (равносильно
-падению iops с Q=1 с 10500 до 9500), а при включённом это может быть
-и 500 микросекунд (равносильно падению iops с Q=1 с 2000 до 1000). На работу
-RDMA данная опция не влияет.
-
-Рекомендуется включать клиентские потоки ввода-вывода, если вы не используете
-RDMA и хотите повысить пиковую производительность клиентов.
-
 ## client_retry_interval

 - Тип: миллисекунды
--- a/docs/config/layout-cluster.en.md
+++ b/docs/config/layout-cluster.en.md
@@ -56,24 +56,14 @@ Can't be smaller than the OSD data device sector.
 ## immediate_commit

 - Type: string
- Default: all
+- Default: false

-One of "none", "all" or "small". Global value, may be overriden [at pool level](pool.en.md#immediate_commit).
-
-This parameter is also really important for performance.
-
-TLDR: default "all" is optimal for server-grade SSDs with supercapacitor-based
-power loss protection (nonvolatile write-through cache) and also for most HDDs.
-"none" or "small" should be only selected if you use desktop SSDs without
-capacitors or drives with slow write-back cache that can't be disabled. Check
-immediate_commit of your OSDs in [ls-osd](../usage/cli.en.md#ls-osd).
-
-Detailed explanation:
+Another parameter which is really important for performance.

 Desktop SSDs are very fast (100000+ iops) for simple random writes
 without cache flush. However, they are really slow (only around 1000 iops)
-if you try to fsync() each write, that is, if you want to guarantee that
-each change gets actually persisted to the physical media.
+if you try to fsync() each write, that is, when you want to guarantee that
+each change gets immediately persisted to the physical media.

 Server-grade SSDs with "Advanced/Enhanced Power Loss Protection" or with
 "Supercapacitor-based Power Loss Protection", on the other hand, are equally
@@ -85,8 +75,8 @@ really slow when used with desktop SSDs. Vitastor, however, can also
 efficiently utilize desktop SSDs by postponing fsync until the client calls
 it explicitly.

-This is what this parameter regulates. When it's set to "all" Vitastor
-cluster commits each change to disks immediately and clients just
+This is what this parameter regulates. When it's set to "all" the whole
+Vitastor cluster commits each change to disks immediately and clients just
 ignore fsyncs because they know for sure that they're unneeded. This reduces
 the amount of network roundtrips performed by clients and improves
 performance. So it's always better to use server grade SSDs with
@@ -109,5 +99,9 @@ Setting this parameter to "all" or "small" in OSD parameters requires enabling
 [disable_journal_fsync](layout-osd.en.yml#disable_journal_fsync) and
 [disable_meta_fsync](layout-osd.en.yml#disable_meta_fsync), setting it to
 "all" also requires enabling [disable_data_fsync](layout-osd.en.yml#disable_data_fsync).
-vitastor-disk tried to do that by default, first checking/disabling drive cache.
-If it can't disable drive cache, OSD get initialized with "none".
+
+TLDR: For optimal performance, set immediate_commit to "all" if you only use
+SSDs with supercapacitor-based power loss protection (nonvolatile
+write-through cache) for both data and journals in the whole Vitastor
+cluster. Set it to "small" if you only use such SSDs for journals. Leave
+empty if your drives have write-back cache.
--- a/docs/config/layout-cluster.ru.md
+++ b/docs/config/layout-cluster.ru.md
@@ -57,18 +57,9 @@ amplification) и эффективность распределения нагр
 ## immediate_commit

 - Тип: строка
- Значение по умолчанию: all
+- Значение по умолчанию: false

-Одно из значений "none", "small" или "all". Глобальное значение, может быть
-переопределено [на уровне пула](pool.ru.md#immediate_commit).
-
-Данный параметр тоже важен для производительности.
-
-Вкратце: значение по умолчанию "all" оптимально для всех серверных SSD с
-суперконденсаторами и также для большинства HDD. "none" и "small" имеет смысл
-устанавливать только при использовании SSD настольного класса без
-суперконденсаторов или дисков с медленным неотключаемым кэшем записи.
-Проверьте настройку immediate_commit своих OSD в выводе команды [ls-osd](../usage/cli.ru.md#ls-osd).
+Ещё один важный для производительности параметр.

 Модели SSD для настольных компьютеров очень быстрые (100000+ операций в
 секунду) при простой случайной записи без сбросов кэша. Однако они очень
@@ -89,7 +80,7 @@ Power Loss Protection" - одинаково быстрые и со сбросо
 эффективно утилизировать настольные SSD.

 Данный параметр влияет как раз на это. Когда он установлен в значение "all",
-кластер Vitastor мгновенно фиксирует каждое изменение на физические
+весь кластер Vitastor мгновенно фиксирует каждое изменение на физические
 носители и клиенты могут просто игнорировать запросы fsync, т.к. они точно
 знают, что fsync-и не нужны. Это уменьшает число необходимых обращений к OSD
 по сети и улучшает производительность. Поэтому даже с Vitastor лучше всегда
@@ -115,3 +106,10 @@ HDD-дисках с внутренним SSD или "медиа" кэшем - н
 включения [disable_journal_fsync](layout-osd.ru.yml#disable_journal_fsync) и
 [disable_meta_fsync](layout-osd.ru.yml#disable_meta_fsync), значение "all"
 также требует включения [disable_data_fsync](layout-osd.ru.yml#disable_data_fsync).
+
+Итого, вкратце: для оптимальной производительности установите
+immediate_commit в значение "all", если вы используете в кластере только SSD
+с суперконденсаторами и для данных, и для журналов. Если вы используете
+такие SSD для всех журналов, но не для данных - можете установить параметр
+в "small". Если и какие-то из дисков журналов имеют волатильный кэш записи -
+оставьте параметр пустым.
--- a/docs/config/monitor.en.md
+++ b/docs/config/monitor.en.md
@@ -8,14 +8,6 @@

 These parameters only apply to Monitors.

- [use_antietcd](#use_antietcd)
- [enable_prometheus](#enable_prometheus)
- [mon_http_port](#mon_http_port)
- [mon_http_ip](#mon_http_ip)
- [mon_https_cert](#mon_https_cert)
- [mon_https_key](#mon_https_key)
- [mon_https_client_auth](#mon_https_client_auth)
- [mon_https_ca](#mon_https_ca)
 - [etcd_mon_ttl](#etcd_mon_ttl)
 - [etcd_mon_timeout](#etcd_mon_timeout)
 - [etcd_mon_retries](#etcd_mon_retries)
@@ -25,87 +17,6 @@ These parameters only apply to Monitors.
 - [placement_levels](#placement_levels)
 - [use_old_pg_combinator](#use_old_pg_combinator)

-## use_antietcd
-
- Type: boolean
- Default: false
-
-Enable experimental built-in etcd replacement (clustered key-value database):
-[antietcd](https://git.yourcmc.ru/vitalif/antietcd/).
-
-When set to true, monitor runs internal antietcd automatically if it finds
-a network interface with an IP address matching one of addresses in the
-`etcd_address` configuration option (in `/etc/vitastor/vitastor.conf` or in
-the monitor command line). If there are multiple matching addresses, it also
-checks `antietcd_port` and antietcd is started for address with matching port.
-By default, antietcd accepts connection on the selected IP address, but it
-can also be overridden manually in the `antietcd_ip` option.
-
-When antietcd is started, monitor stores cluster metadata itself and exposes
-a etcd-compatible REST API. On disk, these metadata are stored in
-`/var/lib/vitastor/mon_2379.json.gz` (can be overridden in antietcd_data_file
-or antietcd_data_dir options). All other antietcd parameters
-(see [here](https://git.yourcmc.ru/vitalif/antietcd/)) except node_id,
-cluster, cluster_key, persist_filter, stale_read can also be set in
-Vitastor configuration with `antietcd_` prefix.
-
-You can dump/load data to or from antietcd using Antietcd `anticli` tool:
-
-```
-npm exec anticli -e http://etcd:2379/v3 get --prefix '' --no-temp > dump.json
-npm exec anticli -e http://antietcd:2379/v3 load < dump.json
-```
-
-## enable_prometheus
-
- Type: boolean
- Default: true
-
-Enable built-in Prometheus metrics exporter at mon_http_port (8060 by default).
-
-Note that only the active (master) monitor exposes metrics, others return
-HTTP 503. So you should add all monitor URLs to your Prometheus job configuration.
-
-Grafana dashboard suitable for this exporter is here: [Vitastor-Grafana-6+.json](../../mon/scripts/Vitastor-Grafana-6+.json).
-
-## mon_http_port
-
- Type: integer
- Default: 8060
-
-HTTP port for monitors to listen on (including metrics exporter)
-
-## mon_http_ip
-
- Type: string
-
-IP address for monitors to listen on (all addresses by default)
-
-## mon_https_cert
-
- Type: string
-
-Path to PEM SSL certificate file for monitor to listen using HTTPS
-
-## mon_https_key
-
- Type: string
-
-Path to PEM SSL private key file for monitor to listen using HTTPS
-
-## mon_https_client_auth
-
- Type: boolean
- Default: false
-
-Enable HTTPS client certificate-based authorization for monitor connections
-
-## mon_https_ca
-
- Type: string
-
-Path to CA certificate for client HTTPS authorization
-
 ## etcd_mon_ttl

 - Type: seconds
--- a/docs/config/monitor.ru.md
+++ b/docs/config/monitor.ru.md
@@ -8,14 +8,6 @@

 Данные параметры используются только мониторами Vitastor.

- [use_antietcd](#use_antietcd)
- [enable_prometheus](#enable_prometheus)
- [mon_http_port](#mon_http_port)
- [mon_http_ip](#mon_http_ip)
- [mon_https_cert](#mon_https_cert)
- [mon_https_key](#mon_https_key)
- [mon_https_client_auth](#mon_https_client_auth)
- [mon_https_ca](#mon_https_ca)
 - [etcd_mon_ttl](#etcd_mon_ttl)
 - [etcd_mon_timeout](#etcd_mon_timeout)
 - [etcd_mon_retries](#etcd_mon_retries)
@@ -25,89 +17,6 @@
 - [placement_levels](#placement_levels)
 - [use_old_pg_combinator](#use_old_pg_combinator)

-## use_antietcd
-
- Тип: булево (да/нет)
- Значение по умолчанию: false
-
-Включить экспериментальный встроенный заменитель etcd (кластерную БД ключ-значение):
-[antietcd](https://git.yourcmc.ru/vitalif/antietcd/).
-
-Если параметр установлен в true, монитор запускает antietcd автоматически,
-если обнаруживает сетевой интерфейс с одним из адресов, указанных в опции
-конфигурации `etcd_address` (в `/etc/vitastor/vitastor.conf` или в опциях
-командной строки монитора). Если таких адресов несколько, также проверяется
-опция `antietcd_port` и antietcd запускается для адреса с соответствующим
-портом. По умолчанию antietcd принимает подключения по выбранному совпадающему
-IP, но его также можно определить вручную опцией `antietcd_ip`.
-
-При запуске antietcd монитор сам хранит центральные метаданные кластера и
-выставляет etcd-совместимое REST API. На диске эти метаданные хранятся в файле
-`/var/lib/vitastor/mon_2379.json.gz` (можно переопределить параметрами
-antietcd_data_file или antietcd_data_dir). Все остальные параметры antietcd
-(смотрите [по ссылке](https://git.yourcmc.ru/vitalif/antietcd/)), за исключением
-node_id, cluster, cluster_key, persist_filter, stale_read также можно задавать
-в конфигурации Vitastor с префиксом `antietcd_`.
-
-Вы можете выгружать/загружать данные в или из antietcd с помощью его инструмента
-`anticli`:
-
-```
-npm exec anticli -e http://etcd:2379/v3 get --prefix '' --no-temp > dump.json
-npm exec anticli -e http://antietcd:2379/v3 load < dump.json
-```
-
-## enable_prometheus
-
- Тип: булево (да/нет)
- Значение по умолчанию: true
-
-Включить встроенный Prometheus-экспортер метрик на порту mon_http_port (по умолчанию 8060).
-
-Обратите внимание, что метрики выставляет только активный (главный) монитор, остальные
-возвращают статус HTTP 503, поэтому вам следует добавлять адреса всех мониторов
-в задание по сбору метрик Prometheus.
-
-Дашборд для Grafana, подходящий для этого экспортера: [Vitastor-Grafana-6+.json](../../mon/scripts/Vitastor-Grafana-6+.json).
-
-## mon_http_port
-
- Тип: целое число
- Значение по умолчанию: 8060
-
-Порт, на котором мониторы принимают HTTP-соединения (в том числе для отдачи метрик)
-
-## mon_http_ip
-
- Тип: строка
-
-IP-адрес, на котором мониторы принимают HTTP-соединения (по умолчанию все адреса)
-
-## mon_https_cert
-
- Тип: строка
-
-Путь к PEM-файлу SSL-сертификата для монитора, чтобы принимать соединения через HTTPS
-
-## mon_https_key
-
- Тип: строка
-
-Путь к PEM-файлу секретного SSL-ключа для монитора, чтобы принимать соединения через HTTPS
-
-## mon_https_client_auth
-
- Тип: булево (да/нет)
- Значение по умолчанию: false
-
-Включить в HTTPS-сервере монитора авторизацию по клиентским сертификатам
-
-## mon_https_ca
-
- Тип: строка
-
-Путь к удостоверяющему сертификату для авторизации клиентских HTTPS соединений
-
 ## etcd_mon_ttl

 - Тип: секунды
--- a/docs/config/osd.en.md
+++ b/docs/config/osd.en.md
@@ -10,7 +10,6 @@ These parameters only apply to OSDs, are not fixed at the moment of OSD drive
 initialization and can be changed - either with an OSD restart or, for some of
 them, even without restarting by updating configuration in etcd.

- [osd_iothread_count](#osd_iothread_count)
 - [etcd_report_interval](#etcd_report_interval)
 - [etcd_stats_interval](#etcd_stats_interval)
 - [run_primary](#run_primary)
@@ -62,18 +61,6 @@ them, even without restarting by updating configuration in etcd.
 - [recovery_tune_sleep_min_us](#recovery_tune_sleep_min_us)
 - [recovery_tune_sleep_cutoff_us](#recovery_tune_sleep_cutoff_us)

-## osd_iothread_count
-
- Type: integer
- Default: 0
-
-TCP network I/O thread count for OSD. When non-zero, a single OSD process
-may handle more TCP I/O, but at a cost of increased latency because thread
-switching overhead occurs. RDMA isn't affected by this option.
-
-Because of latency, instead of enabling OSD I/O threads it's recommended to
-just create multiple OSDs per disk, or use RDMA.
-
 ## etcd_report_interval

 - Type: seconds
--- a/docs/config/osd.ru.md
+++ b/docs/config/osd.ru.md
@@ -11,7 +11,6 @@
 момент с помощью перезапуска OSD, а некоторые и без перезапуска, с помощью
 изменения конфигурации в etcd.

- [osd_iothread_count](#osd_iothread_count)
 - [etcd_report_interval](#etcd_report_interval)
 - [etcd_stats_interval](#etcd_stats_interval)
 - [run_primary](#run_primary)
@@ -63,19 +62,6 @@
 - [recovery_tune_sleep_min_us](#recovery_tune_sleep_min_us)
 - [recovery_tune_sleep_cutoff_us](#recovery_tune_sleep_cutoff_us)

-## osd_iothread_count
-
- Тип: целое число
- Значение по умолчанию: 0
-
-Число отдельных потоков для обработки ввода-вывода через TCP-сеть на
-стороне OSD. Включение опции позволяет каждому отдельному OSD передавать
-по сети больше данных, но ухудшает задержку из-за накладных расходов
-переключения потоков. На работу RDMA опция не влияет.
-
-Из-за задержек вместо включения потоков ввода-вывода OSD рекомендуется
-просто создавать по несколько OSD на каждом диске, или использовать RDMA.
-
 ## etcd_report_interval

 - Тип: секунды
--- a/docs/config/src/client.yml
+++ b/docs/config/src/client.yml
@@ -1,32 +1,3 @@
- name: client_iothread_count
-  type: int
-  default: 0
-  online: false
-  info: |
-    Number of separate threads for handling TCP network I/O at client library
-    side. Enabling 4 threads usually allows to increase peak performance of each
-    client from approx. 2-3 to 7-8 GByte/s linear read/write and from approx.
-    100-150 to 400 thousand iops, but at the same time it increases latency.
-    Latency increase depends on CPU: with CPU power saving disabled latency
-    only increases by ~10 us (equivalent to Q=1 iops decrease from 10500 to 9500),
-    with CPU power saving enabled it may be as high as 500 us (equivalent to Q=1
-    iops decrease from 2000 to 1000). RDMA isn't affected by this option.
-
-    It's recommended to enable client I/O threads if you don't use RDMA and want
-    to increase peak client performance.
-  info_ru: |
-    Число отдельных потоков для обработки ввода-вывода через TCP сеть на стороне
-    клиентской библиотеки. Включение 4 потоков обычно позволяет поднять пиковую
-    производительность каждого клиента примерно с 2-3 до 7-8 Гбайт/с линейного
-    чтения/записи и примерно с 100-150 до 400 тысяч операций ввода-вывода в
-    секунду, но ухудшает задержку. Увеличение задержки зависит от процессора:
-    при отключённом энергосбережении CPU это всего ~10 микросекунд (равносильно
-    падению iops с Q=1 с 10500 до 9500), а при включённом это может быть
-    и 500 микросекунд (равносильно падению iops с Q=1 с 2000 до 1000). На работу
-    RDMA данная опция не влияет.
-
-    Рекомендуется включать клиентские потоки ввода-вывода, если вы не используете
-    RDMA и хотите повысить пиковую производительность клиентов.
 - name: client_retry_interval
  type: ms
  min: 10
--- a/docs/config/src/layout-cluster.yml
+++ b/docs/config/src/layout-cluster.yml
@@ -47,24 +47,14 @@
    Не может быть меньше размера сектора дисков данных OSD.
 - name: immediate_commit
  type: string
-  default: all
+  default: false
  info: |
-    One of "none", "all" or "small". Global value, may be overriden [at pool level](pool.en.md#immediate_commit).
-
-    This parameter is also really important for performance.
-
-    TLDR: default "all" is optimal for server-grade SSDs with supercapacitor-based
-    power loss protection (nonvolatile write-through cache) and also for most HDDs.
-    "none" or "small" should be only selected if you use desktop SSDs without
-    capacitors or drives with slow write-back cache that can't be disabled. Check
-    immediate_commit of your OSDs in [ls-osd](../usage/cli.en.md#ls-osd).
-
-    Detailed explanation:
+    Another parameter which is really important for performance.

    Desktop SSDs are very fast (100000+ iops) for simple random writes
    without cache flush. However, they are really slow (only around 1000 iops)
-    if you try to fsync() each write, that is, if you want to guarantee that
-    each change gets actually persisted to the physical media.
+    if you try to fsync() each write, that is, when you want to guarantee that
+    each change gets immediately persisted to the physical media.

    Server-grade SSDs with "Advanced/Enhanced Power Loss Protection" or with
    "Supercapacitor-based Power Loss Protection", on the other hand, are equally
@@ -76,8 +66,8 @@
    efficiently utilize desktop SSDs by postponing fsync until the client calls
    it explicitly.

-    This is what this parameter regulates. When it's set to "all" Vitastor
-    cluster commits each change to disks immediately and clients just
+    This is what this parameter regulates. When it's set to "all" the whole
+    Vitastor cluster commits each change to disks immediately and clients just
    ignore fsyncs because they know for sure that they're unneeded. This reduces
    the amount of network roundtrips performed by clients and improves
    performance. So it's always better to use server grade SSDs with
@@ -100,19 +90,14 @@
    [disable_journal_fsync](layout-osd.en.yml#disable_journal_fsync) and
    [disable_meta_fsync](layout-osd.en.yml#disable_meta_fsync), setting it to
    "all" also requires enabling [disable_data_fsync](layout-osd.en.yml#disable_data_fsync).
-    vitastor-disk tried to do that by default, first checking/disabling drive cache.
-    If it can't disable drive cache, OSD get initialized with "none".
+
+    TLDR: For optimal performance, set immediate_commit to "all" if you only use
+    SSDs with supercapacitor-based power loss protection (nonvolatile
+    write-through cache) for both data and journals in the whole Vitastor
+    cluster. Set it to "small" if you only use such SSDs for journals. Leave
+    empty if your drives have write-back cache.
  info_ru: |
-    Одно из значений "none", "small" или "all". Глобальное значение, может быть
-    переопределено [на уровне пула](pool.ru.md#immediate_commit).
-
-    Данный параметр тоже важен для производительности.
-
-    Вкратце: значение по умолчанию "all" оптимально для всех серверных SSD с
-    суперконденсаторами и также для большинства HDD. "none" и "small" имеет смысл
-    устанавливать только при использовании SSD настольного класса без
-    суперконденсаторов или дисков с медленным неотключаемым кэшем записи.
-    Проверьте настройку immediate_commit своих OSD в выводе команды [ls-osd](../usage/cli.ru.md#ls-osd).
+    Ещё один важный для производительности параметр.

    Модели SSD для настольных компьютеров очень быстрые (100000+ операций в
    секунду) при простой случайной записи без сбросов кэша. Однако они очень
@@ -133,7 +118,7 @@
    эффективно утилизировать настольные SSD.

    Данный параметр влияет как раз на это. Когда он установлен в значение "all",
-    кластер Vitastor мгновенно фиксирует каждое изменение на физические
+    весь кластер Vitastor мгновенно фиксирует каждое изменение на физические
    носители и клиенты могут просто игнорировать запросы fsync, т.к. они точно
    знают, что fsync-и не нужны. Это уменьшает число необходимых обращений к OSD
    по сети и улучшает производительность. Поэтому даже с Vitastor лучше всегда
@@ -159,3 +144,10 @@
    включения [disable_journal_fsync](layout-osd.ru.yml#disable_journal_fsync) и
    [disable_meta_fsync](layout-osd.ru.yml#disable_meta_fsync), значение "all"
    также требует включения [disable_data_fsync](layout-osd.ru.yml#disable_data_fsync).
+
+    Итого, вкратце: для оптимальной производительности установите
+    immediate_commit в значение "all", если вы используете в кластере только SSD
+    с суперконденсаторами и для данных, и для журналов. Если вы используете
+    такие SSD для всех журналов, но не для данных - можете установить параметр
+    в "small". Если и какие-то из дисков журналов имеют волатильный кэш записи -
+    оставьте параметр пустым.
--- a/docs/config/src/monitor.yml
+++ b/docs/config/src/monitor.yml
@@ -1,103 +1,3 @@
- name: use_antietcd
-  type: bool
-  default: false
-  info: |
-    Enable experimental built-in etcd replacement (clustered key-value database):
-    [antietcd](https://git.yourcmc.ru/vitalif/antietcd/).
-
-    When set to true, monitor runs internal antietcd automatically if it finds
-    a network interface with an IP address matching one of addresses in the
-    `etcd_address` configuration option (in `/etc/vitastor/vitastor.conf` or in
-    the monitor command line). If there are multiple matching addresses, it also
-    checks `antietcd_port` and antietcd is started for address with matching port.
-    By default, antietcd accepts connection on the selected IP address, but it
-    can also be overridden manually in the `antietcd_ip` option.
-
-    When antietcd is started, monitor stores cluster metadata itself and exposes
-    a etcd-compatible REST API. On disk, these metadata are stored in
-    `/var/lib/vitastor/mon_2379.json.gz` (can be overridden in antietcd_data_file
-    or antietcd_data_dir options). All other antietcd parameters
-    (see [here](https://git.yourcmc.ru/vitalif/antietcd/)) except node_id,
-    cluster, cluster_key, persist_filter, stale_read can also be set in
-    Vitastor configuration with `antietcd_` prefix.
-
-    You can dump/load data to or from antietcd using Antietcd `anticli` tool:
-
-    ```
-    npm exec anticli -e http://etcd:2379/v3 get --prefix '' --no-temp > dump.json
-    npm exec anticli -e http://antietcd:2379/v3 load < dump.json
-    ```
-  info_ru: |
-    Включить экспериментальный встроенный заменитель etcd (кластерную БД ключ-значение):
-    [antietcd](https://git.yourcmc.ru/vitalif/antietcd/).
-
-    Если параметр установлен в true, монитор запускает antietcd автоматически,
-    если обнаруживает сетевой интерфейс с одним из адресов, указанных в опции
-    конфигурации `etcd_address` (в `/etc/vitastor/vitastor.conf` или в опциях
-    командной строки монитора). Если таких адресов несколько, также проверяется
-    опция `antietcd_port` и antietcd запускается для адреса с соответствующим
-    портом. По умолчанию antietcd принимает подключения по выбранному совпадающему
-    IP, но его также можно определить вручную опцией `antietcd_ip`.
-
-    При запуске antietcd монитор сам хранит центральные метаданные кластера и
-    выставляет etcd-совместимое REST API. На диске эти метаданные хранятся в файле
-    `/var/lib/vitastor/mon_2379.json.gz` (можно переопределить параметрами
-    antietcd_data_file или antietcd_data_dir). Все остальные параметры antietcd
-    (смотрите [по ссылке](https://git.yourcmc.ru/vitalif/antietcd/)), за исключением
-    node_id, cluster, cluster_key, persist_filter, stale_read также можно задавать
-    в конфигурации Vitastor с префиксом `antietcd_`.
-
-    Вы можете выгружать/загружать данные в или из antietcd с помощью его инструмента
-    `anticli`:
-
-    ```
-    npm exec anticli -e http://etcd:2379/v3 get --prefix '' --no-temp > dump.json
-    npm exec anticli -e http://antietcd:2379/v3 load < dump.json
-    ```
- name: enable_prometheus
-  type: bool
-  default: true
-  info: |
-    Enable built-in Prometheus metrics exporter at mon_http_port (8060 by default).
-
-    Note that only the active (master) monitor exposes metrics, others return
-    HTTP 503. So you should add all monitor URLs to your Prometheus job configuration.
-
-    Grafana dashboard suitable for this exporter is here: [Vitastor-Grafana-6+.json](../../mon/scripts/Vitastor-Grafana-6+.json).
-  info_ru: |
-    Включить встроенный Prometheus-экспортер метрик на порту mon_http_port (по умолчанию 8060).
-
-    Обратите внимание, что метрики выставляет только активный (главный) монитор, остальные
-    возвращают статус HTTP 503, поэтому вам следует добавлять адреса всех мониторов
-    в задание по сбору метрик Prometheus.
-
-    Дашборд для Grafana, подходящий для этого экспортера: [Vitastor-Grafana-6+.json](../../mon/scripts/Vitastor-Grafana-6+.json).
- name: mon_http_port
-  type: int
-  default: 8060
-  info: HTTP port for monitors to listen on (including metrics exporter)
-  info_ru: Порт, на котором мониторы принимают HTTP-соединения (в том числе для отдачи метрик)
- name: mon_http_ip
-  type: string
-  info: IP address for monitors to listen on (all addresses by default)
-  info_ru: IP-адрес, на котором мониторы принимают HTTP-соединения (по умолчанию все адреса)
- name: mon_https_cert
-  type: string
-  info: Path to PEM SSL certificate file for monitor to listen using HTTPS
-  info_ru: Путь к PEM-файлу SSL-сертификата для монитора, чтобы принимать соединения через HTTPS
- name: mon_https_key
-  type: string
-  info: Path to PEM SSL private key file for monitor to listen using HTTPS
-  info_ru: Путь к PEM-файлу секретного SSL-ключа для монитора, чтобы принимать соединения через HTTPS
- name: mon_https_client_auth
-  type: bool
-  default: false
-  info: Enable HTTPS client certificate-based authorization for monitor connections
-  info_ru: Включить в HTTPS-сервере монитора авторизацию по клиентским сертификатам
- name: mon_https_ca
-  type: string
-  info: Path to CA certificate for client HTTPS authorization
-  info_ru: Путь к удостоверяющему сертификату для авторизации клиентских HTTPS соединений
 - name: etcd_mon_ttl
  type: sec
  min: 5
--- a/docs/config/src/osd.yml
+++ b/docs/config/src/osd.yml
@@ -1,21 +1,3 @@
- name: osd_iothread_count
-  type: int
-  default: 0
-  info: |
-    TCP network I/O thread count for OSD. When non-zero, a single OSD process
-    may handle more TCP I/O, but at a cost of increased latency because thread
-    switching overhead occurs. RDMA isn't affected by this option.
-
-    Because of latency, instead of enabling OSD I/O threads it's recommended to
-    just create multiple OSDs per disk, or use RDMA.
-  info_ru: |
-    Число отдельных потоков для обработки ввода-вывода через TCP-сеть на
-    стороне OSD. Включение опции позволяет каждому отдельному OSD передавать
-    по сети больше данных, но ухудшает задержку из-за накладных расходов
-    переключения потоков. На работу RDMA опция не влияет.
-
-    Из-за задержек вместо включения потоков ввода-вывода OSD рекомендуется
-    просто создавать по несколько OSD на каждом диске, или использовать RDMA.
 - name: etcd_report_interval
  type: sec
  default: 5
--- a/docs/installation/packages.en.md
+++ b/docs/installation/packages.en.md
@@ -16,6 +16,8 @@
  - Debian 10 (Buster): `deb https://vitastor.io/debian buster main`
  - Add `-oldstable` to bookworm/bullseye/buster in this line to install the last
    stable version from 0.9.x branch instead of 1.x
+- For Debian 10 (Buster) also enable backports repository:
+  `deb http://deb.debian.org/debian buster-backports main`
 - Install packages: `apt update; apt install vitastor lp-solve etcd linux-image-amd64 qemu-system-x86`

 ## CentOS
--- a/docs/installation/packages.ru.md
+++ b/docs/installation/packages.ru.md
@@ -16,6 +16,8 @@
  - Debian 10 (Buster): `deb https://vitastor.io/debian buster main`
  - Добавьте `-oldstable` к слову bookworm/bullseye/buster в этой строке, чтобы
    установить последнюю стабильную версию из ветки 0.9.x вместо 1.x
+- Для Debian 10 (Buster) также включите репозиторий backports:
+  `deb http://deb.debian.org/debian buster-backports main`
 - Установите пакеты: `apt update; apt install vitastor lp-solve etcd linux-image-amd64 qemu-system-x86`

 ## CentOS
--- a/docs/installation/proxmox.en.md
+++ b/docs/installation/proxmox.en.md
@@ -17,10 +17,10 @@ To enable Vitastor support in Proxmox Virtual Environment (6.4-8.1 are supported
 - Restart pvedaemon: `systemctl restart pvedaemon`

 `/etc/pve/storage.cfg` example (the only required option is vitastor_pool, all others
-are listed below with their default values; `vitastor_ssd` is Proxmox storage pool id):
+are listed below with their default values):

 ```
-vitastor: vitastor_ssd
+vitastor: vitastor
    # pool to put new images into
    vitastor_pool testpool
    # path to the configuration file
--- a/docs/installation/proxmox.ru.md
+++ b/docs/installation/proxmox.ru.md
@@ -16,10 +16,10 @@
 - Перезапустите демон Proxmox: `systemctl restart pvedaemon`

 Пример `/etc/pve/storage.cfg` (единственная обязательная опция - vitastor_pool, все остальные
-перечислены внизу для понимания значений по умолчанию; `vitastor_ssd` - имя хранилища в Proxmox):
+перечислены внизу для понимания значений по умолчанию):

 ```
-vitastor: vitastor_ssd
+vitastor: vitastor
    # Пул, в который будут помещаться образы дисков
    vitastor_pool testpool
    # Путь к файлу конфигурации
--- a/docs/intro/features.en.md
+++ b/docs/intro/features.en.md
@@ -34,8 +34,6 @@
 - [Client write-back cache](../config/client.en.md#client_enable_writeback)
 - [Intelligent recovery auto-tuning](../config/osd.en.md#recovery_tune_interval)
 - [Clustered file system](../usage/nfs.en.md#vitastorfs)
- [Experimental internal etcd replacement - antietcd](../config/monitor.en.md#use_antietcd)
- [Built-in Prometheus metric exporter](../config/monitor.en.md#enable_prometheus)

 ## Plugins and tools

--- a/docs/intro/features.ru.md
+++ b/docs/intro/features.ru.md
@@ -36,8 +36,6 @@
 - [Буферизация записи на стороне клиента](../config/client.ru.md#client_enable_writeback)
 - [Интеллектуальная автоподстройка скорости восстановления](../config/osd.ru.md#recovery_tune_interval)
 - [Кластерная файловая система](../usage/nfs.ru.md#vitastorfs)
- [Экспериментальная встроенная замена etcd - antietcd](../config/monitor.ru.md#use_antietcd)
- [Встроенный Prometheus-экспортер метрик](../config/monitor.ru.md#enable_prometheus)

 ## Драйверы и инструменты

--- a/docs/intro/quickstart.en.md
+++ b/docs/intro/quickstart.en.md
@@ -68,6 +68,10 @@ On the monitor hosts:
    but some free unpartitioned space must be available because the script creates new partitions for journals.
 - You can change OSD configuration in units or in `vitastor.conf`.
  Check [Configuration Reference](../config.en.md) for parameter descriptions.
+- If all your drives have capacitors, and even if not, but if you ran `vitastor-disk`
+  without `--disable_data_fsync off` at the first step, then put the following
+  setting into etcd: \
+  `etcdctl --endpoints=... put /vitastor/config/global '{"immediate_commit":"all"}'`
 - Start all OSDs: `systemctl start vitastor.target`

 ## Create a pool
@@ -84,10 +88,6 @@ For EC pools the configuration should look like the following:
 vitastor-cli create-pool testpool --ec 2+2 --pg_count 256
 ```

-Add `--immediate_commit none` if you added `--disable_data_fsync off` at the OSD
-initialization step, or if `vitastor-disk` complained about impossibility to
-disable drive cache.
-
 After you do this, one of the monitors will configure PGs and OSDs will start them.

 If you use HDDs you should also add `"block_size": 1048576` to pool configuration.
--- a/docs/intro/quickstart.ru.md
+++ b/docs/intro/quickstart.ru.md
@@ -69,6 +69,11 @@
    для журналов, на SSD должно быть доступно свободное нераспределённое место.
 - Вы можете менять параметры OSD в юнитах systemd или в `vitastor.conf`. Описания параметров
  смотрите в [справке по конфигурации](../config.ru.md).
+- Если все ваши диски - серверные с конденсаторами, и даже если нет, но при этом
+  вы не добавляли опцию `--disable_data_fsync off` на первом шаге, а `vitastor-disk`
+  не ругался на невозможность отключения кэша дисков, пропишите следующую настройку
+  в глобальную конфигурацию в etcd: \
+  `etcdctl --endpoints=... put /vitastor/config/global '{"immediate_commit":"all"}'`.
 - Запустите все OSD: `systemctl start vitastor.target`

 ## Создайте пул
@@ -85,10 +90,6 @@ vitastor-cli create-pool testpool --pg_size 2 --pg_count 256
 vitastor-cli create-pool testpool --ec 2+2 --pg_count 256
 ```

-Добавьте также опцию `--immediate_commit none`, если вы добавляли `--disable_data_fsync off`
-на этапе инициализации OSD, либо если `vitastor-disk` ругался на невозможность отключения
-кэша дисков.
-
 После этого один из мониторов должен сконфигурировать PG, а OSD должны запустить их.

 Если вы используете HDD-диски, то добавьте в конфигурацию пулов опцию `"block_size": 1048576`.
--- a/docs/usage/admin.en.md
+++ b/docs/usage/admin.en.md
@@ -42,7 +42,7 @@ PG state always includes exactly 1 of the following base states:
 - **offline** — PG isn't activated by any OSD at all. Either primary OSD isn't set for
  this PG at all (if the pool is just created), or an unavailable OSD is set as primary,
  or the primary OSD refuses to start this PG (for example, because of wrong block_size),
-  or the PG is stopped by the monitor using `pause: true` flag in `/vitastor/pg/config` in etcd.
+  or the PG is stopped by the monitor using `pause: true` flag in `/vitastor/config/pgs` in etcd.
 - **starting** — primary OSD has acquired PG lock in etcd, PG is starting.
 - **peering** — primary OSD requests PG object listings from secondary OSDs and calculates
  the PG state.
@@ -107,17 +107,16 @@ If a PG is active it can also have any number of the following additional states

 ## Removing a healthy disk

-Before removing a healthy disk from the cluster set its OSD weight(s) to 0 to
-move data away. To do that, run `vitastor-cli modify-osd --reweight 0 <НОМЕР_OSD>`.
-
-Then wait until rebalance finishes and remove OSD by running `vitastor-disk purge /dev/vitastor/osdN-data`.
-
-Zero weight can also be put manually into etcd key `/vitastor/config/osd/<НОМЕР_OSD>`, for example:
+Befor removing a healthy disk from the cluster set its OSD weight(s) to 0 to
+move data away. To do that, add `"reweight":0` to etcd key `/vitastor/config/osd/<OSD_NUMBER>`.
+For example:

 ```
 etcdctl --endpoints=http://1.1.1.1:2379/v3 put /vitastor/config/osd/1 '{"reweight":0}'
 ```

+Then wait until rebalance finishes and remove OSD by running `vitastor-disk purge /dev/vitastor/osdN-data`.
+
 ## Removing a failed disk

 If a disk is already dead, its OSD(s) are likely already stopped.
@@ -150,7 +149,7 @@ POOL_ID=1
 ALL_OSDS=$(etcdctl --endpoints=your_etcd_address:2379 get --keys-only --prefix /vitastor/osd/stats/ | \
    perl -e '$/ = undef; $a = <>; $a =~ s/\s*$//; $a =~ s!/vitastor/osd/stats/!!g; $a =~ s/\s+/,/g; print $a')
 for i in $(seq 1 $PG_COUNT); do
-    etcdctl --endpoints=your_etcd_address:2379 put /vitastor/pg/history/$POOL_ID/$i '{"all_peers":['$ALL_OSDS']}'
+    etcdctl --endpoints=your_etcd_address:2379 put /vitastor/pg/history/$POOL_ID/$i '{"all_peers":['$ALL_OSDS']}'; done
 done
 ```

@@ -169,51 +168,21 @@ Upgrading is performed without stopping clients (VMs/containers), you just need
 upgrade and restart servers one by one. However, ideally you should restart VMs too
 to make them use the new version of the client library.

-### 1.1.x to 1.2.0
+Exceptions (specific upgrade instructions):
+- Upgrading <= 1.1.x to 1.2.0 or later, if you use EC n+k with k>=2, is recommended
+  to be performed with full downtime: first you should stop all clients, then all OSDs,
+  then upgrade and start everything back — because versions before 1.2.0 have several
+  bugs leading to invalid data being read in EC n+k, k>=2 configurations in degraded pools.
+- Versions <= 0.8.7 are incompatible with versions >= 0.9.0, so you should first
+  upgrade from <= 0.8.7 to 0.8.8 or 0.8.9, and only then to >= 0.9.x. If you upgrade
+  without this intermediate step, client I/O will hang until the end of upgrade process.
+- Upgrading from <= 0.5.x to >= 0.6.x is not supported.

-Upgrading version <= 1.1.x to version >= 1.2.0, if you use EC n+k with k>=2, is recommended
-to be performed with full downtime: first you should stop all clients, then all OSDs,
-then upgrade and start everything back — because versions before 1.2.0 have several
-bugs leading to invalid data being read in EC n+k, k>=2 configurations in degraded pools.
-
-### 0.8.7 to 0.9.0
-
-Versions <= 0.8.7 are incompatible with versions >= 0.9.0, so you should first
-upgrade from <= 0.8.7 to 0.8.8 or 0.8.9, and only then to >= 0.9.x. If you upgrade
-without this intermediate step, client I/O will hang until the end of upgrade process.
-
-### 0.5.x to 0.6.x
-
-Upgrading from <= 0.5.x to >= 0.6.x is not supported.
-
-## Downgrade
-
-Downgrade are also allowed freely, except the following specific instructions:
-
-### 1.8.0 to 1.7.1
-
-Before downgrading from version >= 1.8.0 to version <= 1.7.1
-you have to copy /vitastor/pg/config etcd key to /vitastor/config/pgs:
-
-```
-etcdctl --endpoints=http://... get --print-value-only /vitastor/pg/config | \
-  etcdctl --endpoints=http://... put /vitastor/config/pgs
-```
-
-Then you can just install older packages and restart all services.
-
-If you performed downgrade without first copying that key, run "add all OSDs into the
-history records of all PGs" from [Restoring from lost pool configuration](#restoring-from-lost-pool-configuration).
-
-### 1.0.0 to 0.9.x
-
-Version 1.0.0 has a new disk format, so OSDs initialized on 1.0.0 or later can't
-be rolled back to 0.9.x or previous versions.
-
-### 0.8.0 to 0.7.x
-
-Versions before 0.8.0 don't have vitastor-disk, so OSDs, initialized by it, won't
-start with older versions (0.4.x - 0.7.x). :-)
+Rollback:
+- Version 1.0.0 has a new disk format, so OSDs initiaziled on 1.0.0 can't be rolled
+  back to 0.9.x or previous versions.
+- Versions before 0.8.0 don't have vitastor-disk, so OSDs, initialized by it, won't
+  start with 0.7.x or 0.6.x. :-)

 ## OSD memory usage

--- a/docs/usage/admin.ru.md
+++ b/docs/usage/admin.ru.md
@@ -42,7 +42,7 @@
 - **offline** — PG вообще не активирована ни одним OSD. Либо первичный OSD не назначен вообще
  (если пул только создан), либо в качестве первичного назначен недоступный OSD, либо
  назначенный OSD отказывается запускать эту PG (например, из-за несовпадения block_size),
-  либо PG остановлена монитором через флаг `pause: true` в `/vitastor/pg/config` в etcd.
+  либо PG остановлена монитором через флаг `pause: true` в `/vitastor/config/pgs` в etcd.
 - **starting** — первичный OSD захватил блокировку PG в etcd, PG запускается.
 - **peering** — первичный OSD опрашивает вторичные OSD на предмет списков объектов данной PG и рассчитывает её состояние.
 - **repeering** — PG ожидает завершения текущих операций ввода-вывода, после чего перейдёт в состояние **peering**.
@@ -105,16 +105,14 @@ PG должны очень быстро переходить из них в др
 ## Удаление исправного диска

 Перед удалением исправного диска из кластера установите его OSD вес в 0, чтобы убрать с него данные.
-Для этого выполните команду `vitastor-cli modify-osd --reweight 0 <НОМЕР_OSD>`.
-
-Дождитесь завершения перебалансировки данных, после чего удалите OSD командой `vitastor-disk purge /dev/vitastor/osdN-data`.
-
-Также вес 0 можно прописать вручную прямо в etcd в ключ `/vitastor/config/osd/<НОМЕР_OSD>`, например:
+Для этого добавьте в ключ `/vitastor/config/osd/<НОМЕР_OSD>` в etcd значение `"reweight":0`, например:

 ```
 etcdctl --endpoints=http://1.1.1.1:2379/v3 put /vitastor/config/osd/1 '{"reweight":0}'
 ```

+Дождитесь завершения ребаланса, после чего удалите OSD командой `vitastor-disk purge /dev/vitastor/osdN-data`.
+
 ## Удаление неисправного диска

 Если диск уже умер, его OSD, скорее всего, уже будет/будут остановлен(ы).
@@ -147,7 +145,7 @@ POOL_ID=1
 ALL_OSDS=$(etcdctl --endpoints=your_etcd_address:2379 get --keys-only --prefix /vitastor/osd/stats/ | \
    perl -e '$/ = undef; $a = <>; $a =~ s/\s*$//; $a =~ s!/vitastor/osd/stats/!!g; $a =~ s/\s+/,/g; print $a')
 for i in $(seq 1 $PG_COUNT); do
-    etcdctl --endpoints=your_etcd_address:2379 put /vitastor/pg/history/$POOL_ID/$i '{"all_peers":['$ALL_OSDS']}'
+    etcdctl --endpoints=your_etcd_address:2379 put /vitastor/pg/history/$POOL_ID/$i '{"all_peers":['$ALL_OSDS']}'; done
 done
 ```

@@ -166,51 +164,21 @@ done
 достаточно обновлять серверы по одному. Однако, конечно, чтобы запущенные виртуальные машины
 начали использовать новую версию клиентской библиотеки, их тоже нужно перезапустить.

-### 1.1.x -> 1.2.0
+Исключения (особые указания при обновлении):
+- Обновляться с версий <= 1.1.x до версий >= 1.2.0, если вы используете EC n+k и k>=2,
+  рекомендуется с временной остановкой кластера — сначала нужно остановить всех клиентов,
+  потом все OSD, потом обновить и запустить всё обратно — из-за нескольких багов, которые
+  могли приводить к некорректному чтению данных в деградированных EC-пулах.
+- Версии <= 0.8.7 несовместимы с версиями >= 0.9.0, поэтому при обновлении с <= 0.8.7
+  нужно сначала обновиться до 0.8.8 или 0.8.9, а уже потом до любых версий >= 0.9.x.
+  Иначе клиентский ввод-вывод зависнет до завершения обновления.
+- Обновление с версий 0.5.x и более ранних до 0.6.x и более поздних не поддерживается.

-Обновляться с версий <= 1.1.x до версий >= 1.2.0, если вы используете EC n+k и k>=2,
-рекомендуется с временной остановкой кластера — сначала нужно остановить всех клиентов,
-потом все OSD, потом обновить и запустить всё обратно — из-за нескольких багов, которые
-могли приводить к некорректному чтению данных в деградированных EC-пулах.
-
-### 0.8.7 -> 0.9.0
-
-Версии <= 0.8.7 несовместимы с версиями >= 0.9.0, поэтому при обновлении с <= 0.8.7
-нужно сначала обновиться до 0.8.8 или 0.8.9, а уже потом до любых версий >= 0.9.x.
-Иначе клиентский ввод-вывод зависнет до завершения обновления.
-
-### 0.5.x -> 0.6.x
-
-Обновление с версий 0.5.x и более ранних до 0.6.x и более поздних не поддерживается.
-
-## Откат версии
-
-Откат (понижение версии) тоже свободно разрешён, кроме указанных ниже случаев:
-
-### 1.8.0 -> 1.7.1
-
-Перед понижением версии с >= 1.8.0 до <= 1.7.1 вы должны скопировать ключ
-etcd `/vitastor/pg/config` в `/vitastor/config/pgs`:
-
-```
-etcdctl --endpoints=http://... get --print-value-only /vitastor/pg/config | \
-  etcdctl --endpoints=http://... put /vitastor/config/pgs
-```
-
-После этого можно просто установить более старые пакеты и перезапустить все сервисы.
-
-Если вы откатили версию, не скопировав предварительно этот ключ - выполните "добавление всех
-OSD в исторические записи всех PG" из раздела [Восстановление потерянной конфигурации пулов](#восстановление-потерянной-конфигурации-пулов).
-
-### 1.0.0 -> 0.9.x
-
-В версии 1.0.0 поменялся дисковый формат, поэтому OSD, созданные на версии >= 1.0.0,
-нельзя откатить до версии 0.9.x и более ранних.
-
-### 0.8.0 -> 0.7.x
-
-В версиях ранее 0.8.0 нет vitastor-disk, значит, созданные им OSD не запустятся на
-более ранних версиях (0.4.x - 0.7.x). :-)
+Откат:
+- В версии 1.0.0 поменялся дисковый формат, поэтому OSD, созданные на версии >= 1.0.0,
+  нельзя откатить до версии 0.9.x и более ранних.
+- В версиях ранее 0.8.0 нет vitastor-disk, значит, созданные им OSD нельзя откатить
+  до 0.7.x или 0.6.x. :-)

 ## Потребление памяти OSD

--- a/docs/usage/cli.en.md
+++ b/docs/usage/cli.en.md
@@ -24,10 +24,6 @@ It supports the following commands:
 - [fix](#fix)
 - [alloc-osd](#alloc-osd)
 - [rm-osd](#rm-osd)
- [osd-tree](#osd-tree)
- [ls-osd](#ls-osd)
- [modify-osd](#modify-osd)
- [pg-list](#pg-list)
 - [create-pool](#create-pool)
 - [modify-pool](#modify-pool)
 - [ls-pools](#ls-pools)
@@ -178,7 +174,6 @@ Remove inode data without changing metadata.
 --wait-list   Retrieve full objects listings before starting to remove objects.
              Requires more memory, but allows to show correct removal progress.
 --min-offset  Purge only data starting with specified offset.
--max-offset  Purge only data before specified offset.
 ```

 ## merge-data
@@ -251,82 +246,6 @@ Refuses to remove OSDs with data without `--force` and `--allow-data-loss`.
 With `--dry-run` only checks if deletion is possible without data loss and
 redundancy degradation.

-## osd-tree
-
-`vitastor-cli osd-tree [-l|--long]`
-
-Show current OSD tree, optionally with I/O statistics if -l is specified.
-
-Example output:
-
-```
-TYPE     NAME       UP    SIZE  USED%    TAGS          WEIGHT  BLOCK  BITMAP  IMM   NOOUT
-host     kaveri
-  disk   nvme0n1p1
-    osd  3          down  100G  0 %      abc,kaveri    1       128k   4k      none  -
-    osd  4          down  100G  0 %                    1       128k   4k      none  -
-  disk   nvme1n1p1
-    osd  5          down  100G  0 %      abc,kaveri    1       128k   4k      none  -
-    osd  6          down  100G  0 %                    1       128k   4k      none  -
-host     stump
-  osd    1          up    100G  37.29 %  osdone        1       128k   4k      all   -
-  osd    2          up    100G  26.8 %   abc           1       128k   4k      all   -
-  osd    7          up    100G  21.84 %                1       128k   4k      all   -
-  osd    8          up    100G  21.63 %                1       128k   4k      all   -
-  osd    9          up    100G  20.69 %                1       128k   4k      all   -
-  osd    10         up    100G  21.61 %                1       128k   4k      all   -
-  osd    11         up    100G  21.53 %                1       128k   4k      all   -
-  osd    12         up    100G  22.4 %                 1       128k   4k      all   -
-```
-
-## ls-osd
-
-`vitastor-cli osds|ls-osd|osd-ls [-l|--long]`
-
-Show current OSDs as list, optionally with I/O statistics if -l is specified.
-
-Example output:
-
-```
-OSD  PARENT            UP    SIZE  USED%    TAGS          WEIGHT  BLOCK  BITMAP  IMM   NOOUT
-3    kaveri/nvme0n1p1  down  100G  0 %      globl,kaveri  1       128k   4k      none  -
-4    kaveri/nvme0n1p1  down  100G  0 %                    1       128k   4k      none  -
-5    kaveri/nvme1n1p1  down  100G  0 %      globl,kaveri  1       128k   4k      none  -
-6    kaveri/nvme1n1p1  down  100G  0 %                    1       128k   4k      none  -
-1    stump             up    100G  37.29 %  osdone        1       128k   4k      all   -
-2    stump             up    100G  26.8 %   globl         1       128k   4k      all   -
-7    stump             up    100G  21.84 %                1       128k   4k      all   -
-8    stump             up    100G  21.63 %                1       128k   4k      all   -
-9    stump             up    100G  20.69 %                1       128k   4k      all   -
-10   stump             up    100G  21.61 %                1       128k   4k      all   -
-11   stump             up    100G  21.53 %                1       128k   4k      all   -
-12   stump             up    100G  22.4 %                 1       128k   4k      all   -
-```
-
-## modify-osd
-
-`vitastor-cli modify-osd [--tags tag1,tag2,...] [--reweight <number>] [--noout true/false] <osd_number>`
-
-Set OSD reweight, tags or noout flag. See detail description in [OSD config documentation](../config/pool.en.md#osd-settings).
-
-## pg-list
-
-`vitastor-cli pg-list|pg-ls|list-pg|ls-pg|ls-pgs [OPTIONS] [state1+state2] [^state3] [...]`
-
-List PGs with any of listed state filters (^ or ! in the beginning is negation). Options:
-
-```
--pool <pool name or number>  Only list PGs of the given pool.
--min <min pg number>         Only list PGs with number >= min.
--max <max pg number>         Only list PGs with number <= max.
-```
-
-Examples:
-
-`vitastor-cli pg-list active+degraded`
-
-`vitastor-cli pg-list ^active`
-
 ## create-pool

 `vitastor-cli create-pool|pool-create <name> (-s <pg_size>|--ec <N>+<K>) -n <pg_count> [OPTIONS]`
--- a/docs/usage/cli.ru.md
+++ b/docs/usage/cli.ru.md
@@ -23,10 +23,6 @@ vitastor-cli - интерфейс командной строки для адм
 - [merge-data](#merge-data)
 - [alloc-osd](#alloc-osd)
 - [rm-osd](#rm-osd)
- [osd-tree](#osd-tree)
- [ls-osd](#ls-osd)
- [modify-osd](#modify-osd)
- [pg-list](#pg-list)
 - [create-pool](#create-pool)
 - [modify-pool](#modify-pool)
 - [ls-pools](#ls-pools)
@@ -186,7 +182,6 @@ vitastor-cli snap-create [-p|--pool <id|name>] <image>@<snapshot>
 --wait-list   Сначала запросить полный листинг объектов, а потом начать удалять.
              Требует больше памяти, но позволяет правильно печатать прогресс удаления.
 --min-offset  Удалять только данные, начиная с заданного смещения.
--max-offset  Удалять только данные до (исключительно) заданного смещения.
 ```

 ## merge-data
@@ -268,83 +263,6 @@ vitastor-cli snap-create [-p|--pool <id|name>] <image>@<snapshot>
 С опцией `--dry-run` только проверяет, возможно ли удаление без потери данных и деградации
 избыточности.

-## osd-tree
-
-`vitastor-cli osd-tree [-l|--long]`
-
-Показать дерево OSD, со статистикой ввода-вывода, если установлено -l.
-
-Пример вывода:
-
-```
-TYPE     NAME       UP    SIZE  USED%    TAGS          WEIGHT  BLOCK  BITMAP  IMM   NOOUT
-host     kaveri
-  disk   nvme0n1p1
-    osd  3          down  100G  0 %      globl,kaveri  1       128k   4k      none  -
-    osd  4          down  100G  0 %                    1       128k   4k      none  -
-  disk   nvme1n1p1
-    osd  5          down  100G  0 %      globl,kaveri  1       128k   4k      none  -
-    osd  6          down  100G  0 %                    1       128k   4k      none  -
-host     stump
-  osd    1          up    100G  37.29 %  osdone        1       128k   4k      all   -
-  osd    2          up    100G  26.8 %   globl         1       128k   4k      all   -
-  osd    7          up    100G  21.84 %                1       128k   4k      all   -
-  osd    8          up    100G  21.63 %                1       128k   4k      all   -
-  osd    9          up    100G  20.69 %                1       128k   4k      all   -
-  osd    10         up    100G  21.61 %                1       128k   4k      all   -
-  osd    11         up    100G  21.53 %                1       128k   4k      all   -
-  osd    12         up    100G  22.4 %                 1       128k   4k      all   -
-```
-
-## ls-osd
-
-`vitastor-cli osds|ls-osd|osd-ls [-l|--long]`
-
-Показать список OSD, со статистикой ввода-вывода, если установлено -l.
-
-Пример вывода:
-
-```
-OSD  PARENT            UP    SIZE  USED%    TAGS          WEIGHT  BLOCK  BITMAP  IMM   NOOUT
-3    kaveri/nvme0n1p1  down  100G  0 %      globl,kaveri  1       128k   4k      none  -
-4    kaveri/nvme0n1p1  down  100G  0 %                    1       128k   4k      none  -
-5    kaveri/nvme1n1p1  down  100G  0 %      globl,kaveri  1       128k   4k      none  -
-6    kaveri/nvme1n1p1  down  100G  0 %                    1       128k   4k      none  -
-1    stump             up    100G  37.29 %  osdone        1       128k   4k      all   -
-2    stump             up    100G  26.8 %   globl         1       128k   4k      all   -
-7    stump             up    100G  21.84 %                1       128k   4k      all   -
-8    stump             up    100G  21.63 %                1       128k   4k      all   -
-9    stump             up    100G  20.69 %                1       128k   4k      all   -
-10   stump             up    100G  21.61 %                1       128k   4k      all   -
-11   stump             up    100G  21.53 %                1       128k   4k      all   -
-12   stump             up    100G  22.4 %                 1       128k   4k      all   -
-```
-
-## modify-osd
-
-`vitastor-cli modify-osd [--tags tag1,tag2,...] [--reweight <number>] [--noout true/false] <osd_number>`
-
-Установить вес OSD, теги или флаг noout. Смотрите подробное описание в [документации настроек OSD](../config/pool.ru.md#настройки-osd).
-
-## pg-list
-
-`vitastor-cli pg-list|pg-ls|list-pg|ls-pg|ls-pgs [OPTIONS] [state1+state2] [^state3] [...]`
-
-Вывести список PG с состояними, удовлетворяющими любому из переданных фильтров (^ или !
-в начале фильтра означает отрицание). Опции:
-
-```
--pool <pool name or number>  Only list PGs of the given pool.
--min <min pg number>         Only list PGs with number >= min.
--max <max pg number>         Only list PGs with number <= max.
-```
-
-Примеры:
-
-`vitastor-cli pg-list active+degraded`
-
-`vitastor-cli pg-list ^active`
-
 ## create-pool

 `vitastor-cli create-pool|pool-create <name> (-s <pg_size>|--ec <N>+<K>) -n <pg_count> [OPTIONS]`
--- a/docs/usage/nfs.en.md
+++ b/docs/usage/nfs.en.md
@@ -11,8 +11,6 @@ Vitastor has two file system implementations. Both can be used via `vitastor-nfs
 Commands:
 - [mount](#mount)
 - [start](#start)
- [upgrade](#upgrade)
- [defrag](#defrag)

 ## Pseudo-FS

@@ -88,6 +86,10 @@ POSIX features currently not implemented in VitastorFS:
 - Modification time (`mtime`) is updated lazily every second (like `-o lazytime`)

 Other notable missing features which should be addressed in the future:
+- Defragmentation of "shared" inodes. Files smaller than pool object size (block_size
+  multiplied by data part count if pool is EC) are internally stored in large block
+  volumes sequentially, one after another, and leave garbage after deleting or resizing.
+  Defragmentator will be implemented to collect this garbage.
 - Inode ID reuse. Currently inode IDs always grow, the limit is 2^48 inodes, so
  in theory you may hit it if you create and delete a very large number of files
 - Compaction of the key-value B-Tree. Current implementation never merges or deletes
@@ -137,37 +139,6 @@ Start network NFS server. Options:
 | `--port <PORT>` | use port \<PORT> for NFS services (default is 2049)        |
 | `--portmap 0`   | do not listen on port 111 (portmap/rpcbind, requires root) |

-### upgrade
-
-`vitastor-nfs --fs <NAME> upgrade`
-
-Upgrade FS metadata. Can be run online, but server(s) should be restarted after upgrade.
-
-### defrag
-
-`vitastor-nfs --fs <NAME> defrag [OPTIONS] [--dry-run]`
-
-Defragment volumes used for small file storage having more than \<defrag_percent> %
-of data removed. Can be run online.
-
-In VitastorFS, small files are stored in large "volumes" / "shared inodes" one
-after another. When you delete or extend such files, they are moved and garbage is left
-behind. Defragmentation removes garbage and moves data still in use to new volumes.
-
-Options:
-
-| <!-- -->                 | <!-- -->                                                                |
-|--------------------------|------------------------------------------------------------------------ |
-| --volume_untouched 86400 | Defragment volumes last appended to at least this number of seconds ago |
-| --defrag_percent 50      | Defragment volumes with at least this % of removed data                 |
-| --defrag_block_count 16  | Read this number of pool blocks at once during defrag                   |
-| --defrag_iodepth 16      | Move up to this number of files in parallel during defrag               |
-| --trace                  | Print verbose defragmentation status                                    |
-| --dry-run                | Skip modifications, only print status                                   |
-| --recalc-stats           | Recalculate all volume statistics                                       |
-| --include-empty          | Include old and empty volumes; make sure to restart NFS servers before using it |
-| --no-rm                  | Move, but do not delete data                                            |
-
 ## Common options

 | <!-- -->           | <!-- -->                                                 |
--- a/docs/usage/nfs.ru.md
+++ b/docs/usage/nfs.ru.md
@@ -11,8 +11,6 @@
 Команды:
 - [mount](#mount)
 - [start](#start)
- [upgrade](#upgrade)
- [defrag](#defrag)

 ## Псевдо-ФС

@@ -90,6 +88,11 @@ JSON-формате :-). Для инспекции содержимого БД
 - Времена модификации (`mtime`) отслеживаются асинхронно (как будто ФС смонтирована с `-o lazytime`)

 Другие недостающие функции, которые нужно добавить в будущем:
+- Дефрагментация "общих инодов". На уровне реализации ФС файлы, меньшие, чем размер
+  объекта пула (block_size умножить на число частей данных, если пул EC),
+  упаковываются друг за другом в большие "общие" иноды/тома. Если такие файлы удалять
+  или увеличивать, они перемещаются и оставляют за собой "мусор", вот тут-то и нужен
+  дефрагментатор.
 - Переиспользование номеров инодов. В текущей реализации номера инодов всё время
  увеличиваются, так что в теории вы можете упереться в лимит, если насоздаёте
  и наудаляете больше, чем 2^48 файлов.
@@ -142,40 +145,6 @@ JSON-формате :-). Для инспекции содержимого БД
 | `--port <PORT>` | использовать порт \<PORT> для NFS-сервисов (по умолчанию 2049)        |
 | `--portmap 0`   | отключить сервис portmap/rpcbind на порту 111 (по умолчанию включён и требует root привилегий) |

-### upgrade
-
-`vitastor-nfs --fs <NAME> upgrade`
-
-Обновить метаданные ФС. Можно запускать онлайн (при запущенных серверах NFS), но после выполнения их всё
-же желательно перезапустить.
-
-### defrag
-
-`vitastor-nfs --fs <NAME> defrag [OPTIONS] [--dry-run]`
-
-Дефрагментировать тома, используемые для хранения мелких файлов, в которых более, чем
-<defrag_percent> процентов данных удалено. Можно запускать онлайн.
-
-На уровне реализации ФС файлы, меньшие, чем размер объекта пула (block_size умножить на число
-частей данных, если пул EC), упаковываются друг за другом в большие "тома" / "общие иноды".
-Когда такие файлы удаляются или увеличиваются, они перемещаются и оставляют за собой "мусор".
-
-При дефрагментации мусор удаляется, а всё ещё используемые данные перемещаются в новые тома.
-
-Опции:
-
-| <!-- -->                 | <!-- -->                                                                |
-|--------------------------|------------------------------------------------------------------------ |
-| --volume_untouched 86400 | Дефрагментировать только тома, в которые уже не писали это число секунд |
-| --defrag_percent 50      | Дефрагментировать только тома, в которых этот % данных удалён           |
-| --defrag_block_count 16  | Читать это количество блоков пула за один раз                           |
-| --defrag_iodepth 16      | Перемещать одновременно до этого числа файлов                           |
-| --trace                  | Печатать детальную статистику дефрагментации                            |
-| --dry-run                | Не производить никаких изменений, только описать выполняемые действия   |
-| --recalc-stats           | Пересчитать и сохранить статистику всех томов                           |
-| --include-empty          | Дефрагментировать старые и пустые тома; обязательно перезапустите NFS-сервера после использования этой опции |
-| --no-rm                  | Перемещать, но не удалять данные                                        |
-
 ## Общие опции

 | <!-- -->           | <!-- -->                                                |
--- a/mon/antietcd_adapter.js
+++ b/mon/antietcd_adapter.js
@@ -1,188 +0,0 @@
-// Copyright (c) Vitaliy Filippov, 2019+
-// License: VNPL-1.1 (see README.md for details)
-
-const AntiEtcd = require('antietcd');
-
-const vitastor_persist_filter = require('./vitastor_persist_filter.js');
-const { b64, local_ips } = require('./utils.js');
-
-class AntiEtcdAdapter
-{
-    static async start_antietcd(config)
-    {
-        let antietcd;
-        if (config.use_antietcd)
-        {
-            let cluster = config.etcd_address;
-            if (!(cluster instanceof Array))
-                cluster = cluster ? (''+(cluster||'')).split(/,+/) : [];
-            cluster = Object.keys(cluster.reduce((a, url) =>
-            {
-                a[url.toLowerCase().replace(/^(https?:\/\/)/, '').replace(/\/.*$/, '')] = true;
-                return a;
-            }, {}));
-            const cfg_port = config.antietcd_port;
-            const is_local = local_ips(true).reduce((a, c) => { a[c] = true; return a; }, {});
-            const selected = cluster.map(s => s.split(':', 2)).filter(ip => is_local[ip[0]] && (!cfg_port || ip[1] == cfg_port));
-            if (selected.length > 1)
-            {
-                console.error('More than 1 etcd_address matches local IPs, please specify port');
-                process.exit(1);
-            }
-            else if (selected.length == 1)
-            {
-                const antietcd_config = {
-                    ip: selected[0][0],
-                    port: selected[0][1],
-                    data: config.antietcd_data_file || ((config.antietcd_data_dir || '/var/lib/vitastor') + '/mon_'+selected[0][1]+'.json.gz'),
-                    persist_filter: vitastor_persist_filter({ vitastor_prefix: config.etcd_prefix || '/vitastor' }),
-                    node_id: selected[0][0]+':'+selected[0][1], // node_id = ip:port
-                    cluster: (cluster.length == 1 ? null : cluster.reduce((a, c) => { a[c] = "http://"+c; return a; }, {})),
-                    cluster_key: (config.etcd_prefix || '/vitastor'),
-                    stale_read: 1,
-                    log_level: 1,
-                };
-                for (const key in config)
-                {
-                    if (key.substr(0, 9) === 'antietcd_')
-                    {
-                        const noprefix = key.substr(9);
-                        if (!(noprefix in antietcd_config) || noprefix == 'ip' || noprefix == 'cluster_key')
-                        {
-                            antietcd_config[noprefix] = config[key];
-                        }
-                    }
-                }
-                console.log('Starting Antietcd node '+antietcd_config.node_id);
-                antietcd = new AntiEtcd(antietcd_config);
-                await antietcd.start();
-            }
-            else
-            {
-                console.log('Antietcd is enabled, but etcd_address does not contain local IPs, proceeding without it');
-            }
-        }
-        return antietcd;
-    }
-
-    constructor(mon, antietcd)
-    {
-        this.mon = mon;
-        this.antietcd = antietcd;
-        this.on_leader = [];
-        this.on_change = (st) =>
-        {
-            if (st.state === 'leader')
-            {
-                for (const cb of this.on_leader)
-                {
-                    cb();
-                }
-                this.on_leader = [];
-            }
-        };
-        this.antietcd.on('raftchange', this.on_change);
-    }
-
-    parse_config(/*config*/)
-    {
-    }
-
-    stop_watcher()
-    {
-        this.antietcd.off('raftchange', this.on_change);
-        const watch_id = this.watch_id;
-        if (watch_id)
-        {
-            this.watch_id = null;
-            this.antietcd.cancel_watch(watch_id).catch(console.error);
-        }
-    }
-
-    async start_watcher()
-    {
-        if (this.watch_id)
-        {
-            await this.antietcd.cancel_watch(this.watch_id);
-            this.watch_id = null;
-        }
-        const watch_id = await this.antietcd.create_watch({
-            key: b64(this.mon.config.etcd_prefix+'/'),
-            range_end: b64(this.mon.config.etcd_prefix+'0'),
-            start_revision: ''+this.mon.etcd_watch_revision,
-            watch_id: 1,
-            progress_notify: true,
-        }, (message) =>
-        {
-            setImmediate(() => this.mon.on_message(message.result));
-        });
-        console.log('Successfully subscribed to antietcd revision '+this.antietcd.etctree.mod_revision);
-        this.watch_id = watch_id;
-    }
-
-    async become_master()
-    {
-        if (!this.antietcd.cluster)
-        {
-            console.log('Running in non-clustered mode');
-        }
-        else
-        {
-            console.log('Waiting to become master');
-            if (this.antietcd.cluster.raft.state !== 'leader')
-            {
-                await new Promise(ok => this.on_leader.push(ok));
-            }
-        }
-        const state = { ...this.mon.get_mon_state(), id: ''+this.mon.etcd_lease_id };
-        await this.etcd_call('/kv/txn', {
-            success: [ { requestPut: { key: b64(this.mon.config.etcd_prefix+'/mon/master'), value: b64(JSON.stringify(state)), lease: ''+this.mon.etcd_lease_id } } ],
-        }, this.mon.config.etcd_start_timeout, 0);
-        if (this.antietcd.cluster)
-        {
-            console.log('Became master');
-        }
-    }
-
-    async etcd_call(path, body, timeout, retries)
-    {
-        let retry = 0;
-        if (retries >= 0 && retries < 1)
-        {
-            retries = 1;
-        }
-        let prev = 0;
-        while (retries < 0 || retry < retries)
-        {
-            retry++;
-            if (this.mon.stopped)
-            {
-                throw new Error('Monitor instance is stopped');
-            }
-            try
-            {
-                if (Date.now()-prev < timeout)
-                {
-                    await new Promise(ok => setTimeout(ok, timeout-(Date.now()-prev)));
-                }
-                prev = Date.now();
-                const res = await this.antietcd.api(path.replace(/^\/+/, '').replace(/\/+$/, '').replace(/\/+/g, '_'), body);
-                if (res.error)
-                {
-                    console.error('Failed to query antietcd '+path+' (retry '+retry+'/'+retries+'): '+res.error);
-                }
-                else
-                {
-                    return res;
-                }
-            }
-            catch (e)
-            {
-                console.error('Failed to query antietcd '+path+' (retry '+retry+'/'+retries+'): '+e.stack);
-            }
-        }
-        throw new Error('Failed to query antietcd ('+retries+' retries)');
-    }
-}
-
-module.exports = AntiEtcdAdapter;
--- a/mon/etcd_adapter.js
+++ b/mon/etcd_adapter.js
@@ -3,7 +3,6 @@

 const http = require('http');
 const WebSocket = require('ws');
-const { b64, local_ips } = require('./utils.js');

 const MON_STOPPED = 'Monitor instance is stopped';

@@ -24,7 +23,7 @@ class EtcdAdapter

    parse_etcd_addresses(addrs)
    {
-        const is_local_ip = local_ips(true).reduce((a, c) => { a[c] = true; return a; }, {});
+        const is_local_ip = this.mon.local_ips(true).reduce((a, c) => { a[c] = true; return a; }, {});
        this.etcd_local = [];
        this.etcd_urls = [];
        this.selected_etcd_url = null;
@@ -349,4 +348,9 @@ function POST(url, body, timeout)
    });
 }

+function b64(str)
+{
+    return Buffer.from(str).toString('base64');
+}
+
 module.exports = EtcdAdapter;
--- a/mon/etcd_schema.js
+++ b/mon/etcd_schema.js
@@ -6,7 +6,7 @@ const etcd_nonempty_keys = {
    'config/global': 1,
    'config/node_placement': 1,
    'config/pools': 1,
-    'pg/config': 1,
+    'config/pgs': 1,
    'history/last_clean_pgs': 1,
    'stats': 1,
 };
@@ -15,8 +15,7 @@ const etcd_allow = new RegExp('^'+[
    'config/node_placement',
    'config/pools',
    'config/osd/[1-9]\\d*',
-    'config/pgs', // old name
-    'pg/config',
+    'config/pgs',
    'config/inode/[1-9]\\d*/[1-9]\\d*',
    'osd/state/[1-9]\\d*',
    'osd/stats/[1-9]\\d*',
@@ -25,8 +24,7 @@ const etcd_allow = new RegExp('^'+[
    'mon/master',
    'mon/member/[a-f0-9]+',
    'pg/state/[1-9]\\d*/[1-9]\\d*',
-    'pg/stats/[1-9]\\d*/[1-9]\\d*', // old name
-    'pgstats/[1-9]\\d*/[1-9]\\d*',
+    'pg/stats/[1-9]\\d*/[1-9]\\d*',
    'pg/history/[1-9]\\d*/[1-9]\\d*',
    'history/last_clean_pgs',
    'inode/stats/[1-9]\\d*/\\d+',
@@ -71,7 +69,7 @@ const etcd_tree = {
            block_size: 131072,
            disk_alignment: 4096,
            bitmap_granularity: 4096,
-            immediate_commit: 'all', // 'none', 'all' or 'small'
+            immediate_commit: false, // 'all' or 'small'
            // client - configurable online
            client_max_dirty_bytes: 33554432,
            client_max_dirty_ops: 1024,
@@ -191,7 +189,7 @@ const etcd_tree = {
                block_size: 131072,
                bitmap_granularity: 4096,
                // 'all'/'small'/'none', same as in OSD options
-                immediate_commit: 'all',
+                immediate_commit: 'none',
                pg_stripe_size: 0,
                root_node?: 'rack1',
                // restrict pool to OSDs having all of these tags
@@ -207,6 +205,19 @@ const etcd_tree = {
        osd: {
            /* <id>: { reweight?: 1, tags?: [ 'nvme', ... ], noout?: true }, ... */
        },
+        /* pgs: {
+            hash: string,
+            items: {
+                <pool_id>: {
+                    <pg_id>: {
+                        osd_set: [ 1, 2, 3 ],
+                        primary: 1,
+                        pause: false,
+                    }
+                }
+            }
+        }, */
+        pgs: {},
        /* inode: {
            <pool_id>: {
                <inode_t>: {
@@ -234,9 +245,6 @@ const etcd_tree = {
        stats: {
            /* <osd_num_t>: {
                time: number, // unix time
-                data_block_size: uint64_t, // bytes
-                bitmap_granularity: uint64_t, // bytes
-                immediate_commit: "all"|"small"|"none",
                blockstore_ready: boolean,
                size: uint64_t, // bytes
                free: uint64_t, // bytes
@@ -274,24 +282,11 @@ const etcd_tree = {
        master: {
            /* ip: [ string ], id: uint64_t */
        },
-        member: {
+        standby: {
            /* <uint64_t>: { ip: [ string ] }, */
        },
    },
    pg: {
-        /* config: {
-            hash: string,
-            items: {
-                <pool_id>: {
-                    <pg_id>: {
-                        osd_set: [ 1, 2, 3 ],
-                        primary: 1,
-                        pause: false,
-                    }
-                }
-            }
-        }, */
-        config: {},
        state: {
            /* <pool_id>: {
                <pg_id>: {
@@ -302,6 +297,18 @@ const etcd_tree = {
                }
            }, */
        },
+        stats: {
+            /* <pool_id>: {
+                <pg_id>: {
+                    object_count: uint64_t,
+                    clean_count: uint64_t,
+                    misplaced_count: uint64_t,
+                    degraded_count: uint64_t,
+                    incomplete_count: uint64_t,
+                    write_osd_set: osd_num_t[],
+                },
+            }, */
+        },
        history: {
            /* <pool_id>: {
                <pg_id>: {
@@ -313,18 +320,6 @@ const etcd_tree = {
            }, */
        },
    },
-    pgstats: {
-        /* <pool_id>: {
-            <pg_id>: {
-                object_count: uint64_t,
-                clean_count: uint64_t,
-                misplaced_count: uint64_t,
-                degraded_count: uint64_t,
-                incomplete_count: uint64_t,
-                write_osd_set: osd_num_t[],
-            },
-        }, */
-    },
    inode: {
        stats: {
            /* <pool_id>: {
--- a/mon/http_server.js
+++ b/mon/http_server.js
@@ -1,50 +0,0 @@
-// Copyright (c) Vitaliy Filippov, 2019+
-// License: VNPL-1.1 (see README.md for details)
-
-const fsp = require('fs').promises;
-const http = require('http');
-const https = require('https');
-
-async function create_http_server(cfg, handler)
-{
-    let server;
-    if (cfg.mon_https_cert)
-    {
-        const tls = {
-            key: await fsp.readFile(cfg.mon_https_key),
-            cert: await fsp.readFile(cfg.mon_https_cert),
-        };
-        if (cfg.mon_https_ca)
-        {
-            tls.mon_https_ca = await fsp.readFile(cfg.mon_https_ca);
-        }
-        if (cfg.mon_https_client_auth)
-        {
-            tls.requestCert = true;
-        }
-        server = https.createServer(tls, handler);
-    }
-    else
-    {
-        server = http.createServer(handler);
-    }
-    try
-    {
-        let err;
-        server.once('error', e => err = e);
-        server.listen(cfg.mon_http_port || 8060, cfg.mon_http_ip || undefined);
-        if (err)
-            throw err;
-    }
-    catch (e)
-    {
-        console.error(
-            'HTTP server disabled because listen at address: '+
-            (cfg.mon_http_ip || '')+':'+(cfg.mon_http_port || 9090)+' failed with error: '+e
-        );
-        return null;
-    }
-    return server;
-}
-
-module.exports = { create_http_server };
--- a/mon/mon-main.js
+++ b/mon/mon-main.js
@@ -23,4 +23,4 @@ for (let i = 2; i < process.argv.length; i++)
    }
 }

-Mon.run_forever(options).catch(console.error);
+Mon.run_forever(options);
--- a/mon/mon.js
+++ b/mon/mon.js
@@ -1,43 +1,27 @@
 // Copyright (c) Vitaliy Filippov, 2019+
 // License: VNPL-1.1 (see README.md for details)

-const { URL } = require('url');
 const fs = require('fs');
 const crypto = require('crypto');
 const os = require('os');
-const AntiEtcdAdapter = require('./antietcd_adapter.js');
 const EtcdAdapter = require('./etcd_adapter.js');
-const { create_http_server } = require('./http_server.js');
-const { export_prometheus_metrics } = require('./prometheus.js');
 const { etcd_tree, etcd_allow, etcd_nonempty_keys } = require('./etcd_schema.js');
 const { validate_pool_cfg } = require('./pool_config.js');
 const { sum_op_stats, sum_object_counts, sum_inode_stats, serialize_bigints } = require('./stats.js');
 const stableStringify = require('./stable-stringify.js');
 const { scale_pg_history } = require('./pg_utils.js');
 const { get_osd_tree } = require('./osd_tree.js');
-const { b64, de64, local_ips } = require('./utils.js');
 const { recheck_primary, save_new_pgs_txn, generate_pool_pgs } = require('./pg_gen.js');

 class Mon
 {
-    static async run_forever(config)
+    static run_forever(config)
    {
-        let mergedConfig = config;
-        if (fs.existsSync(config.config_path||'/etc/vitastor/vitastor.conf'))
-        {
-            const fileConfig = JSON.parse(fs.readFileSync(config.config_path||'/etc/vitastor/vitastor.conf', { encoding: 'utf-8' }));
-            mergedConfig = { ...fileConfig, ...config };
-        }
-        let antietcd = await AntiEtcdAdapter.start_antietcd(mergedConfig);
        let mon;
        const run = () =>
        {
            console.log('Starting Monitor');
            const my_mon = new Mon(config);
-            my_mon.etcd = antietcd
-                ? new AntiEtcdAdapter(my_mon, antietcd)
-                : new EtcdAdapter(my_mon);
-            my_mon.etcd.parse_config(my_mon.config);
            mon = my_mon;
            my_mon.on_die = () =>
            {
@@ -74,57 +58,24 @@ class Mon
        this.state = JSON.parse(JSON.stringify(etcd_tree));
        this.prev_stats = { osd_stats: {}, osd_diff: {} };
        this.recheck_pgs_active = false;
-        this.watcher_active = false;
-        this.old_pg_config = false;
-        this.old_pg_stats_seen = false;
+        this.etcd = new EtcdAdapter(this);
+        this.etcd.parse_config(this.config);
    }

    async start()
    {
-        if (this.config.enable_prometheus || !('enable_prometheus' in this.config))
-        {
-            this.http = await create_http_server(this.config, (req, res) =>
-            {
-                const u = new URL(req.url, 'http://'+(req.headers.host || 'localhost'));
-                if (u.pathname.replace(/\/+$/, '') == (this.config.prometheus_path||'/metrics'))
-                {
-                    if (!this.watcher_active)
-                    {
-                        res.writeHead(503);
-                        res.write('Monitor is in standby mode. Please retrieve metrics from master monitor instance\n');
-                    }
-                    else
-                    {
-                        res.write(export_prometheus_metrics(this.state));
-                    }
-                }
-                else
-                {
-                    res.writeHead(404);
-                    res.write('Not found. Metrics path: '+(this.config.prometheus_path||'/metrics\n'));
-                }
-                res.end();
-            });
-            this.http_connections = new Set();
-            this.http.on('connection', conn =>
-            {
-                this.http_connections.add(conn);
-                conn.once('close', () => this.http_connections.delete(conn));
-            });
-        }
        await this.load_config();
        await this.get_lease();
        await this.etcd.become_master();
        await this.load_cluster_state();
        await this.etcd.start_watcher(this.config.etcd_mon_retries);
-        this.watcher_active = true;
        for (const pool_id in this.state.config.pools)
        {
            if (!this.state.pool.stats[pool_id] ||
                !Number(this.state.pool.stats[pool_id].pg_real_size))
            {
                // Generate missing data in etcd
-                this.state.pg.config.hash = null;
+                this.state.config.pgs.hash = null;
                break;
            }
        }
@@ -196,22 +147,17 @@ class Mon
        this.etcd_watch_revision = BigInt(msg.header.revision)+BigInt(1);
        for (const e of msg.events||[])
        {
-            const kv = this.parse_kv(e.kv);
-            const key = kv.key.substr(this.config.etcd_prefix.length);
+            this.parse_kv(e.kv);
+            const key = e.kv.key.substr(this.config.etcd_prefix.length);
            if (key.substr(0, 11) == '/osd/state/')
            {
                stats_changed = true;
                changed = true;
            }
-            else if (key.substr(0, 11) == '/osd/stats/' || key.substr(0, 9) == '/pgstats/' || key.substr(0, 16) == '/osd/inodestats/')
+            else if (key.substr(0, 11) == '/osd/stats/' || key.substr(0, 10) == '/pg/stats/' || key.substr(0, 16) == '/osd/inodestats/')
            {
                stats_changed = true;
            }
-            else if (key.substr(0, 10) == '/pg/stats/')
-            {
-                this.old_pg_stats_seen = true;
-                stats_changed = true;
-            }
            else if (key.substr(0, 10) == '/pg/state/')
            {
                pg_states_changed = true;
@@ -222,7 +168,7 @@ class Mon
            }
            if (this.config.verbose)
            {
-                console.log(JSON.stringify({ ...e, kv: kv || undefined }));
+                console.log(JSON.stringify(e));
            }
        }
        if (pg_states_changed)
@@ -292,7 +238,7 @@ class Mon
                    continue next_pool;
                }
            }
-            new_clean_pgs.items[pool_id] = this.state.pg.config.items[pool_id];
+            new_clean_pgs.items[pool_id] = this.state.config.pgs.items[pool_id];
        }
        this.state.history.last_clean_pgs = new_clean_pgs;
        await this.etcd.etcd_call('/kv/txn', {
@@ -306,7 +252,7 @@ class Mon

    get_mon_state()
    {
-        return { ip: local_ips(), hostname: os.hostname() };
+        return { ip: this.local_ips(), hostname: os.hostname() };
    }

    async get_lease()
@@ -338,16 +284,6 @@ class Mon
    async on_stop()
    {
        console.log('Stopping Monitor');
-        if (this.http)
-        {
-            await new Promise(ok =>
-            {
-                this.http.close(ok);
-                for (const conn of this.http_connections)
-                    conn.destroy();
-            });
-            this.http = null;
-        }
        this.etcd.stop_watcher();
        if (this.save_last_clean_timer)
        {
@@ -403,50 +339,6 @@ class Mon
                this.parse_kv(kv);
            }
        }
-        if (Object.keys((this.state.config.pgs||{}).items||{}).length)
-        {
-            // Support seamless upgrade to new OSDs
-            if (!Object.keys((this.state.pg.config||{}).items||{}).length)
-            {
-                const pgs = JSON.stringify(this.state.config.pgs);
-                this.state.pg.config = JSON.parse(pgs);
-                const res = await this.etcd.etcd_call('/kv/txn', {
-                    success: [
-                        { requestPut: { key: b64(this.config.etcd_prefix+'/pg/config'), value: b64(pgs) } },
-                    ],
-                    compare: [
-                        { key: b64(this.config.etcd_prefix+'/pg/config'), target: 'MOD', mod_revision: ''+this.etcd_watch_revision, result: 'LESS' },
-                    ],
-                }, this.config.etcd_mon_timeout, this.config.etcd_mon_retries);
-                if (!res.succeeded)
-                    throw new Error('Failed to duplicate old PG config to new PG config');
-            }
-            this.old_pg_config = true;
-            this.old_pg_config_timer = setInterval(() => this.check_clear_old_config().catch(console.error),
-                this.config.old_pg_config_clear_interval||3600000);
-        }
-    }
-
-    async check_clear_old_config()
-    {
-        if (this.old_pg_config && this.old_pg_stats_seen)
-        {
-            this.old_pg_stats_seen = false;
-            return;
-        }
-        if (this.old_pg_config)
-        {
-            await this.etcd.etcd_call('/kv/txn', { success: [
-                { requestDeleteRange: { key: b64(this.config.etcd_prefix+'/config/pgs') } },
-                { requestDeleteRange: { key: b64(this.config.etcd_prefix+'/pg/stats/'), range_end: b64(this.config.etcd_prefix+'/pg/stats0') } },
-            ] }, this.config.etcd_mon_timeout, this.config.etcd_mon_retries);
-            this.old_pg_config = false;
-        }
-        if (this.old_pg_config_timer)
-        {
-            clearInterval(this.old_pg_config_timer);
-            this.old_pg_config_timer = null;
-        }
    }

    all_osds()
@@ -457,7 +349,7 @@ class Mon
    async stop_all_pgs(pool_id)
    {
        let has_online = false, paused = true;
-        for (const pg in this.state.pg.config.items[pool_id]||{})
+        for (const pg in this.state.config.pgs.items[pool_id]||{})
        {
            // FIXME: Change all (||{}) to ?. (optional chaining) at some point
            const cur_state = (((this.state.pg.state[pool_id]||{})[pg]||{}).state||[]).join(',');
@@ -465,7 +357,7 @@ class Mon
            {
                has_online = true;
            }
-            if (!this.state.pg.config.items[pool_id][pg].pause)
+            if (!this.state.config.pgs.items[pool_id][pg].pause)
            {
                paused = false;
            }
@@ -473,7 +365,7 @@ class Mon
        if (!paused)
        {
            console.log('Stopping all PGs for pool '+pool_id+' before changing PG count');
-            const new_cfg = JSON.parse(JSON.stringify(this.state.pg.config));
+            const new_cfg = JSON.parse(JSON.stringify(this.state.config.pgs));
            for (const pg in new_cfg.items[pool_id])
            {
                new_cfg.items[pool_id][pg].pause = true;
@@ -481,26 +373,22 @@ class Mon
            // Check that no OSDs change their state before we pause PGs
            // Doing this we make sure that OSDs don't wake up in the middle of our "transaction"
            // and can't see the old PG configuration
-            const checks = [
-                { key: b64(this.config.etcd_prefix+'/mon/master'), target: 'LEASE', lease: ''+this.etcd_lease_id },
-                { key: b64(this.config.etcd_prefix+'/pg/config'), target: 'MOD', mod_revision: ''+this.etcd_watch_revision, result: 'LESS' },
-            ];
+            const checks = [];
            for (const osd_num of this.all_osds())
            {
                const key = b64(this.config.etcd_prefix+'/osd/state/'+osd_num);
                checks.push({ key, target: 'MOD', result: 'LESS', mod_revision: ''+this.etcd_watch_revision });
            }
-            const txn = {
-                compare: checks,
-                success: [
-                    { requestPut: { key: b64(this.config.etcd_prefix+'/pg/config'), value: b64(JSON.stringify(new_cfg)) } },
+            await this.etcd.etcd_call('/kv/txn', {
+                compare: [
+                    { key: b64(this.config.etcd_prefix+'/mon/master'), target: 'LEASE', lease: ''+this.etcd_lease_id },
+                    { key: b64(this.config.etcd_prefix+'/config/pgs'), target: 'MOD', mod_revision: ''+this.etcd_watch_revision, result: 'LESS' },
+                    ...checks,
                ],
-            };
-            if (this.old_pg_config)
-            {
-                txn.success.push({ requestPut: { key: b64(this.config.etcd_prefix+'/config/pgs'), value: b64(JSON.stringify(new_cfg)) } });
-            }
-            await this.etcd.etcd_call('/kv/txn', txn, this.config.etcd_mon_timeout, 0);
+                success: [
+                    { requestPut: { key: b64(this.config.etcd_prefix+'/config/pgs'), value: b64(JSON.stringify(new_cfg)) } },
+                ],
+            }, this.config.etcd_mon_timeout, 0);
            return false;
        }
        return !has_online;
@@ -528,7 +416,7 @@ class Mon
            pools: this.state.config.pools,
        };
        const tree_hash = sha1hex(stableStringify(tree_cfg));
-        if (this.state.pg.config.hash != tree_hash)
+        if (this.state.config.pgs.hash != tree_hash)
        {
            // Something has changed
            console.log('Pool configuration or OSD tree changed, re-optimizing');
@@ -569,10 +457,10 @@ class Mon
        else
        {
            // Nothing changed, but we still want to recheck the distribution of primaries
-            let new_pg_config = recheck_primary(this.state, this.config, up_osds, osd_tree);
-            if (new_pg_config)
+            let new_config_pgs = recheck_primary(this.state, this.config, up_osds, osd_tree);
+            if (new_config_pgs)
            {
-                const ok = await this.save_pg_config(new_pg_config);
+                const ok = await this.save_pg_config(new_config_pgs);
                if (ok)
                    console.log('PG configuration successfully changed');
                else
@@ -587,12 +475,12 @@ class Mon

    async apply_pool_pgs(results, up_osds, osd_tree, tree_hash)
    {
-        for (const pool_id in (this.state.pg.config||{}).items||{})
+        for (const pool_id in (this.state.config.pgs||{}).items||{})
        {
            // We should stop all PGs when deleting a pool or changing its PG count
            if (!this.state.config.pools[pool_id] ||
-                this.state.pg.config.items[pool_id] && this.state.config.pools[pool_id].pg_count !=
-                Object.keys(this.state.pg.config.items[pool_id]).reduce((a, c) => (a < (0|c) ? (0|c) : a), 0))
+                this.state.config.pgs.items[pool_id] && this.state.config.pools[pool_id].pg_count !=
+                Object.keys(this.state.config.pgs.items[pool_id]).reduce((a, c) => (a < (0|c) ? (0|c) : a), 0))
            {
                if (!await this.stop_all_pgs(pool_id))
                {
@@ -600,22 +488,22 @@ class Mon
                }
            }
        }
-        const new_pg_config = JSON.parse(JSON.stringify(this.state.pg.config));
+        const new_config_pgs = JSON.parse(JSON.stringify(this.state.config.pgs));
        const etcd_request = { compare: [], success: [] };
-        for (const pool_id in (new_pg_config||{}).items||{})
+        for (const pool_id in (new_config_pgs||{}).items||{})
        {
            if (!this.state.config.pools[pool_id])
            {
                const prev_pgs = [];
-                for (const pg in new_pg_config.items[pool_id]||{})
+                for (const pg in new_config_pgs.items[pool_id]||{})
                {
-                    prev_pgs[pg-1] = new_pg_config.items[pool_id][pg].osd_set;
+                    prev_pgs[pg-1] = new_config_pgs.items[pool_id][pg].osd_set;
                }
                // Also delete pool statistics
                etcd_request.success.push({ requestDeleteRange: {
                    key: b64(this.config.etcd_prefix+'/pool/stats/'+pool_id),
                } });
-                save_new_pgs_txn(new_pg_config, etcd_request, this.state, this.config.etcd_prefix,
+                save_new_pgs_txn(new_config_pgs, etcd_request, this.state, this.config.etcd_prefix,
                    this.etcd_watch_revision, pool_id, up_osds, osd_tree, prev_pgs, [], []);
            }
        }
@@ -624,7 +512,7 @@ class Mon
            const pool_id = pool_res.pool_id;
            const pool_cfg = this.state.config.pools[pool_id];
            let pg_history = [];
-            for (const pg in ((this.state.pg.config.items||{})[pool_id]||{}))
+            for (const pg in ((this.state.config.pgs.items||{})[pool_id]||{}))
            {
                if (this.state.pg.history[pool_id] &&
                    this.state.pg.history[pool_id][pg])
@@ -633,9 +521,9 @@ class Mon
                }
            }
            const real_prev_pgs = [];
-            for (const pg in ((this.state.pg.config.items||{})[pool_id]||{}))
+            for (const pg in ((this.state.config.pgs.items||{})[pool_id]||{}))
            {
-                real_prev_pgs[pg-1] = [ ...this.state.pg.config.items[pool_id][pg].osd_set ];
+                real_prev_pgs[pg-1] = [ ...this.state.config.pgs.items[pool_id][pg].osd_set ];
            }
            if (real_prev_pgs.length > 0 && real_prev_pgs.length != pool_res.pgs.length)
            {
@@ -646,8 +534,8 @@ class Mon
                pg_history = scale_pg_history(pg_history, real_prev_pgs, pool_res.pgs);
                // Drop stats
                etcd_request.success.push({ requestDeleteRange: {
-                    key: b64(this.config.etcd_prefix+'/pgstats/'+pool_id+'/'),
-                    range_end: b64(this.config.etcd_prefix+'/pgstats/'+pool_id+'0'),
+                    key: b64(this.config.etcd_prefix+'/pg/stats/'+pool_id+'/'),
+                    range_end: b64(this.config.etcd_prefix+'/pg/stats/'+pool_id+'0'),
                } });
            }
            const stats = {
@@ -658,26 +546,22 @@ class Mon
                key: b64(this.config.etcd_prefix+'/pool/stats/'+pool_id),
                value: b64(JSON.stringify(stats)),
            } });
-            save_new_pgs_txn(new_pg_config, etcd_request, this.state, this.config.etcd_prefix,
+            save_new_pgs_txn(new_config_pgs, etcd_request, this.state, this.config.etcd_prefix,
                this.etcd_watch_revision, pool_id, up_osds, osd_tree, real_prev_pgs, pool_res.pgs, pg_history);
        }
-        new_pg_config.hash = tree_hash;
-        return await this.save_pg_config(new_pg_config, etcd_request);
+        new_config_pgs.hash = tree_hash;
+        return await this.save_pg_config(new_config_pgs, etcd_request);
    }

-    async save_pg_config(new_pg_config, etcd_request = { compare: [], success: [] })
+    async save_pg_config(new_config_pgs, etcd_request = { compare: [], success: [] })
    {
        etcd_request.compare.push(
            { key: b64(this.config.etcd_prefix+'/mon/master'), target: 'LEASE', lease: ''+this.etcd_lease_id },
-            { key: b64(this.config.etcd_prefix+'/pg/config'), target: 'MOD', mod_revision: ''+this.etcd_watch_revision, result: 'LESS' },
+            { key: b64(this.config.etcd_prefix+'/config/pgs'), target: 'MOD', mod_revision: ''+this.etcd_watch_revision, result: 'LESS' },
        );
        etcd_request.success.push(
-            { requestPut: { key: b64(this.config.etcd_prefix+'/pg/config'), value: b64(JSON.stringify(new_pg_config)) } },
+            { requestPut: { key: b64(this.config.etcd_prefix+'/config/pgs'), value: b64(JSON.stringify(new_config_pgs)) } },
        );
-        if (this.old_pg_config)
-        {
-            etcd_request.success.push({ requestPut: { key: b64(this.config.etcd_prefix+'/config/pgs'), value: b64(JSON.stringify(new_pg_config)) } });
-        }
        const txn_res = await this.etcd.etcd_call('/kv/txn', etcd_request, this.config.etcd_mon_timeout, 0);
        return txn_res.succeeded;
    }
@@ -806,16 +690,15 @@ class Mon
    {
        if (!kv || !kv.key)
        {
-            return kv;
+            return;
        }
-        kv = { ...kv };
        kv.key = de64(kv.key);
        kv.value = kv.value ? de64(kv.value) : null;
        let key = kv.key.substr(this.config.etcd_prefix.length+1);
        if (!etcd_allow.exec(key))
        {
            console.log('Bad key in etcd: '+kv.key+' = '+kv.value);
-            return kv;
+            return;
        }
        try
        {
@@ -824,7 +707,7 @@ class Mon
        catch (e)
        {
            console.log('Bad value in etcd: '+kv.key+' = '+kv.value);
-            return kv;
+            return;
        }
        let key_parts = key.split('/');
        let cur = this.state;
@@ -838,14 +721,7 @@ class Mon
            kv.value = kv.value || {};
        }
        const old = cur[key_parts[key_parts.length-1]];
-        if (kv.value == null)
-        {
-            delete cur[key_parts[key_parts.length-1]];
-        }
-        else
-        {
-            cur[key_parts[key_parts.length-1]] = kv.value;
-        }
+        cur[key_parts[key_parts.length-1]] = kv.value;
        if (key === 'config/global')
        {
            this.config = { ...this.fileConfig, ...this.state.config.global, ...this.cliConfig };
@@ -881,7 +757,6 @@ class Mon
                !this.state.osd.stats[osd_num] ? 0 : this.state.osd.stats[osd_num].time+this.config.osd_out_time
            );
        }
-        return kv;
    }

    _die(err)
@@ -891,6 +766,33 @@ class Mon
        this.on_stop().catch(console.error);
        this.on_die();
    }
+
+    local_ips(all)
+    {
+        const ips = [];
+        const ifaces = os.networkInterfaces();
+        for (const ifname in ifaces)
+        {
+            for (const iface of ifaces[ifname])
+            {
+                if (iface.family == 'IPv4' && !iface.internal || all)
+                {
+                    ips.push(iface.address);
+                }
+            }
+        }
+        return ips;
+    }
+}
+
+function b64(str)
+{
+    return Buffer.from(str).toString('base64');
+}
+
+function de64(str)
+{
+    return Buffer.from(str, 'base64').toString();
 }

 function sha1hex(str)
--- a/mon/package.json
+++ b/mon/package.json
@@ -1,6 +1,6 @@
 {
  "name": "vitastor-mon",
-  "version": "1.7.1",
+  "version": "1.6.1",
  "description": "Vitastor SDS monitor service",
  "main": "mon-main.js",
  "scripts": {
@@ -9,7 +9,6 @@
  "author": "Vitaliy Filippov",
  "license": "UNLICENSED",
  "dependencies": {
-    "antietcd": "^1.1.0",
    "sprintf-js": "^1.1.2",
    "ws": "^7.2.5"
  },
--- a/mon/pg_gen.js
+++ b/mon/pg_gen.js
@@ -57,7 +57,7 @@ function pick_primary(pool_config, osd_set, up_osds, aff_osds)

 function recheck_primary(state, global_config, up_osds, osd_tree)
 {
-    let new_pg_config;
+    let new_config_pgs;
    for (const pool_id in state.config.pools)
    {
        const pool_cfg = state.config.pools[pool_id];
@@ -69,30 +69,30 @@ function recheck_primary(state, global_config, up_osds, osd_tree)
        reset_rng();
        for (let pg_num = 1; pg_num <= pool_cfg.pg_count; pg_num++)
        {
-            if (!state.pg.config.items[pool_id])
+            if (!state.config.pgs.items[pool_id])
            {
                continue;
            }
-            const pg_cfg = state.pg.config.items[pool_id][pg_num];
+            const pg_cfg = state.config.pgs.items[pool_id][pg_num];
            if (pg_cfg)
            {
                const new_primary = pick_primary(state.config.pools[pool_id], pg_cfg.osd_set, up_osds, aff_osds);
                if (pg_cfg.primary != new_primary)
                {
-                    if (!new_pg_config)
+                    if (!new_config_pgs)
                    {
-                        new_pg_config = JSON.parse(JSON.stringify(state.pg.config));
+                        new_config_pgs = JSON.parse(JSON.stringify(state.config.pgs));
                    }
                    console.log(
                        `Moving pool ${pool_id} (${pool_cfg.name || 'unnamed'}) PG ${pg_num}`+
                        ` primary OSD from ${pg_cfg.primary} to ${new_primary}`
                    );
-                    new_pg_config.items[pool_id][pg_num].primary = new_primary;
+                    new_config_pgs.items[pool_id][pg_num].primary = new_primary;
                }
            }
        }
    }
-    return new_pg_config;
+    return new_config_pgs;
 }

 function save_new_pgs_txn(save_to, request, state, etcd_prefix, etcd_watch_revision, pool_id, up_osds, osd_tree, prev_pgs, new_pgs, pg_history)
@@ -174,7 +174,7 @@ async function generate_pool_pgs(state, global_config, pool_id, osd_tree, levels
        state.osd.stats,
        pool_cfg.block_size || global_config.block_size || 131072,
        pool_cfg.bitmap_granularity || global_config.bitmap_granularity || 4096,
-        pool_cfg.immediate_commit || global_config.immediate_commit || 'all'
+        pool_cfg.immediate_commit || global_config.immediate_commit || 'none'
    );
    pool_tree = make_hier_tree(global_config, pool_tree);
    // First try last_clean_pgs to minimize data movement
@@ -185,10 +185,10 @@ async function generate_pool_pgs(state, global_config, pool_id, osd_tree, levels
    }
    if (!prev_pgs.length)
    {
-        // Fall back to pg/config if it's empty
-        for (const pg in ((state.pg.config.items||{})[pool_id]||{}))
+        // Fall back to config/pgs if it's empty
+        for (const pg in ((state.config.pgs.items||{})[pool_id]||{}))
        {
-            prev_pgs[pg-1] = [ ...state.pg.config.items[pool_id][pg].osd_set ];
+            prev_pgs[pg-1] = [ ...state.config.pgs.items[pool_id][pg].osd_set ];
        }
    }
    const old_pg_count = prev_pgs.length;
@@ -205,8 +205,8 @@ async function generate_pool_pgs(state, global_config, pool_id, osd_tree, levels
        ordered: pool_cfg.scheme != 'replicated',
    };
    let optimize_result;
-    // Re-shuffle PGs if pg/config.hash is empty
-    if (old_pg_count > 0 && state.pg.config.hash)
+    // Re-shuffle PGs if config/pgs.hash is empty
+    if (old_pg_count > 0 && state.config.pgs.hash)
    {
        if (prev_pgs.length != pool_cfg.pg_count)
        {
--- a/mon/prometheus.js
+++ b/mon/prometheus.js
@@ -1,220 +0,0 @@
-// Copyright (c) Vitaliy Filippov, 2019+
-// License: VNPL-1.1 (see README.md for details)
-
-const metric_help =
-`# HELP vitastor_object_bytes Total size of objects in cluster in bytes
-# TYPE vitastor_object_bytes gauge
-# HELP vitastor_object_count Total number of objects in cluster
-# TYPE vitastor_object_count gauge
-# HELP vitastor_stat_count Total operation count
-# TYPE vitastor_stat_count counter
-# HELP vitastor_stat_usec Total operation latency in usec
-# TYPE vitastor_stat_usec counter
-# HELP vitastor_stat_bytes Total operation size in bytes
-# HELP vitastor_stat_bytes counter
-
-# HELP vitastor_image_raw_used Image raw used size in bytes
-# TYPE vitastor_image_raw_used counter
-# HELP vitastor_image_stat_count Per-image total operation count
-# TYPE vitastor_image_stat_count counter
-# HELP vitastor_image_stat_usec Per-image total operation latency
-# TYPE vitastor_image_stat_usec counter
-# HELP vitastor_image_stat_bytes Per-image total operation size in bytes
-# TYPE vitastor_image_stat_bytes counter
-
-# HELP vitastor_osd_status OSD up/down status
-# TYPE vitastor_osd_status gauge
-# HELP vitastor_osd_size_bytes OSD total space in bytes
-# TYPE vitastor_osd_size_bytes gauge
-# HELP vitastor_osd_free_bytes OSD free space in bytes
-# TYPE vitastor_osd_free_bytes gauge
-# HELP vitastor_osd_stat_count Per-image total operation count
-# TYPE vitastor_osd_stat_count counter
-# HELP vitastor_osd_stat_usec Per-image total operation latency
-# TYPE vitastor_osd_stat_usec counter
-# HELP vitastor_osd_stat_bytes Per-image total operation size in bytes
-# TYPE vitastor_osd_stat_bytes counter
-
-# HELP vitastor_monitor_info Monitor info, 1 is master, 0 is standby
-# TYPE vitastor_monitor_info gauge
-
-# HELP vitastor_pool_info Pool configuration (in labels)
-# TYPE vitastor_pool_info gauge
-# HELP vitastor_pool_status Pool up/down status
-# TYPE vitastor_pool_status gauge
-# HELP vitastor_pool_raw_to_usable Raw to usable space ratio
-# TYPE vitastor_pool_raw_to_usable gauge
-# HELP vitastor_pool_space_efficiency Pool space usage efficiency
-# TYPE vitastor_pool_space_efficiency gauge
-# HELP vitastor_pool_total_raw_tb Total raw space in pool in TB
-# TYPE vitastor_pool_total_raw_tb gauge
-# HELP vitastor_pool_used_raw_tb Used raw space in pool in TB
-# TYPE vitastor_pool_used_raw_tb gauge
-# HELP vitastor_pg_count PG counts by state
-# HELP vitastor_pg_count gauge
-
-`;
-
-function export_prometheus_metrics(st)
-{
-    let res = metric_help;
-
-    // Global statistics
-
-    for (const k in st.stats.object_bytes)
-    {
-        res += `vitastor_object_bytes{object_type="${k}"} ${st.stats.object_bytes[k]}\n`;
-    }
-
-    for (const k in st.stats.object_counts)
-    {
-        res += `vitastor_object_count{object_type="${k}"} ${st.stats.object_counts[k]}\n`;
-    }
-
-    for (const typ of [ 'op', 'subop', 'recovery' ])
-    {
-        for (const op in st.stats[typ+"_stats"]||{})
-        {
-            const op_stat = st.stats[typ+"_stats"][op];
-            for (const key of [ 'count', 'usec', 'bytes' ])
-            {
-                res += `vitastor_stat_${key}{op="${op}",op_type="${typ}"} ${op_stat[key]||0}\n`;
-            }
-        }
-    }
-
-    // Per-image statistics
-
-    for (const pool in st.inode.stats)
-    {
-        for (const inode in st.inode.stats[pool])
-        {
-            const ist = st.inode.stats[pool][inode];
-            const inode_name = ((st.config.inode[pool]||{})[inode]||{}).name||'';
-            const inode_label = `image_name="${addslashes(inode_name)}",inode_num="${inode}",pool_id="${pool}"`;
-            res += `vitastor_image_raw_used{${inode_label}} ${ist.raw_used||0}\n`;
-            for (const op of [ 'read', 'write', 'delete' ])
-            {
-                for (const k of [ 'count', 'usec', 'bytes' ])
-                {
-                    if (ist[op])
-                    {
-                        res += `vitastor_image_stat_${k}{${inode_label},op="${op}"} ${ist[op][k]||0}\n`;
-                    }
-                }
-            }
-        }
-    }
-
-    // Per-OSD statistics
-
-    for (const osd in st.osd.stats)
-    {
-        const osd_stat = st.osd.stats[osd];
-        const up = st.osd.state[osd] && st.osd.state[osd].state == 'up' ? 1 : 0;
-        res += `vitastor_osd_status{host="${addslashes(osd_stat.host)}",osd_num="${osd}"} ${up}\n`;
-        res += `vitastor_osd_size_bytes{osd_num="${osd}"} ${osd_stat.size||0}\n`;
-        res += `vitastor_osd_free_bytes{osd_num="${osd}"} ${osd_stat.free||0}\n`;
-        for (const op in osd_stat.op_stats)
-        {
-            const ist = osd_stat.op_stats[op];
-            for (const k of [ 'count', 'usec', 'bytes' ])
-            {
-                res += `vitastor_osd_stat_${k}{osd_num="${osd}",op="${op}",op_type="op"} ${ist[k]||0}\n`;
-            }
-        }
-        for (const op in osd_stat.subop_stats)
-        {
-            const ist = osd_stat.subop_stats[op];
-            for (const k of [ 'count', 'usec', 'bytes' ])
-            {
-                res += `vitastor_osd_stat_${k}{osd_num="${osd}",op="${op}",op_type="subop"} ${ist[k]||0}\n`;
-            }
-        }
-    }
-
-    // Monitor statistics
-
-    for (const mon_id in st.mon.member)
-    {
-        const mon = st.mon.member[mon_id];
-        const master = st.mon.master && st.mon.master.id == mon_id ? 1 : 0;
-        const ip = (mon.ip instanceof Array ? mon.ip[0] : mon.ip) || '';
-        res += `vitastor_monitor_info{monitor_hostname="${addslashes(mon.hostname)}",monitor_id="${mon_id}",monitor_ip="${addslashes(ip)}"} ${master}\n`;
-    }
-
-    // Per-pool statistics
-
-    for (const pool_id in st.config.pools)
-    {
-        const pool_cfg = st.config.pools[pool_id];
-        const pool_label = `pool_id="${pool_id}",pool_name="${addslashes(pool_cfg.name)}"`;
-        const pool_stat = st.pool.stats[pool_id];
-        res += `vitastor_pool_info{${pool_label}`+
-            `,pool_scheme="${addslashes(pool_cfg.scheme)}"`+
-            `,pg_size="${pool_cfg.pg_size||0}",pg_minsize="${pool_cfg.pg_minsize||0}"`+
-            `,parity_chunks="${pool_cfg.parity_chunks||0}",pg_count="${pool_cfg.pg_count||0}"`+
-            `,failure_domain="${addslashes(pool_cfg.failure_domain)}"`+
-            `} 1\n`;
-        if (!pool_stat)
-        {
-            continue;
-        }
-        res += `vitastor_pool_raw_to_usable{${pool_label}} ${pool_stat.raw_to_usable||0}\n`;
-        res += `vitastor_pool_space_efficiency{${pool_label}} ${pool_stat.space_efficiency||0}\n`;
-        res += `vitastor_pool_total_raw_tb{${pool_label}} ${pool_stat.total_raw_tb||0}\n`;
-        res += `vitastor_pool_used_raw_tb{${pool_label}} ${pool_stat.used_raw_tb||0}\n`;
-
-        // PG states and pool up/down status
-        const real_pg_count = (Object.keys(((st.pg.config||{}).items||{})[pool_id]||{}).length) || (0|pool_cfg.pg_count);
-        const per_state = {
-            active: 0,
-            starting: 0,
-            peering: 0,
-            incomplete: 0,
-            repeering: 0,
-            stopping: 0,
-            offline: 0,
-            degraded: 0,
-            has_inconsistent: 0,
-            has_corrupted: 0,
-            has_incomplete: 0,
-            has_degraded: 0,
-            has_misplaced: 0,
-            has_unclean: 0,
-            has_invalid: 0,
-            left_on_dead: 0,
-            scrubbing: 0,
-        };
-        const pool_pg_states = st.pg.state[pool_id] || {};
-        for (let i = 1; i <= real_pg_count; i++)
-        {
-            if (!pool_pg_states[i])
-            {
-                per_state['offline'] = 1 + (per_state['offline']|0);
-            }
-            else
-            {
-                for (const st_name of pool_pg_states[i].state)
-                {
-                    per_state[st_name] = 1 + (per_state[st_name]|0);
-                }
-            }
-        }
-        for (const st_name in per_state)
-        {
-            res += `vitastor_pg_count{pg_state="${st_name}",${pool_label}} ${per_state[st_name]}\n`;
-        }
-        const pool_active = per_state['active'] >= real_pg_count ? 1 : 0;
-        res += `vitastor_pool_status{${pool_label}} ${pool_active}\n`;
-    }
-
-    return res;
-}
-
-function addslashes(str)
-{
-    return ((str||'')+'').replace(/(["\n\\])/g, "\\$1"); // escape " \n \
-}
-
-module.exports = { export_prometheus_metrics };
--- a/mon/scripts/Vitastor-Grafana-6+.json
+++ b/mon/scripts/Vitastor-Grafana-6+.json
--- a/mon/stats.js
+++ b/mon/stats.js
@@ -3,10 +3,10 @@

 function derive_osd_stats(st, prev, prev_diff)
 {
-    const diff = prev_diff || { op_stats: {}, subop_stats: {}, recovery_stats: {}, inode_stats: {} };
+    const diff = { op_stats: {}, subop_stats: {}, recovery_stats: {}, inode_stats: {} };
    if (!st || !st.time || !prev || !prev.time || prev.time >= st.time)
    {
-        return diff;
+        return prev_diff || diff;
    }
    const timediff = BigInt(st.time*1000 - prev.time*1000);
    for (const op in st.op_stats||{})
@@ -17,7 +17,8 @@ function derive_osd_stats(st, prev, prev_diff)
        const b = c.bytes - BigInt(pr && pr.bytes||0);
        const us = c.usec - BigInt(pr && pr.usec||0);
        const n = c.count - BigInt(pr && pr.count||0);
-        diff.op_stats[op] = { ...c, bps: n > 0 ? b*1000n/timediff : 0n, iops: n > 0 ? n*1000n/timediff : 0n, lat: n > 0 ? us/n : 0n };
+        if (n > 0)
+            diff.op_stats[op] = { ...c, bps: b*1000n/timediff, iops: n*1000n/timediff, lat: us/n };
    }
    for (const op in st.subop_stats||{})
    {
@@ -26,7 +27,8 @@ function derive_osd_stats(st, prev, prev_diff)
        c = { usec: BigInt(c.usec||0), count: BigInt(c.count||0) };
        const us = c.usec - BigInt(pr && pr.usec||0);
        const n = c.count - BigInt(pr && pr.count||0);
-        diff.subop_stats[op] = { ...c, iops: n > 0 ? n*1000n/timediff : 0n, lat: n > 0 ? us/n : 0n };
+        if (n > 0)
+            diff.subop_stats[op] = { ...c, iops: n*1000n/timediff, lat: us/n };
    }
    for (const op in st.recovery_stats||{})
    {
@@ -35,7 +37,8 @@ function derive_osd_stats(st, prev, prev_diff)
        c = { bytes: BigInt(c.bytes||0), count: BigInt(c.count||0) };
        const b = c.bytes - BigInt(pr && pr.bytes||0);
        const n = c.count - BigInt(pr && pr.count||0);
-        diff.recovery_stats[op] = { ...c, bps: n > 0 ? b*1000n/timediff : 0n, iops: n > 0 ? n*1000n/timediff : 0n };
+        if (n > 0)
+            diff.recovery_stats[op] = { ...c, bps: b*1000n/timediff, iops: n*1000n/timediff };
    }
    for (const pool_id in st.inode_stats||{})
    {
@@ -50,9 +53,9 @@ function derive_osd_stats(st, prev, prev_diff)
                    prev.inode_stats[pool_id][inode_num] && prev.inode_stats[pool_id][inode_num][op];
                const n = BigInt(c.count||0) - BigInt(pr && pr.count||0);
                inode_diff[op] = {
-                    bps: n > 0 ? (BigInt(c.bytes||0) - BigInt(pr && pr.bytes||0))*1000n/timediff : 0n,
-                    iops: n > 0 ? n*1000n/timediff : 0n,
-                    lat: n > 0 ? (BigInt(c.usec||0) - BigInt(pr && pr.usec||0))/n : 0n,
+                    bps: (BigInt(c.bytes||0) - BigInt(pr && pr.bytes||0))*1000n/timediff,
+                    iops: n*1000n/timediff,
+                    lat: (BigInt(c.usec||0) - BigInt(pr && pr.usec||0))/(n || 1n),
                };
            }
        }
@@ -71,7 +74,7 @@ function sum_op_stats(all_osd, prev_stats)
        );
        prev_stats.osd_stats[osd] = cur;
    }
-    const sum_diff = { op_stats: {}, subop_stats: {}, recovery_stats: { degraded: {}, misplaced: {} } };
+    const sum_diff = { op_stats: {}, subop_stats: {}, recovery_stats: {} };
    // Sum derived values instead of deriving summed
    for (const osd in all_osd.state)
    {
@@ -100,19 +103,10 @@ function sum_object_counts(state, global_config)
 {
    const object_counts = { object: 0n, clean: 0n, misplaced: 0n, degraded: 0n, incomplete: 0n };
    const object_bytes = { object: 0n, clean: 0n, misplaced: 0n, degraded: 0n, incomplete: 0n };
-    let pgstats = state.pgstats;
-    if (state.pg.stats)
-    {
-        // Merge with old stats for seamless transition to new stats
-        for (const pool_id in state.pg.stats)
-        {
-            pgstats[pool_id] = { ...(state.pg.stats[pool_id] || {}), ...(pgstats[pool_id] || {}) };
-        }
-    }
-    for (const pool_id in pgstats)
+    for (const pool_id in state.pg.stats)
    {
        let object_size = 0;
-        for (const osd_num of pgstats[pool_id].write_osd_set||[])
+        for (const osd_num of state.pg.stats[pool_id].write_osd_set||[])
        {
            if (osd_num && state.osd.stats[osd_num] && state.osd.stats[osd_num].block_size)
            {
@@ -130,9 +124,9 @@ function sum_object_counts(state, global_config)
            object_size *= ((pool_cfg.pg_size||0) - (pool_cfg.parity_chunks||0));
        }
        object_size = BigInt(object_size);
-        for (const pg_num in pgstats[pool_id])
+        for (const pg_num in state.pg.stats[pool_id])
        {
-            const st = pgstats[pool_id][pg_num];
+            const st = state.pg.stats[pool_id][pg_num];
            if (st)
            {
                for (const k in object_counts)
--- a/mon/utils.js
+++ b/mon/utils.js
@@ -1,37 +0,0 @@
-// Copyright (c) Vitaliy Filippov, 2019+
-// License: VNPL-1.1 (see README.md for details)
-
-const os = require('os');
-
-function local_ips(all)
-{
-    const ips = [];
-    const ifaces = os.networkInterfaces();
-    for (const ifname in ifaces)
-    {
-        for (const iface of ifaces[ifname])
-        {
-            if (iface.family == 'IPv4' && !iface.internal || all)
-            {
-                ips.push(iface.address);
-            }
-        }
-    }
-    return ips;
-}
-
-function b64(str)
-{
-    return Buffer.from(str).toString('base64');
-}
-
-function de64(str)
-{
-    return Buffer.from(str, 'base64').toString();
-}
-
-module.exports = {
-    b64,
-    de64,
-    local_ips,
-};
--- a/mon/vitastor_persist_filter.js
+++ b/mon/vitastor_persist_filter.js
@@ -1,49 +0,0 @@
-// AntiEtcd persistence filter for Vitastor
-// (c) Vitaliy Filippov, 2024
-// License: Mozilla Public License 2.0 or Vitastor Network Public License 1.1
-
-function vitastor_persist_filter(cfg)
-{
-    const prefix = cfg.vitastor_prefix || '/vitastor';
-    return (key, value) =>
-    {
-        if (key.substr(0, prefix.length+'/osd/stats/'.length) == prefix+'/osd/stats/')
-        {
-            if (value)
-            {
-                try
-                {
-                    value = JSON.parse(value);
-                    value = JSON.stringify({
-                        bitmap_granularity: value.bitmap_granularity || undefined,
-                        data_block_size: value.data_block_size || undefined,
-                        host: value.host || undefined,
-                        immediate_commit: value.immediate_commit || undefined,
-                    });
-                }
-                catch (e)
-                {
-                    console.error('invalid JSON in '+key+' = '+value+': '+e);
-                    value = '{}';
-                }
-            }
-            else
-            {
-                value = undefined;
-            }
-            return value;
-        }
-        else if (key.substr(0, prefix.length+'/osd/'.length) == prefix+'/osd/' ||
-            key.substr(0, prefix.length+'/inode/stats/'.length) == prefix+'/inode/stats/' ||
-            key.substr(0, prefix.length+'/pg/stats/'.length) == prefix+'/pg/stats/' || // old name
-            key.substr(0, prefix.length+'/pgstats/'.length) == prefix+'/pgstats/' ||
-            key.substr(0, prefix.length+'/pool/stats/'.length) == prefix+'/pool/stats/' ||
-            key == prefix+'/stats')
-        {
-            return undefined;
-        }
-        return value;
-    };
-}
-
-module.exports = vitastor_persist_filter;
--- a/node-binding/addon.cc
+++ b/node-binding/addon.cc
@@ -1,80 +0,0 @@
-// Copyright (c) Vitaliy Filippov, 2019+
-// License: VNPL-1.1 (see README.md for details)
-
-#include "addon.h"
-
-// Initialize the node addon
-NAN_MODULE_INIT(InitAddon)
-{
-    // vitastor.Client
-
-    v8::Local<v8::FunctionTemplate> tpl = Nan::New<v8::FunctionTemplate>(NodeVitastor::Create);
-    tpl->SetClassName(Nan::New("Client").ToLocalChecked());
-    tpl->InstanceTemplate()->SetInternalFieldCount(1);
-
-    Nan::SetPrototypeMethod(tpl, "read", NodeVitastor::Read);
-    Nan::SetPrototypeMethod(tpl, "write", NodeVitastor::Write);
-    Nan::SetPrototypeMethod(tpl, "sync", NodeVitastor::Sync);
-    Nan::SetPrototypeMethod(tpl, "read_bitmap", NodeVitastor::ReadBitmap);
-    //Nan::SetPrototypeMethod(tpl, "destroy", NodeVitastor::Destroy);
-
-    Nan::Set(target, Nan::New("Client").ToLocalChecked(), Nan::GetFunction(tpl).ToLocalChecked());
-
-    // vitastor.Image (opened image)
-
-    tpl = Nan::New<v8::FunctionTemplate>(NodeVitastorImage::Create);
-    tpl->SetClassName(Nan::New("Image").ToLocalChecked());
-    tpl->InstanceTemplate()->SetInternalFieldCount(1);
-
-    Nan::SetPrototypeMethod(tpl, "read", NodeVitastorImage::Read);
-    Nan::SetPrototypeMethod(tpl, "write", NodeVitastorImage::Write);
-    Nan::SetPrototypeMethod(tpl, "sync", NodeVitastorImage::Sync);
-    Nan::SetPrototypeMethod(tpl, "get_info", NodeVitastorImage::GetInfo);
-    Nan::SetPrototypeMethod(tpl, "read_bitmap", NodeVitastorImage::ReadBitmap);
-
-    Nan::Set(target, Nan::New("Image").ToLocalChecked(), Nan::GetFunction(tpl).ToLocalChecked());
-
-    // vitastor.KV
-
-    tpl = Nan::New<v8::FunctionTemplate>(NodeVitastorKV::Create);
-    tpl->SetClassName(Nan::New("KV").ToLocalChecked());
-    tpl->InstanceTemplate()->SetInternalFieldCount(1);
-
-    Nan::SetPrototypeMethod(tpl, "open", NodeVitastorKV::Open);
-    Nan::SetPrototypeMethod(tpl, "set_config", NodeVitastorKV::SetConfig);
-    Nan::SetPrototypeMethod(tpl, "close", NodeVitastorKV::Close);
-    Nan::SetPrototypeMethod(tpl, "get_size", NodeVitastorKV::GetSize);
-    Nan::SetPrototypeMethod(tpl, "get", NodeVitastorKV::Get);
-    Nan::SetPrototypeMethod(tpl, "get_cached", NodeVitastorKV::GetCached);
-    Nan::SetPrototypeMethod(tpl, "set", NodeVitastorKV::Set);
-    Nan::SetPrototypeMethod(tpl, "del", NodeVitastorKV::Del);
-    Nan::SetPrototypeMethod(tpl, "list", NodeVitastorKV::List);
-
-    Nan::Set(target, Nan::New("KV").ToLocalChecked(), Nan::GetFunction(tpl).ToLocalChecked());
-
-    Nan::Set(target, Nan::New("ENOENT").ToLocalChecked(), Nan::New<v8::Int32>(-ENOENT));
-    Nan::Set(target, Nan::New("EIO").ToLocalChecked(), Nan::New<v8::Int32>(-EIO));
-    Nan::Set(target, Nan::New("EINVAL").ToLocalChecked(), Nan::New<v8::Int32>(-EINVAL));
-    Nan::Set(target, Nan::New("EROFS").ToLocalChecked(), Nan::New<v8::Int32>(-EROFS));
-    Nan::Set(target, Nan::New("ENOSPC").ToLocalChecked(), Nan::New<v8::Int32>(-ENOSPC));
-    Nan::Set(target, Nan::New("EINTR").ToLocalChecked(), Nan::New<v8::Int32>(-EINTR));
-    Nan::Set(target, Nan::New("EILSEQ").ToLocalChecked(), Nan::New<v8::Int32>(-EILSEQ));
-    Nan::Set(target, Nan::New("ENOTBLK").ToLocalChecked(), Nan::New<v8::Int32>(-ENOTBLK));
-    Nan::Set(target, Nan::New("ENOSYS").ToLocalChecked(), Nan::New<v8::Int32>(-ENOSYS));
-    Nan::Set(target, Nan::New("EAGAIN").ToLocalChecked(), Nan::New<v8::Int32>(-EAGAIN));
-
-    // Listing handle
-
-    tpl = Nan::New<v8::FunctionTemplate>(NodeVitastorKVListing::Create);
-    tpl->SetClassName(Nan::New("KVListing").ToLocalChecked());
-    tpl->InstanceTemplate()->SetInternalFieldCount(1);
-
-    Nan::SetPrototypeMethod(tpl, "next", NodeVitastorKVListing::Next);
-    Nan::SetPrototypeMethod(tpl, "close", NodeVitastorKVListing::Close);
-
-    Nan::Set(target, Nan::New("KVListing").ToLocalChecked(), Nan::GetFunction(tpl).ToLocalChecked());
-
-    NodeVitastorKV::listing_class.Reset(Nan::GetFunction(tpl).ToLocalChecked());
-}
-
-NODE_MODULE(addon, (void*)InitAddon)
--- a/node-binding/addon.h
+++ b/node-binding/addon.h
@@ -1,20 +0,0 @@
-// Copyright (c) Vitaliy Filippov, 2019+
-// License: VNPL-1.1 (see README.md for details)
-
-#ifndef NODE_VITASTOR_ADDON_H
-#define NODE_VITASTOR_ADDON_H
-
-#include <nan.h>
-#include <vitastor_c.h>
-
-#include "client.h"
-
-#define ERRORF(format, ...) fprintf(stderr, format "\n", __VA_ARGS__);
-
-#define TRACEF(format, ...) fprintf(stderr, format "\n", __VA_ARGS__);
-#define TRACE(msg) fprintf(stderr, "%s\n", msg);
-
-//#define TRACEF(format, arg) ;
-//#define TRACE(msg) ;
-
-#endif
--- a/node-binding/binding.gyp
+++ b/node-binding/binding.gyp
@@ -1,21 +0,0 @@
-{
-  'targets': [
-    {
-      'target_name': 'addon',
-      'sources': [
-        'client.cc',
-        'addon.cc'
-      ],
-      'include_dirs': [
-        '<!(node -e "require(\'nan\')")'
-      ],
-      'cflags': [
-        '<!(pkg-config --cflags vitastor) -g'
-      ],
-      'libraries': [
-        '<!(pkg-config --libs vitastor)',
-        '-lvitastor_kv'
-      ]
-    }
-  ]
-}
--- a/node-binding/client.cc
+++ b/node-binding/client.cc
@@ -1,968 +0,0 @@
-// Copyright (c) Vitaliy Filippov, 2019+
-// License: VNPL-1.1 (see README.md for details)
-
-#include "addon.h"
-
-#define NODE_VITASTOR_READ 1
-#define NODE_VITASTOR_WRITE 2
-#define NODE_VITASTOR_SYNC 3
-#define NODE_VITASTOR_READ_BITMAP 4
-#define NODE_VITASTOR_GET_INFO 5
-
-#ifndef INODE_POOL
-#define INODE_POOL(inode) (uint32_t)((inode) >> (64 - POOL_ID_BITS))
-#define INODE_NO_POOL(inode) (uint64_t)((inode) & (((uint64_t)1 << (64-POOL_ID_BITS)) - 1))
-#define INODE_WITH_POOL(pool_id, inode) (((uint64_t)(pool_id) << (64-POOL_ID_BITS)) | INODE_NO_POOL(inode))
-#endif
-
-class NodeVitastorRequest: public Nan::AsyncResource
-{
-public:
-    NodeVitastorRequest(NodeVitastor *cli, v8::Local<v8::Function> cb): Nan::AsyncResource("NodeVitastorRequest")
-    {
-        this->cli = cli;
-        callback.Reset(cb);
-    }
-
-    iovec iov;
-    std::vector<iovec> iov_list;
-    NodeVitastor *cli = NULL;
-    NodeVitastorImage *img = NULL;
-    int op = 0;
-    uint64_t offset = 0, len = 0, version = 0;
-    bool with_parents = false;
-    Nan::Persistent<v8::Function> callback;
-};
-
-//////////////////////////////////////////////////
-// NodeVitastor
-//////////////////////////////////////////////////
-
-NodeVitastor::NodeVitastor(): Nan::ObjectWrap()
-{
-    TRACE("NodeVitastor: constructor");
-    poll_watcher.data = this;
-}
-
-NodeVitastor::~NodeVitastor()
-{
-    TRACE("NodeVitastor: destructor");
-    uv_poll_stop(&poll_watcher);
-    vitastor_c_destroy(c);
-    c = NULL;
-}
-
-NAN_METHOD(NodeVitastor::Create)
-{
-    TRACE("NodeVitastor::Create");
-    v8::Local<v8::Object> jsParams = info[0].As<v8::Object>();
-    v8::Local<v8::Array> keys = Nan::GetOwnPropertyNames(jsParams).ToLocalChecked();
-    std::vector<std::string> cfg;
-    for (uint32_t i = 0; i < keys->Length(); i++)
-    {
-        auto key = Nan::Get(keys, i).ToLocalChecked();
-        cfg.push_back(std::string(*Nan::Utf8String(key)));
-        cfg.push_back(std::string(*Nan::Utf8String(Nan::Get(jsParams, key).ToLocalChecked())));
-    }
-
-    const char **c_cfg = new const char*[cfg.size()];
-    for (size_t i = 0; i < cfg.size(); i++)
-    {
-        c_cfg[i] = cfg[i].c_str();
-    }
-    NodeVitastor* cli = new NodeVitastor();
-    cli->c = vitastor_c_create_uring_json(c_cfg, cfg.size());
-    delete[] c_cfg;
-
-    int res = vitastor_c_uring_register_eventfd(cli->c);
-    if (res >= 0)
-    {
-        cli->eventfd = res;
-        res = uv_poll_init(uv_default_loop(), &cli->poll_watcher, cli->eventfd);
-        if (res >= 0)
-            res = uv_poll_start(&cli->poll_watcher, UV_READABLE, on_io_readable);
-        on_io_readable(&cli->poll_watcher, 0, UV_READABLE);
-    }
-    if (res < 0)
-    {
-        ERRORF("NodeVitastor: failed to create and register io_uring eventfd in libuv: %s", strerror(-cli->eventfd));
-        vitastor_c_destroy(cli->c);
-        cli->c = NULL;
-        Nan::ThrowError("failed to create and register io_uring eventfd");
-        return;
-    }
-
-    cli->Wrap(info.This());
-    info.GetReturnValue().Set(info.This());
-}
-
-void NodeVitastor::on_io_readable(uv_poll_t* handle, int status, int revents)
-{
-    TRACEF("NodeVitastor::on_io_readable status/revents %d %d", status, revents);
-    if (revents & UV_READABLE)
-    {
-        NodeVitastor* self = (NodeVitastor*)handle->data;
-        {
-            std::unique_lock<std::mutex> lock(self->mu);
-            vitastor_c_uring_handle_events(self->c);
-        }
-        self->run_postponed();
-    }
-}
-
-void NodeVitastor::run_postponed()
-{
-    std::vector<std::function<void()>> callbacks;
-    {
-        std::unique_lock<std::mutex> lock(mu);
-        callbacks.swap(postponed);
-    }
-    for (auto & cb: callbacks)
-    {
-        cb();
-    }
-}
-
-NodeVitastorRequest* NodeVitastor::get_read_request(const Nan::FunctionCallbackInfo<v8::Value> & info, int argpos)
-{
-    uint64_t offset = Nan::To<int64_t>(info[argpos+0]).FromJust();
-    uint64_t len = Nan::To<int64_t>(info[argpos+1]).FromJust();
-    uint8_t *buf = (uint8_t*)malloc(len);
-    if (!buf)
-    {
-        Nan::ThrowError("failed to allocate memory");
-        return NULL;
-    }
-    v8::Local<v8::Function> callback = info[argpos+2].As<v8::Function>();
-    auto req = new NodeVitastorRequest(this, callback);
-
-    req->offset = offset;
-    req->len = len;
-    req->iov = { .iov_base = buf, .iov_len = len };
-
-    return req;
-}
-
-// read(pool, inode, offset, len, callback(err, buffer, version))
-NAN_METHOD(NodeVitastor::Read)
-{
-    TRACE("NodeVitastor::Read");
-
-    NodeVitastor* self = Nan::ObjectWrap::Unwrap<NodeVitastor>(info.This());
-
-    uint64_t pool = Nan::To<int64_t>(info[0]).FromJust();
-    uint64_t inode = Nan::To<int64_t>(info[1]).FromJust();
-
-    auto req = self->get_read_request(info, 2);
-
-    self->Ref();
-    {
-        std::unique_lock<std::mutex> lock(self->mu);
-        vitastor_c_read(self->c, ((pool << (64-POOL_ID_BITS)) | inode), req->offset, req->len, &req->iov, 1, postpone_read_finish, req);
-    }
-    self->run_postponed();
-}
-
-NodeVitastorRequest* NodeVitastor::get_write_request(const Nan::FunctionCallbackInfo<v8::Value> & info, int argpos)
-{
-    uint64_t offset = Nan::To<int64_t>(info[argpos+0]).FromJust();
-    const auto & bufarg = info[argpos+1];
-    uint64_t version = 0;
-    if (!info[argpos+2].IsEmpty() && info[argpos+2]->IsObject())
-    {
-        auto key = Nan::New<v8::String>("version").ToLocalChecked();
-        auto params = info[argpos+2].As<v8::Object>();
-        auto versionObj = Nan::Get(params, key).ToLocalChecked();
-        if (!versionObj.IsEmpty())
-            version = Nan::To<int64_t>(versionObj).FromJust();
-        argpos++;
-    }
-
-    v8::Local<v8::Function> callback = info[argpos+2].As<v8::Function>();
-    auto req = new NodeVitastorRequest(this, callback);
-
-    req->offset = offset;
-    req->version = version;
-
-    if (bufarg->IsArray())
-    {
-        auto buffers = bufarg.As<v8::Array>();
-        req->len = 0;
-        for (uint32_t i = 0; i < buffers->Length(); i++)
-        {
-            auto buffer_obj = Nan::Get(buffers, i).ToLocalChecked();
-            char *buf = node::Buffer::Data(buffer_obj);
-            uint64_t len = node::Buffer::Length(buffer_obj);
-            req->iov_list.push_back({ .iov_base = buf, .iov_len = len });
-            req->len += len;
-        }
-    }
-    else
-    {
-        char *buf = node::Buffer::Data(bufarg);
-        uint64_t len = node::Buffer::Length(bufarg);
-        req->iov = { .iov_base = buf, .iov_len = len };
-        req->len = len;
-    }
-
-    return req;
-}
-
-// write(pool, inode, offset, buf: Buffer | Buffer[], { version }?, callback(err))
-NAN_METHOD(NodeVitastor::Write)
-{
-    TRACE("NodeVitastor::Write");
-
-    NodeVitastor* self = Nan::ObjectWrap::Unwrap<NodeVitastor>(info.This());
-
-    uint64_t pool = Nan::To<int64_t>(info[0]).FromJust();
-    uint64_t inode = Nan::To<int64_t>(info[1]).FromJust();
-
-    auto req = self->get_write_request(info, 2);
-
-    self->Ref();
-    {
-        std::unique_lock<std::mutex> lock(self->mu);
-        vitastor_c_write(self->c, ((pool << (64-POOL_ID_BITS)) | inode), req->offset, req->len, req->version,
-            req->iov_list.size() ? req->iov_list.data() : &req->iov,
-            req->iov_list.size() ? req->iov_list.size() : 1,
-            postpone_write_finish, req);
-    }
-    self->run_postponed();
-}
-
-// sync(callback(err))
-NAN_METHOD(NodeVitastor::Sync)
-{
-    TRACE("NodeVitastor::Sync");
-
-    NodeVitastor* self = Nan::ObjectWrap::Unwrap<NodeVitastor>(info.This());
-
-    v8::Local<v8::Function> callback = info[0].As<v8::Function>();
-    auto req = new NodeVitastorRequest(self, callback);
-
-    self->Ref();
-    {
-        std::unique_lock<std::mutex> lock(self->mu);
-        vitastor_c_sync(self->c, postpone_write_finish, req);
-    }
-    self->run_postponed();
-}
-
-// read_bitmap(pool, inode, offset, len, with_parents, callback(err, bitmap_buffer))
-NAN_METHOD(NodeVitastor::ReadBitmap)
-{
-    TRACE("NodeVitastor::ReadBitmap");
-
-    NodeVitastor* self = Nan::ObjectWrap::Unwrap<NodeVitastor>(info.This());
-
-    uint64_t pool = Nan::To<int64_t>(info[0]).FromJust();
-    uint64_t inode = Nan::To<int64_t>(info[1]).FromJust();
-    uint64_t offset = Nan::To<int64_t>(info[2]).FromJust();
-    uint64_t len = Nan::To<int64_t>(info[3]).FromJust();
-    bool with_parents = Nan::To<bool>(info[4]).FromJust();
-    v8::Local<v8::Function> callback = info[5].As<v8::Function>();
-    auto req = new NodeVitastorRequest(self, callback);
-
-    self->Ref();
-    {
-        std::unique_lock<std::mutex> lock(self->mu);
-        vitastor_c_read_bitmap(self->c, ((pool << (64-POOL_ID_BITS)) | inode), offset, len, with_parents, postpone_read_bitmap_finish, req);
-    }
-    self->run_postponed();
-}
-
-static void on_error(NodeVitastorRequest *req, Nan::Callback & nanCallback, long retval)
-{
-    // Legal errors: EINVAL, EIO, EROFS, ENOSPC, EINTR, ENOENT
-    v8::Local<v8::Value> args[1];
-    if (!retval)
-        args[0] = Nan::Null();
-    else
-        args[0] = Nan::New<v8::Int32>((int32_t)retval);
-    nanCallback.Call(1, args, req);
-}
-
-void NodeVitastor::on_read_finish(void *opaque, long retval, uint64_t version)
-{
-    TRACE("NodeVitastor::on_read_finish");
-    Nan::HandleScope scope;
-    NodeVitastorRequest *req = (NodeVitastorRequest *)opaque;
-    Nan::Callback nanCallback(Nan::New(req->callback));
-    if (retval == -ENOENT)
-    {
-        free(req->iov.iov_base);
-        nanCallback.Call(0, NULL, req);
-    }
-    else if (retval < 0)
-    {
-        free(req->iov.iov_base);
-        on_error(req, nanCallback, retval);
-    }
-    else
-    {
-        v8::Local<v8::Value> args[3];
-        args[0] = Nan::Null();
-        args[1] = Nan::NewBuffer((char*)req->iov.iov_base, req->iov.iov_len).ToLocalChecked();
-        args[2] = v8::BigInt::NewFromUnsigned(v8::Isolate::GetCurrent(), version);
-        nanCallback.Call(3, args, req);
-    }
-    req->cli->Unref();
-    delete req;
-}
-
-void NodeVitastor::on_write_finish(void *opaque, long retval)
-{
-    TRACE("NodeVitastor::on_write_finish");
-    Nan::HandleScope scope;
-    NodeVitastorRequest *req = (NodeVitastorRequest *)opaque;
-    Nan::Callback nanCallback(Nan::New(req->callback));
-    on_error(req, nanCallback, retval);
-    req->cli->Unref();
-    delete req;
-}
-
-void NodeVitastor::on_read_bitmap_finish(void *opaque, long retval, uint8_t *bitmap)
-{
-    TRACE("NodeVitastor::on_read_bitmap_finish");
-    Nan::HandleScope scope;
-    NodeVitastorRequest *req = (NodeVitastorRequest *)opaque;
-    Nan::Callback nanCallback(Nan::New(req->callback));
-    if (retval == -ENOENT)
-        nanCallback.Call(0, NULL, req);
-    else if (retval < 0)
-        on_error(req, nanCallback, retval);
-    else
-    {
-        v8::Local<v8::Value> args[2];
-        args[0] = Nan::Null();
-        args[1] = Nan::NewBuffer((char*)bitmap, (retval+7)/8).ToLocalChecked();
-        nanCallback.Call(2, args, req);
-    }
-    req->cli->Unref();
-    delete req;
-}
-
-void NodeVitastor::postpone_read_finish(void *opaque, long retval, uint64_t version)
-{
-    NodeVitastorRequest *req = (NodeVitastorRequest *)opaque;
-    req->cli->postponed.push_back([=]() { on_read_finish(opaque, retval, version); });
-}
-
-void NodeVitastor::postpone_write_finish(void *opaque, long retval)
-{
-    NodeVitastorRequest *req = (NodeVitastorRequest *)opaque;
-    req->cli->postponed.push_back([=]() { on_write_finish(opaque, retval); });
-}
-
-void NodeVitastor::postpone_read_bitmap_finish(void *opaque, long retval, uint8_t *bitmap)
-{
-    NodeVitastorRequest *req = (NodeVitastorRequest *)opaque;
-    req->cli->postponed.push_back([=]() { on_read_bitmap_finish(opaque, retval, bitmap); });
-}
-
-//NAN_METHOD(NodeVitastor::Destroy)
-//{
-//    TRACE("NodeVitastor::Destroy");
-//}
-
-//////////////////////////////////////////////////
-// NodeVitastorImage
-//////////////////////////////////////////////////
-
-NAN_METHOD(NodeVitastorImage::Create)
-{
-    TRACE("NodeVitastorImage::Create");
-
-    v8::Local<v8::Object> parent = info[0].As<v8::Object>();
-    std::string name = std::string(*Nan::Utf8String(info[1].As<v8::String>()));
-    NodeVitastor *cli = Nan::ObjectWrap::Unwrap<NodeVitastor>(parent);
-
-    NodeVitastorImage *img = new NodeVitastorImage();
-    img->Wrap(info.This());
-
-    img->cli = cli;
-    img->name = name;
-
-    img->Ref();
-    cli->Ref();
-    std::unique_lock<std::mutex> lock(cli->mu);
-    vitastor_c_watch_inode(cli->c, (char*)img->name.c_str(), on_watch_start, img);
-
-    info.GetReturnValue().Set(info.This());
-}
-
-NodeVitastorImage::~NodeVitastorImage()
-{
-    if (watch)
-    {
-        vitastor_c_close_watch(cli->c, watch);
-        watch = NULL;
-    }
-    cli->Unref();
-}
-
-// read(offset, len, callback(err, buffer, version))
-NAN_METHOD(NodeVitastorImage::Read)
-{
-    TRACE("NodeVitastorImage::Read");
-
-    NodeVitastorImage* img = Nan::ObjectWrap::Unwrap<NodeVitastorImage>(info.This());
-
-    auto req = img->cli->get_read_request(info, 0);
-    req->img = img;
-    req->op = NODE_VITASTOR_READ;
-
-    img->exec_or_wait(req);
-}
-
-// write(offset, buffer, { version }?, callback(err))
-NAN_METHOD(NodeVitastorImage::Write)
-{
-    TRACE("NodeVitastorImage::Write");
-
-    NodeVitastorImage* img = Nan::ObjectWrap::Unwrap<NodeVitastorImage>(info.This());
-
-    auto req = img->cli->get_write_request(info, 0);
-    req->img = img;
-    req->op = NODE_VITASTOR_WRITE;
-
-    img->exec_or_wait(req);
-}
-
-NAN_METHOD(NodeVitastorImage::Sync)
-{
-    TRACE("NodeVitastorImage::Sync");
-
-    NodeVitastorImage* img = Nan::ObjectWrap::Unwrap<NodeVitastorImage>(info.This());
-
-    v8::Local<v8::Function> callback = info[0].As<v8::Function>();
-    auto req = new NodeVitastorRequest(img->cli, callback);
-    req->img = img;
-    req->op = NODE_VITASTOR_SYNC;
-
-    img->exec_or_wait(req);
-}
-
-// read_bitmap(offset, len, with_parents, callback(err, bitmap_buffer))
-NAN_METHOD(NodeVitastorImage::ReadBitmap)
-{
-    TRACE("NodeVitastorImage::ReadBitmap");
-
-    NodeVitastorImage* img = Nan::ObjectWrap::Unwrap<NodeVitastorImage>(info.This());
-
-    uint64_t offset = Nan::To<int64_t>(info[0]).FromJust();
-    uint64_t len = Nan::To<int64_t>(info[1]).FromJust();
-    bool with_parents = Nan::To<bool>(info[2]).FromJust();
-    v8::Local<v8::Function> callback = info[3].As<v8::Function>();
-
-    auto req = new NodeVitastorRequest(img->cli, callback);
-    req->img = img;
-    req->op = NODE_VITASTOR_READ_BITMAP;
-    req->offset = offset;
-    req->len = len;
-    req->with_parents = with_parents;
-
-    img->exec_or_wait(req);
-}
-
-NAN_METHOD(NodeVitastorImage::GetInfo)
-{
-    TRACE("NodeVitastorImage::GetInfo");
-
-    NodeVitastorImage* img = Nan::ObjectWrap::Unwrap<NodeVitastorImage>(info.This());
-
-    v8::Local<v8::Function> callback = info[0].As<v8::Function>();
-    auto req = new NodeVitastorRequest(img->cli, callback);
-    req->img = img;
-    req->op = NODE_VITASTOR_GET_INFO;
-
-    img->exec_or_wait(req);
-}
-
-void NodeVitastorImage::exec_or_wait(NodeVitastorRequest *req)
-{
-    if (!watch)
-    {
-        // Need to wait for initialisation
-        on_init.push_back(req);
-    }
-    else
-    {
-        exec_request(req);
-    }
-}
-
-void NodeVitastorImage::exec_request(NodeVitastorRequest *req)
-{
-    if (req->op == NODE_VITASTOR_READ)
-    {
-        cli->Ref();
-        std::unique_lock<std::mutex> lock(cli->mu);
-        uint64_t ino = vitastor_c_inode_get_num(watch);
-        vitastor_c_read(cli->c, ino, req->offset, req->len, &req->iov, 1, NodeVitastor::postpone_read_finish, req);
-    }
-    else if (req->op == NODE_VITASTOR_WRITE)
-    {
-        cli->Ref();
-        std::unique_lock<std::mutex> lock(cli->mu);
-        uint64_t ino = vitastor_c_inode_get_num(watch);
-        vitastor_c_write(cli->c, ino, req->offset, req->len, req->version,
-            req->iov_list.size() ? req->iov_list.data() : &req->iov,
-            req->iov_list.size() ? req->iov_list.size() : 1,
-            NodeVitastor::postpone_write_finish, req);
-    }
-    else if (req->op == NODE_VITASTOR_SYNC)
-    {
-        cli->Ref();
-        std::unique_lock<std::mutex> lock(cli->mu);
-        uint64_t ino = vitastor_c_inode_get_num(watch);
-        uint32_t imm = vitastor_c_inode_get_immediate_commit(cli->c, ino);
-        if (imm != IMMEDIATE_ALL)
-        {
-            vitastor_c_sync(cli->c, NodeVitastor::postpone_write_finish, req);
-        }
-        else
-        {
-            NodeVitastor::postpone_write_finish(req, 0);
-        }
-    }
-    else if (req->op == NODE_VITASTOR_READ_BITMAP)
-    {
-        cli->Ref();
-        std::unique_lock<std::mutex> lock(cli->mu);
-        uint64_t ino = vitastor_c_inode_get_num(watch);
-        vitastor_c_read_bitmap(cli->c, ino, req->offset, req->len, req->with_parents, NodeVitastor::postpone_read_bitmap_finish, req);
-    }
-    else if (req->op == NODE_VITASTOR_GET_INFO)
-    {
-        v8::Local<v8::Object> res = Nan::New<v8::Object>();
-
-        fill_info(res);
-
-        Nan::HandleScope scope;
-        Nan::Callback nanCallback(Nan::New(req->callback));
-        v8::Local<v8::Value> args[1];
-        args[0] = res;
-        nanCallback.Call(1, args, req);
-
-        delete req;
-    }
-    cli->run_postponed();
-}
-
-void NodeVitastorImage::fill_info(v8::Local<v8::Object> & res)
-{
-    std::unique_lock<std::mutex> lock(cli->mu);
-    uint64_t size = vitastor_c_inode_get_size(watch);
-    uint64_t num = vitastor_c_inode_get_num(watch);
-    uint32_t block_size = vitastor_c_inode_get_block_size(cli->c, num);
-    uint32_t bitmap_granularity = vitastor_c_inode_get_bitmap_granularity(cli->c, num);
-    int readonly = vitastor_c_inode_get_readonly(watch);
-    uint32_t immediate_commit = vitastor_c_inode_get_immediate_commit(cli->c, num);
-    uint64_t parent_id = vitastor_c_inode_get_parent_id(watch);
-    char *meta = vitastor_c_inode_get_meta(watch);
-    uint64_t mod_revision = vitastor_c_inode_get_mod_revision(watch);
-
-    Nan::Set(res, Nan::New<v8::String>("pool_id").ToLocalChecked(), Nan::New<v8::Number>(INODE_POOL(num)));
-    Nan::Set(res, Nan::New<v8::String>("inode_num").ToLocalChecked(), Nan::New<v8::Number>(INODE_NO_POOL(num)));
-    if (size < ((uint64_t)1<<53))
-        Nan::Set(res, Nan::New<v8::String>("size").ToLocalChecked(), Nan::New<v8::Number>(size));
-    else
-        Nan::Set(res, Nan::New<v8::String>("size").ToLocalChecked(), v8::BigInt::NewFromUnsigned(v8::Isolate::GetCurrent(), size));
-    if (parent_id)
-    {
-        Nan::Set(res, Nan::New<v8::String>("parent_pool_id").ToLocalChecked(), Nan::New<v8::Number>(INODE_POOL(parent_id)));
-        Nan::Set(res, Nan::New<v8::String>("parent_inode_num").ToLocalChecked(), Nan::New<v8::Number>(INODE_NO_POOL(parent_id)));
-    }
-    Nan::Set(res, Nan::New<v8::String>("readonly").ToLocalChecked(), Nan::New((bool)readonly));
-    if (meta)
-    {
-        Nan::JSON nanJSON;
-        Nan::Set(res, Nan::New<v8::String>("meta").ToLocalChecked(), nanJSON.Parse(Nan::New<v8::String>(meta).ToLocalChecked()).ToLocalChecked());
-    }
-    if (mod_revision < ((uint64_t)1<<53))
-        Nan::Set(res, Nan::New<v8::String>("mod_revision").ToLocalChecked(), Nan::New<v8::Number>(mod_revision));
-    else
-        Nan::Set(res, Nan::New<v8::String>("mod_revision").ToLocalChecked(), v8::BigInt::NewFromUnsigned(v8::Isolate::GetCurrent(), mod_revision));
-    Nan::Set(res, Nan::New<v8::String>("block_size").ToLocalChecked(), Nan::New(block_size));
-    Nan::Set(res, Nan::New<v8::String>("bitmap_granularity").ToLocalChecked(), Nan::New(bitmap_granularity));
-    Nan::Set(res, Nan::New<v8::String>("immediate_commit").ToLocalChecked(), Nan::New(immediate_commit));
-}
-
-void NodeVitastorImage::on_watch_start(void *opaque, long retval)
-{
-    NodeVitastorImage *img = (NodeVitastorImage *)opaque;
-    {
-        img->watch = (void*)retval;
-        auto on_init = std::move(img->on_init);
-        for (auto req: on_init)
-        {
-            img->exec_request(req);
-        }
-    }
-    img->Unref();
-}
-
-//////////////////////////////////////////////////
-// NodeVitastorKV
-//////////////////////////////////////////////////
-
-Nan::Persistent<v8::Function> NodeVitastorKV::listing_class;
-
-// constructor(node_vitastor)
-NAN_METHOD(NodeVitastorKV::Create)
-{
-    TRACE("NodeVitastorKV::Create");
-
-    v8::Local<v8::Object> parent = info[0].As<v8::Object>();
-    NodeVitastor *cli = Nan::ObjectWrap::Unwrap<NodeVitastor>(parent);
-
-    NodeVitastorKV *kv = new NodeVitastorKV();
-    kv->cli = cli;
-    {
-        std::unique_lock<std::mutex> lock(cli->mu);
-        kv->dbw = new vitastorkv_dbw_t((cluster_client_t*)vitastor_c_get_internal_client(cli->c));
-    }
-
-    kv->Wrap(info.This());
-    cli->Ref();
-    info.GetReturnValue().Set(info.This());
-}
-
-NodeVitastorKV::~NodeVitastorKV()
-{
-    delete dbw;
-    cli->Unref();
-}
-
-// open(pool_id, inode_num, { ...config }, callback(err))
-NAN_METHOD(NodeVitastorKV::Open)
-{
-    TRACE("NodeVitastorKV::Open");
-
-    NodeVitastorKV* kv = Nan::ObjectWrap::Unwrap<NodeVitastorKV>(info.This());
-
-    uint64_t inode_id = INODE_WITH_POOL(Nan::To<int64_t>(info[0]).FromJust(), Nan::To<int64_t>(info[1]).FromJust());
-
-    v8::Local<v8::Object> jsParams = info[2].As<v8::Object>();
-    v8::Local<v8::Array> keys = Nan::GetOwnPropertyNames(jsParams).ToLocalChecked();
-    std::map<std::string, std::string> cfg;
-    for (uint32_t i = 0; i < keys->Length(); i++)
-    {
-        auto key = Nan::Get(keys, i).ToLocalChecked();
-        cfg[std::string(*Nan::Utf8String(key))] = std::string(*Nan::Utf8String(Nan::Get(jsParams, key).ToLocalChecked()));
-    }
-
-    v8::Local<v8::Function> callback = info[3].As<v8::Function>();
-    auto req = new NodeVitastorRequest(kv->cli, callback);
-
-    kv->Ref();
-    kv->dbw->open(inode_id, cfg, [kv, req](int res)
-    {
-        Nan::HandleScope scope;
-        Nan::Callback nanCallback(Nan::New(req->callback));
-        v8::Local<v8::Value> args[1];
-        args[0] = !res ? v8::Local<v8::Value>(Nan::Null()) : v8::Local<v8::Value>(Nan::New<v8::Int32>(res));
-        nanCallback.Call(1, args, req);
-        delete req;
-        kv->Unref();
-    });
-}
-
-// close(callback(err))
-NAN_METHOD(NodeVitastorKV::Close)
-{
-    TRACE("NodeVitastorKV::Close");
-
-    NodeVitastorKV* kv = Nan::ObjectWrap::Unwrap<NodeVitastorKV>(info.This());
-
-    v8::Local<v8::Function> callback = info[0].As<v8::Function>();
-    auto req = new NodeVitastorRequest(kv->cli, callback);
-
-    kv->Ref();
-    kv->dbw->close([kv, req]()
-    {
-        Nan::HandleScope scope;
-        Nan::Callback nanCallback(Nan::New(req->callback));
-        nanCallback.Call(0, NULL, req);
-        delete req;
-        kv->Unref();
-    });
-}
-
-// set_config({ ...config })
-NAN_METHOD(NodeVitastorKV::SetConfig)
-{
-    TRACE("NodeVitastorKV::SetConfig");
-
-    NodeVitastorKV* kv = Nan::ObjectWrap::Unwrap<NodeVitastorKV>(info.This());
-
-    v8::Local<v8::Object> jsParams = info[0].As<v8::Object>();
-    v8::Local<v8::Array> keys = Nan::GetOwnPropertyNames(jsParams).ToLocalChecked();
-    std::map<std::string, std::string> cfg;
-    for (uint32_t i = 0; i < keys->Length(); i++)
-    {
-        auto key = Nan::Get(keys, i).ToLocalChecked();
-        cfg[std::string(*Nan::Utf8String(key))] = std::string(*Nan::Utf8String(Nan::Get(jsParams, key).ToLocalChecked()));
-    }
-
-    kv->dbw->set_config(cfg);
-}
-
-// get_size()
-NAN_METHOD(NodeVitastorKV::GetSize)
-{
-    TRACE("NodeVitastorKV::GetSize");
-
-    NodeVitastorKV* kv = Nan::ObjectWrap::Unwrap<NodeVitastorKV>(info.This());
-
-    auto size = kv->dbw->get_size();
-    info.GetReturnValue().Set((size < ((uint64_t)1<<53))
-        ? v8::Local<v8::Value>(Nan::New<v8::Number>(size))
-        : v8::Local<v8::Value>(v8::BigInt::NewFromUnsigned(info.GetIsolate(), size)));
-}
-
-void NodeVitastorKV::get_impl(const Nan::FunctionCallbackInfo<v8::Value> & info, bool allow_cache)
-{
-    NodeVitastorKV* kv = Nan::ObjectWrap::Unwrap<NodeVitastorKV>(info.This());
-
-    // FIXME: Handle Buffer too
-    std::string key(*Nan::Utf8String(info[0].As<v8::String>()));
-
-    v8::Local<v8::Function> callback = info[1].As<v8::Function>();
-    auto req = new NodeVitastorRequest(kv->cli, callback);
-
-    kv->Ref();
-    kv->dbw->get(key, [kv, req](int res, const std::string & value)
-    {
-        Nan::HandleScope scope;
-        Nan::Callback nanCallback(Nan::New(req->callback));
-        v8::Local<v8::Value> args[2];
-        args[0] = !res ? v8::Local<v8::Value>(Nan::Null()) : v8::Local<v8::Value>(Nan::New<v8::Int32>(res));
-        args[1] = !res ? v8::Local<v8::Value>(Nan::New<v8::String>(value).ToLocalChecked()) : v8::Local<v8::Value>(Nan::Null());
-        nanCallback.Call(2, args, req);
-        delete req;
-        kv->Unref();
-    }, allow_cache);
-}
-
-// get(key, callback(err, value))
-NAN_METHOD(NodeVitastorKV::Get)
-{
-    TRACE("NodeVitastorKV::Get");
-    get_impl(info, false);
-}
-
-// get_cached(key, callback(err, value))
-NAN_METHOD(NodeVitastorKV::GetCached)
-{
-    TRACE("NodeVitastorKV::GetCached");
-    get_impl(info, true);
-}
-
-static std::function<bool(int, const std::string &)> make_cas_callback(NodeVitastorRequest *cas_req)
-{
-    return [cas_req](int res, const std::string & value)
-    {
-        Nan::HandleScope scope;
-        Nan::Callback nanCallback(Nan::New(cas_req->callback));
-        v8::Local<v8::Value> args[1];
-        args[0] = !res ? v8::Local<v8::Value>(Nan::New<v8::String>(value).ToLocalChecked()) : v8::Local<v8::Value>(Nan::Null());
-        Nan::MaybeLocal<v8::Value> ret = nanCallback.Call(1, args, cas_req);
-        if (ret.IsEmpty())
-            return false;
-        return Nan::To<bool>(ret.ToLocalChecked()).FromJust();
-    };
-}
-
-// set(key, value, callback(err), cas_compare(old_value))
-NAN_METHOD(NodeVitastorKV::Set)
-{
-    TRACE("NodeVitastorKV::Set");
-
-    NodeVitastorKV* kv = Nan::ObjectWrap::Unwrap<NodeVitastorKV>(info.This());
-
-    // FIXME: Handle Buffer too
-    std::string key(*Nan::Utf8String(info[0].As<v8::String>()));
-    std::string value(*Nan::Utf8String(info[1].As<v8::String>()));
-
-    v8::Local<v8::Function> callback = info[2].As<v8::Function>();
-    NodeVitastorRequest *req = new NodeVitastorRequest(kv->cli, callback), *cas_req = NULL;
-
-    std::function<bool(int, const std::string &)> cas_cb;
-    if (info.Length() > 3 && info[3]->IsObject())
-    {
-        v8::Local<v8::Function> cas_callback = info[3].As<v8::Function>();
-        cas_req = new NodeVitastorRequest(kv->cli, cas_callback);
-        cas_cb = make_cas_callback(cas_req);
-    }
-
-    kv->Ref();
-    kv->dbw->set(key, value, [kv, req, cas_req](int res)
-    {
-        Nan::HandleScope scope;
-        Nan::Callback nanCallback(Nan::New(req->callback));
-        v8::Local<v8::Value> args[1];
-        args[0] = !res ? v8::Local<v8::Value>(Nan::Null()) : v8::Local<v8::Value>(Nan::New<v8::Int32>(res));
-        nanCallback.Call(1, args, req);
-        delete req;
-        if (cas_req)
-            delete cas_req;
-        kv->Unref();
-    }, cas_cb);
-}
-
-// del(key, callback(err), cas_compare(old_value))
-NAN_METHOD(NodeVitastorKV::Del)
-{
-    TRACE("NodeVitastorKV::Del");
-
-    NodeVitastorKV* kv = Nan::ObjectWrap::Unwrap<NodeVitastorKV>(info.This());
-
-    // FIXME: Handle Buffer too
-    std::string key(*Nan::Utf8String(info[0].As<v8::String>()));
-
-    v8::Local<v8::Function> callback = info[1].As<v8::Function>();
-    NodeVitastorRequest *req = new NodeVitastorRequest(kv->cli, callback), *cas_req = NULL;
-
-    std::function<bool(int, const std::string &)> cas_cb;
-    if (info.Length() > 2 && info[2]->IsObject())
-    {
-        v8::Local<v8::Function> cas_callback = info[2].As<v8::Function>();
-        cas_req = new NodeVitastorRequest(kv->cli, cas_callback);
-        cas_cb = make_cas_callback(cas_req);
-    }
-
-    kv->Ref();
-    kv->dbw->del(key, [kv, req, cas_req](int res)
-    {
-        Nan::HandleScope scope;
-        Nan::Callback nanCallback(Nan::New(req->callback));
-        v8::Local<v8::Value> args[1];
-        args[0] = !res ? v8::Local<v8::Value>(Nan::Null()) : v8::Local<v8::Value>(Nan::New<v8::Int32>(res));
-        nanCallback.Call(1, args, req);
-        delete req;
-        if (cas_req)
-            delete cas_req;
-        kv->Unref();
-    }, cas_cb);
-}
-
-// list(start_key?)
-NAN_METHOD(NodeVitastorKV::List)
-{
-    TRACE("NodeVitastorKV::List");
-
-    v8::Local<v8::Function> cons = Nan::New(listing_class);
-    v8::Local<v8::Value> args[2];
-    args[0] = info.This();
-    int narg = 1;
-    if (info.Length() > 1 && info[1]->IsString())
-    {
-        args[1] = info[1];
-        narg = 2;
-    }
-    info.GetReturnValue().Set(Nan::NewInstance(cons, narg, args).ToLocalChecked());
-}
-
-/*NAN_METHOD(NodeVitastorKV::Destroy)
-{
-    TRACE("NodeVitastorKV::Destroy");
-    NodeVitastorKV* kv = Nan::ObjectWrap::Unwrap<NodeVitastorKV>(info.This());
-    std::unique_lock<std::mutex> lock(self->mu);
-    if (!kv->dead)
-        kv->Unref();
-}*/
-
-//////////////////////////////////////////////////
-// NodeVitastorKVListing
-//////////////////////////////////////////////////
-
-// constructor(node_vitastor_kv, start_key?)
-NAN_METHOD(NodeVitastorKVListing::Create)
-{
-    TRACE("NodeVitastorKVListing::Create");
-
-    v8::Local<v8::Object> parent = info[0].As<v8::Object>();
-    NodeVitastorKV *kv = Nan::ObjectWrap::Unwrap<NodeVitastorKV>(parent);
-
-    std::string start_key;
-    // FIXME: Handle Buffer too
-    if (info.Length() > 1 && info[1]->IsString())
-    {
-        start_key = std::string(*Nan::Utf8String(info[1].As<v8::String>()));
-    }
-
-    NodeVitastorKVListing *list = new NodeVitastorKVListing();
-    list->kv = kv;
-    {
-        std::unique_lock<std::mutex> lock(kv->cli->mu);
-        list->handle = list->kv->dbw->list_start(start_key);
-    }
-
-    list->Wrap(info.This());
-    kv->Ref();
-    info.GetReturnValue().Set(info.This());
-}
-
-NodeVitastorKVListing::~NodeVitastorKVListing()
-{
-    if (handle)
-    {
-        std::unique_lock<std::mutex> lock(kv->cli->mu);
-        kv->dbw->list_close(handle);
-        handle = NULL;
-    }
-    kv->Unref();
-}
-
-// next(callback(err, value))
-NAN_METHOD(NodeVitastorKVListing::Next)
-{
-    TRACE("NodeVitastorKVListing::Next");
-
-    NodeVitastorKVListing* list = Nan::ObjectWrap::Unwrap<NodeVitastorKVListing>(info.This());
-
-    v8::Local<v8::Function> callback = info[0].As<v8::Function>();
-    auto req = new NodeVitastorRequest(list->kv->cli, callback);
-    if (!list->handle)
-    {
-        // Already closed
-        Nan::Callback nanCallback(Nan::New(req->callback));
-        v8::Local<v8::Value> args[1];
-        args[0] = Nan::New<v8::Int32>(-EINVAL);
-        nanCallback.Call(1, args, req);
-        delete req;
-        return;
-    }
-
-    list->kv->Ref();
-    list->kv->dbw->list_next(list->handle, [list, req](int res, const std::string & key, const std::string & value)
-    {
-        Nan::HandleScope scope;
-        Nan::Callback nanCallback(Nan::New(req->callback));
-        v8::Local<v8::Value> args[3];
-        args[0] = Nan::New<v8::Int32>(res);
-        args[1] = !res ? v8::Local<v8::Value>(Nan::New<v8::String>(key).ToLocalChecked()) : v8::Local<v8::Value>(Nan::Null());
-        args[2] = !res ? v8::Local<v8::Value>(Nan::New<v8::String>(value).ToLocalChecked()) : v8::Local<v8::Value>(Nan::Null());
-        nanCallback.Call(3, args, req);
-        delete req;
-        list->kv->Unref();
-    });
-}
-
-// close()
-NAN_METHOD(NodeVitastorKVListing::Close)
-{
-    TRACE("NodeVitastorKVListing::Close");
-
-    NodeVitastorKVListing* list = Nan::ObjectWrap::Unwrap<NodeVitastorKVListing>(info.This());
-
-    if (list->handle)
-    {
-        std::unique_lock<std::mutex> lock(list->kv->cli->mu);
-        list->kv->dbw->list_close(list->handle);
-        list->handle = NULL;
-    }
-}
--- a/node-binding/client.h
+++ b/node-binding/client.h
@@ -1,146 +0,0 @@
-// Copyright (c) Vitaliy Filippov, 2019+
-// License: VNPL-1.1 (see README.md for details)
-
-#ifndef NODE_VITASTOR_CLIENT_H
-#define NODE_VITASTOR_CLIENT_H
-
-#include <mutex>
-
-#include <nan.h>
-#include <vitastor_c.h>
-#include <vitastor_kv.h>
-
-class NodeVitastorRequest;
-
-class NodeVitastor: public Nan::ObjectWrap
-{
-public:
-    // constructor({ ...config })
-    static NAN_METHOD(Create);
-    // read(pool, inode, offset, len, callback(err, buffer, version))
-    static NAN_METHOD(Read);
-    // write(pool, inode, offset, buf: Buffer | Buffer[], { version }?, callback(err))
-    static NAN_METHOD(Write);
-    // sync(callback(err))
-    static NAN_METHOD(Sync);
-    // read_bitmap(pool, inode, offset, len, with_parents, callback(err, bitmap_buffer))
-    static NAN_METHOD(ReadBitmap);
-//    // destroy()
-//    static NAN_METHOD(Destroy);
-
-    ~NodeVitastor();
-
-private:
-    vitastor_c *c = NULL;
-    int eventfd = -1;
-    uv_poll_t poll_watcher;
-    // FIXME: Is it really needed?
-    std::mutex mu;
-    std::vector<std::function<void()>> postponed;
-
-    NodeVitastor();
-
-    static void on_io_readable(uv_poll_t* handle, int status, int revents);
-    static void on_read_finish(void *opaque, long retval, uint64_t version);
-    static void on_write_finish(void *opaque, long retval);
-    static void on_read_bitmap_finish(void *opaque, long retval, uint8_t *bitmap);
-
-    void run_postponed();
-    static void postpone_read_finish(void *opaque, long retval, uint64_t version);
-    static void postpone_write_finish(void *opaque, long retval);
-    static void postpone_read_bitmap_finish(void *opaque, long retval, uint8_t *bitmap);
-
-    NodeVitastorRequest* get_read_request(const Nan::FunctionCallbackInfo<v8::Value> & info, int argpos);
-    NodeVitastorRequest* get_write_request(const Nan::FunctionCallbackInfo<v8::Value> & info, int argpos);
-
-    friend class NodeVitastorImage;
-    friend class NodeVitastorKV;
-    friend class NodeVitastorKVListing;
-};
-
-class NodeVitastorImage: public Nan::ObjectWrap
-{
-public:
-    // constructor(node_vitastor, name)
-    static NAN_METHOD(Create);
-    // read(offset, len, callback(err, buffer, version))
-    static NAN_METHOD(Read);
-    // write(offset, buf: Buffer | Buffer[], { version }?, callback(err))
-    static NAN_METHOD(Write);
-    // sync(callback(err))
-    static NAN_METHOD(Sync);
-    // read_bitmap(offset, len, with_parents, callback(err, bitmap_buffer))
-    static NAN_METHOD(ReadBitmap);
-    // get_info(callback({ num, name, size, parent_id?, readonly?, meta?, mod_revision, block_size, bitmap_granularity, immediate_commit }))
-    static NAN_METHOD(GetInfo);
-
-    ~NodeVitastorImage();
-
-private:
-    NodeVitastor *cli = NULL;
-    std::string name;
-    void *watch = NULL;
-    std::vector<NodeVitastorRequest*> on_init;
-    Nan::Persistent<v8::Object> cliObj;
-
-    static void on_watch_start(void *opaque, long retval);
-    void exec_request(NodeVitastorRequest *req);
-    void exec_or_wait(NodeVitastorRequest *req);
-    void fill_info(v8::Local<v8::Object> & res);
-};
-
-class NodeVitastorKV: public Nan::ObjectWrap
-{
-public:
-    // constructor(node_vitastor)
-    static NAN_METHOD(Create);
-    // open(pool_id, inode_num, { ...config }, callback(err))
-    static NAN_METHOD(Open);
-    // set_config({ ...config })
-    static NAN_METHOD(SetConfig);
-    // close(callback())
-    static NAN_METHOD(Close);
-    // get_size()
-    static NAN_METHOD(GetSize);
-    // get(key, callback(err, value))
-    static NAN_METHOD(Get);
-    // get_cached(key, callback(err, value))
-    static NAN_METHOD(GetCached);
-    // set(key, value, callback(err), cas_compare(old_value))
-    static NAN_METHOD(Set);
-    // del(key, callback(err), cas_compare(old_value))
-    static NAN_METHOD(Del);
-    // list(start_key?)
-    static NAN_METHOD(List);
-
-    ~NodeVitastorKV();
-
-    static Nan::Persistent<v8::Function> listing_class;
-
-private:
-    NodeVitastor *cli = NULL;
-    vitastorkv_dbw_t *dbw = NULL;
-
-    static void get_impl(const Nan::FunctionCallbackInfo<v8::Value> & info, bool allow_cache);
-
-    friend class NodeVitastorKVListing;
-};
-
-class NodeVitastorKVListing: public Nan::ObjectWrap
-{
-public:
-    // constructor(node_vitastor_kv, start_key?)
-    static NAN_METHOD(Create);
-    // next(callback(err, value))
-    static NAN_METHOD(Next);
-    // close()
-    static NAN_METHOD(Close);
-
-    ~NodeVitastorKVListing();
-
-private:
-    NodeVitastorKV *kv = NULL;
-    void *handle = NULL;
-};
-
-#endif
--- a/node-binding/index.js
+++ b/node-binding/index.js
@@ -1 +0,0 @@
-module.exports = require('bindings')('addon.node');
--- a/node-binding/package.json
+++ b/node-binding/package.json
@@ -1,24 +0,0 @@
-{
-  "name": "vitastor",
-  "version": "1.7.0",
-  "description": "Low-level native bindings to Vitastor client library",
-  "main": "index.js",
-  "keywords": [
-    "storage",
-    "sds",
-    "vitastor"
-  ],
-  "repository": {
-    "type": "git",
-    "url": "git://git.yourcmc.ru/vitalif/vitastor.git"
-  },
-  "scripts": {
-    "build": "node-gyp rebuild"
-  },
-  "author": "Vitaliy Filippov",
-  "license": "VNPL-2.0",
-  "dependencies": {
-    "bindings": "1.5.0",
-    "nan": "^2.19.0"
-  }
-}
--- a/patches/cinder-vitastor.py
+++ b/patches/cinder-vitastor.py
@@ -50,7 +50,7 @@ from cinder.volume import configuration
 from cinder.volume import driver
 from cinder.volume import volume_utils

-VITASTOR_VERSION = '1.7.1'
+VERSION = '1.6.1'

 LOG = logging.getLogger(__name__)

@@ -238,7 +238,7 @@ class VitastorDriver(driver.CloneableImageVD,

        stats = {
            'vendor_name': 'Vitastor',
-            'driver_version': VITASTOR_VERSION,
+            'driver_version': self.VERSION,
            'storage_protocol': 'vitastor',
            'total_capacity_gb': 'unknown',
            'free_capacity_gb': 'unknown',
--- a/patches/qemu-8.2-vitastor.patch
+++ b/patches/qemu-8.2-vitastor.patch
@@ -71,7 +71,7 @@ index c9baeda639..85e1df5a56 100644
        description: 'OpenGL support')
 option('rdma', type : 'feature', value : 'auto',
 diff --git a/qapi/block-core.json b/qapi/block-core.json
-index ca390c5700..d2dbaeb279 100644
+index ca390c5700..8f11ae9fa5 100644
 --- a/qapi/block-core.json
 +++ b/qapi/block-core.json
@@ -3201,7 +3201,7 @@
@@ -120,7 +120,7 @@ index ca390c5700..d2dbaeb279 100644
       'virtio-blk-vfio-pci':
                     { 'type': 'BlockdevOptionsVirtioBlkVfioPci',
                       'if': 'CONFIG_BLKIO' },
-@@ -5148,6 +5171,20 @@
+@@ -5148,6 +5171,17 @@
             '*cluster-size' :   'size',
             '*encrypt' :        'RbdEncryptionCreateOptions' } }
 
@@ -129,9 +129,6 @@ index ca390c5700..d2dbaeb279 100644
 +#
 +# Driver specific image creation options for Vitastor.
 +#
-+# @location: Where to store the new image file.  This location cannot
-+#     point to a snapshot.
-+#
 +# @size: Size of the virtual disk in bytes
 +##
 +{ 'struct': 'BlockdevCreateOptionsVitastor',
@@ -141,7 +138,7 @@ index ca390c5700..d2dbaeb279 100644
 ##
 # @BlockdevVmdkSubformat:
 #
-@@ -5370,6 +5407,7 @@
+@@ -5370,6 +5404,7 @@
       'ssh':            'BlockdevCreateOptionsSsh',
       'vdi':            'BlockdevCreateOptionsVdi',
       'vhdx':           'BlockdevCreateOptionsVhdx',
--- a/patches/qemu-9.0-vitastor.patch
+++ b/patches/qemu-9.0-vitastor.patch
@@ -71,7 +71,7 @@ index 0a99a059ec..16dc440118 100644
        description: 'OpenGL support')
 option('rdma', type : 'feature', value : 'auto',
 diff --git a/qapi/block-core.json b/qapi/block-core.json
-index 746d1694c2..199a146a0b 100644
+index 746d1694c2..fb7aa4423b 100644
 --- a/qapi/block-core.json
 +++ b/qapi/block-core.json
@@ -3203,7 +3203,7 @@
@@ -120,7 +120,7 @@ index 746d1694c2..199a146a0b 100644
       'virtio-blk-vfio-pci':
                     { 'type': 'BlockdevOptionsVirtioBlkVfioPci',
                       'if': 'CONFIG_BLKIO' },
-@@ -5180,6 +5203,20 @@
+@@ -5180,6 +5203,17 @@
             '*cluster-size' :   'size',
             '*encrypt' :        'RbdEncryptionCreateOptions' } }
 
@@ -129,9 +129,6 @@ index 746d1694c2..199a146a0b 100644
 +#
 +# Driver specific image creation options for Vitastor.
 +#
-+# @location: Where to store the new image file.  This location cannot
-+#     point to a snapshot.
-+#
 +# @size: Size of the virtual disk in bytes
 +##
 +{ 'struct': 'BlockdevCreateOptionsVitastor',
@@ -141,7 +138,7 @@ index 746d1694c2..199a146a0b 100644
 ##
 # @BlockdevVmdkSubformat:
 #
-@@ -5402,6 +5439,7 @@
+@@ -5402,6 +5436,7 @@
       'ssh':            'BlockdevCreateOptionsSsh',
       'vdi':            'BlockdevCreateOptionsVdi',
       'vhdx':           'BlockdevCreateOptionsVhdx',
--- a/rpm/build-tarball.sh
+++ b/rpm/build-tarball.sh
@@ -18,11 +18,10 @@ fi
 cd ~/rpmbuild/SPECS
 rpmbuild -bp fio.spec
 cd $VITASTOR
-VER=$(grep ^Version: rpm/vitastor-el7.spec | awk '{print $2}')
 ln -s ~/rpmbuild/BUILD/fio*/ fio
 sh copy-fio-includes.sh
 rm fio
 mv fio-copy fio
 FIO=`rpm -qi fio | perl -e 'while(<>) { /^Epoch[\s:]+(\S+)/ && print "$1:"; /^Version[\s:]+(\S+)/ && print $1; /^Release[\s:]+(\S+)/ && print "-$1"; }'`
 perl -i -pe 's/(Requires:\s*fio)([^\n]+)?/$1 = '$FIO'/' $VITASTOR/rpm/vitastor-el$EL.spec
-tar --transform "s#^#vitastor-$VER/#" --exclude 'rpm/*.rpm' -czf $VITASTOR/../vitastor-$VER$(rpm --eval '%dist').tar.gz *
+tar --transform 's#^#vitastor-1.6.1/#' --exclude 'rpm/*.rpm' -czf $VITASTOR/../vitastor-1.6.1$(rpm --eval '%dist').tar.gz *
--- a/rpm/vitastor-el7.Dockerfile
+++ b/rpm/vitastor-el7.Dockerfile
@@ -36,8 +36,7 @@ ADD . /root/vitastor
 RUN set -e; \
    cd /root/vitastor/rpm; \
    sh build-tarball.sh; \
-    VER=$(grep ^Version: vitastor-el7.spec | awk '{print $2}'); \
-    cp /root/vitastor-$VER.el7.tar.gz ~/rpmbuild/SOURCES; \
+    cp /root/vitastor-1.6.1.el7.tar.gz ~/rpmbuild/SOURCES; \
    cp vitastor-el7.spec ~/rpmbuild/SPECS/vitastor.spec; \
    cd ~/rpmbuild/SPECS/; \
    rpmbuild -ba vitastor.spec; \
--- a/rpm/vitastor-el7.spec
+++ b/rpm/vitastor-el7.spec
@@ -1,11 +1,11 @@
 Name:           vitastor
-Version:        1.7.1
+Version:        1.6.1
 Release:        1%{?dist}
 Summary:        Vitastor, a fast software-defined clustered block storage

 License:        Vitastor Network Public License 1.1
 URL:            https://vitastor.io/
-Source0:        vitastor-1.7.1.el7.tar.gz
+Source0:        vitastor-1.6.1.el7.tar.gz

 BuildRequires:  liburing-devel >= 0.6
 BuildRequires:  gperftools-devel
@@ -144,8 +144,6 @@ mkdir -p /etc/vitastor
 groupadd -r -f vitastor 2>/dev/null ||:
 useradd -r -g vitastor -s /sbin/nologin -c "Vitastor daemons" -M -d /nonexistent vitastor 2>/dev/null ||:
 mkdir -p /etc/vitastor
-mkdir -p /var/lib/vitastor
-chown vitastor:vitastor /var/lib/vitastor


 %files -n vitastor-client
@@ -163,7 +161,6 @@ chown vitastor:vitastor /var/lib/vitastor

 %files -n vitastor-client-devel
 %_includedir/vitastor_c.h
-%_includedir/vitastor_kv.h
 %_libdir/pkgconfig


--- a/rpm/vitastor-el8.Dockerfile
+++ b/rpm/vitastor-el8.Dockerfile
@@ -35,8 +35,7 @@ ADD . /root/vitastor
 RUN set -e; \
    cd /root/vitastor/rpm; \
    sh build-tarball.sh; \
-    VER=$(grep ^Version: vitastor-el8.spec | awk '{print $2}'); \
-    cp /root/vitastor-$VER.el8.tar.gz ~/rpmbuild/SOURCES; \
+    cp /root/vitastor-1.6.1.el8.tar.gz ~/rpmbuild/SOURCES; \
    cp vitastor-el8.spec ~/rpmbuild/SPECS/vitastor.spec; \
    cd ~/rpmbuild/SPECS/; \
    rpmbuild -ba vitastor.spec; \
--- a/rpm/vitastor-el8.spec
+++ b/rpm/vitastor-el8.spec
@@ -1,11 +1,11 @@
 Name:           vitastor
-Version:        1.7.1
+Version:        1.6.1
 Release:        1%{?dist}
 Summary:        Vitastor, a fast software-defined clustered block storage

 License:        Vitastor Network Public License 1.1
 URL:            https://vitastor.io/
-Source0:        vitastor-1.7.1.el8.tar.gz
+Source0:        vitastor-1.6.1.el8.tar.gz

 BuildRequires:  liburing-devel >= 0.6
 BuildRequires:  gperftools-devel
@@ -141,8 +141,6 @@ mkdir -p /etc/vitastor
 groupadd -r -f vitastor 2>/dev/null ||:
 useradd -r -g vitastor -s /sbin/nologin -c "Vitastor daemons" -M -d /nonexistent vitastor 2>/dev/null ||:
 mkdir -p /etc/vitastor
-mkdir -p /var/lib/vitastor
-chown vitastor:vitastor /var/lib/vitastor


 %files -n vitastor-client
@@ -160,7 +158,6 @@ chown vitastor:vitastor /var/lib/vitastor

 %files -n vitastor-client-devel
 %_includedir/vitastor_c.h
-%_includedir/vitastor_kv.h
 %_libdir/pkgconfig


--- a/rpm/vitastor-el9.Dockerfile
+++ b/rpm/vitastor-el9.Dockerfile
@@ -18,8 +18,7 @@ ADD . /root/vitastor
 RUN set -e; \
    cd /root/vitastor/rpm; \
    sh build-tarball.sh; \
-    VER=$(grep ^Version: vitastor-el9.spec | awk '{print $2}'); \
-    cp /root/vitastor-$VER.el9.tar.gz ~/rpmbuild/SOURCES; \
+    cp /root/vitastor-1.6.1.el9.tar.gz ~/rpmbuild/SOURCES; \
    cp vitastor-el9.spec ~/rpmbuild/SPECS/vitastor.spec; \
    cd ~/rpmbuild/SPECS/; \
    rpmbuild -ba vitastor.spec; \
--- a/rpm/vitastor-el9.spec
+++ b/rpm/vitastor-el9.spec
@@ -1,11 +1,11 @@
 Name:           vitastor
-Version:        1.7.1
+Version:        1.6.1
 Release:        1%{?dist}
 Summary:        Vitastor, a fast software-defined clustered block storage

 License:        Vitastor Network Public License 1.1
 URL:            https://vitastor.io/
-Source0:        vitastor-1.7.1.el9.tar.gz
+Source0:        vitastor-1.6.1.el9.tar.gz

 BuildRequires:  liburing-devel >= 0.6
 BuildRequires:  gperftools-devel
@@ -134,8 +134,6 @@ mkdir -p /etc/vitastor
 groupadd -r -f vitastor 2>/dev/null ||:
 useradd -r -g vitastor -s /sbin/nologin -c "Vitastor daemons" -M -d /nonexistent vitastor 2>/dev/null ||:
 mkdir -p /etc/vitastor
-mkdir -p /var/lib/vitastor
-chown vitastor:vitastor /var/lib/vitastor


 %files -n vitastor-client
@@ -153,7 +151,6 @@ chown vitastor:vitastor /var/lib/vitastor

 %files -n vitastor-client-devel
 %_includedir/vitastor_c.h
-%_includedir/vitastor_kv.h
 %_libdir/pkgconfig


--- a/src/CMakeLists.txt
+++ b/src/CMakeLists.txt
@@ -19,7 +19,7 @@ if("${CMAKE_INSTALL_PREFIX}" MATCHES "^/usr/local/?$")
 	set(CMAKE_INSTALL_RPATH "${CMAKE_INSTALL_PREFIX}/${CMAKE_INSTALL_LIBDIR}")
 endif()

-add_definitions(-DVITASTOR_VERSION="1.7.1")
+add_definitions(-DVERSION="1.6.1")
 add_definitions(-D_LARGEFILE64_SOURCE -D_FILE_OFFSET_BITS=64 -Wall -Wno-sign-compare -Wno-comment -Wno-parentheses -Wno-pointer-arith -fdiagnostics-color=always -fno-omit-frame-pointer -I ${CMAKE_SOURCE_DIR}/src)
 add_link_options(-fno-omit-frame-pointer)
 if (${WITH_ASAN})
--- a/src/blockstore/CMakeLists.txt
+++ b/src/blockstore/CMakeLists.txt
@@ -13,7 +13,7 @@ target_link_libraries(vitastor_blk
 	# for timerfd_manager
 	vitastor_common
 )
-set_target_properties(vitastor_blk PROPERTIES VERSION ${VITASTOR_VERSION} SOVERSION 0)
+set_target_properties(vitastor_blk PROPERTIES VERSION ${VERSION} SOVERSION 0)

 if (${WITH_FIO})
 	# libfio_vitastor_blk.so
--- a/src/client/CMakeLists.txt
+++ b/src/client/CMakeLists.txt
@@ -12,7 +12,6 @@ add_library(vitastor_common STATIC
 	msgr_stop.cpp msgr_op.cpp msgr_send.cpp msgr_receive.cpp ../util/ringloop.cpp ../../json11/json11.cpp
 	http_client.cpp osd_ops.cpp pg_states.cpp ../util/timerfd_manager.cpp ../util/str_util.cpp ${MSGR_RDMA}
 )
-target_link_libraries(vitastor_common pthread)
 target_compile_options(vitastor_common PUBLIC -fPIC)

 # libvitastor_client.so
@@ -29,7 +28,7 @@ target_link_libraries(vitastor_client
 	${LIBURING_LIBRARIES}
 	${IBVERBS_LIBRARIES}
 )
-set_target_properties(vitastor_client PROPERTIES VERSION ${VITASTOR_VERSION} SOVERSION 0)
+set_target_properties(vitastor_client PROPERTIES VERSION ${VERSION} SOVERSION 0)
 configure_file(vitastor.pc.in vitastor.pc @ONLY)

 if (${WITH_FIO})
--- a/src/client/cluster_client.cpp
+++ b/src/client/cluster_client.cpp
@@ -452,10 +452,11 @@ void cluster_client_t::on_change_pg_state_hook(pool_id_t pool_id, pg_num_t pg_nu
    if (pg_cfg.cur_primary != prev_primary)
    {
        // Repeat this PG operations because an OSD which stopped being primary may not fsync operations
-        wb->repeat_ops_for(this, 0, pool_id, pg_num);
+        if (wb->repeat_ops_for(this, 0, pool_id, pg_num) > 0)
+        {
+            continue_ops();
+        }
    }
-    // Always continue to resume operations hung because of lack of the primary OSD
-    continue_ops();
 }

 bool cluster_client_t::get_immediate_commit(uint64_t inode)
@@ -1065,11 +1066,11 @@ bool cluster_client_t::try_send(cluster_op_t *op, int i)
        !pg_it->second.pause && pg_it->second.cur_primary)
    {
        osd_num_t primary_osd = pg_it->second.cur_primary;
-        part->osd_num = primary_osd;
        auto peer_it = msgr.osd_peer_fds.find(primary_osd);
        if (peer_it != msgr.osd_peer_fds.end())
        {
            int peer_fd = peer_it->second;
+            part->osd_num = primary_osd;
            part->flags |= PART_SENT;
            op->inflight_count++;
            uint64_t pg_bitmap_size = (pool_cfg.data_block_size / pool_cfg.bitmap_granularity / 8) * (
--- a/src/client/etcd_state_client.cpp
+++ b/src/client/etcd_state_client.cpp
@@ -333,10 +333,7 @@ void etcd_state_client_t::start_etcd_watcher()
        etcd_watch_ws = NULL;
    }
    if (this->log_level > 1)
-    {
-        fprintf(stderr, "Trying to connect to etcd websocket at %s, watch from revision %ju/%ju/%ju\n", etcd_address.c_str(),
-            etcd_watch_revision_config, etcd_watch_revision_osd, etcd_watch_revision_pg);
-    }
+        fprintf(stderr, "Trying to connect to etcd websocket at %s, watch from revision %ju\n", etcd_address.c_str(), etcd_watch_revision);
    etcd_watch_ws = open_websocket(tfd, etcd_address, etcd_api_path+"/watch", etcd_slow_timeout,
        [this, cur_addr = selected_etcd_address](const http_response_t *msg)
    {
@@ -351,20 +348,16 @@ void etcd_state_client_t::start_etcd_watcher()
            }
            else
            {
-                uint64_t watch_id = data["result"]["watch_id"].uint64_value();
                if (data["result"]["created"].bool_value())
                {
+                    uint64_t watch_id = data["result"]["watch_id"].uint64_value();
                    if (watch_id == ETCD_CONFIG_WATCH_ID ||
                        watch_id == ETCD_PG_STATE_WATCH_ID ||
+                        watch_id == ETCD_PG_HISTORY_WATCH_ID ||
                        watch_id == ETCD_OSD_STATE_WATCH_ID)
-                    {
                        etcd_watches_initialised++;
-                    }
                    if (etcd_watches_initialised == ETCD_TOTAL_WATCHES && this->log_level > 0)
-                    {
-                        fprintf(stderr, "Successfully subscribed to etcd at %s, revision %ju/%ju/%ju\n", cur_addr.c_str(),
-                            etcd_watch_revision_config, etcd_watch_revision_osd, etcd_watch_revision_pg);
-                    }
+                        fprintf(stderr, "Successfully subscribed to etcd at %s, revision %ju\n", cur_addr.c_str(), etcd_watch_revision);
                }
                if (data["result"]["canceled"].bool_value())
                {
@@ -382,7 +375,7 @@ void etcd_state_client_t::start_etcd_watcher()
                                    data["result"]["compact_revision"].uint64_value());
                                http_close(etcd_watch_ws);
                                etcd_watch_ws = NULL;
-                                etcd_watch_revision_config = etcd_watch_revision_osd = etcd_watch_revision_pg = 0;
+                                etcd_watch_revision = 0;
                                on_reload_hook();
                            }
                            return;
@@ -400,29 +393,13 @@ void etcd_state_client_t::start_etcd_watcher()
                        exit(1);
                    }
                }
-                // Save revision only if it's present in the message - because sometimes etcd sends something without a header, like:
-                // {"error": {"grpc_code": 14, "http_code": 503, "http_status": "Service Unavailable", "message": "error reading from server: EOF"}}
                if (etcd_watches_initialised == ETCD_TOTAL_WATCHES && !data["result"]["header"]["revision"].is_null())
                {
-                    // Restart watchers from the same revision number as in the last received message,
-                    // not from the next one to protect against revision being split into multiple messages,
-                    // even though etcd guarantees not to do that **within a single watcher** without fragment=true:
-                    // https://etcd.io/docs/v3.5/learning/api_guarantees/#watch-apis
-                    // Revision contents are ALWAYS split into separate messages for different watchers though!
-                    // So generally we have to resume each watcher from its own revision...
-                    // Progress messages may have watch_id=-1 if sent on behalf of multiple watchers though.
-                    // And antietcd has an advanced semantic which merges the same revision for all watchers
-                    // into one message and just omits watch_id.
-                    // So we also have to handle the case where watch_id is -1 or not present (0).
-                    auto watch_rev = data["result"]["header"]["revision"].uint64_value();
-                    if (!watch_id || watch_id == UINT64_MAX)
-                        etcd_watch_revision_config = etcd_watch_revision_osd = etcd_watch_revision_pg = watch_rev;
-                    else if (watch_id == ETCD_CONFIG_WATCH_ID)
-                        etcd_watch_revision_config = watch_rev;
-                    else if (watch_id == ETCD_PG_STATE_WATCH_ID)
-                        etcd_watch_revision_pg = watch_rev;
-                    else if (watch_id == ETCD_OSD_STATE_WATCH_ID)
-                        etcd_watch_revision_osd = watch_rev;
+                    // Protect against a revision beign split into multiple messages and some
+                    // of them being lost. Even though I'm not sure if etcd actually splits them
+                    // Also sometimes etcd sends something without a header, like:
+                    // {"error": {"grpc_code": 14, "http_code": 503, "http_status": "Service Unavailable", "message": "error reading from server: EOF"}}
+                    etcd_watch_revision = data["result"]["header"]["revision"].uint64_value();
                    addresses_to_try.clear();
                }
                // First gather all changes into a hash to remove multiple overwrites
@@ -480,7 +457,7 @@ void etcd_state_client_t::start_etcd_watcher()
        { "create_request", json11::Json::object {
            { "key", base64_encode(etcd_prefix+"/config/") },
            { "range_end", base64_encode(etcd_prefix+"/config0") },
-            { "start_revision", etcd_watch_revision_config },
+            { "start_revision", etcd_watch_revision },
            { "watch_id", ETCD_CONFIG_WATCH_ID },
            { "progress_notify", true },
        } }
@@ -489,21 +466,29 @@ void etcd_state_client_t::start_etcd_watcher()
        { "create_request", json11::Json::object {
            { "key", base64_encode(etcd_prefix+"/osd/state/") },
            { "range_end", base64_encode(etcd_prefix+"/osd/state0") },
-            { "start_revision", etcd_watch_revision_osd },
+            { "start_revision", etcd_watch_revision },
            { "watch_id", ETCD_OSD_STATE_WATCH_ID },
            { "progress_notify", true },
        } }
    }).dump());
    http_post_message(etcd_watch_ws, WS_TEXT, json11::Json(json11::Json::object {
        { "create_request", json11::Json::object {
-            { "key", base64_encode(etcd_prefix+"/pg/") },
-            { "range_end", base64_encode(etcd_prefix+"/pg0") },
-            { "start_revision", etcd_watch_revision_pg },
+            { "key", base64_encode(etcd_prefix+"/pg/state/") },
+            { "range_end", base64_encode(etcd_prefix+"/pg/state0") },
+            { "start_revision", etcd_watch_revision },
            { "watch_id", ETCD_PG_STATE_WATCH_ID },
            { "progress_notify", true },
        } }
    }).dump());
-    // FIXME: Do not watch /pg/history/ at all in client code (not in OSD)
+    http_post_message(etcd_watch_ws, WS_TEXT, json11::Json(json11::Json::object {
+        { "create_request", json11::Json::object {
+            { "key", base64_encode(etcd_prefix+"/pg/history/") },
+            { "range_end", base64_encode(etcd_prefix+"/pg/history0") },
+            { "start_revision", etcd_watch_revision },
+            { "watch_id", ETCD_PG_HISTORY_WATCH_ID },
+            { "progress_notify", true },
+        } }
+    }).dump());
    if (on_start_watcher_hook)
    {
        on_start_watcher_hook(etcd_watch_ws);
@@ -588,7 +573,7 @@ void etcd_state_client_t::load_global_config()
        {
            global_bitmap_granularity = DEFAULT_BITMAP_GRANULARITY;
        }
-        global_immediate_commit = parse_immediate_commit(global_config["immediate_commit"].string_value(), IMMEDIATE_ALL);
+        global_immediate_commit = parse_immediate_commit(global_config["immediate_commit"].string_value());
        on_load_config_hook(global_config);
    });
 }
@@ -606,11 +591,6 @@ void etcd_state_client_t::load_pgs()
                { "key", base64_encode(etcd_prefix+"/config/pgs") },
            } }
        },
-        json11::Json::object {
-            { "request_range", json11::Json::object {
-                { "key", base64_encode(etcd_prefix+"/pg/config") },
-            } }
-        },
        json11::Json::object {
            { "request_range", json11::Json::object {
                { "key", base64_encode(etcd_prefix+"/config/inode/") },
@@ -660,10 +640,13 @@ void etcd_state_client_t::load_pgs()
            return;
        }
        reset_pg_exists();
-        etcd_watch_revision_config = etcd_watch_revision_osd = etcd_watch_revision_pg = data["header"]["revision"].uint64_value()+1;
-        if (this->log_level > 3)
+        if (!etcd_watch_revision)
        {
-            fprintf(stderr, "Loaded revision %ju of PG configuration\n", etcd_watch_revision_pg-1);
+            etcd_watch_revision = data["header"]["revision"].uint64_value()+1;
+            if (this->log_level > 3)
+            {
+                fprintf(stderr, "Loaded revision %ju of PG configuration\n", etcd_watch_revision-1);
+            }
        }
        for (auto & res: data["responses"].array_items())
        {
@@ -730,7 +713,7 @@ void etcd_state_client_t::clean_nonexistent_pgs()
            {
                if (!pg_cfg.state_exists)
                {
-                    if (this->log_level > 3 && (pg_cfg.cur_primary || pg_cfg.cur_state))
+                    if (this->log_level > 3)
                    {
                        fprintf(stderr, "PG %u/%u primary OSD disappeared after reload, forgetting it\n", pool_item.first, pg_it->first);
                    }
@@ -740,7 +723,7 @@ void etcd_state_client_t::clean_nonexistent_pgs()
                }
                if (!pg_cfg.history_exists)
                {
-                    if (this->log_level > 3 && (pg_cfg.target_history.size() || pg_cfg.all_peers.size() || pg_cfg.epoch || pg_cfg.next_scrub))
+                    if (this->log_level > 3)
                    {
                        fprintf(stderr, "PG %u/%u history disappeared after reload, forgetting it\n", pool_item.first, pg_it->first);
                    }
@@ -884,7 +867,7 @@ void etcd_state_client_t::parse_state(const etcd_kv_t & kv)
            pc.used_for_fs = pool_item.second["used_for_fs"].as_string();
            // Immediate Commit Mode
            pc.immediate_commit = pool_item.second["immediate_commit"].is_string()
-                ? parse_immediate_commit(pool_item.second["immediate_commit"].string_value(), IMMEDIATE_ALL)
+                ? parse_immediate_commit(pool_item.second["immediate_commit"].string_value())
                : global_immediate_commit;
            // PG Stripe Size
            pc.pg_stripe_size = pool_item.second["pg_stripe_size"].uint64_value();
@@ -912,17 +895,8 @@ void etcd_state_client_t::parse_state(const etcd_kv_t & kv)
            on_change_pool_config_hook();
        }
    }
-    else if (key == etcd_prefix+"/pg/config" || key == etcd_prefix+"/config/pgs")
+    else if (key == etcd_prefix+"/config/pgs")
    {
-        if (key == etcd_prefix+"/pg/config")
-        {
-            new_pg_config = !value.is_null();
-        }
-        else if (new_pg_config)
-        {
-            // Ignore old key if the new one is present
-            return;
-        }
        for (auto & pool_item: this->pool_config)
        {
            for (auto & pg_item: pool_item.second.pg_config)
@@ -1201,11 +1175,10 @@ void etcd_state_client_t::parse_state(const etcd_kv_t & kv)
    }
 }

-uint32_t etcd_state_client_t::parse_immediate_commit(const std::string & immediate_commit_str, uint32_t default_value)
+uint32_t etcd_state_client_t::parse_immediate_commit(const std::string & immediate_commit_str)
 {
-    return (immediate_commit_str == "all" ? IMMEDIATE_ALL :
-        (immediate_commit_str == "small" ? IMMEDIATE_SMALL :
-        (immediate_commit_str == "none" ? IMMEDIATE_NONE : default_value)));
+    return immediate_commit_str == "all" ? IMMEDIATE_ALL :
+        (immediate_commit_str == "small" ? IMMEDIATE_SMALL : IMMEDIATE_NONE);
 }

 uint32_t etcd_state_client_t::parse_scheme(const std::string & scheme)
--- a/src/client/etcd_state_client.h
+++ b/src/client/etcd_state_client.h
@@ -10,9 +10,10 @@
 #include "timerfd_manager.h"

 #define ETCD_CONFIG_WATCH_ID 1
-#define ETCD_OSD_STATE_WATCH_ID 2
-#define ETCD_PG_STATE_WATCH_ID 3
-#define ETCD_TOTAL_WATCHES 3
+#define ETCD_PG_STATE_WATCH_ID 2
+#define ETCD_PG_HISTORY_WATCH_ID 3
+#define ETCD_OSD_STATE_WATCH_ID 4
+#define ETCD_TOTAL_WATCHES 4

 #define DEFAULT_BLOCK_SIZE 128*1024
 #define MIN_DATA_BLOCK_SIZE 4*1024
@@ -94,7 +95,7 @@ protected:
    std::string selected_etcd_address;
    std::vector<std::string> addresses_to_try;
    std::vector<inode_watch_t*> watches;
-    bool new_pg_config = false;
+    http_co_t *etcd_watch_ws = NULL, *keepalive_client = NULL;
    int ws_keepalive_timer = -1;
    int ws_alive = 0;
    bool rand_initialized = false;
@@ -114,11 +115,8 @@ public:
    int log_level = 0;
    timerfd_manager_t *tfd = NULL;

-    http_co_t *etcd_watch_ws = NULL, *keepalive_client = NULL;
    int etcd_watches_initialised = 0;
-    uint64_t etcd_watch_revision_config = 0;
-    uint64_t etcd_watch_revision_osd = 0;
-    uint64_t etcd_watch_revision_pg = 0;
+    uint64_t etcd_watch_revision = 0;
    std::map<pool_id_t, pool_config_t> pool_config;
    std::map<osd_num_t, json11::Json> peer_states;
    std::set<osd_num_t> seen_peers;
@@ -159,6 +157,6 @@ public:
    int address_count();
    ~etcd_state_client_t();

-    static uint32_t parse_immediate_commit(const std::string & immediate_commit_str, uint32_t default_value);
+    static uint32_t parse_immediate_commit(const std::string & immediate_commit_str);
    static uint32_t parse_scheme(const std::string & scheme_str);
 };
--- a/src/client/http_client.cpp
+++ b/src/client/http_client.cpp
@@ -271,7 +271,7 @@ void http_co_t::close_connection()
    }
    if (peer_fd >= 0)
    {
-        tfd->set_fd_handler(peer_fd, false, NULL);
+        tfd->set_fd_handler(peer_fd, 0, NULL);
        close(peer_fd);
        peer_fd = -1;
    }
@@ -314,7 +314,7 @@ void http_co_t::start_connection()
        stackout();
        return;
    }
-    tfd->set_fd_handler(peer_fd, true, [this](int peer_fd, int epoll_events)
+    tfd->set_fd_handler(peer_fd, EPOLLIN|EPOLLOUT, [this](int peer_fd, int epoll_events)
    {
        this->epoll_events |= epoll_events;
        handle_events();
@@ -372,7 +372,7 @@ void http_co_t::handle_connect_result()
    }
    int one = 1;
    setsockopt(peer_fd, SOL_TCP, TCP_NODELAY, &one, sizeof(one));
-    tfd->set_fd_handler(peer_fd, false, [this](int peer_fd, int epoll_events)
+    tfd->set_fd_handler(peer_fd, EPOLLIN, [this](int peer_fd, int epoll_events)
    {
        this->epoll_events |= epoll_events;
        handle_events();
--- a/src/client/messenger.cpp
+++ b/src/client/messenger.cpp
@@ -16,32 +16,46 @@
 #endif

 #include <sys/poll.h>
+#include <sys/eventfd.h>

-msgr_iothread_t::msgr_iothread_t():
-    ring(RINGLOOP_DEFAULT_SIZE, true),
-    thread(&msgr_iothread_t::run, this)
+static uint64_t one = 1;
+
+msgr_iothread_t::msgr_iothread_t()
 {
-    eventfd = ring.register_eventfd();
-    if (eventfd < 0)
+    ring = new ring_loop_t(RINGLOOP_DEFAULT_SIZE);
+    epmgr = new epoll_manager_t(ring);
+    submit_eventfd = eventfd(0, EFD_CLOEXEC|EFD_NONBLOCK);
+    if (submit_eventfd < 0)
    {
-        throw std::runtime_error(std::string("failed to register eventfd: ") + strerror(-eventfd));
+        throw std::runtime_error(std::string("failed to create eventfd: ")+strerror(errno));
    }
+    epmgr->tfd->set_fd_handler(submit_eventfd, EPOLLIN, [this](int fd, int epoll_events)
+    {
+        // Reset eventfd counter
+        uint64_t ctr = 0;
+        int r = read(submit_eventfd, &ctr, 8);
+        if (r < 0 && errno != EAGAIN && errno != EINTR)
+        {
+            fprintf(stderr, "Error resetting eventfd: %s\n", strerror(errno));
+        }
+        ring->wakeup();
+    });
+    consumer.loop = [this]()
+    {
+        read_requests();
+        send_replies();
+        ring->submit();
+    };
+    ring->register_consumer(&consumer);
+    thread = new std::thread(&msgr_iothread_t::run, this);
 }

 msgr_iothread_t::~msgr_iothread_t()
 {
    stop();
-}
-
-void msgr_iothread_t::add_sqe(io_uring_sqe & sqe)
-{
-    mu.lock();
-    queue.push_back((iothread_sqe_t){ .sqe = sqe, .data = std::move(*(ring_data_t*)sqe.user_data) });
-    if (queue.size() == 1)
-    {
-        cond.notify_all();
-    }
-    mu.unlock();
+    delete thread;
+    delete epmgr;
+    delete ring;
 }

 void msgr_iothread_t::stop()
@@ -53,65 +67,152 @@ void msgr_iothread_t::stop()
        return;
    }
    stopped = true;
-    if (outer_loop_data)
-    {
-        outer_loop_data->callback = [](ring_data_t*){};
-    }
-    cond.notify_all();
-    close(eventfd);
+    write(submit_eventfd, &one, sizeof(one));
    mu.unlock();
-    thread.join();
+    thread->join();
+    ring->unregister_consumer(&consumer);
+    close(submit_eventfd);
 }

-void msgr_iothread_t::add_to_ringloop(ring_loop_t *outer_loop)
+void msgr_iothread_t::add_client(osd_client_t *cl)
 {
-    assert(!this->outer_loop || this->outer_loop == outer_loop);
-    io_uring_sqe *sqe = outer_loop->get_sqe();
-    assert(sqe != NULL);
-    this->outer_loop = outer_loop;
-    this->outer_loop_data = ((ring_data_t*)sqe->user_data);
-    my_uring_prep_poll_add(sqe, eventfd, POLLIN);
-    outer_loop_data->callback = [this](ring_data_t *data)
+    mu.lock();
+    if (stopped)
    {
-        if (data->res < 0)
+        mu.unlock();
+        return;
+    }
+    assert(!clients[cl->peer_fd]);
+    clients[cl->peer_fd] = cl;
+    epmgr->tfd->set_fd_handler(cl->peer_fd, EPOLLIN, [this](int peer_fd, int epoll_events)
+    {
+        // FIXME: Slight copypaste (see handle_peer_epoll)
+        if (epoll_events & EPOLLIN)
        {
-            throw std::runtime_error(std::string("eventfd poll failed: ") + strerror(-data->res));
+            auto cl_it = clients.find(peer_fd);
+            if (cl_it != clients.end())
+            {
+                auto cl = cl_it->second;
+                cl->mu.lock();
+                cl->read_ready++;
+                if (cl->read_ready == 1)
+                {
+                    read_ready_clients.push_back(peer_fd);
+                    ring->wakeup();
+                }
+                cl->mu.unlock();
+            }
        }
-        outer_loop_data = NULL;
-        if (stopped)
+    });
+    mu.unlock();
+}
+
+void msgr_iothread_t::remove_client(osd_client_t *cl)
+{
+    mu.lock();
+    if (stopped)
+    {
+        mu.unlock();
+        return;
+    }
+    auto cl_it = clients.find(cl->peer_fd);
+    if (cl_it != clients.end() && cl_it->second == cl)
+    {
+        clients.erase(cl->peer_fd);
+        epmgr->tfd->set_fd_handler(cl->peer_fd, 0, NULL);
+    }
+    mu.unlock();
+}
+
+void msgr_iothread_t::wakeup_out(int peer_fd, ring_loop_t *outer_ring)
+{
+    write_ready_mu.lock();
+    if (!write_ready_clients.size())
+    {
+        io_uring_sqe* sqe = outer_ring->get_sqe();
+        if (!sqe)
        {
-            return;
+            write(submit_eventfd, &one, sizeof(one));
        }
-        add_to_ringloop(this->outer_loop);
-        ring.loop();
-    };
+        else
+        {
+            ring_data_t* data = ((ring_data_t*)sqe->user_data);
+            data->callback = [](ring_data_t*){};
+            my_uring_prep_write(sqe, submit_eventfd, &one, sizeof(one), 0);
+        }
+    }
+    write_ready_clients.push_back(peer_fd);
+    write_ready_mu.unlock();
+}
+
+void msgr_iothread_t::read_requests()
+{
+    // FIXME: Slight copypaste (see messenger_t::read_requests)
+    auto to_recv = std::move(read_ready_clients);
+    for (int i = 0; i < to_recv.size(); i++)
+    {
+        int peer_fd = to_recv[i];
+        auto cl_it = clients.find(peer_fd);
+        if (cl_it == clients.end())
+        {
+            continue;
+        }
+        osd_client_t *cl = cl_it->second;
+        cl->mu.lock();
+        auto ok = cl->try_recv(ring, false);
+        cl->mu.unlock();
+        if (!ok)
+        {
+            read_ready_clients.insert(read_ready_clients.end(), to_recv.begin()+i, to_recv.end());
+            break;
+        }
+    }
+}
+
+void msgr_iothread_t::send_replies()
+{
+    if (stopped)
+    {
+        return;
+    }
+    write_ready_mu.lock();
+    auto to_send = std::move(write_ready_clients);
+    write_ready_mu.unlock();
+    for (int i = 0; i < to_send.size(); i++)
+    {
+        auto cl_it = clients.find(to_send[i]);
+        if (cl_it == clients.end())
+        {
+            continue;
+        }
+        auto cl = cl_it->second;
+        cl->mu.lock();
+        auto ok = cl->try_send(ring, false/*, lock*/);
+        cl->mu.unlock();
+        if (!ok)
+        {
+            // ring is full (rare but what if...)
+            write_ready_mu.lock();
+            write_ready_clients.insert(write_ready_clients.end(), to_send.begin()+i, to_send.end());
+            write_ready_mu.unlock();
+            break;
+        }
+    }
 }

 void msgr_iothread_t::run()
 {
    while (true)
    {
+        mu.lock();
+        if (stopped)
        {
-            std::unique_lock<std::mutex> lk(mu);
-            while (!stopped && !queue.size())
-                cond.wait(lk);
-            if (stopped)
-                return;
-            int i = 0;
-            for (; i < queue.size(); i++)
-            {
-                io_uring_sqe *sqe = ring.get_sqe();
-                if (!sqe)
-                    break;
-                ring_data_t *data = ((ring_data_t*)sqe->user_data);
-                *data = std::move(queue[i].data);
-                *sqe = queue[i].sqe;
-                sqe->user_data = (uint64_t)data;
-            }
-            queue.erase(queue.begin(), queue.begin()+i);
+            mu.unlock();
+            return;
        }
-        // We only want to offload sendmsg/recvmsg. Callbacks will be called in main thread
-        ring.submit();
+        ring->loop();
+        mu.unlock();
+        ring->wait();
    }
 }

@@ -135,7 +236,7 @@ void osd_messenger_t::init()
                ? rdma_max_sge : rdma_context->attrx.orig_attr.max_sge;
            fprintf(stderr, "[OSD %ju] RDMA initialized successfully\n", osd_num);
            fcntl(rdma_context->channel->fd, F_SETFL, fcntl(rdma_context->channel->fd, F_GETFL, 0) | O_NONBLOCK);
-            tfd->set_fd_handler(rdma_context->channel->fd, false, [this](int notify_fd, int epoll_events)
+            tfd->set_fd_handler(rdma_context->channel->fd, EPOLLIN, [this](int notify_fd, int epoll_events)
            {
                handle_rdma_events();
            });
@@ -149,8 +250,37 @@ void osd_messenger_t::init()
        {
            auto iot = new msgr_iothread_t();
            iothreads.push_back(iot);
-            iot->add_to_ringloop(ringloop);
        }
+        immediates_eventfd = eventfd(0, EFD_CLOEXEC|EFD_NONBLOCK);
+        if (immediates_eventfd < 0)
+        {
+            throw std::runtime_error(std::string("failed to create set_immediate eventfd: ")+strerror(errno));
+        }
+        tfd->set_fd_handler(immediates_eventfd, EPOLLIN, [this](int peer_fd, int epoll_events)
+        {
+            // Reset eventfd counter
+            uint64_t ctr = 0;
+            int r = read(immediates_eventfd, &ctr, 8);
+            if (r < 0 && errno != EAGAIN && errno != EINTR)
+            {
+                fprintf(stderr, "Error resetting eventfd: %s\n", strerror(errno));
+            }
+            while (true)
+            {
+                immediates_mu.lock();
+                auto to_run = std::move(immediates);
+                immediates_mu.unlock();
+                if (!to_run.size())
+                {
+                    break;
+                }
+                for (auto & cb: to_run)
+                {
+                    cb();
+                }
+            }
+            ringloop->wakeup();
+        });
    }
    keepalive_timer_id = tfd->set_timer(1000, true, [this](int)
    {
@@ -229,6 +359,12 @@ void osd_messenger_t::init()

 osd_messenger_t::~osd_messenger_t()
 {
+    if (immediates_eventfd >= 0)
+    {
+        tfd->set_fd_handler(immediates_eventfd, 0, NULL);
+        close(immediates_eventfd);
+        immediates_eventfd = -1;
+    }
    if (keepalive_timer_id >= 0)
    {
        tfd->clear_timer(keepalive_timer_id);
@@ -283,7 +419,7 @@ void osd_messenger_t::parse_config(const json11::Json & config)
    this->rdma_odp = config["rdma_odp"].bool_value();
 #endif
    if (!osd_num)
-        this->iothread_count = (uint32_t)config["client_iothread_count"].uint64_value();
+        this->iothread_count = config["client_iothread_count"].is_null() ? 4 : (uint32_t)config["client_iothread_count"].uint64_value();
    else
        this->iothread_count = (uint32_t)config["osd_iothread_count"].uint64_value();
    this->receive_buffer_size = (uint32_t)config["tcp_header_buffer_size"].uint64_value();
@@ -376,6 +512,7 @@ void osd_messenger_t::try_connect_peer_addr(osd_num_t peer_osd, const char *peer
    {
        fprintf(stderr, "Connecting to OSD %ju at %s:%d (client %d)\n", peer_osd, peer_host, peer_port, peer_fd);
    }
+    clients[peer_fd]->msgr = this;
    clients[peer_fd]->peer_addr = addr;
    clients[peer_fd]->peer_port = peer_port;
    clients[peer_fd]->peer_fd = peer_fd;
@@ -383,7 +520,8 @@ void osd_messenger_t::try_connect_peer_addr(osd_num_t peer_osd, const char *peer
    clients[peer_fd]->connect_timeout_id = -1;
    clients[peer_fd]->osd_num = peer_osd;
    clients[peer_fd]->in_buf = malloc_or_die(receive_buffer_size);
-    tfd->set_fd_handler(peer_fd, true, [this](int peer_fd, int epoll_events)
+    clients[peer_fd]->receive_buffer_size = receive_buffer_size;
+    tfd->set_fd_handler(peer_fd, EPOLLIN|EPOLLOUT, [this](int peer_fd, int epoll_events)
    {
        // Either OUT (connected) or HUP
        handle_connect_epoll(peer_fd);
@@ -424,7 +562,11 @@ void osd_messenger_t::handle_connect_epoll(int peer_fd)
    int one = 1;
    setsockopt(peer_fd, SOL_TCP, TCP_NODELAY, &one, sizeof(one));
    cl->peer_state = PEER_CONNECTED;
-    tfd->set_fd_handler(peer_fd, false, [this](int peer_fd, int epoll_events)
+    if (iothreads.size())
+    {
+        iothreads[peer_fd % iothreads.size()]->add_client(cl);
+    }
+    tfd->set_fd_handler(peer_fd, iothreads.size() ? 0 : EPOLLIN, [this](int peer_fd, int epoll_events)
    {
        handle_peer_epoll(peer_fd, epoll_events);
    });
@@ -608,7 +750,7 @@ void osd_messenger_t::check_peer_config(osd_client_t *cl)
                    fprintf(stderr, "Connected to OSD %ju using RDMA\n", cl->osd_num);
                }
                cl->peer_state = PEER_RDMA;
-                tfd->set_fd_handler(cl->peer_fd, false, [this](int peer_fd, int epoll_events)
+                tfd->set_fd_handler(cl->peer_fd, 0, [this](int peer_fd, int epoll_events)
                {
                    // Do not miss the disconnection!
                    if (epoll_events & EPOLLRDHUP)
@@ -643,13 +785,19 @@ void osd_messenger_t::accept_connections(int listen_fd)
        int one = 1;
        setsockopt(peer_fd, SOL_TCP, TCP_NODELAY, &one, sizeof(one));
        clients[peer_fd] = new osd_client_t();
+        clients[peer_fd]->msgr = this;
        clients[peer_fd]->peer_addr = addr;
        clients[peer_fd]->peer_port = ntohs(((sockaddr_in*)&addr)->sin_port);
        clients[peer_fd]->peer_fd = peer_fd;
        clients[peer_fd]->peer_state = PEER_CONNECTED;
        clients[peer_fd]->in_buf = malloc_or_die(receive_buffer_size);
+        clients[peer_fd]->receive_buffer_size = receive_buffer_size;
        // Add FD to epoll
-        tfd->set_fd_handler(peer_fd, false, [this](int peer_fd, int epoll_events)
+        if (iothreads.size())
+        {
+            iothreads[peer_fd % iothreads.size()]->add_client(clients[peer_fd]);
+        }
+        tfd->set_fd_handler(peer_fd, iothreads.size() ? 0 : EPOLLIN, [this](int peer_fd, int epoll_events)
        {
            handle_peer_epoll(peer_fd, epoll_events);
        });
--- a/src/client/messenger.h
+++ b/src/client/messenger.h
@@ -11,6 +11,7 @@
 #include <map>
 #include <deque>
 #include <vector>
+#include <mutex>

 #include "malloc_or_die.h"
 #include "json11/json11.hpp"
@@ -45,8 +46,13 @@ struct msgr_rdma_connection_t;
 struct msgr_rdma_context_t;
 #endif

+struct osd_messenger_t;
+
 struct osd_client_t
 {
+    std::mutex mu;
+    osd_messenger_t *msgr = NULL;
+
    int refs = 0;

    sockaddr_storage peer_addr;
@@ -59,6 +65,7 @@ struct osd_client_t
    osd_num_t osd_num = 0;

    void *in_buf = NULL;
+    uint32_t receive_buffer_size = 0;

 #ifdef WITH_RDMA
    msgr_rdma_connection_t *rdma_conn = NULL;
@@ -89,6 +96,17 @@ struct osd_client_t
    std::vector<msgr_sendp_t> outbox, next_outbox;

    ~osd_client_t();
+
+    bool try_send(ring_loop_t *ringloop, bool use_sync_send_recv);
+    int handle_send(int result);
+
+    bool try_recv(ring_loop_t *ringloop, bool use_sync_send_recv);
+    int handle_read(int result);
+    bool handle_read_buffer(void *curbuf, int remain);
+    bool handle_finished_read();
+    void handle_op_hdr();
+    bool handle_reply_hdr();
+    void handle_reply_ready(osd_op_t *op);
 };

 struct osd_wanted_peer_t
@@ -111,41 +129,50 @@ struct osd_op_stats_t
    uint64_t subop_stat_count[OSD_OP_MAX+1] = { 0 };
 };

-#include <mutex>
-#include <condition_variable>
-#include <thread>
-
 #ifdef __MOCK__
 class msgr_iothread_t;
 #else
-struct iothread_sqe_t
-{
-    io_uring_sqe sqe;
-    ring_data_t data;
-};
+
+#include <thread>
+
+#include "epoll_manager.h"

 class msgr_iothread_t
 {
 protected:
-    ring_loop_t ring;
-    ring_loop_t *outer_loop = NULL;
-    ring_data_t *outer_loop_data = NULL;
-    int eventfd = -1;
+    ring_loop_t *ring = NULL;
+    epoll_manager_t *epmgr = NULL;
+    ring_consumer_t consumer;
+    int submit_eventfd = -1;
    bool stopped = false;
    std::mutex mu;
-    std::condition_variable cond;
-    std::vector<iothread_sqe_t> queue;
-    std::thread thread;
+    std::map<int, osd_client_t*> clients;
+    std::vector<int> read_ready_clients;
+    std::mutex write_ready_mu;
+    std::vector<int> write_ready_clients;
+    std::thread *thread = NULL;

    void run();
+
+    void read_requests();
+
+    void send_replies();
+
 public:

+    void handle_client_read(osd_client_t *cl, int res);
+    void handle_client_send(osd_client_t *cl, int res);
+
    msgr_iothread_t();
    ~msgr_iothread_t();

-    void add_sqe(io_uring_sqe & sqe);
+    void add_client(osd_client_t *cl);
+
+    void remove_client(osd_client_t *cl);
+
+    void wakeup_out(int peer_fd, ring_loop_t *outer_ring);
+
    void stop();
-    void add_to_ringloop(ring_loop_t *outer_loop);
 };
 #endif

@@ -176,8 +203,10 @@ protected:
    std::vector<msgr_iothread_t*> iothreads;
    std::vector<int> read_ready_clients;
    std::vector<int> write_ready_clients;
+    int immediates_eventfd = -1;
+    std::mutex immediates_mu;
    // We don't use ringloop->set_immediate here because we may have no ringloop in client :)
-    std::vector<std::function<void()>> set_immediate;
+    std::vector<std::function<void()>> immediates;

 public:
    timerfd_manager_t *tfd;
@@ -195,10 +224,13 @@ public:
    void parse_config(const json11::Json & config);
    void connect_peer(uint64_t osd_num, json11::Json peer_state);
    void stop_client(int peer_fd, bool force = false, bool force_delete = false);
+    void stop_client_from_iothread(osd_client_t *cl);
    void outbox_push(osd_op_t *cur_op);
    std::function<void(osd_op_t*)> exec_op;
    std::function<void(osd_num_t)> repeer_pgs;
    std::function<bool(osd_client_t*, json11::Json)> check_config_hook;
+    void handle_client_read(osd_client_t *cl, int res);
+    void handle_client_send(osd_client_t *cl, int res);
    void read_requests();
    void send_replies();
    void accept_connections(int listen_fd);
@@ -218,6 +250,9 @@ public:
    void inc_op_stats(osd_op_stats_t & stats, uint64_t opcode, timespec & tv_begin, timespec & tv_end, uint64_t len);
    void measure_exec(osd_op_t *cur_op);

+    void set_immediate(std::function<void()> cb);
+    void set_immediate_or_run(std::function<void()> cb);
+
 protected:
    void try_connect_peer(uint64_t osd_num);
    void try_connect_peer_addr(osd_num_t peer_osd, const char *peer_host, int peer_port);
@@ -228,15 +263,7 @@ protected:
    void cancel_osd_ops(osd_client_t *cl);
    void cancel_op(osd_op_t *op);

-    bool try_send(osd_client_t *cl);
-    void handle_send(int result, osd_client_t *cl);
-
-    bool handle_read(int result, osd_client_t *cl);
-    bool handle_read_buffer(osd_client_t *cl, void *curbuf, int remain);
-    bool handle_finished_read(osd_client_t *cl);
-    void handle_op_hdr(osd_client_t *cl);
-    bool handle_reply_hdr(osd_client_t *cl);
-    void handle_reply_ready(osd_op_t *op);
+    void handle_immediates();

 #ifdef WITH_RDMA
    void try_send_rdma(osd_client_t *cl);
@@ -245,4 +272,6 @@ protected:
    bool try_recv_rdma(osd_client_t *cl);
    void handle_rdma_events();
 #endif
+
+    friend struct osd_client_t;
 };
--- a/src/client/msgr_rdma.cpp
+++ b/src/client/msgr_rdma.cpp
@@ -603,7 +603,7 @@ void osd_messenger_t::handle_rdma_events()
            if (!is_send)
            {
                rc->cur_recv--;
-                if (!handle_read_buffer(cl, rc->recv_buffers[rc->next_recv_buf].buf, wc[i].byte_len))
+                if (!cl->handle_read_buffer(rc->recv_buffers[rc->next_recv_buf].buf, wc[i].byte_len))
                {
                    // handle_read_buffer may stop the client
                    continue;
@@ -666,9 +666,5 @@ void osd_messenger_t::handle_rdma_events()
            }
        }
    } while (event_count > 0);
-    for (auto cb: set_immediate)
-    {
-        cb();
-    }
-    set_immediate.clear();
+    handle_immediates();
 }
--- a/src/client/msgr_receive.cpp
+++ b/src/client/msgr_receive.cpp
@@ -1,6 +1,7 @@
 // Copyright (c) Vitaliy Filippov, 2019+
 // License: VNPL-1.1 or GNU GPL-2.0+ (see README.md for details)

+#include <unistd.h>
 #include "messenger.h"

 void osd_messenger_t::read_requests()
@@ -9,71 +10,119 @@ void osd_messenger_t::read_requests()
    {
        int peer_fd = read_ready_clients[i];
        osd_client_t *cl = clients[peer_fd];
-        if (cl->read_msg.msg_iovlen)
+        if (!cl->try_recv(ringloop, use_sync_send_recv))
        {
-            continue;
-        }
-        if (cl->read_remaining < receive_buffer_size)
-        {
-            cl->read_iov.iov_base = cl->in_buf;
-            cl->read_iov.iov_len = receive_buffer_size;
-            cl->read_msg.msg_iov = &cl->read_iov;
-            cl->read_msg.msg_iovlen = 1;
-        }
-        else
-        {
-            cl->read_iov.iov_base = 0;
-            cl->read_iov.iov_len = cl->read_remaining;
-            cl->read_msg.msg_iov = cl->recv_list.get_iovec();
-            cl->read_msg.msg_iovlen = cl->recv_list.get_size();
-        }
-        cl->refs++;
-        if (ringloop && !use_sync_send_recv)
-        {
-            auto iothread = iothreads.size() ? iothreads[peer_fd % iothreads.size()] : NULL;
-            io_uring_sqe sqe_local;
-            ring_data_t data_local;
-            sqe_local.user_data = (uint64_t)&data_local;
-            io_uring_sqe* sqe = (iothread ? &sqe_local : ringloop->get_sqe());
-            if (!sqe)
-            {
-                cl->read_msg.msg_iovlen = 0;
-                read_ready_clients.erase(read_ready_clients.begin(), read_ready_clients.begin() + i);
-                return;
-            }
-            ring_data_t* data = ((ring_data_t*)sqe->user_data);
-            data->callback = [this, cl](ring_data_t *data) { handle_read(data->res, cl); };
-            my_uring_prep_recvmsg(sqe, peer_fd, &cl->read_msg, 0);
-            if (iothread)
-            {
-                iothread->add_sqe(sqe_local);
-            }
-        }
-        else
-        {
-            int result = recvmsg(peer_fd, &cl->read_msg, 0);
-            if (result < 0)
-            {
-                result = -errno;
-            }
-            handle_read(result, cl);
+            read_ready_clients.erase(read_ready_clients.begin(), read_ready_clients.begin() + i);
+            return;
        }
    }
    read_ready_clients.clear();
+    if (!iothreads.size())
+    {
+        handle_immediates();
+    }
 }

-bool osd_messenger_t::handle_read(int result, osd_client_t *cl)
+bool osd_client_t::try_recv(ring_loop_t *ringloop, bool use_sync_send_recv)
 {
-    bool ret = false;
+    auto cl = this;
+    if (cl->read_msg.msg_iovlen)
+    {
+        return true;
+    }
+    if (cl->read_remaining < cl->receive_buffer_size)
+    {
+        cl->read_iov.iov_base = cl->in_buf;
+        cl->read_iov.iov_len = cl->receive_buffer_size;
+        cl->read_msg.msg_iov = &cl->read_iov;
+        cl->read_msg.msg_iovlen = 1;
+    }
+    else
+    {
+        cl->read_iov.iov_base = 0;
+        cl->read_iov.iov_len = cl->read_remaining;
+        cl->read_msg.msg_iov = cl->recv_list.get_iovec();
+        cl->read_msg.msg_iovlen = cl->recv_list.get_size();
+    }
+    cl->refs++;
+    if (ringloop && !use_sync_send_recv)
+    {
+        io_uring_sqe* sqe = ringloop->get_sqe();
+        if (!sqe)
+        {
+            cl->read_msg.msg_iovlen = 0;
+            return false;
+        }
+        ring_data_t* data = ((ring_data_t*)sqe->user_data);
+        if (msgr->iothreads.size())
+        {
+            data->callback = [this](ring_data_t *data) { msgr->iothreads[peer_fd % msgr->iothreads.size()]->handle_client_read(this, data->res); };
+        }
+        else
+        {
+            data->callback = [this](ring_data_t *data) { msgr->handle_client_read(this, data->res); };
+        }
+        my_uring_prep_recvmsg(sqe, peer_fd, &cl->read_msg, 0);
+    }
+    else
+    {
+        int result = recvmsg(peer_fd, &cl->read_msg, 0);
+        if (result < 0)
+        {
+            result = -errno;
+        }
+        msgr->handle_client_read(this, result);
+    }
+    return true;
+}
+
+void osd_messenger_t::handle_client_read(osd_client_t *cl, int res)
+{
+    res = cl->handle_read(res);
+    if (res == -ENOENT)
+    {
+        if (!cl->refs)
+            delete cl;
+    }
+    else if (res == -EIO)
+    {
+        stop_client(cl->peer_fd);
+    }
+    else if (res == -EAGAIN)
+    {
+        read_ready_clients.push_back(cl->peer_fd);
+    }
+}
+
+void msgr_iothread_t::handle_client_read(osd_client_t *cl, int res)
+{
+    cl->mu.lock();
+    res = cl->handle_read(res);
+    if (res == -ENOENT)
+    {
+        if (!cl->refs)
+            cl->msgr->set_immediate([cl]() { delete cl; });
+    }
+    cl->mu.unlock();
+    if (res == -EIO)
+    {
+        cl->msgr->stop_client_from_iothread(cl);
+    }
+    else if (res == -EAGAIN)
+    {
+        read_ready_clients.push_back(cl->peer_fd);
+        ring->wakeup();
+    }
+}
+
+int osd_client_t::handle_read(int result)
+{
+    auto cl = this;
    cl->read_msg.msg_iovlen = 0;
    cl->refs--;
    if (cl->peer_state == PEER_STOPPED)
    {
-        if (cl->refs <= 0)
-        {
-            delete cl;
-        }
-        return false;
+        return -ENOENT;
    }
    if (result <= 0 && result != -EAGAIN && result != -EINTR)
    {
@@ -82,27 +131,14 @@ bool osd_messenger_t::handle_read(int result, osd_client_t *cl)
        {
            fprintf(stderr, "Client %d socket read error: %d (%s). Disconnecting client\n", cl->peer_fd, -result, strerror(-result));
        }
-        stop_client(cl->peer_fd);
-        return false;
-    }
-    if (result == -EAGAIN || result == -EINTR || result < cl->read_iov.iov_len)
-    {
-        cl->read_ready--;
-        if (cl->read_ready > 0)
-            read_ready_clients.push_back(cl->peer_fd);
-    }
-    else
-    {
-        read_ready_clients.push_back(cl->peer_fd);
+        return -EIO;
    }
+    int expected = cl->read_iov.iov_len;
    if (result > 0)
    {
        if (cl->read_iov.iov_base == cl->in_buf)
        {
-            if (!handle_read_buffer(cl, cl->in_buf, result))
-            {
-                goto fin;
-            }
+            handle_read_buffer(cl->in_buf, result);
        }
        else
        {
@@ -111,28 +147,25 @@ bool osd_messenger_t::handle_read(int result, osd_client_t *cl)
            cl->recv_list.eat(result);
            if (cl->recv_list.done >= cl->recv_list.count)
            {
-                if (!handle_finished_read(cl))
-                {
-                    goto fin;
-                }
+                handle_finished_read();
            }
        }
-        if (result >= cl->read_iov.iov_len)
-        {
-            ret = true;
-        }
    }
-fin:
-    for (auto cb: set_immediate)
+    if (result == -EAGAIN || result == -EINTR || result < expected)
    {
-        cb();
+        cl->read_ready--;
+        assert(cl->read_ready >= 0);
    }
-    set_immediate.clear();
-    return ret;
+    if (cl->read_ready > 0)
+    {
+        return -EAGAIN;
+    }
+    return 0;
 }

-bool osd_messenger_t::handle_read_buffer(osd_client_t *cl, void *curbuf, int remain)
+bool osd_client_t::handle_read_buffer(void *curbuf, int remain)
 {
+    auto cl = this;
    // Compose operation(s) from the buffer
    while (remain > 0)
    {
@@ -168,7 +201,7 @@ bool osd_messenger_t::handle_read_buffer(osd_client_t *cl, void *curbuf, int rem
        }
        if (cl->recv_list.done >= cl->recv_list.count)
        {
-            if (!handle_finished_read(cl))
+            if (!handle_finished_read())
            {
                return false;
            }
@@ -177,21 +210,20 @@ bool osd_messenger_t::handle_read_buffer(osd_client_t *cl, void *curbuf, int rem
    return true;
 }

-bool osd_messenger_t::handle_finished_read(osd_client_t *cl)
+bool osd_client_t::handle_finished_read()
 {
-    cl->ping_time_remaining = 0;
-    cl->idle_time_remaining = osd_idle_timeout;
+    auto cl = this;
    cl->recv_list.reset();
    if (cl->read_state == CL_READ_HDR)
    {
        if (cl->read_op->req.hdr.magic == SECONDARY_OSD_REPLY_MAGIC)
-            return handle_reply_hdr(cl);
+            return handle_reply_hdr();
        else if (cl->read_op->req.hdr.magic == SECONDARY_OSD_OP_MAGIC)
-            handle_op_hdr(cl);
+            handle_op_hdr();
        else
        {
            fprintf(stderr, "Received garbage: magic=%jx id=%ju opcode=%jx from %d\n", cl->read_op->req.hdr.magic, cl->read_op->req.hdr.id, cl->read_op->req.hdr.opcode, cl->peer_fd);
-            stop_client(cl->peer_fd);
+            msgr->stop_client_from_iothread(cl);
            return false;
        }
    }
@@ -199,7 +231,7 @@ bool osd_messenger_t::handle_finished_read(osd_client_t *cl)
    {
        // Operation is ready
        cl->received_ops.push_back(cl->read_op);
-        set_immediate.push_back([this, op = cl->read_op]() { exec_op(op); });
+        msgr->set_immediate([msgr = this->msgr, op = cl->read_op, cl]() { msgr->exec_op(op); });
        cl->read_op = NULL;
        cl->read_state = 0;
    }
@@ -217,8 +249,9 @@ bool osd_messenger_t::handle_finished_read(osd_client_t *cl)
    return true;
 }

-void osd_messenger_t::handle_op_hdr(osd_client_t *cl)
+void osd_client_t::handle_op_hdr()
 {
+    auto cl = this;
    osd_op_t *cur_op = cl->read_op;
    if (cur_op->req.hdr.opcode == OSD_OP_SEC_READ)
    {
@@ -295,20 +328,21 @@ void osd_messenger_t::handle_op_hdr(osd_client_t *cl)
    {
        // Operation is ready
        cl->received_ops.push_back(cur_op);
-        set_immediate.push_back([this, cur_op]() { exec_op(cur_op); });
+        msgr->set_immediate([msgr = this->msgr, cur_op, cl]() { msgr->exec_op(cur_op); });
        cl->read_op = NULL;
        cl->read_state = 0;
    }
 }

-bool osd_messenger_t::handle_reply_hdr(osd_client_t *cl)
+bool osd_client_t::handle_reply_hdr()
 {
+    auto cl = this;
    auto req_it = cl->sent_ops.find(cl->read_op->req.hdr.id);
    if (req_it == cl->sent_ops.end())
    {
        // Command out of sync. Drop connection
        fprintf(stderr, "Client %d command out of sync: id %ju\n", cl->peer_fd, cl->read_op->req.hdr.id);
-        stop_client(cl->peer_fd);
+        msgr->stop_client_from_iothread(cl);
        return false;
    }
    osd_op_t *op = req_it->second;
@@ -325,7 +359,7 @@ bool osd_messenger_t::handle_reply_hdr(osd_client_t *cl)
            fprintf(stderr, "Client %d read reply of different length: expected %u+%u, got %jd+%u\n",
                cl->peer_fd, expected_size, op->bitmap_len, op->reply.hdr.retval, bmp_len);
            cl->sent_ops[op->req.hdr.id] = op;
-            stop_client(cl->peer_fd);
+            msgr->stop_client_from_iothread(cl);
            return false;
        }
        if (bmp_len > 0)
@@ -401,24 +435,92 @@ reuse:
    return true;
 }

-void osd_messenger_t::handle_reply_ready(osd_op_t *op)
+void osd_client_t::handle_reply_ready(osd_op_t *op)
 {
-    // Measure subop latency
-    timespec tv_end;
-    clock_gettime(CLOCK_REALTIME, &tv_end);
-    stats.subop_stat_count[op->req.hdr.opcode]++;
-    if (!stats.subop_stat_count[op->req.hdr.opcode])
+    msgr->set_immediate([msgr = this->msgr, op, cl = this]()
    {
+        // Measure subop latency
+        auto & stats = msgr->stats;
+        timespec tv_end;
+        clock_gettime(CLOCK_REALTIME, &tv_end);
        stats.subop_stat_count[op->req.hdr.opcode]++;
-        stats.subop_stat_sum[op->req.hdr.opcode] = 0;
-    }
-    stats.subop_stat_sum[op->req.hdr.opcode] += (
-        (tv_end.tv_sec - op->tv_begin.tv_sec)*1000000 +
-        (tv_end.tv_nsec - op->tv_begin.tv_nsec)/1000
-    );
-    set_immediate.push_back([op]()
-    {
+        if (!stats.subop_stat_count[op->req.hdr.opcode])
+        {
+            stats.subop_stat_count[op->req.hdr.opcode]++;
+            stats.subop_stat_sum[op->req.hdr.opcode] = 0;
+        }
+        stats.subop_stat_sum[op->req.hdr.opcode] += (
+            (tv_end.tv_sec - op->tv_begin.tv_sec)*1000000 +
+            (tv_end.tv_nsec - op->tv_begin.tv_nsec)/1000
+        );
        // Copy lambda to be unaffected by `delete op`
        std::function<void(osd_op_t*)>(op->callback)(op);
    });
 }
+
+static uint64_t one = 1;
+
+void osd_messenger_t::set_immediate(std::function<void()> cb/*, ring_loop_t *ringloop*/)
+{
+    if (!iothreads.size())
+    {
+        immediates.push_back(cb);
+        return;
+    }
+    immediates_mu.lock();
+    bool wakeup_main_thread = !immediates.size();
+    immediates.push_back(cb);
+    immediates_mu.unlock();
+    if (wakeup_main_thread)
+    {
+//        io_uring_sqe* sqe = ringloop ? ringloop->get_sqe() : NULL;
+//        if (!sqe)
+//        {
+            write(immediates_eventfd, &one, sizeof(one));
+// FIXME: Can't use ringloop here, oops
+//        }
+//        else
+//        {
+//            ring_data_t* data = ((ring_data_t*)sqe->user_data);
+//            data->callback = [](ring_data_t*){};
+//            my_uring_prep_write(sqe, immediates_eventfd, &one, sizeof(one), 0);
+//        }
+    }
+}
+
+void osd_messenger_t::set_immediate_or_run(std::function<void()> cb/*, ring_loop_t *ringloop*/)
+{
+    if (!iothreads.size())
+    {
+        cb();
+        return;
+    }
+    immediates_mu.lock();
+    bool wakeup_main_thread = !immediates.size();
+    immediates.push_back(cb);
+    immediates_mu.unlock();
+    if (wakeup_main_thread)
+    {
+//        io_uring_sqe* sqe = ringloop ? ringloop->get_sqe() : NULL;
+//        if (!sqe)
+//        {
+            write(immediates_eventfd, &one, sizeof(one));
+// FIXME: Can't use ringloop here, oops
+//        }
+//        else
+//        {
+//            ring_data_t* data = ((ring_data_t*)sqe->user_data);
+//            data->callback = [](ring_data_t*){};
+//            my_uring_prep_write(sqe, immediates_eventfd, &one, sizeof(one), 0);
+//        }
+    }
+}
+
+void osd_messenger_t::handle_immediates()
+{
+    auto to_run = std::move(immediates);
+    for (auto & cb: to_run)
+    {
+        cb();
+    }
+}
--- a/src/client/msgr_send.cpp
+++ b/src/client/msgr_send.cpp
@@ -15,10 +15,17 @@ void osd_messenger_t::outbox_push(osd_op_t *cur_op)
    {
        clock_gettime(CLOCK_REALTIME, &cur_op->tv_begin);
    }
-    else
+    else if (cur_op->op_type == OSD_OP_IN)
+    {
+        measure_exec(cur_op);
+    }
+    if (iothreads.size())
+    {
+        cl->mu.lock();
+    }
+    if (cur_op->op_type == OSD_OP_IN)
    {
        // Check that operation actually belongs to this client
-        // FIXME: Review if this is still needed
        bool found = false;
        for (auto it = cl->received_ops.begin(); it != cl->received_ops.end(); it++)
        {
@@ -32,6 +39,10 @@ void osd_messenger_t::outbox_push(osd_op_t *cur_op)
        if (!found)
        {
            delete cur_op;
+            if (iothreads.size())
+            {
+                cl->mu.unlock();
+            }
            return;
        }
    }
@@ -39,7 +50,6 @@ void osd_messenger_t::outbox_push(osd_op_t *cur_op)
    auto & to_outbox = cl->write_msg.msg_iovlen ? cl->next_outbox : cl->outbox;
    if (cur_op->op_type == OSD_OP_IN)
    {
-        measure_exec(cur_op);
        to_send_list.push_back((iovec){ .iov_base = cur_op->reply.buf, .iov_len = OSD_PACKET_SIZE });
    }
    else
@@ -108,21 +118,36 @@ void osd_messenger_t::outbox_push(osd_op_t *cur_op)
 #ifdef WITH_RDMA
    if (cl->peer_state == PEER_RDMA)
    {
+        if (iothreads.size())
+        {
+            cl->mu.unlock();
+        }
        try_send_rdma(cl);
        return;
    }
 #endif
-    if (!ringloop)
+    if (iothreads.size())
+    {
+        int should_wakeup = !cl->write_msg.msg_iovlen && !cl->write_state;
+        cl->write_state = CL_WRITE_READY;
+        cl->mu.unlock();
+        if (should_wakeup)
+        {
+            auto iot = iothreads[cl->peer_fd % iothreads.size()];
+            iot->wakeup_out(cl->peer_fd, ringloop);
+        }
+    }
+    else if (!ringloop)
    {
        // FIXME: It's worse because it doesn't allow batching
        while (cl->outbox.size())
        {
-            try_send(cl);
+            cl->try_send(NULL, true);
        }
    }
    else
    {
-        if ((cl->write_msg.msg_iovlen > 0 || !try_send(cl)) && (cl->write_state == 0))
+        if ((cl->write_msg.msg_iovlen > 0 || !cl->try_send(ringloop, use_sync_send_recv)) && (cl->write_state == 0))
        {
            cl->write_state = CL_WRITE_READY;
            write_ready_clients.push_back(cur_op->peer_fd);
@@ -180,8 +205,9 @@ void osd_messenger_t::measure_exec(osd_op_t *cur_op)
    }
 }

-bool osd_messenger_t::try_send(osd_client_t *cl)
+bool osd_client_t::try_send(ring_loop_t *ringloop, bool use_sync_send_recv)
 {
+    auto cl = this;
    int peer_fd = cl->peer_fd;
    if (!cl->send_list.size() || cl->write_msg.msg_iovlen > 0)
    {
@@ -189,11 +215,7 @@ bool osd_messenger_t::try_send(osd_client_t *cl)
    }
    if (ringloop && !use_sync_send_recv)
    {
-        auto iothread = iothreads.size() ? iothreads[peer_fd % iothreads.size()] : NULL;
-        io_uring_sqe sqe_local;
-        ring_data_t data_local;
-        sqe_local.user_data = (uint64_t)&data_local;
-        io_uring_sqe* sqe = (iothread ? &sqe_local : ringloop->get_sqe());
+        io_uring_sqe* sqe = ringloop->get_sqe();
        if (!sqe)
        {
            return false;
@@ -202,12 +224,15 @@ bool osd_messenger_t::try_send(osd_client_t *cl)
        cl->write_msg.msg_iovlen = cl->send_list.size() < IOV_MAX ? cl->send_list.size() : IOV_MAX;
        cl->refs++;
        ring_data_t* data = ((ring_data_t*)sqe->user_data);
-        data->callback = [this, cl](ring_data_t *data) { handle_send(data->res, cl); };
-        my_uring_prep_sendmsg(sqe, peer_fd, &cl->write_msg, 0);
-        if (iothread)
+        if (msgr->iothreads.size())
        {
-            iothread->add_sqe(sqe_local);
+            data->callback = [this](ring_data_t *data) { msgr->iothreads[this->peer_fd % msgr->iothreads.size()]->handle_client_send(this, data->res); };
        }
+        else
+        {
+            data->callback = [this](ring_data_t *data) { msgr->handle_client_send(this, data->res); };
+        }
+        my_uring_prep_sendmsg(sqe, peer_fd, &cl->write_msg, 0);
    }
    else
    {
@@ -219,18 +244,68 @@ bool osd_messenger_t::try_send(osd_client_t *cl)
        {
            result = -errno;
        }
-        handle_send(result, cl);
+        msgr->handle_client_send(this, result);
    }
    return true;
 }

+void osd_messenger_t::handle_client_send(osd_client_t *cl, int res)
+{
+    res = cl->handle_send(res);
+    if (res == -ENOENT)
+    {
+        if (!cl->refs)
+            delete cl;
+    }
+    else if (res == -EIO)
+    {
+        stop_client(cl->peer_fd);
+    }
+    else if (res == -EAGAIN)
+    {
+        write_ready_clients.push_back(cl->peer_fd);
+    }
+}
+
+void msgr_iothread_t::handle_client_send(osd_client_t *cl, int res)
+{
+    cl->mu.lock();
+    res = cl->handle_send(res);
+    if (res == -ENOENT)
+    {
+        if (!cl->refs)
+            cl->msgr->set_immediate([cl]() { delete cl; });
+    }
+    cl->mu.unlock();
+    if (res == -EIO)
+    {
+        cl->msgr->stop_client_from_iothread(cl);
+    }
+    else if (res == -EAGAIN)
+    {
+        write_ready_mu.lock();
+        write_ready_clients.push_back(cl->peer_fd);
+        write_ready_mu.unlock();
+        ring->wakeup();
+    }
+}
+
 void osd_messenger_t::send_replies()
 {
+    if (iothreads.size())
+    {
+        return;
+    }
    for (int i = 0; i < write_ready_clients.size(); i++)
    {
        int peer_fd = write_ready_clients[i];
        auto cl_it = clients.find(peer_fd);
-        if (cl_it != clients.end() && !try_send(cl_it->second))
+        if (cl_it == clients.end())
+        {
+            continue;
+        }
+        auto cl = cl_it->second;
+        if (!cl->try_send(ringloop, use_sync_send_recv))
        {
            write_ready_clients.erase(write_ready_clients.begin(), write_ready_clients.begin() + i);
            return;
@@ -239,24 +314,20 @@ void osd_messenger_t::send_replies()
    write_ready_clients.clear();
 }

-void osd_messenger_t::handle_send(int result, osd_client_t *cl)
+int osd_client_t::handle_send(int result)
 {
+    auto cl = this;
    cl->write_msg.msg_iovlen = 0;
    cl->refs--;
    if (cl->peer_state == PEER_STOPPED)
    {
-        if (cl->refs <= 0)
-        {
-            delete cl;
-        }
-        return;
+        return -ENOENT;
    }
    if (result < 0 && result != -EAGAIN && result != -EINTR)
    {
        // this is a client socket, so don't panic. just disconnect it
        fprintf(stderr, "Client %d socket write error: %d (%s). Disconnecting client\n", cl->peer_fd, -result, strerror(-result));
-        stop_client(cl->peer_fd);
-        return;
+        return -EIO;
    }
    if (result >= 0)
    {
@@ -269,7 +340,7 @@ void osd_messenger_t::handle_send(int result, osd_client_t *cl)
                if (cl->outbox[done].flags & MSGR_SENDP_FREE)
                {
                    // Reply fully sent
-                    delete cl->outbox[done].op;
+                    msgr->set_immediate_or_run([op = cl->outbox[done].op] { delete op; });
                }
                result -= iov.iov_len;
                done++;
@@ -299,26 +370,35 @@ void osd_messenger_t::handle_send(int result, osd_client_t *cl)
        {
            // FIXME: Do something better than just forgetting the FD
            // FIXME: Ignore pings during RDMA state transition
-            if (log_level > 0)
-            {
-                fprintf(stderr, "Successfully connected with client %d using RDMA\n", cl->peer_fd);
-            }
            cl->peer_state = PEER_RDMA;
-            tfd->set_fd_handler(cl->peer_fd, false, [this](int peer_fd, int epoll_events)
+            msgr->set_immediate_or_run([cl = this, msgr = this->msgr, peer_fd = this->peer_fd]()
            {
-                // Do not miss the disconnection!
-                if (epoll_events & EPOLLRDHUP)
+                auto cl_it = msgr->clients.find(peer_fd);
+                if (cl_it == msgr->clients.end() || cl_it->second != cl)
                {
-                    handle_peer_epoll(peer_fd, epoll_events);
+                    return;
                }
+                if (msgr->log_level > 0)
+                {
+                    fprintf(stderr, "Successfully connected with client %d using RDMA\n", peer_fd);
+                }
+                msgr->tfd->set_fd_handler(peer_fd, 0, [msgr](int peer_fd, int epoll_events)
+                {
+                    // Do not miss the disconnection!
+                    if (epoll_events & EPOLLRDHUP)
+                    {
+                        msgr->handle_peer_epoll(peer_fd, epoll_events);
+                    }
+                });
+                // Add the initial receive request
+                msgr->try_recv_rdma(cl);
            });
-            // Add the initial receive request
-            try_recv_rdma(cl);
        }
 #endif
    }
    if (cl->write_state != 0)
    {
-        write_ready_clients.push_back(cl->peer_fd);
+        return -EAGAIN;
    }
+    return 0;
 }
--- a/src/client/msgr_stop.cpp
+++ b/src/client/msgr_stop.cpp
@@ -11,6 +11,7 @@

 void osd_messenger_t::cancel_osd_ops(osd_client_t *cl)
 {
+    cl->mu.lock();
    std::vector<osd_op_t*> cancel_ops;
    cancel_ops.resize(cl->sent_ops.size());
    int i = 0;
@@ -20,6 +21,7 @@ void osd_messenger_t::cancel_osd_ops(osd_client_t *cl)
    }
    cl->sent_ops.clear();
    cl->outbox.clear();
+    cl->mu.unlock();
    for (auto op: cancel_ops)
    {
        cancel_op(op);
@@ -53,8 +55,10 @@ void osd_messenger_t::stop_client(int peer_fd, bool force, bool force_delete)
        return;
    }
    osd_client_t *cl = it->second;
+    cl->mu.lock();
    if (cl->peer_state == PEER_CONNECTING && !force || cl->peer_state == PEER_STOPPED)
    {
+        cl->mu.unlock();
        return;
    }
    if (log_level > 0)
@@ -71,6 +75,7 @@ void osd_messenger_t::stop_client(int peer_fd, bool force, bool force_delete)
    // First set state to STOPPED so another stop_client() call doesn't try to free it again
    cl->refs++;
    cl->peer_state = PEER_STOPPED;
+    cl->mu.unlock();
    if (cl->osd_num)
    {
        // ...and forget OSD peer
@@ -78,7 +83,11 @@ void osd_messenger_t::stop_client(int peer_fd, bool force, bool force_delete)
    }
 #ifndef __MOCK__
    // Then remove FD from the eventloop so we don't accidentally read something
-    tfd->set_fd_handler(peer_fd, false, NULL);
+    tfd->set_fd_handler(peer_fd, 0, NULL);
+    if (iothreads.size())
+    {
+        iothreads[peer_fd % iothreads.size()]->remove_client(cl);
+    }
    if (cl->connect_timeout_id >= 0)
    {
        tfd->clear_timer(cl->connect_timeout_id);
@@ -108,17 +117,24 @@ void osd_messenger_t::stop_client(int peer_fd, bool force, bool force_delete)
        repeer_pgs(cl->osd_num);
    }
    // Then cancel all operations
+    cl->mu.lock();
    if (cl->read_op)
    {
-        if (!cl->read_op->callback)
+        auto op = cl->read_op;
+        cl->read_op = NULL;
+        cl->mu.unlock();
+        if (!op->callback)
        {
-            delete cl->read_op;
+            delete op;
        }
        else
        {
-            cancel_op(cl->read_op);
+            cancel_op(op);
        }
-        cl->read_op = NULL;
+    }
+    else
+    {
+        cl->mu.unlock();
    }
    if (cl->osd_num)
    {
@@ -131,11 +147,32 @@ void osd_messenger_t::stop_client(int peer_fd, bool force, bool force_delete)
    {
        clients.erase(it);
    }
+    cl->mu.lock();
    cl->refs--;
    if (cl->refs <= 0 || force_delete)
    {
+        cl->mu.unlock();
        delete cl;
    }
+    else
+        cl->mu.unlock();
+}
+
+void osd_messenger_t::stop_client_from_iothread(osd_client_t *cl)
+{
+    if (!iothreads.size())
+    {
+        stop_client(cl->peer_fd);
+        return;
+    }
+    set_immediate([this, cl, peer_fd = cl->peer_fd]()
+    {
+        auto cl_it = clients.find(peer_fd);
+        if (cl_it != clients.end() && cl_it->second == cl)
+        {
+            stop_client(peer_fd);
+        }
+    });
 }

 osd_client_t::~osd_client_t()
--- a/src/client/nbd_proxy.cpp
+++ b/src/client/nbd_proxy.cpp
@@ -253,7 +253,7 @@ nla_put_failure:
 const char *exe_name = NULL;

 const char *help_text =
-    "Vitastor NBD proxy " VITASTOR_VERSION "\n"
+    "Vitastor NBD proxy " VERSION "\n"
    "(c) Vitaliy Filippov, 2020+ (VNPL-1.1)\n"
    "\n"
    "COMMANDS:\n"
@@ -655,7 +655,7 @@ help:
        ringloop->register_consumer(&consumer);
        // Add FD to epoll
        bool stop = false;
-        epmgr->tfd->set_fd_handler(sockfd[0], false, [this, &stop](int peer_fd, int epoll_events)
+        epmgr->tfd->set_fd_handler(sockfd[0], EPOLLIN, [this, &stop](int peer_fd, int epoll_events)
        {
            if (epoll_events & EPOLLRDHUP)
            {
--- a/src/client/vitastor.pc.in
+++ b/src/client/vitastor.pc.in
@@ -6,7 +6,7 @@ includedir=${prefix}/@CMAKE_INSTALL_INCLUDEDIR@

 Name: Vitastor
 Description: Vitastor client library
-Version: 1.7.1
+Version: 1.6.1
 Libs: -L${libdir} -lvitastor_client
 Cflags: -I${includedir}

--- a/src/client/vitastor_c.cpp
+++ b/src/client/vitastor_c.cpp
@@ -384,28 +384,6 @@ int vitastor_c_inode_get_readonly(void *handle)
    return watch->cfg.readonly;
 }

-uint64_t vitastor_c_inode_get_parent_id(void *handle)
-{
-    inode_watch_t *watch = (inode_watch_t*)handle;
-    return watch->cfg.parent_id;
-}
-
-char* vitastor_c_inode_get_meta(void *handle)
-{
-    inode_watch_t *watch = (inode_watch_t*)handle;
-    if (watch->cfg.meta.is_null())
-    {
-        return NULL;
-    }
-    return strdup(watch->cfg.meta.dump().c_str());
-}
-
-uint64_t vitastor_c_inode_get_mod_revision(void *handle)
-{
-    inode_watch_t *watch = (inode_watch_t*)handle;
-    return watch->cfg.mod_revision;
-}
-
 uint32_t vitastor_c_inode_get_immediate_commit(vitastor_c *client, uint64_t inode_num)
 {
    auto pool_it = client->cli->st_cli.pool_config.find(INODE_POOL(inode_num));
--- a/src/client/vitastor_c.h
+++ b/src/client/vitastor_c.h
@@ -69,9 +69,6 @@ void vitastor_c_watch_inode(vitastor_c *client, char *image, VitastorIOHandler c
 void vitastor_c_close_watch(vitastor_c *client, void *handle);
 uint64_t vitastor_c_inode_get_size(void *handle);
 uint64_t vitastor_c_inode_get_num(void *handle);
-uint64_t vitastor_c_inode_get_parent_id(void *handle);
-char* vitastor_c_inode_get_meta(void *handle);
-uint64_t vitastor_c_inode_get_mod_revision(void *handle);
 uint32_t vitastor_c_inode_get_block_size(vitastor_c *client, uint64_t inode_num);
 uint32_t vitastor_c_inode_get_bitmap_granularity(vitastor_c *client, uint64_t inode_num);
 int vitastor_c_inode_get_readonly(void *handle);
--- a/src/cmd/CMakeLists.txt
+++ b/src/cmd/CMakeLists.txt
@@ -12,9 +12,7 @@ add_library(vitastor_cli STATIC
 	cli_ls.cpp
 	cli_create.cpp
 	cli_modify.cpp
-	cli_modify_osd.cpp
 	cli_osd_tree.cpp
-	cli_pg_ls.cpp
 	cli_flatten.cpp
 	cli_merge.cpp
 	cli_rm_data.cpp
--- a/src/cmd/cli.cpp
+++ b/src/cmd/cli.cpp
@@ -17,7 +17,7 @@
 static const char *exe_name = NULL;

 static const char* help_text =
-    "Vitastor command-line tool " VITASTOR_VERSION "\n"
+    "Vitastor command-line tool " VERSION "\n"
    "(c) Vitaliy Filippov, 2019+ (VNPL-1.1)\n"
    "\n"
    "COMMANDS:\n"
@@ -70,7 +70,6 @@ static const char* help_text =
    "  --wait-list   Retrieve full objects listings before starting to remove objects.\n"
    "                Requires more memory, but allows to show correct removal progress.\n"
    "  --min-offset  Purge only data starting with specified offset.\n"
-    "  --max-offset  Purge only data before specified offset.\n"
    "\n"
    "vitastor-cli merge-data <from> <to> [--target <target>]\n"
    "  Merge layer data without changing metadata. Merge <from>..<to> to <target>.\n"
@@ -119,23 +118,11 @@ static const char* help_text =
    "  With --dry-run only checks if deletion is possible without data loss and\n"
    "  redundancy degradation.\n"
    "\n"
-    "vitastor-cli osd-tree [-l|--long]\n"
-    "  Show current OSD tree, optionally with I/O statistics if -l is specified.\n"
+    "vitastor-cli osd-tree\n"
+    "  Show current OSD tree.\n"
    "\n"
-    "vitastor-cli osds|ls-osd|osd-ls [-l|--long]\n"
-    "  Show current OSDs as list, optionally with I/O statistics if -l is specified.\n"
-    "\n"
-    "vitastor-cli modify-osd [--tags tag1,tag2,...] [--reweight <number>] [--noout true/false] <osd_number>\n"
-    "  Set OSD reweight, tags or noout flag.\n"
-    "\n"
-    "vitastor-cli pg-list|pg-ls|list-pg|ls-pg|ls-pgs [OPTIONS] [state1+state2] [^state3] [...]\n"
-    "  List PGs with any of listed state filters (^ or ! in the beginning is negation). Options:\n"
-    "    --pool <pool name or number>  Only list PGs of the given pool.\n"
-    "    --min <min pg number>         Only list PGs with number >= min.\n"
-    "    --max <max pg number>         Only list PGs with number <= max.\n"
-    "  Examples:\n"
-    "    vitastor-cli pg-list active+degraded\n"
-    "    vitastor-cli pg-list ^active\n"
+    "vitastor-cli osds|ls-osd|osd-ls\n"
+    "  Show current OSDs as list.\n"
    "\n"
    "vitastor-cli create-pool|pool-create <name> (-s <pg_size>|--ec <N>+<K>) -n <pg_count> [OPTIONS]\n"
    "  Create a pool. Required parameters:\n"
@@ -149,7 +136,7 @@ static const char* help_text =
    "    --osd_tags <tag>[,<tag>]...   Put pool only on OSDs tagged with all specified tags\n"
    "    --block_size 128k             Put pool only on OSDs with this data block size\n"
    "    --bitmap_granularity 4k       Put pool only on OSDs with this logical sector size\n"
-    "    --immediate_commit all        Put pool only on OSDs with this or larger immediate_commit (none < small < all)\n"
+    "    --immediate_commit none       Put pool only on OSDs with this or larger immediate_commit (none < small < all)\n"
    "    --level_placement <rules>     Use additional failure domain rules (example: \"dc=112233\")\n"
    "    --raw_placement <rules>       Specify raw PG generation rules (see documentation for details)\n"
    "    --primary_affinity_tags tags  Prefer to put primary copies on OSDs with all specified tags\n"
@@ -419,23 +406,6 @@ static int run(cli_tool_t *p, json11::Json::object cfg)
        cfg["flat"] = true;
        action_cb = p->start_osd_tree(cfg);
    }
-    else if (cmd[0] == "modify-osd")
-    {
-        // Modify OSD configuration
-        if (cmd.size() > 1)
-            cfg["osd_num"] = cmd[1];
-        action_cb = p->start_modify_osd(cfg);
-    }
-    else if (cmd[0] == "pg-list" || cmd[0] == "pg-ls" || cmd[0] == "list-pg" || cmd[0] == "ls-pg" || cmd[0] == "ls-pgs")
-    {
-        // Modify OSD configuration
-        if (cmd.size() > 1)
-        {
-            cmd.erase(cmd.begin(), cmd.begin()+1);
-            cfg["pg_state"] = cmd;
-        }
-        action_cb = p->start_pg_list(cfg);
-    }
    else if (cmd[0] == "create-pool" || cmd[0] == "pool-create")
    {
        // Create a new pool
--- a/src/cmd/cli.h
+++ b/src/cmd/cli.h
@@ -65,9 +65,7 @@ public:
    std::function<bool(cli_result_t &)> start_ls(json11::Json);
    std::function<bool(cli_result_t &)> start_merge(json11::Json);
    std::function<bool(cli_result_t &)> start_modify(json11::Json);
-    std::function<bool(cli_result_t &)> start_modify_osd(json11::Json);
    std::function<bool(cli_result_t &)> start_osd_tree(json11::Json);
-    std::function<bool(cli_result_t &)> start_pg_list(json11::Json);
    std::function<bool(cli_result_t &)> start_pool_create(json11::Json);
    std::function<bool(cli_result_t &)> start_pool_modify(json11::Json);
    std::function<bool(cli_result_t &)> start_pool_rm(json11::Json);
--- a/src/cmd/cli_modify_osd.cpp
+++ b/src/cmd/cli_modify_osd.cpp
@@ -1,210 +0,0 @@
-// Copyright (c) Vitaliy Filippov, 2019+
-// License: VNPL-1.1 (see README.md for details)
-
-#include "cli.h"
-#include "cluster_client.h"
-#include "str_util.h"
-#include "http_client.h"
-
-// Reweight OSD, change tags or set noout flag
-struct osd_changer_t
-{
-    cli_tool_t *parent;
-
-    uint64_t osd_num = 0;
-    bool set_tags = false;
-    std::vector<std::string> new_tags;
-    bool set_reweight = false;
-    double new_reweight = 1;
-    bool set_noout = false;
-    double new_noout = false;
-    bool force = false;
-
-    json11::Json::object osd_cfg;
-    uint64_t osd_cfg_mod_rev = 0;
-    json11::Json::array compare, success;
-
-    int state = 0;
-    std::function<bool(cli_result_t &)> cb;
-    cli_result_t result;
-
-    bool is_done()
-    {
-        return state == 100;
-    }
-
-    void loop()
-    {
-        if (state == 1)
-            goto resume_1;
-        else if (state == 2)
-            goto resume_2;
-        if (!osd_num)
-        {
-            result = (cli_result_t){ .err = EINVAL, .text = "OSD number is missing" };
-            state = 100;
-            return;
-        }
-        if (!set_tags && !set_reweight && !set_noout)
-        {
-            result = (cli_result_t){ .err = EINVAL, .text = "Nothing to update" };
-            state = 100;
-            return;
-        }
-        if (set_reweight && new_reweight < 0)
-        {
-            result = (cli_result_t){ .err = EINVAL, .text = "Reweight can't be negative" };
-            state = 100;
-            return;
-        }
-        parent->etcd_txn(json11::Json::object {
-            { "success", json11::Json::array {
-                json11::Json::object {
-                    { "request_range", json11::Json::object {
-                        { "key", base64_encode(parent->cli->st_cli.etcd_prefix+"/osd/stats/"+std::to_string(osd_num)) },
-                    } },
-                },
-                json11::Json::object {
-                    { "request_range", json11::Json::object {
-                        { "key", base64_encode(parent->cli->st_cli.etcd_prefix+"/config/osd/"+std::to_string(osd_num)) },
-                    } },
-                },
-            } },
-        });
-        state = 1;
-resume_1:
-        if (parent->waiting > 0)
-            return;
-        if (parent->etcd_err.err)
-        {
-            result = parent->etcd_err;
-            state = 100;
-            return;
-        }
-        {
-            auto osd_stats = parent->cli->st_cli.parse_etcd_kv(parent->etcd_result["responses"][0]["response_range"]["kvs"][0]).value;
-            if (!osd_stats.is_object() && !force)
-            {
-                result = (cli_result_t){ .err = ENOENT, .text = "OSD "+std::to_string(osd_num)+" does not exist. Use --force to set configuration anyway" };
-                state = 100;
-                return;
-            }
-            auto kv = parent->cli->st_cli.parse_etcd_kv(parent->etcd_result["responses"][1]["response_range"]["kvs"][0]);
-            osd_cfg_mod_rev = kv.mod_revision;
-            osd_cfg = kv.value.object_items();
-            if (set_reweight)
-            {
-                if (new_reweight != 1)
-                    osd_cfg["reweight"] = new_reweight;
-                else
-                    osd_cfg.erase("reweight");
-            }
-            if (set_tags)
-            {
-                if (new_tags.size())
-                    osd_cfg["tags"] = new_tags;
-                else
-                    osd_cfg.erase("tags");
-            }
-            if (set_noout)
-            {
-                if (new_noout)
-                    osd_cfg["noout"] = true;
-                else
-                    osd_cfg.erase("noout");
-            }
-            compare.push_back(json11::Json::object {
-                { "target", "MOD" },
-                { "key", base64_encode(parent->cli->st_cli.etcd_prefix+"/config/osd/"+std::to_string(osd_num)) },
-                { "result", "LESS" },
-                { "mod_revision", osd_cfg_mod_rev+1 },
-            });
-            if (!osd_cfg.size())
-            {
-                success.push_back(json11::Json::object {
-                    { "request_delete_range", json11::Json::object {
-                        { "key", base64_encode(parent->cli->st_cli.etcd_prefix+"/config/osd/"+std::to_string(osd_num)) },
-                    } },
-                });
-            }
-            else
-            {
-                success.push_back(json11::Json::object {
-                    { "request_put", json11::Json::object {
-                        { "key", base64_encode(parent->cli->st_cli.etcd_prefix+"/config/osd/"+std::to_string(osd_num)) },
-                        { "value", base64_encode(json11::Json(osd_cfg).dump()) },
-                    } },
-                });
-            }
-        }
-        parent->etcd_txn(json11::Json::object {
-            { "compare", compare },
-            { "success", success },
-        });
-        state = 2;
-resume_2:
-        if (parent->waiting > 0)
-            return;
-        if (parent->etcd_err.err)
-        {
-            result = parent->etcd_err;
-            state = 100;
-            return;
-        }
-        if (!parent->etcd_result["succeeded"].bool_value())
-        {
-            result = (cli_result_t){ .err = EAGAIN, .text = "OSD "+std::to_string(osd_num)+" configuration was modified by someone else, please repeat your request" };
-            state = 100;
-            return;
-        }
-        result = (cli_result_t){
-            .err = 0,
-            .text = "OSD "+std::to_string(osd_num)+" configuration modified",
-            .data = osd_cfg,
-        };
-        state = 100;
-    }
-};
-
-std::function<bool(cli_result_t &)> cli_tool_t::start_modify_osd(json11::Json cfg)
-{
-    auto changer = new osd_changer_t();
-    changer->parent = this;
-    changer->osd_num = cfg["osd_num"].uint64_value();
-    if (!cfg["tags"].is_null())
-    {
-        changer->set_tags = true;
-        if (cfg["tags"].is_string())
-        {
-            if (cfg["tags"].string_value() != "")
-                changer->new_tags = explode(",", cfg["tags"].string_value(), true);
-        }
-        else if (cfg["tags"].is_array())
-        {
-            for (auto item: cfg["tags"].array_items())
-                changer->new_tags.push_back(item.as_string());
-        }
-    }
-    if (!cfg["reweight"].is_null())
-    {
-        changer->set_reweight = true;
-        changer->new_reweight = cfg["reweight"].number_value();
-    }
-    if (!cfg["noout"].is_null())
-    {
-        changer->set_noout = true;
-        changer->new_noout = json_is_true(cfg["noout"]);
-    }
-    changer->force = cfg["force"].bool_value();
-    return [changer](cli_result_t & result)
-    {
-        changer->loop();
-        if (changer->is_done())
-        {
-            result = changer->result;
-            delete changer;
-            return true;
-        }
-        return false;
-    };
-}
--- a/src/cmd/cli_osd_tree.cpp
+++ b/src/cmd/cli_osd_tree.cpp
@@ -17,7 +17,6 @@ struct placement_osd_t
    uint64_t free;
    bool up;
    double reweight;
-    bool noout;
    uint32_t block_size, bitmap_granularity, immediate_commit;
 };

@@ -133,10 +132,9 @@ resume_1:
                .free = kv.second["free"].uint64_value(),
                .up = parent->cli->st_cli.peer_states.find(kv.first) != parent->cli->st_cli.peer_states.end(),
                .reweight = 1,
-                .noout = false,
                .block_size = (uint32_t)kv.second["data_block_size"].uint64_value(),
                .bitmap_granularity = (uint32_t)kv.second["bitmap_granularity"].uint64_value(),
-                .immediate_commit = etcd_state_client_t::parse_immediate_commit(kv.second["immediate_commit"].string_value(), IMMEDIATE_NONE),
+                .immediate_commit = etcd_state_client_t::parse_immediate_commit(kv.second["immediate_commit"].string_value()),
            };
            if (tree->nodes.find(osd.parent) == tree->nodes.end())
            {
@@ -156,7 +154,6 @@ resume_1:
                    for (auto & jtag: osd_cfg["tags"].array_items())
                        osd.tags.push_back(jtag.string_value());
                }
-                osd.noout = osd_cfg["noout"].bool_value();
            }
            auto np_it = node_placement.find(std::to_string(osd.num));
            if (np_it != node_placement.end())
@@ -181,7 +178,7 @@ resume_1:
        return tree;
    }

-    void format_tree()
+    std::string format_tree()
    {
        std::vector<std::string> node_seq = { "" };
        std::vector<int> indents = { -1 };
@@ -201,39 +198,6 @@ resume_1:
            }
        }
        json11::Json::array fmt_items;
-        if (parent->json_output)
-        {
-            for (int i = 1; i < node_seq.size(); i++)
-            {
-                auto & node = placement_tree->nodes.at(node_seq[i]);
-                fmt_items.push_back(json11::Json::object{
-                    { "type", node.level },
-                    { "name", node.name },
-                    { "parent", node.parent },
-                });
-                for (uint64_t osd_num: node.child_osds)
-                {
-                    auto & osd = placement_tree->osds.at(osd_num);
-                    fmt_items.push_back(json11::Json::object{
-                        { "type", "osd" },
-                        { "name", osd.num },
-                        { "parent", node.name },
-                        { "up", osd.up ? "up" : "down" },
-                        { "size", osd.size },
-                        { "free", osd.free },
-                        { "reweight", osd.reweight },
-                        { "noout", osd.noout },
-                        { "tags", osd.tags },
-                        { "block", (uint64_t)osd.block_size },
-                        { "bitmap", (uint64_t)osd.bitmap_granularity },
-                        { "commit", osd.immediate_commit == IMMEDIATE_NONE ? "none" : (osd.immediate_commit == IMMEDIATE_ALL ? "all" : "small") },
-                        { "op_stats", osd_stats[osd_num]["op_stats"] },
-                    });
-                }
-            }
-            result.data = fmt_items;
-            return;
-        }
        for (int i = 1; i < node_seq.size(); i++)
        {
            auto & node = placement_tree->nodes.at(node_seq[i]);
@@ -265,7 +229,6 @@ resume_1:
                    { "size", format_size(osd.size, false, true) },
                    { "used", format_q(100.0*(osd.size - osd.free)/osd.size)+" %" },
                    { "reweight", format_q(osd.reweight) },
-                    { "noout", osd.noout ? "noout" : "-" },
                    { "tags", implode(",", osd.tags) },
                    { "block", format_size(osd.block_size, false, true) },
                    { "bitmap", format_size(osd.bitmap_granularity, false, true) },
@@ -338,10 +301,6 @@ resume_1:
            { "key", "commit" },
            { "title", "IMM" },
        });
-        cols.push_back(json11::Json::object{
-            { "key", "noout" },
-            { "title", "NOOUT" },
-        });
        if (show_stats)
        {
            cols.push_back(json11::Json::object{
@@ -381,7 +340,7 @@ resume_1:
                { "title", "LAT" },
            });
        }
-        result.text = print_table(fmt_items, cols, parent->color);
+        return print_table(fmt_items, cols, parent->color);
    }

    void loop()
@@ -392,7 +351,7 @@ resume_1:
        load_osd_tree();
        if (parent->waiting > 0)
            return;
-        format_tree();
+        result.text = format_tree();
        state = 100;
    }
 };
--- a/src/cmd/cli_pg_ls.cpp
+++ b/src/cmd/cli_pg_ls.cpp
@@ -1,288 +0,0 @@
-// Copyright (c) Vitaliy Filippov, 2024
-// License: VNPL-1.1 (see README.md for details)
-
-#include "cli.h"
-#include "cluster_client.h"
-#include "pg_states.h"
-#include "str_util.h"
-
-struct pg_lister_t
-{
-    cli_tool_t *parent;
-
-    uint64_t pool_id = 0;
-    std::string pool_name;
-    std::vector<std::string> pg_state;
-    uint64_t min_pg_num = 0;
-    uint64_t max_pg_num = 0;
-
-    std::map<pool_pg_num_t, json11::Json> pg_stats;
-
-    int state = 0;
-    cli_result_t result;
-
-    bool is_done() { return state == 100; }
-
-    void load_pg_stats()
-    {
-        if (state == 1)
-            goto resume_1;
-        if (pool_name != "")
-        {
-            pool_id = 0;
-            for (auto & pp: parent->cli->st_cli.pool_config)
-            {
-                if (pp.second.name == pool_name)
-                {
-                    pool_id = pp.first;
-                    break;
-                }
-            }
-            if (!pool_id)
-            {
-                result = (cli_result_t){ .err = ENOENT, .text = "Pool "+pool_name+" not found" };
-                state = 100;
-                return;
-            }
-        }
-        parent->etcd_txn(json11::Json::object {
-            { "success", json11::Json::array {
-                json11::Json::object {
-                    { "request_range", json11::Json::object {
-                        { "key", base64_encode(parent->cli->st_cli.etcd_prefix+"/pgstats"+(pool_id ? "/"+std::to_string(pool_id)+"/" : "/")) },
-                        { "range_end", base64_encode(parent->cli->st_cli.etcd_prefix+"/pgstats"+(pool_id ? "/"+std::to_string(pool_id)+"0" : "0")) },
-                    } },
-                },
-            } },
-        });
-        state = 1;
-resume_1:
-        if (parent->waiting > 0)
-            return;
-        if (parent->etcd_err.err)
-        {
-            result = parent->etcd_err;
-            state = 100;
-            return;
-        }
-        parent->iterate_kvs_2(parent->etcd_result["responses"][0]["response_range"]["kvs"], "/pgstats/", [&](pool_id_t pool_id, uint64_t pg_num, json11::Json value)
-        {
-            pg_stats[(pool_pg_num_t){ .pool_id = pool_id, .pg_num = (pg_num_t)pg_num }] = value;
-        });
-    }
-
-    void format_pgs()
-    {
-        uint64_t is_not = ((uint64_t)1 << 63);
-        std::vector<uint64_t> masks;
-        if (pg_state.size())
-        {
-            for (auto & st: pg_state)
-            {
-                if (st.size())
-                {
-                    uint64_t mask = 0;
-                    size_t pos = 0;
-                    if (st[0] == '!' || st[0] == '^')
-                    {
-                        mask |= is_not;
-                        pos++;
-                    }
-                    size_t prev = pos;
-                    while (true)
-                    {
-                        if (pos < st.size() && (st[pos] >= 'a' && st[pos] <= 'z' || st[pos] == '_'))
-                            pos++;
-                        else
-                        {
-                            if (pos > prev)
-                            {
-                                std::string bit = st.substr(prev, pos-prev);
-                                bool found = false;
-                                for (int i = 0; i < pg_state_bit_count; i++)
-                                {
-                                    if (pg_state_names[i] == bit)
-                                    {
-                                        mask |= (uint64_t)1 << i;
-                                        found = true;
-                                        break;
-                                    }
-                                }
-                                if (!found)
-                                {
-                                    result = (cli_result_t){ .err = EINVAL, .text = "Unknown PG state "+bit };
-                                    state = 100;
-                                    return;
-                                }
-                            }
-                            while (pos < st.size() && !(st[pos] >= 'a' && st[pos] <= 'z' || st[pos] == '_'))
-                                pos++;
-                            prev = pos;
-                            if (pos >= st.size())
-                                break;
-                        }
-                    }
-                    masks.push_back(mask);
-                }
-            }
-        }
-        json11::Json::array pgs;
-        for (auto & pp: parent->cli->st_cli.pool_config)
-        {
-            if ((!pool_id || pp.first == pool_id) && (pool_name == "" || pp.second.name == pool_name))
-            {
-                for (auto & pgp: pp.second.pg_config)
-                {
-                    if (min_pg_num && pgp.first < min_pg_num || max_pg_num && pgp.first > max_pg_num)
-                    {
-                        continue;
-                    }
-                    if (masks.size())
-                    {
-                        bool found = false;
-                        for (auto mask: masks)
-                        {
-                            if ((mask & is_not)
-                                ? (pgp.second.cur_state & (mask & ~is_not)) != (mask & ~is_not)
-                                : ((pgp.second.cur_state & mask) == mask))
-                            {
-                                found = true;
-                                break;
-                            }
-                        }
-                        if (!found)
-                            continue;
-                    }
-                    json11::Json::array state_names;
-                    for (int i = 0; i < pg_state_bit_count; i++)
-                    {
-                        if (pgp.second.cur_state & (1 << i))
-                        {
-                            state_names.push_back(std::string(pg_state_names[i]));
-                        }
-                    }
-                    if (!pgp.second.cur_state)
-                    {
-                        state_names.push_back("offline");
-                    }
-                    auto stat = pg_stats[(pool_pg_num_t){ .pool_id = pp.first, .pg_num = pgp.first }].object_items();
-                    stat.erase("write_osd_set");
-                    stat["pool_id"] = (uint64_t)pp.first;
-                    stat["pool_name"] = pp.second.name;
-                    stat["pg_num"] = (uint64_t)pgp.first;
-                    stat["pause"] = pgp.second.pause;
-                    stat["state"] = state_names;
-                    stat["cur_primary"] = pgp.second.cur_primary;
-                    stat["target_primary"] = pgp.second.primary;
-                    stat["target_set"] = pgp.second.target_set;
-                    stat["target_history"] = pgp.second.target_history;
-                    stat["all_peers"] = pgp.second.all_peers;
-                    stat["epoch"] = pgp.second.epoch;
-                    stat["next_scrub"] = pgp.second.next_scrub;
-                    if (!parent->json_output)
-                    {
-                        stat["fmt_state"] = implode("+", state_names);
-                        stat["fmt_primary"] = (!pgp.second.primary && !pgp.second.cur_primary
-                            ? "-"
-                            : (std::to_string(pgp.second.cur_primary) + (pgp.second.primary == pgp.second.cur_primary
-                                ? ""
-                                : "->"+std::to_string(pgp.second.primary))));
-                        stat["fmt_target_set"] = implode(",", stat["target_set"]);
-                        uint64_t pg_block = pp.second.data_block_size * (pp.second.scheme == POOL_SCHEME_REPLICATED
-                            ? 1 : (pp.second.pg_size-pp.second.parity_chunks));
-                        stat["fmt_clean"] = format_size(stat["clean_count"].uint64_value() * pg_block);
-                        stat["fmt_misplaced"] = format_size(stat["misplaced_count"].uint64_value() * pg_block);
-                        stat["fmt_degraded"] = format_size(stat["degraded_count"].uint64_value() * pg_block);
-                        stat["fmt_incomplete"] = format_size(stat["incomplete_count"].uint64_value() * pg_block);
-                    }
-                    pgs.push_back(stat);
-                }
-            }
-        }
-        if (parent->json_output)
-        {
-            result.data = pgs;
-            return;
-        }
-        json11::Json::array cols;
-        if (!pool_id)
-        {
-            cols.push_back(json11::Json::object{
-                { "key", "pool_name" },
-                { "title", "POOL" },
-            });
-        }
-        cols.push_back(json11::Json::object{
-            { "key", "pg_num" },
-            { "title", "NUM" },
-        });
-        cols.push_back(json11::Json::object{
-            { "key", "fmt_target_set" },
-            { "title", "OSD SET" },
-        });
-        cols.push_back(json11::Json::object{
-            { "key", "fmt_primary" },
-            { "title", "PRIMARY" },
-        });
-        cols.push_back(json11::Json::object{
-            { "key", "fmt_clean" },
-            { "title", "DATA CLEAN" },
-        });
-        cols.push_back(json11::Json::object{
-            { "key", "fmt_misplaced" },
-            { "title", "MISPLACED" },
-        });
-        cols.push_back(json11::Json::object{
-            { "key", "fmt_misplaced" },
-            { "title", "DEGRADED" },
-        });
-        cols.push_back(json11::Json::object{
-            { "key", "fmt_incomplete" },
-            { "title", "INCOMPLETE" },
-        });
-        cols.push_back(json11::Json::object{
-            { "key", "fmt_state" },
-            { "title", "STATE" },
-        });
-        result.text = print_table(pgs, cols, parent->color);
-    }
-
-    void loop()
-    {
-        if (state == 1)
-            goto resume_1;
-resume_1:
-        load_pg_stats();
-        if (parent->waiting > 0)
-            return;
-        format_pgs();
-        state = 100;
-    }
-};
-
-std::function<bool(cli_result_t &)> cli_tool_t::start_pg_list(json11::Json cfg)
-{
-    auto pg_lister = new pg_lister_t();
-    pg_lister->parent = this;
-    if (cfg["pool"].uint64_value())
-        pg_lister->pool_id = cfg["pool"].uint64_value();
-    else
-        pg_lister->pool_name = cfg["pool"].string_value();
-    for (auto & st: cfg["pg_state"].array_items())
-        pg_lister->pg_state.push_back(st.string_value());
-    if (cfg["pg_state"].is_string())
-        pg_lister->pg_state.push_back(cfg["pg_state"].string_value());
-    pg_lister->min_pg_num = cfg["min"].uint64_value();
-    pg_lister->max_pg_num = cfg["max"].uint64_value();
-    return [pg_lister](cli_result_t & result)
-    {
-        pg_lister->loop();
-        if (pg_lister->is_done())
-        {
-            result = pg_lister->result;
-            delete pg_lister;
-            return true;
-        }
-        return false;
-    };
-}
--- a/src/cmd/cli_pool_cfg.cpp
+++ b/src/cmd/cli_pool_cfg.cpp
@@ -71,7 +71,8 @@ std::string validate_pool_config(json11::Json::object & new_cfg, json11::Json ol
        auto & key = kv_it->first;
        auto & value = kv_it->second;
        if (key == "pg_size" || key == "parity_chunks" || key == "pg_minsize" ||
-            key == "pg_count" || key == "max_osd_combinations")
+            key == "pg_count" || key == "max_osd_combinations" ||
+            key == "bitmap_granularity" || key == "pg_stripe_size")
        {
            if (value.is_number() && value.uint64_value() != value.number_value() ||
                value.is_string() && !value.uint64_value() && value.string_value() != "0")
@@ -80,14 +81,13 @@ std::string validate_pool_config(json11::Json::object & new_cfg, json11::Json ol
            }
            value = value.uint64_value();
        }
-        else if (key == "block_size" || key == "bitmap_granularity" || key == "pg_stripe_size")
+        else if (key == "block_size")
        {
-            uint64_t sz = value.is_string() ? parse_size(value.string_value()) : value.uint64_value();
-            if (!sz)
+            uint64_t block_size = value.is_string() ? parse_size(value.string_value()) : value.uint64_value();
+            if (!block_size)
            {
                return key+" must be an integer with or without size suffix (K/M/G/T)";
            }
-            value = sz;
        }
        else if (key == "name" || key == "scheme" || key == "immediate_commit" ||
            key == "failure_domain" || key == "root_node" || key == "scrub_interval" || key == "used_for_fs" ||
@@ -319,7 +319,7 @@ std::string validate_pool_config(json11::Json::object & new_cfg, json11::Json ol
    }

    // immediate_commit
-    if (!cfg["immediate_commit"].is_null() && etcd_state_client_t::parse_immediate_commit(cfg["immediate_commit"].string_value(), UINT32_MAX) == UINT32_MAX)
+    if (!cfg["immediate_commit"].is_null() && !etcd_state_client_t::parse_immediate_commit(cfg["immediate_commit"].string_value()))
    {
        return "immediate_commit must be one of \"all\", \"small\", or \"none\", but it is "+cfg["immediate_commit"].as_string();
    }
--- a/src/cmd/cli_pool_create.cpp
+++ b/src/cmd/cli_pool_create.cpp
@@ -19,9 +19,6 @@ struct pool_creator_t
    bool force = false;
    bool wait = false;

-    uint64_t block_size = 0, bitmap_granularity = 0;
-    uint32_t immediate_commit = 0;
-
    int state = 0;
    cli_result_t result;

@@ -190,23 +187,13 @@ resume_4:

                if (cfg["pg_size"].uint64_value() > max_pg_size)
                {
-                    std::string pool_err = "Not enough matching OSDs to create pool."
-                        " Change parameters or add --force to create a degraded pool."
-                        "\n\nAt least "+std::to_string(cfg["pg_size"].uint64_value())+
-                        " (pg_size="+std::to_string(cfg["pg_size"].uint64_value())+") OSDs should have:"
-                        "\n- block_size "+format_size(block_size, false, true)+
-                        "\n- bitmap_granularity "+format_size(bitmap_granularity, false, true);
-                    if (immediate_commit == IMMEDIATE_ALL)
-                        pool_err += "\n- immediate_commit all";
-                    else if (immediate_commit == IMMEDIATE_SMALL)
-                        pool_err += "\n- immediate_commit all or small";
-                    if (cfg["osd_tags"].array_items().size())
-                        pool_err += "\n- '"+implode("', '", cfg["osd_tags"])+(cfg["osd_tags"].array_items().size() > 1 ? "' tags" : "' tag");
-                    if (failure_domain != "osd")
-                        pool_err += "\n- different parent '"+failure_domain+"' nodes";
                    result = (cli_result_t){
                        .err = EINVAL,
-                        .text = pool_err,
+                        .text =
+                            "There are "+std::to_string(max_pg_size)+" \""+failure_domain+"\" failure domains with OSDs matching tags and"
+                            " block_size/bitmap_granularity/immediate_commit parameters, but you want to create a"
+                            " pool with "+cfg["pg_size"].as_string()+" OSDs from different failure domains in a PG."
+                            " Change parameters or add --force if you want to create a degraded pool and add OSDs later."
                    };
                    state = 100;
                    return;
@@ -454,14 +441,14 @@ resume_8:
        // List of accepted osds
        std::vector<std::string> accepted_osds;

-        block_size = cfg["block_size"].uint64_value()
+        uint64_t p_block_size = cfg["block_size"].uint64_value()
            ? cfg["block_size"].uint64_value()
            : parent->cli->st_cli.global_block_size;
-        bitmap_granularity = cfg["bitmap_granularity"].uint64_value()
+        uint64_t p_bitmap_granularity = cfg["bitmap_granularity"].uint64_value()
            ? cfg["bitmap_granularity"].uint64_value()
            : parent->cli->st_cli.global_bitmap_granularity;
-        immediate_commit = cfg["immediate_commit"].is_string()
-            ? etcd_state_client_t::parse_immediate_commit(cfg["immediate_commit"].string_value(), IMMEDIATE_ALL)
+        uint32_t p_immediate_commit = cfg["immediate_commit"].is_string()
+            ? etcd_state_client_t::parse_immediate_commit(cfg["immediate_commit"].string_value())
            : parent->cli->st_cli.global_immediate_commit;

        for (size_t i = 0; i < osd_stats.size(); i++)
@@ -469,10 +456,10 @@ resume_8:
            auto & os = osd_stats[i];
            // Get osd number
            auto osd_num = osds[i].as_string();
-            if (!os["data_block_size"].is_null() && os["data_block_size"] != block_size ||
-                !os["bitmap_granularity"].is_null() && os["bitmap_granularity"] != bitmap_granularity ||
+            if (!os["data_block_size"].is_null() && os["data_block_size"] != p_block_size ||
+                !os["bitmap_granularity"].is_null() && os["bitmap_granularity"] != p_bitmap_granularity ||
                !os["immediate_commit"].is_null() &&
-                etcd_state_client_t::parse_immediate_commit(os["immediate_commit"].string_value(), IMMEDIATE_NONE) < immediate_commit)
+                etcd_state_client_t::parse_immediate_commit(os["immediate_commit"].string_value()) < p_immediate_commit)
            {
                accepted_nodes.erase(osd_num);
            }
--- a/src/cmd/cli_pool_ls.cpp
+++ b/src/cmd/cli_pool_ls.cpp
@@ -214,10 +214,10 @@ resume_1:
                json11::Json::object {
                    { "request_range", json11::Json::object {
                        { "key", base64_encode(
-                            parent->cli->st_cli.etcd_prefix+"/pgstats/"
+                            parent->cli->st_cli.etcd_prefix+"/pg/stats/"
                        ) },
                        { "range_end", base64_encode(
-                            parent->cli->st_cli.etcd_prefix+"/pgstats0"
+                            parent->cli->st_cli.etcd_prefix+"/pg/stats0"
                        ) },
                    } },
                },
@@ -235,7 +235,7 @@ resume_1:
        }
        // Calculate recovery percent
        std::map<pool_id_t, object_counts_t> counts;
-        parent->iterate_kvs_2(parent->etcd_result["responses"][0]["response_range"]["kvs"], "/pgstats/",
+        parent->iterate_kvs_2(parent->etcd_result["responses"][0]["response_range"]["kvs"], "/pg/stats/",
            [&](pool_id_t pool_id, uint64_t pg_num, json11::Json value)
        {
            auto & cnt = counts[pool_id];
--- a/src/cmd/cli_rm_data.cpp
+++ b/src/cmd/cli_rm_data.cpp
@@ -25,7 +25,6 @@ struct rm_inode_t
    uint64_t inode = 0;
    pool_id_t pool_id = 0;
    uint64_t min_offset = 0;
-    uint64_t max_offset = 0;
    bool down_ok = false;

    cli_tool_t *parent = NULL;
@@ -53,7 +52,7 @@ struct rm_inode_t
                .obj_done = 0,
                .synced = parent->cli->get_immediate_commit(inode),
            });
-            if (min_offset == 0 && max_offset == 0)
+            if (min_offset == 0)
            {
                total_count += objects.size();
            }
@@ -61,7 +60,7 @@ struct rm_inode_t
            {
                for (object_id oid: objects)
                {
-                    if (oid.stripe >= min_offset && (!max_offset || oid.stripe < max_offset))
+                    if (oid.stripe >= min_offset)
                    {
                        total_count++;
                    }
@@ -117,7 +116,7 @@ struct rm_inode_t
        }
        while (cur_list->in_flight < parent->iodepth && cur_list->obj_pos != cur_list->objects.end())
        {
-            if (cur_list->obj_pos->stripe >= min_offset && (!max_offset || cur_list->obj_pos->stripe < max_offset))
+            if (cur_list->obj_pos->stripe >= min_offset)
            {
                osd_op_t *op = new osd_op_t();
                op->op_type = OSD_OP_OUT;
@@ -288,7 +287,6 @@ std::function<bool(cli_result_t &)> cli_tool_t::start_rm_data(json11::Json cfg)
    remover->down_ok = cfg["down_ok"].bool_value();
    remover->pool_id = INODE_POOL(remover->inode);
    remover->min_offset = cfg["min_offset"].uint64_value();
-    remover->max_offset = cfg["max_offset"].uint64_value();
    return [remover](cli_result_t & result)
    {
        remover->loop();
--- a/src/cmd/cli_rm_osd.cpp
+++ b/src/cmd/cli_rm_osd.cpp
@@ -176,7 +176,7 @@ struct rm_osd_t
            json11::Json::object {
                { "request_range", json11::Json::object {
                    { "key", base64_encode(
-                        parent->cli->st_cli.etcd_prefix+"/pg/config"
+                        parent->cli->st_cli.etcd_prefix+"/config/pgs"
                    ) },
                } },
            },
@@ -229,7 +229,7 @@ struct rm_osd_t
            }
            if (!new_pgs.is_null())
            {
-                auto pgs_key = base64_encode(parent->cli->st_cli.etcd_prefix+"/pg/config");
+                auto pgs_key = base64_encode(parent->cli->st_cli.etcd_prefix+"/config/pgs");
                rm_items.push_back(json11::Json::object {
                    { "request_put", json11::Json::object {
                        { "key", pgs_key },
@@ -427,7 +427,7 @@ struct rm_osd_t
                        { "target", "MOD" },
                        { "key", history_key },
                        { "result", "LESS" },
-                        { "mod_revision", parent->cli->st_cli.etcd_watch_revision_pg+1 },
+                        { "mod_revision", parent->cli->st_cli.etcd_watch_revision+1 },
                    });
                }
            }
--- a/src/disk_tool/disk_tool.cpp
+++ b/src/disk_tool/disk_tool.cpp
@@ -5,7 +5,7 @@
 #include "str_util.h"

 static const char *help_text =
-    "Vitastor disk management tool " VITASTOR_VERSION "\n"
+    "Vitastor disk management tool " VERSION "\n"
    "(c) Vitaliy Filippov, 2022+ (VNPL-1.1)\n"
    "\n"
    "COMMANDS:\n"
--- a/src/disk_tool/disk_tool_udev.cpp
+++ b/src/disk_tool/disk_tool_udev.cpp
@@ -383,7 +383,7 @@ int disk_tool_t::pre_exec_osd(std::string device)

 int disk_tool_t::purge_devices(const std::vector<std::string> & devices)
 {
-    std::set<uint64_t> osd_numbers;
+    std::vector<uint64_t> osd_numbers;
    json11::Json::array superblocks;
    for (auto & device: devices)
    {
@@ -391,11 +391,8 @@ int disk_tool_t::purge_devices(const std::vector<std::string> & devices)
        if (!sb.is_null())
        {
            uint64_t osd_num = sb["params"]["osd_num"].uint64_value();
-            if (osd_numbers.find(osd_num) == osd_numbers.end())
-            {
-                osd_numbers.insert(osd_num);
-                superblocks.push_back(sb);
-            }
+            osd_numbers.push_back(osd_num);
+            superblocks.push_back(sb);
        }
    }
    if (!osd_numbers.size())
--- a/src/kv/CMakeLists.txt
+++ b/src/kv/CMakeLists.txt
@@ -10,7 +10,7 @@ set_target_properties(vitastor_kv PROPERTIES PUBLIC_HEADER "kv/vitastor_kv.h")
 target_link_libraries(vitastor_kv
 	vitastor_client
 )
-set_target_properties(vitastor_kv PROPERTIES VERSION ${VITASTOR_VERSION} SOVERSION 0)
+set_target_properties(vitastor_kv PROPERTIES VERSION ${VERSION} SOVERSION 0)

 # vitastor-kv
 add_executable(vitastor-kv
--- a/src/kv/kv_cli.cpp
+++ b/src/kv/kv_cli.cpp
@@ -25,7 +25,7 @@ public:
    std::map<std::string, std::string> cfg;
    std::vector<std::string> cli_cmd;

-    vitastorkv_dbw_t *db = NULL;
+    kv_dbw_t *db = NULL;
    ring_loop_t *ringloop = NULL;
    epoll_manager_t *epmgr = NULL;
    cluster_client_t *cli = NULL;
@@ -144,7 +144,7 @@ void kv_cli_t::run()
    ringloop = new ring_loop_t(512);
    epmgr = new epoll_manager_t(ringloop);
    cli = new cluster_client_t(ringloop, epmgr->tfd, cfg);
-    db = new vitastorkv_dbw_t(cli);
+    db = new kv_dbw_t(cli);
    // Load image metadata
    while (!cli->is_ready())
    {
@@ -185,7 +185,7 @@ void kv_cli_t::run()
        fcntl(0, F_SETFL, fcntl(0, F_GETFL, 0) | O_NONBLOCK);
        try
        {
-            epmgr->tfd->set_fd_handler(0, false, [this](int fd, int events)
+            epmgr->tfd->set_fd_handler(0, EPOLLIN, [this](int fd, int events)
            {
                if (events & EPOLLIN)
                {
@@ -193,7 +193,7 @@ void kv_cli_t::run()
                }
                if (events & EPOLLRDHUP)
                {
-                    epmgr->tfd->set_fd_handler(0, false, NULL);
+                    epmgr->tfd->set_fd_handler(0, 0, NULL);
                    finished = true;
                }
            });
@@ -289,7 +289,7 @@ void kv_cli_t::next_cmd()

 struct kv_cli_list_t
 {
-    vitastorkv_dbw_t *db = NULL;
+    kv_dbw_t *db = NULL;
    void *handle = NULL;
    int format = 0;
    int n = 0;
--- a/src/kv/kv_db.cpp
+++ b/src/kv/kv_db.cpp
@@ -501,7 +501,7 @@ void kv_block_t::dump(int base_level)

 void kv_db_t::open(inode_t inode_id, json11::Json cfg, std::function<void(int)> cb)
 {
-    if (block_cache.size() > 0 || this->inode_id)
+    if (block_cache.size() > 0)
    {
        cb(-EINVAL);
        return;
@@ -1958,38 +1958,38 @@ void kv_op_t::next_go_up()
    }
 }

-vitastorkv_dbw_t::vitastorkv_dbw_t(cluster_client_t *cli)
+kv_dbw_t::kv_dbw_t(cluster_client_t *cli)
 {
    db = new kv_db_t();
    db->cli = cli;
 }

-vitastorkv_dbw_t::~vitastorkv_dbw_t()
+kv_dbw_t::~kv_dbw_t()
 {
    delete db;
 }

-void vitastorkv_dbw_t::open(uint64_t inode_id, std::map<std::string, std::string> cfg, std::function<void(int)> cb)
+void kv_dbw_t::open(uint64_t inode_id, std::map<std::string, std::string> cfg, std::function<void(int)> cb)
 {
    db->open(inode_id, cfg, cb);
 }

-void vitastorkv_dbw_t::set_config(std::map<std::string, std::string> cfg)
+void kv_dbw_t::set_config(std::map<std::string, std::string> cfg)
 {
    db->set_config(cfg);
 }

-uint64_t vitastorkv_dbw_t::get_size()
+uint64_t kv_dbw_t::get_size()
 {
    return db->next_free;
 }

-void vitastorkv_dbw_t::close(std::function<void()> cb)
+void kv_dbw_t::close(std::function<void()> cb)
 {
    db->close(cb);
 }

-void vitastorkv_dbw_t::get(const std::string & key, std::function<void(int res, const std::string & value)> cb, bool cached)
+void kv_dbw_t::get(const std::string & key, std::function<void(int res, const std::string & value)> cb, bool cached)
 {
    auto *op = new kv_op_t;
    op->db = db;
@@ -2003,7 +2003,7 @@ void vitastorkv_dbw_t::get(const std::string & key, std::function<void(int res,
    op->exec();
 }

-void vitastorkv_dbw_t::set(const std::string & key, const std::string & value, std::function<void(int res)> cb,
+void kv_dbw_t::set(const std::string & key, const std::string & value, std::function<void(int res)> cb,
    std::function<bool(int res, const std::string & value)> cas_compare)
 {
    auto *op = new kv_op_t;
@@ -2023,7 +2023,7 @@ void vitastorkv_dbw_t::set(const std::string & key, const std::string & value, s
    op->exec();
 }

-void vitastorkv_dbw_t::del(const std::string & key, std::function<void(int res)> cb,
+void kv_dbw_t::del(const std::string & key, std::function<void(int res)> cb,
    std::function<bool(int res, const std::string & value)> cas_compare)
 {
    auto *op = new kv_op_t;
@@ -2042,7 +2042,7 @@ void vitastorkv_dbw_t::del(const std::string & key, std::function<void(int res)>
    op->exec();
 }

-void* vitastorkv_dbw_t::list_start(const std::string & start)
+void* kv_dbw_t::list_start(const std::string & start)
 {
    if (!db->inode_id || db->closing)
        return NULL;
@@ -2055,7 +2055,7 @@ void* vitastorkv_dbw_t::list_start(const std::string & start)
    return op;
 }

-void vitastorkv_dbw_t::list_next(void *handle, std::function<void(int res, const std::string & key, const std::string & value)> cb)
+void kv_dbw_t::list_next(void *handle, std::function<void(int res, const std::string & key, const std::string & value)> cb)
 {
    kv_op_t *op = (kv_op_t*)handle;
    if (cb)
@@ -2068,7 +2068,7 @@ void vitastorkv_dbw_t::list_next(void *handle, std::function<void(int res, const
    op->next();
 }

-void vitastorkv_dbw_t::list_close(void *handle)
+void kv_dbw_t::list_close(void *handle)
 {
    kv_op_t *op = (kv_op_t*)handle;
    delete op;
--- a/Show More
+++ b/Show More
Author	SHA1	Message	Date
Vitaliy Filippov	249a233b37	WIP another experiment - "smart" iothreads Some checks failed Test / test_snapshot_chain_ec (push) Successful in 2m55s Details Test / test_rebalance_verify_imm (push) Successful in 3m55s Details Test / test_root_node (push) Successful in 11s Details Test / test_rebalance_verify (push) Successful in 4m32s Details Test / test_switch_primary (push) Successful in 37s Details Test / test_rebalance_verify_ec_imm (push) Successful in 2m53s Details Test / test_write (push) Successful in 46s Details Test / test_write_no_same (push) Successful in 17s Details Test / test_write_xor (push) Successful in 1m55s Details Test / test_rebalance_verify_ec (push) Successful in 5m59s Details Test / test_heal_pg_size_2 (push) Successful in 3m58s Details Test / test_heal_ec (push) Successful in 3m51s Details Test / test_heal_csum_32k_dmj (push) Successful in 5m47s Details Test / test_heal_csum_32k_dj (push) Successful in 6m10s Details Test / test_heal_csum_4k_dmj (push) Successful in 6m50s Details Test / test_osd_tags (push) Successful in 19s Details Test / test_enospc (push) Successful in 1m11s Details Test / test_heal_csum_4k_dj (push) Successful in 6m1s Details Test / test_enospc_xor (push) Successful in 1m11s Details Test / test_heal_csum_32k (push) Failing after 10m14s Details Test / test_enospc_imm (push) Successful in 46s Details Test / test_enospc_imm_xor (push) Successful in 1m3s Details Test / test_scrub (push) Successful in 28s Details Test / test_scrub_zero_osd_2 (push) Successful in 28s Details Test / test_scrub_xor (push) Successful in 34s Details Test / test_scrub_pg_size_6_pg_minsize_4_osd_count_6_ec (push) Successful in 39s Details Test / test_scrub_pg_size_3 (push) Successful in 49s Details Test / test_scrub_ec (push) Successful in 25s Details Test / test_nfs (push) Successful in 16s Details Test / test_heal_csum_4k (push) Successful in 8m55s Details	2024-07-03 11:05:45 +03:00
Vitaliy Filippov	d07e072212	Change bool wr to event mask in epoll_manager	2024-07-01 00:30:59 +03:00
				`@@ -1 +0,0 @@`
				`module.exports = require('bindings')('addon.node');`