trace writes

WIP Support separate OSD cluster network
WIP Add force_rdma parameter
2025-03-26 14:01:54 +03:00 · 2025-03-26 01:52:08 +03:00 · 2025-03-23 16:59:10 +03:00 · 2025-03-23 12:04:23 +03:00 · 2025-03-23 12:04:09 +03:00 · 2025-03-23 12:00:20 +03:00
112 changed files with 1898 additions and 394 deletions
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@ -2,6 +2,6 @@ cmake_minimum_required(VERSION 2.8.12)

 project(vitastor)

-set(VITASTOR_VERSION "1.11.0")
+set(VITASTOR_VERSION "2.0.0")

 add_subdirectory(src)
--- a/README-ru.md
+++ b/README-ru.md
@ -6,7 +6,7 @@

 Вернём былую скорость кластерному блочному хранилищу!

-Vitastor - распределённая блочная и файловая SDS (программная СХД), прямой аналог Ceph RBD и CephFS,
+Vitastor - распределённая блочная, файловая и объектная SDS (программная СХД), прямой аналог Ceph RBD, CephFS и RGW,
 а также внутренних СХД популярных облачных провайдеров. Однако, в отличие от них, Vitastor
 быстрый и при этом простой. Только пока маленький :-).

@ -46,6 +46,7 @@ Vitastor поддерживает QEMU-драйвер, протоколы NBD и
  - [OpenNebula](docs/installation/opennebula.ru.md)
  - [OpenStack](docs/installation/openstack.ru.md)
  - [Kubernetes CSI](docs/installation/kubernetes.ru.md)
+  - [S3](docs/installation/s3.ru.md)
  - [Сборка из исходных кодов](docs/installation/source.ru.md)
 - Конфигурация
  - [Обзор](docs/config.ru.md)
--- a/README.md
+++ b/README.md
@ -6,7 +6,7 @@

 Make Clustered Block Storage Fast Again.

-Vitastor is a distributed block and file SDS, direct replacement of Ceph RBD and CephFS,
+Vitastor is a distributed block, file and object SDS, direct replacement of Ceph RBD, CephFS and RGW,
 and also internal SDS's of public clouds. However, in contrast to them, Vitastor is fast
 and simple at the same time. The only thing is it's slightly young :-).

@ -46,6 +46,7 @@ Read more details in the documentation. You can start from here: [Quick Start](d
  - [OpenNebula](docs/installation/opennebula.en.md)
  - [OpenStack](docs/installation/openstack.en.md)
  - [Kubernetes CSI](docs/installation/kubernetes.en.md)
+  - [S3](docs/installation/s3.en.md)
  - [Building from Source](docs/installation/source.en.md)
 - Configuration
  - [Overview](docs/config.en.md)
--- a/csi/Makefile
+++ b/csi/Makefile
@ -1,4 +1,4 @@
-VITASTOR_VERSION ?= v1.11.0
+VITASTOR_VERSION ?= v2.0.0

 all: build push

--- a/csi/deploy/004-csi-nodeplugin.yaml
+++ b/csi/deploy/004-csi-nodeplugin.yaml
@ -49,7 +49,7 @@ spec:
            capabilities:
              add: ["SYS_ADMIN"]
            allowPrivilegeEscalation: true
-          image: vitalif/vitastor-csi:v1.11.0
+          image: vitalif/vitastor-csi:v2.0.0
          args:
            - "--node=$(NODE_ID)"
            - "--endpoint=$(CSI_ENDPOINT)"
--- a/csi/deploy/007-csi-provisioner.yaml
+++ b/csi/deploy/007-csi-provisioner.yaml
@ -121,7 +121,7 @@ spec:
            privileged: true
            capabilities:
              add: ["SYS_ADMIN"]
-          image: vitalif/vitastor-csi:v1.11.0
+          image: vitalif/vitastor-csi:v2.0.0
          args:
            - "--node=$(NODE_ID)"
            - "--endpoint=$(CSI_ENDPOINT)"
--- a/csi/src/config.go
+++ b/csi/src/config.go
@ -5,7 +5,7 @@ package vitastor

 const (
    vitastorCSIDriverName    = "csi.vitastor.io"
-    vitastorCSIDriverVersion = "1.11.0"
+    vitastorCSIDriverVersion = "2.0.0"
 )

 // Config struct fills the parameters of request or user input
--- a/debian/changelog
+++ b/debian/changelog
@ -1,4 +1,4 @@
-vitastor (1.11.0-1) unstable; urgency=medium
+vitastor (2.0.0-1) unstable; urgency=medium

  * Bugfixes

--- a/debian/patched-qemu.Dockerfile
+++ b/debian/patched-qemu.Dockerfile
@ -10,10 +10,14 @@ ARG REL=
 WORKDIR /root

 RUN if [ "$REL" = "buster" -o "$REL" = "bullseye" -o "$REL" = "bookworm" ]; then \
-        echo "deb http://deb.debian.org/debian $REL-backports main" >> /etc/apt/sources.list; \
+        if [ "$REL" = "buster" ]; then \
+            echo "deb http://archive.debian.org/debian $REL-backports main" >> /etc/apt/sources.list; \
+        else \
+            echo "deb http://deb.debian.org/debian $REL-backports main" >> /etc/apt/sources.list; \
+        fi; \
        echo >> /etc/apt/preferences; \
        echo 'Package: *' >> /etc/apt/preferences; \
-        echo "Pin: release a=$REL-backports" >> /etc/apt/preferences; \
+        echo "Pin: release n=$REL-backports" >> /etc/apt/preferences; \
        echo 'Pin-Priority: 500' >> /etc/apt/preferences; \
    fi; \
    grep '^deb ' /etc/apt/sources.list | perl -pe 's/^deb/deb-src/' >> /etc/apt/sources.list; \
@ -56,7 +60,7 @@ RUN set -e; \
    quilt add block/vitastor.c; \
    cp /root/qemu_driver.c block/vitastor.c; \
    quilt refresh; \
-    V=$(head -n1 debian/changelog | perl -pe 's/5\.2\+dfsg-9/5.2+dfsg-11/; s/^.*\((.*?)(~bpo[\d\+]*)?\).*$/$1/')+vitastor4; \
+    V=$(head -n1 debian/changelog | perl -pe 's/5\.2\+dfsg-9/5.2+dfsg-11/; s/^.*\((.*?)(\+deb\d+u\d+)?(~bpo[\d\+]*)?\).*$/$1/')+vitastor5; \
    if [ "$REL" = bullseye ]; then V=${V}bullseye; fi; \
    DEBEMAIL="Vitaliy Filippov <vitalif@yourcmc.ru>" dch -D $REL -v $V 'Plug Vitastor block driver'; \
    DEB_BUILD_OPTIONS=nocheck dpkg-buildpackage --jobs=auto -sa; \
--- a/docker/Dockerfile
+++ b/docker/Dockerfile
@ -3,7 +3,7 @@
 FROM debian:bookworm

 ADD etc/apt /etc/apt/
-RUN apt-get update && apt-get -y install vitastor qemu-system-x86 qemu-system-common qemu-block-extra qemu-utils jq nfs-common && apt-get clean
+RUN apt-get update && apt-get -y install vitastor udev systemd qemu-system-x86 qemu-system-common qemu-block-extra qemu-utils jq nfs-common && apt-get clean
 ADD sleep.sh /usr/bin/
 ADD install.sh /usr/bin/
 ADD scripts /opt/scripts/
--- a/docker/Makefile
+++ b/docker/Makefile
@ -1,4 +1,4 @@
-VITASTOR_VERSION ?= v1.11.0
+VITASTOR_VERSION ?= v2.0.0

 all: build push

--- a/docker/etc/systemd/system/vitastor-host.service
+++ b/docker/etc/systemd/system/vitastor-host.service
@ -7,8 +7,8 @@ PartOf=vitastor.target
 [Service]
 Restart=always
 EnvironmentFile=/etc/vitastor/docker.conf
-ExecStart=bash -c 'docker run --rm -i -v /etc/vitastor:/etc/vitastor -v /dev:/dev \
-    --privileged --log-driver none --network host --name vitastor vitastor:$VITASTOR_VERSION \
+ExecStart=bash -c 'docker run --rm -i -v /etc/vitastor:/etc/vitastor -v /dev:/dev -v /run:/run \
+    --security-opt seccomp=unconfined --privileged --pid=host --log-driver none --network host --name vitastor vitastor:$VITASTOR_VERSION \
    sleep.sh'
 ExecStartPost=udevadm trigger
 ExecStop=docker stop vitastor
--- a/docker/etc/systemd/system/vitastor-osd@.service
+++ b/docker/etc/systemd/system/vitastor-osd@.service
@ -12,7 +12,8 @@ EnvironmentFile=/etc/vitastor/docker.conf
 SyslogIdentifier=vitastor-osd%i
 ExecStart=bash -c 'docker run --rm -i -v /etc/vitastor:/etc/vitastor -v /dev:/dev \
    $(for i in $(ls /dev/vitastor/osd%i-*); do echo --device $i:$i; done) \
-    --log-driver none --network host --ulimit nofile=1048576 --ulimit memlock=-1 $CONTAINER_OPTIONS --name vitastor-osd%i \
+    --log-driver none --network host --ulimit nofile=1048576 --ulimit memlock=-1 \
+    --security-opt seccomp=unconfined $CONTAINER_OPTIONS --name vitastor-osd%i \
    vitastor:$VITASTOR_VERSION vitastor-disk exec-osd /dev/vitastor/osd%i-data'
 ExecStartPre=+docker exec vitastor vitastor-disk pre-exec /dev/vitastor/osd%i-data
 ExecStop=docker stop vitastor-etcd%i
--- a/docker/etc/vitastor/docker.conf
+++ b/docker/etc/vitastor/docker.conf
@ -4,7 +4,7 @@
 #

 # Desired Vitastor version
-VITASTOR_VERSION=1.11.0
+VITASTOR_VERSION=v2.0.0

 # Additional arguments for all containers
 # For example, you may want to specify a custom logging driver here
--- a/docs/config/client.en.md
+++ b/docs/config/client.en.md
@ -13,6 +13,7 @@ affect their interaction with the cluster.
 - [client_retry_interval](#client_retry_interval)
 - [client_eio_retry_interval](#client_eio_retry_interval)
 - [client_retry_enospc](#client_retry_enospc)
+- [client_wait_up_timeout](#client_wait_up_timeout)
 - [client_max_dirty_bytes](#client_max_dirty_bytes)
 - [client_max_dirty_ops](#client_max_dirty_ops)
 - [client_enable_writeback](#client_enable_writeback)
@ -70,6 +71,19 @@ and clients are not blocked and just get EIO error code instead.
 Retry writes on out of space errors to wait until some space is freed on
 OSDs.

+## client_wait_up_timeout
+
+- Type: seconds
+- Default: 16
+- Can be changed online: yes
+
+Wait for this number of seconds until PGs are up when doing operations
+which require all PGs to be up. Currently only used by object listings
+in delete and merge-based commands ([vitastor-cli rm](../usage/cli.en.md#rm), merge and so on).
+
+The default value is calculated as `1 + OSD lease timeout`, which is
+`1 + etcd_report_interval + max_etcd_attempts*2*etcd_quick_timeout`.
+
 ## client_max_dirty_bytes

 - Type: integer
--- a/docs/config/client.ru.md
+++ b/docs/config/client.ru.md
@ -13,6 +13,7 @@
 - [client_retry_interval](#client_retry_interval)
 - [client_eio_retry_interval](#client_eio_retry_interval)
 - [client_retry_enospc](#client_retry_enospc)
+- [client_wait_up_timeout](#client_wait_up_timeout)
 - [client_max_dirty_bytes](#client_max_dirty_bytes)
 - [client_max_dirty_ops](#client_max_dirty_ops)
 - [client_enable_writeback](#client_enable_writeback)
@ -72,6 +73,19 @@ RDMA и хотите повысить пиковую производитель
 Повторять запросы записи, завершившиеся с ошибками нехватки места, т.е.
 ожидать, пока на OSD не освободится место.

+## client_wait_up_timeout
+
+- Тип: секунды
+- Значение по умолчанию: 16
+- Можно менять на лету: да
+
+Время ожидания поднятия PG при операциях, требующих активности всех PG.
+В данный момент используется листингами объектов в командах, использующих
+удаление и слияние ([vitastor-cli rm](../usage/cli.ru.md#rm), merge и подобные).
+
+Значение по умолчанию вычисляется как `1 + время lease OSD`, равное
+`1 + etcd_report_interval + max_etcd_attempts*2*etcd_quick_timeout`.
+
 ## client_max_dirty_bytes

 - Тип: целое число
--- a/docs/config/network.en.md
+++ b/docs/config/network.en.md
@ -30,6 +30,7 @@ between clients, OSDs and etcd.
 - [etcd_slow_timeout](#etcd_slow_timeout)
 - [etcd_keepalive_timeout](#etcd_keepalive_timeout)
 - [etcd_ws_keepalive_interval](#etcd_ws_keepalive_interval)
+- [etcd_min_reload_interval](#etcd_min_reload_interval)

 ## tcp_header_buffer_size

@ -261,3 +262,13 @@ etcd_report_interval to guarantee that keepalive actually works.

 etcd websocket ping interval required to keep the connection alive and
 detect disconnections quickly.
+
+## etcd_min_reload_interval
+
+- Type: milliseconds
+- Default: 1000
+- Can be changed online: yes
+
+Minimum interval for full etcd state reload. Introduced to prevent
+excessive load on etcd during outages when etcd can't keep up with event
+streams and cancels them.
--- a/docs/config/network.ru.md
+++ b/docs/config/network.ru.md
@ -30,6 +30,7 @@
 - [etcd_slow_timeout](#etcd_slow_timeout)
 - [etcd_keepalive_timeout](#etcd_keepalive_timeout)
 - [etcd_ws_keepalive_interval](#etcd_ws_keepalive_interval)
+- [etcd_min_reload_interval](#etcd_min_reload_interval)

 ## tcp_header_buffer_size

@ -271,3 +272,13 @@ etcd_report_interval, чтобы keepalive гарантированно рабо
 - Можно менять на лету: да

 Интервал проверки живости вебсокет-подключений к etcd.
+
+## etcd_min_reload_interval
+
+- Тип: миллисекунды
+- Значение по умолчанию: 1000
+- Можно менять на лету: да
+
+Минимальный интервал полной перезагрузки состояния из etcd. Добавлено для
+предотвращения избыточной нагрузки на etcd во время отказов, когда etcd не
+успевает рассылать потоки событий и отменяет их.
--- a/docs/config/osd.en.md
+++ b/docs/config/osd.en.md
@ -7,8 +7,8 @@
 # Runtime OSD Parameters

 These parameters only apply to OSDs, are not fixed at the moment of OSD drive
-initialization and can be changed - either with an OSD restart or, for some of
-them, even without restarting by updating configuration in etcd.
+initialization and can be changed - in /etc/vitastor/vitastor.conf or [vitastor-disk update-sb](../usage/disk.en.md#update-sb)
+with an OSD restart or, for some of them, even without restarting by updating configuration in etcd.

 - [osd_iothread_count](#osd_iothread_count)
 - [etcd_report_interval](#etcd_report_interval)
@ -61,6 +61,9 @@ them, even without restarting by updating configuration in etcd.
 - [recovery_tune_agg_interval](#recovery_tune_agg_interval)
 - [recovery_tune_sleep_min_us](#recovery_tune_sleep_min_us)
 - [recovery_tune_sleep_cutoff_us](#recovery_tune_sleep_cutoff_us)
+- [discard_on_start](#discard_on_start)
+- [min_discard_size](#min_discard_size)
+- [allow_net_split](#allow_net_split)

 ## osd_iothread_count

@ -316,7 +319,7 @@ for hot data and slower disks - HDDs and maybe SATA SSDs - but will slightly
 decrease write performance for fast disks because page cache is an overhead
 itself.

-Choose "directsync" to use [immediate_commit](layout-cluster.ru.md#immediate_commit)
+Choose "directsync" to use [immediate_commit](layout-cluster.en.md#immediate_commit)
 (which requires disable_data_fsync) with drives having write-back cache
 which can't be turned off, for example, Intel Optane. Also note that *some*
 desktop SSDs (for example, HP EX950) may ignore O_SYNC thus making
@ -629,3 +632,30 @@ are changed to 0.

 Maximum possible value for auto-tuned recovery_sleep_us. Higher values
 are treated as outliers and ignored in aggregation.
+
+## discard_on_start
+
+- Type: boolean
+
+Discard (SSD TRIM) unused data device blocks on every OSD startup.
+
+## min_discard_size
+
+- Type: integer
+- Default: 1048576
+
+Minimum consecutive block size to TRIM it.
+
+## allow_net_split
+
+- Type: boolean
+- Default: false
+
+Allow "safe" cases of network splits/partitions - allow to start PGs without
+connections to some OSDs currently registered as alive in etcd, if the number
+of actually connected PG OSDs is at least pg_minsize. That is, allow some OSDs to lose
+connectivity with some other OSDs as long as it doesn't break pg_minsize guarantees.
+The downside is that it increases the probability of writing data into just pg_minsize
+OSDs during failover which can lead to PGs becoming incomplete after additional outages.
+
+The old behaviour in versions up to 2.0.0 was equal to enabled allow_net_split.
--- a/docs/config/osd.ru.md
+++ b/docs/config/osd.ru.md
@ -8,8 +8,8 @@

 Данные параметры используются только OSD, но, в отличие от дисковых параметров,
 не фиксируются в момент инициализации дисков OSD и могут быть изменены в любой
-момент с помощью перезапуска OSD, а некоторые и без перезапуска, с помощью
-изменения конфигурации в etcd.
+момент с перезапуском OSD в /etc/vitastor/vitastor.conf или [vitastor-disk update-sb](../usage/disk.ru.md#update-sb),
+а некоторые и без перезапуска, с помощью изменения конфигурации в etcd.

 - [osd_iothread_count](#osd_iothread_count)
 - [etcd_report_interval](#etcd_report_interval)
@ -62,6 +62,9 @@
 - [recovery_tune_agg_interval](#recovery_tune_agg_interval)
 - [recovery_tune_sleep_min_us](#recovery_tune_sleep_min_us)
 - [recovery_tune_sleep_cutoff_us](#recovery_tune_sleep_cutoff_us)
+- [discard_on_start](#discard_on_start)
+- [min_discard_size](#min_discard_size)
+- [allow_net_split](#allow_net_split)

 ## osd_iothread_count

@ -660,3 +663,31 @@ EC (кодов коррекции ошибок) с более, чем 1 диск
 Максимальное возможное значение авто-подстроенного recovery_sleep_us.
 Большие значения считаются случайными выбросами и игнорируются в
 усреднении.
+
+## discard_on_start
+
+- Тип: булево (да/нет)
+
+Освобождать (SSD TRIM) неиспользуемые блоки диска данных при каждом запуске OSD.
+
+## min_discard_size
+
+- Тип: целое число
+- Значение по умолчанию: 1048576
+
+Минимальный размер последовательного блока данных, чтобы освобождать его через TRIM.
+
+## allow_net_split
+
+- Тип: булево (да/нет)
+- Значение по умолчанию: false
+
+Разрешить "безопасные" случаи разделений сети - разрешить активировать PG без
+соединений к некоторым OSD, помеченным активными в etcd, если общее число активных
+OSD в PG составляет как минимум pg_minsize. То есть, разрешать некоторым OSD терять
+соединения с некоторыми другими OSD, если это не нарушает гарантий pg_minsize.
+Минус такого разрешения в том, что оно повышает вероятность записи данных ровно в
+pg_minsize OSD во время переключений, что может потом привести к тому, что PG станут
+неполными (incomplete), если упадут ещё какие-то OSD.
+
+Старое поведение в версиях до 2.0.0 было идентично включённому allow_net_split.
--- a/docs/config/pool.en.md
+++ b/docs/config/pool.en.md
@ -43,7 +43,7 @@ Parameters:
 - [osd_tags](#osd_tags)
 - [primary_affinity_tags](#primary_affinity_tags)
 - [scrub_interval](#scrub_interval)
- [used_for_fs](#used_for_fs)
+- [used_for_app](#used_for_app)

 Examples:

@ -189,6 +189,9 @@ So, pg_minsize regulates the number of failures that a pool can tolerate
 without temporary downtime for [osd_out_time](monitor.en.md#osd_out_time),
 but at a cost of slightly reduced storage reliability.

+See also [allow_net_split](osd.en.md#allow_net_split) and
+[PG state descriptions](../usage/admin.en.md#pg-states).
+
 FIXME: pg_minsize behaviour may be changed in the future to only make PGs
 read-only instead of deactivating them.

@ -377,24 +380,37 @@ of the OSDs containing a data chunk for a PG.
 Automatic scrubbing interval for this pool. Overrides
 [global scrub_interval setting](osd.en.md#scrub_interval).

-## used_for_fs
+## used_for_app

 - Type: string

-If non-empty, the pool is marked as used for VitastorFS with metadata stored
-in block image (regular Vitastor volume) named as the value of this pool parameter.
+If non-empty, the pool is marked as used for a separate application, for example,
+VitastorFS or S3, which allocates Vitastor volume IDs by itself and does not use
+image/inode metadata in etcd.

-When a pool is marked as used for VitastorFS, regular block volume creation in it
+When a pool is marked as used for such app, regular block volume creation in it
 is disabled (vitastor-cli refuses to create images without --force) to protect
-the user from block volume and FS file ID collisions and data loss.
+the user from block volume and FS/S3 volume ID collisions and data loss.

-[vitastor-nfs](../usage/nfs.ru.md), in its turn, refuses to use pools not marked
+Also such pools do not calculate per-inode space usage statistics in etcd because
+using it for an external application implies that it may contain a very large
+number of volumes and their statistics may take too much space in etcd.
+
+Setting used_for_app to `fs:<name>` tells Vitastor that the pool is used for VitastorFS
+with VitastorKV metadata base stored in a block image (regular Vitastor volume) named
+`<name>`.
+
+[vitastor-nfs](../usage/nfs.en.md), in its turn, refuses to use pools not marked
 for the corresponding FS when starting. This also implies that you can use one
 pool only for one VitastorFS.

-The second thing that is disabled for VitastorFS pools is reporting per-inode space
-usage statistics in etcd because a FS pool may store a very large number of files
-and statistics for them all would take a lot of space in etcd.
+If you plan to use the pool for S3, set its used_for_app to `s3:<name>`. `<name>` may
+be basically anything you want (for example, `s3:standard`) - it's not validated
+by Vitastor S3 components in any way.
+
+All other values except prefixed with `fs:` or `s3:` may be used freely and don't
+mean anything special for Vitastor core components. For now, you can use them as
+you wish.

 # Examples

--- a/docs/config/pool.ru.md
+++ b/docs/config/pool.ru.md
@ -42,7 +42,7 @@
 - [osd_tags](#osd_tags)
 - [primary_affinity_tags](#primary_affinity_tags)
 - [scrub_interval](#scrub_interval)
- [used_for_fs](#used_for_fs)
+- [used_for_app](#used_for_app)

 Примеры:

@ -256,7 +256,7 @@ PG в Vitastor эферемерны, то есть вы можете менят

 ## raw_placement

- Type: string
+- Тип: строка

 Низкоуровневые правила генерации PG в форме DSL (доменно-специфичного языка).
 Используйте, только если действительно знаете, зачем вам это надо :)
@ -383,26 +383,42 @@ OSD с "all".
 Интервал скраба, то есть, автоматической фоновой проверки данных для данного пула.
 Переопределяет [глобальную настройку scrub_interval](osd.ru.md#scrub_interval).

-## used_for_fs
+## used_for_app

- Type: string
+- Тип: строка

-Если непусто, пул помечается как используемый для файловой системы VitastorFS с
-метаданными, хранимыми в блочном образе Vitastor с именем, равным значению
-этого параметра.
+Если непусто, пул помечается как используемый для отдельного приложения, например,
+для VitastorFS или S3, которое распределяет ID образов в пуле само и не использует
+метаданные образов/инодов в etcd.

-Когда пул помечается как используемый для VitastorFS, создание обычных блочных
-образов в нём отключается (vitastor-cli отказывается создавать образы без --force),
-чтобы защитить пользователя от коллизий ID файлов и блочных образов и, таким
-образом, от потери данных.
+Когда пул помечается используемым для такого приложения, создание обычных блочных
+образов в нём запрещается (vitastor-cli отказывается создавать образы без --force),
+чтобы защитить пользователя от коллизий ID блочных образов и томов ФС/S3, и,
+таким образом, от потери данных.
+
+Также для таких пулов отключается передача статистики в etcd по отдельным инодам,
+так как использование для внешнего приложения подразумевает, что пул может содержать
+очень много томов и их статистика может занять слишком много места в etcd.
+
+Установка used_for_app в значение `fs:<name>` сообщает о том, что пул используется
+для VitastorFS с базой метаданных VitastorKV, хранимой в блочном образе с именем
+`<name>`.

 [vitastor-nfs](../usage/nfs.ru.md), в свою очередь, при запуске отказывается
-использовать для ФС пулы, не выделенные для неё. Это также означает, что один
-пул может использоваться только для одной VitastorFS.
+использовать для ФС пулы, не помеченные, как используемые для неё. Это также
+означает, что один пул может использоваться только для одной VitastorFS.

-Также для ФС-пулов отключается передача статистики в etcd по отдельным инодам,
-так как ФС-пул может содержать очень много файлов и статистика по ним всем
-заняла бы очень много места в etcd.
+Если же вы планируете использовать пул для данных S3, установите его used_for_app
+в значение `s3:<name>`, где `<name>` - любое название по вашему усмотрению
+(например, `s3:standard`) - конкретное содержимое `<name>` пока никак не проверяется
+компонентами Vitastor S3.
+
+Смотрите также [allow_net_split](osd.ru.md#allow_net_split) и
+[документацию по состояниям PG](../usage/admin.ru.md#состояния-pg).
+
+Все остальные значения used_for_app, кроме начинающихся на `fs:` или `s3:`, не
+означают ничего особенного для основных компонентов Vitastor. Поэтому сейчас вы
+можете использовать их свободно любым желаемым способом.

 # Примеры

--- a/docs/config/src/network.yml
+++ b/docs/config/src/network.yml
@ -306,3 +306,15 @@
    detect disconnections quickly.
  info_ru: |
    Интервал проверки живости вебсокет-подключений к etcd.
+- name: etcd_min_reload_interval
+  type: ms
+  default: 1000
+  online: true
+  info: |
+    Minimum interval for full etcd state reload. Introduced to prevent
+    excessive load on etcd during outages when etcd can't keep up with event
+    streams and cancels them.
+  info_ru: |
+    Минимальный интервал полной перезагрузки состояния из etcd. Добавлено для
+    предотвращения избыточной нагрузки на etcd во время отказов, когда etcd не
+    успевает рассылать потоки событий и отменяет их.
--- a/docs/config/src/osd.en.md
+++ b/docs/config/src/osd.en.md
@ -1,5 +1,5 @@
 # Runtime OSD Parameters

 These parameters only apply to OSDs, are not fixed at the moment of OSD drive
-initialization and can be changed - either with an OSD restart or, for some of
-them, even without restarting by updating configuration in etcd.
+initialization and can be changed - in /etc/vitastor/vitastor.conf or [vitastor-disk update-sb](../usage/disk.en.md#update-sb)
+with an OSD restart or, for some of them, even without restarting by updating configuration in etcd.
--- a/docs/config/src/osd.ru.md
+++ b/docs/config/src/osd.ru.md
@ -2,5 +2,5 @@

 Данные параметры используются только OSD, но, в отличие от дисковых параметров,
 не фиксируются в момент инициализации дисков OSD и могут быть изменены в любой
-момент с помощью перезапуска OSD, а некоторые и без перезапуска, с помощью
-изменения конфигурации в etcd.
+момент с перезапуском OSD в /etc/vitastor/vitastor.conf или [vitastor-disk update-sb](../usage/disk.ru.md#update-sb),
+а некоторые и без перезапуска, с помощью изменения конфигурации в etcd.
--- a/docs/config/src/osd.yml
+++ b/docs/config/src/osd.yml
@ -315,7 +315,7 @@
    decrease write performance for fast disks because page cache is an overhead
    itself.

-    Choose "directsync" to use [immediate_commit](layout-cluster.ru.md#immediate_commit)
+    Choose "directsync" to use [immediate_commit](layout-cluster.en.md#immediate_commit)
    (which requires disable_data_fsync) with drives having write-back cache
    which can't be turned off, for example, Intel Optane. Also note that *some*
    desktop SSDs (for example, HP EX950) may ignore O_SYNC thus making
@ -765,3 +765,34 @@
    Максимальное возможное значение авто-подстроенного recovery_sleep_us.
    Большие значения считаются случайными выбросами и игнорируются в
    усреднении.
+- name: discard_on_start
+  type: bool
+  info: Discard (SSD TRIM) unused data device blocks on every OSD startup.
+  info_ru: Освобождать (SSD TRIM) неиспользуемые блоки диска данных при каждом запуске OSD.
+- name: min_discard_size
+  type: int
+  default: 1048576
+  info: Minimum consecutive block size to TRIM it.
+  info_ru: Минимальный размер последовательного блока данных, чтобы освобождать его через TRIM.
+- name: allow_net_split
+  type: bool
+  default: false
+  info: |
+    Allow "safe" cases of network splits/partitions - allow to start PGs without
+    connections to some OSDs currently registered as alive in etcd, if the number
+    of actually connected PG OSDs is at least pg_minsize. That is, allow some OSDs to lose
+    connectivity with some other OSDs as long as it doesn't break pg_minsize guarantees.
+    The downside is that it increases the probability of writing data into just pg_minsize
+    OSDs during failover which can lead to PGs becoming incomplete after additional outages.
+
+    The old behaviour in versions up to 2.0.0 was equal to enabled allow_net_split.
+  info_ru: |
+    Разрешить "безопасные" случаи разделений сети - разрешить активировать PG без
+    соединений к некоторым OSD, помеченным активными в etcd, если общее число активных
+    OSD в PG составляет как минимум pg_minsize. То есть, разрешать некоторым OSD терять
+    соединения с некоторыми другими OSD, если это не нарушает гарантий pg_minsize.
+    Минус такого разрешения в том, что оно повышает вероятность записи данных ровно в
+    pg_minsize OSD во время переключений, что может потом привести к тому, что PG станут
+    неполными (incomplete), если упадут ещё какие-то OSD.
+
+    Старое поведение в версиях до 2.0.0 было идентично включённому allow_net_split.
--- a/docs/installation/docker.en.md
+++ b/docs/installation/docker.en.md
@ -26,9 +26,9 @@ at Vitastor Kubernetes operator: https://github.com/Antilles7227/vitastor-operat
 The instruction is very simple.

 1. Download a Docker image of the desired version: \
-   `docker pull vitastor:1.10.2`
+   `docker pull vitastor:2.0.0`
 2. Install scripts to the host system: \
-   `docker run --rm -it -v /etc:/host-etc -v /usr/bin:/host-bin vitastor:1.10.2 install.sh`
+   `docker run --rm -it -v /etc:/host-etc -v /usr/bin:/host-bin vitastor:2.0.0 install.sh`
 3. Reload udev rules: \
   `udevadm control --reload-rules`

--- a/docs/installation/docker.ru.md
+++ b/docs/installation/docker.ru.md
@ -25,9 +25,9 @@ Vitastor можно установить в Docker/Podman. При этом etcd,
 Инструкция по установке максимально простая.

 1. Скачайте Docker-образ желаемой версии: \
-   `docker pull vitastor:1.10.2`
+   `docker pull vitastor:2.0.0`
 2. Установите скрипты в хост-систему командой: \
-   `docker run --rm -it -v /etc:/host-etc -v /usr/bin:/host-bin vitastor:1.10.2 install.sh`
+   `docker run --rm -it -v /etc:/host-etc -v /usr/bin:/host-bin vitastor:2.0.0 install.sh`
 3. Перезагрузите правила udev: \
   `udevadm control --reload-rules`

--- a/docs/installation/s3.en.md
+++ b/docs/installation/s3.en.md
@ -0,0 +1,191 @@
+[Documentation](../../README.md#documentation) → Installation → S3 for Vitastor
+
+-----
+
+[Читать на русском](s3.ru.md)
+
+# S3 for Vitastor
+
+The moment has come - Vitastor S3 implementation based on Zenko CloudServer is released.
+
+## Highlights
+
+- Zenko CloudServer is implemented in node.js.
+- Object metadata is stored in MongoDB.
+- Modified Zenko CloudServer version is used for Vitastor. It is slightly different from
+  the original, has an optimised build and unneeded dependencies are stripped off.
+- Object data is stored in Vitastor block volumes, but the volume metadata is stored in
+  the same MongoDB, not in Vitastor etcd.
+- Objects are written to volumes sequentially one after another. The space is allocated
+  with rounding to the sector size (4 KB), so each object takes at least 4 KB.
+- An important property of such storage scheme is that small objects aren't chunked into
+  parts in Vitastor EC N+K pools and thus don't require reads from all N disks when
+  downloading.
+- Deleted objects are marked as deleted, but the space is only actually freed during
+  asynchronously executed "defragmentation" process. Defragmentation runs automatically
+  in the background when a volume reaches configured amount of "garbage" (20% by default).
+  Defragmentation copies actual objects to new volume(s) and then removes the old volume.
+  Defragmentation can be configured in locationConfig.json.
+
+## Plans for future development
+
+- User account storage in the DB instead of a static file. Original Zenko uses
+  a separate closed-source "Scality Vault" service for it, that's why we use
+  a static file for now.
+- More detailed documentation.
+- Support for other (and faster) key-value DBMS for object metadata storage.
+- Other performance optimisations, for example, related to the used hash function -
+  MD5 used for Amazon compatibility purposes is relatively slow.
+- Object Lifecycle support. There is a Lifecycle implementation for Zenko called
+  [Backbeat](https://github.com/scality/backbeat) but it's not adapted for Vitastor yet.
+- Quota support. Original Zenko uses a separate "SCUBA" service for quotas, but
+  it's also proprietary and not available publicly.
+
+## Installation
+
+In a few words:
+
+- Install MongoDB, create a user for S3 metadata DB.
+- Create a Vitastor pool for S3 data.
+- Download and setup the Docker container `vitalif/vitastor-zenko`.
+
+### Setup MongoDB
+
+You can setup MongoDB yourself, following the [MongoDB manual](https://www.mongodb.com/docs/manual/installation/).
+
+Or you can follow the instructions below - it describes a simple example of MongoDB setup
+in Docker (through docker-compose) with 3 replicas.
+
+1. On each host, create a file `docker-compose.yml` with the content listed below.
+   Replace `<YOUR_PASSWORD>` with your future mongodb administrator password, and optionally
+   replace `0.0.0.0` with `localhost,<server_IP>`. It's recommended to either use a private IP
+   or [setup TLS](https://www.mongodb.com/docs/manual/tutorial/configure-ssl/) afterwards.
+
+```
+version: '3.1'
+
+services:
+
+  mongo:
+    container_name: mongo
+    image: mongo:7-jammy
+    restart: always
+    environment:
+      MONGO_INITDB_ROOT_USERNAME: root
+      MONGO_INITDB_ROOT_PASSWORD: <YOUR_PASSWORD>
+    network_mode: host
+    volumes:
+      - ./keyfile:/opt/keyfile
+      - ./mongo-data/db:/data/db
+      - ./mongo-data/configdb:/data/configdb
+    entrypoint: /bin/bash -c
+    command: [ "chown mongodb /opt/keyfile && chmod 600 /opt/keyfile && . /usr/local/bin/docker-entrypoint.sh mongod --replSet rs0 --keyFile /opt/keyfile --bind_ip 0.0.0.0" ]
+```
+
+2. Generate a shared cluster key using `openssl rand -base64 756 > ./keyfile` and copy
+   that `keyfile` to all hosts.
+
+3. Start MongoDB on all hosts with `docker compose up -d mongo`.
+
+4. Enter Mongo Shell with `docker exec -it mongo mongosh -u root -p <YOUR_PASSWORD> localhost/admin`
+   and execute the following command (replace IP addresses `10.10.10.{1,2,3}` with your host IPs):
+
+`rs.initiate({ _id: 'rs0', members: [
+  { _id: 1, host: '10.10.10.1:27017' },
+  { _id: 2, host: '10.10.10.2:27017' },
+  { _id: 3, host: '10.10.10.3:27017' }
+] })`
+
+5. Stay in Mongo Shell and create a user for the future S3 database:
+
+`db.createUser({ user: 's3', pwd: '<YOUR_S3_PASSWORD>', roles: [
+  { role: 'readWrite', db: 's3' },
+  { role: 'dbAdmin', db: 's3' },
+  { role: 'readWrite', db: 'vitastor' },
+  { role: 'dbAdmin', db: 'vitastor' }
+] })`
+
+### Setup Vitastor
+
+Create a pool in Vitastor for S3 object data, for example:
+
+`vitastor-cli create-pool --ec 2+1 -n 512 s3-data --used_for_app s3:standard`
+
+The `--used_for_app` options works as fool-proofing and prevents you from
+accidentally creating a regular block volume in the S3 pool and overwriting some S3 data.
+Also it hides inode space statistics from Vitastor etcd.
+
+Retrieve the ID of your pool with `vitastor-cli ls-pools s3-data --detail`.
+
+### Setup Vitastor S3
+
+1. Add the following lines to `docker-compose.yml` (instead of `network_mode: host`,
+   you can use `ports: [ "8000:8000", "8002:8002" ]`):
+
+```
+  zenko:
+    container_name: zenko
+    image: vitalif/vitastor-zenko
+    restart: always
+    security_opt:
+      - seccomp:unconfined
+    ulimits:
+      memlock: -1
+    network_mode: host
+    volumes:
+      - /etc/vitastor:/etc/vitastor
+      - /etc/vitastor/s3:/conf
+```
+
+2. Download Docker image: `docker pull vitalif/vitastor-zenko`
+
+3. Extract configuration file examples from the Docker image:
+   ```
+   docker run --rm -it -v /etc/vitastor:/etc/vitastor -v /etc/vitastor/s3:/conf vitalif/vitastor-zenko configure.sh
+   ```
+
+4. Edit configuration files in `/etc/vitastor/s3/`:
+   - `config.json` - common settings.
+   - `authdata.json` - user accounts and access keys.
+   - `locationConfig.json` - S3 storage class list with placement settings.
+     Note: it actually contains storage classes (like STANDARD, COLD, etc)
+     instead of "locations" (zones like us-east-1) as in the original Zenko CloudServer.
+   - Put your MongoDB connection data into `config.json` and `locationConfig.json`.
+   - Put your Vitastor pool ID into `locationConfig.json`.
+   - For now, the complete list of Vitastor backend settings is only available [in the code](https://git.yourcmc.ru/vitalif/zenko-arsenal/src/branch/master/lib/storage/data/vitastor/VitastorBackend.ts#L94).
+
+### Start Zenko
+
+Start the S3 server with:
+
+```
+docker run --restart always --security-opt seccomp:unconfined --ulimit memlock=-1 --network=host \
+    -v /etc/vitastor:/etc/vitastor -v /etc/vitastor/s3:/conf --name zenko vitalif/vitastor-zenko
+```
+
+If you use default settings, Zenko CloudServer starts on port 8000.
+The default access key is `accessKey1` with a secret key of `verySecretKey1`.
+
+Now you can access your S3 with, for example, [s3cmd](https://s3tools.org/s3cmd):
+
+```
+s3cmd --access_key=accessKey1 --secret_key=verySecretKey1 --host=http://localhost:8000 mb s3://testbucket
+```
+
+Or even mount it with [GeeseFS](https://github.com/yandex-cloud/geesefs):
+
+```
+AWS_ACCESS_KEY_ID=accessKey1 \
+    AWS_SECRET_ACCESS_KEY=verySecretKey1 \
+    geesefs --endpoint http://localhost:8000 testbucket mountdir
+```
+
+## Author & License
+
+- [Zenko CloudServer](https://s3-server.readthedocs.io/en/latest/) author is Scality,
+  licensed under [Apache License, version 2.0](https://www.apache.org/licenses/LICENSE-2.0)
+- [Vitastor](https://git.yourcmc.ru/vitalif/vitastor/) and Zenko Vitastor backend author is
+  Vitaliy Filippov, licensed under [VNPL-1.1](https://git.yourcmc.ru/vitalif/vitastor/src/branch/master/VNPL-1.1.txt)
+  (a "network copyleft" license based on AGPL/SSPL, but worded in a better way)
+- Vitastor S3 repository: https://git.yourcmc.ru/vitalif/zenko-cloudserver-vitastor
+- Vitastor S3 backend code: https://git.yourcmc.ru/vitalif/zenko-arsenal/src/branch/master/lib/storage/data/vitastor/VitastorBackend.ts
--- a/docs/installation/s3.ru.md
+++ b/docs/installation/s3.ru.md
@ -0,0 +1,171 @@
+[Документация](../../README-ru.md#документация) → Установка → S3 на базе Vitastor
+
+-----
+
+[Read in English](s3.en.md)
+
+# S3 на базе Vitastor
+
+Итак, свершилось - реализация Vitastor S3 на базе Zenko CloudServer достигла
+состояния готовности к публикации и использованию.
+
+## Ключевые особенности
+
+- Zenko CloudServer реализован на node.js.
+- Метаданные объектов хранятся в MongoDB.
+- Поставляется модифицированная версия Zenko CloudServer, отвязанная от лишних зависимостей,
+  с оптимизированной сборкой и немного отличающаяся от оригинала.
+- Данные объектов хранятся в блочных томах Vitastor, однако информация о самих томах
+  сохраняется не в etcd Vitastor, а тоже в БД на основе MongoDB.
+- Объекты записываются в тома последовательно друг за другом. Место выделяется с округлением
+  до размера сектора (до 4 килобайт), поэтому каждый объект занимает как минимум 4 КБ.
+- Благодаря такой схеме записи объектов мелкие объекты не нарезаются на части и поэтому не
+  требуют чтения с N дисков данных в EC N+K пулах Vitastor.
+- При удалении объекты помечаются удалёнными, но место освобождается не сразу, а при
+  запускаемой асинхронно "дефрагментации". Дефрагментация запускается автоматически в фоне
+  при достижении заданного объёма "мусора" в томе (по умолчанию 20%), копирует актуальные
+  объекты в новые тома, после чего очищает старый том полностью. Дефрагментацию можно
+  настраивать в locationConfig.json.
+
+## Планы развития
+
+- Хранение учётных записей в БД, а не в статическом файле (в оригинальном Zenko для
+  этого используется отдельный закрытый сервис "Scality Vault").
+- Более подробная документация.
+- Поддержка других (и более производительных) key-value СУБД для хранения метаданных.
+- Другие оптимизации производительности, например, в области используемой хеш-функции
+  (хеш MD5, используемый в целях совместимости, относительно медленный).
+- Поддержка Object Lifecycle. Реализация Lifecycle для Zenko существует и называется
+  [Backbeat](https://github.com/scality/backbeat), но она ещё не адаптирована для Vitastor.
+- Квоты. В оригинальном Zenko для этого используется отдельный сервис "SCUBA", однако
+  он тоже является закрытым и недоступен для публичного использования.
+
+## Установка
+
+Кратко:
+
+- Установите MongoDB, создайте пользователя для БД метаданных S3.
+- Создайте в Vitastor пул для хранения данных объектов.
+- Скачайте и настройте Docker-контейнер `vitalif/vitastor-zenko`.
+
+### Установка MongoDB
+
+Вы можете установить MongoDB сами, следуя [официальному руководству MongoDB](https://www.mongodb.com/docs/manual/installation/).
+
+Либо вы можете последовать инструкции, приведённой ниже - здесь описан простейший пример
+установки MongoDB в Docker (docker-compose) в конфигурации с 3 репликами.
+
+1. На всех 3 серверах создайте файл `docker-compose.yml`, заменив `<ВАШ_ПАРОЛЬ>`
+   на собственный будущий пароль администратора mongodb, а `0.0.0.0` по желанию
+   заменив на на `localhost,<IP_сервера>` - желательно либо использовать публично не доступный IP,
+   либо потом [настроить TLS](https://www.mongodb.com/docs/manual/tutorial/configure-ssl/).
+
+```
+version: '3.1'
+
+services:
+
+  mongo:
+    container_name: mongo
+    image: mongo:7-jammy
+    restart: always
+    environment:
+      MONGO_INITDB_ROOT_USERNAME: root
+      MONGO_INITDB_ROOT_PASSWORD: <ВАШ_ПАРОЛЬ>
+    network_mode: host
+    volumes:
+      - ./keyfile:/opt/keyfile
+      - ./mongo-data/db:/data/db
+      - ./mongo-data/configdb:/data/configdb
+    entrypoint: /bin/bash -c
+    command: [ "chown mongodb /opt/keyfile && chmod 600 /opt/keyfile && . /usr/local/bin/docker-entrypoint.sh mongod --replSet rs0 --keyFile /opt/keyfile --bind_ip 0.0.0.0" ]
+```
+
+2. В той же директории сгенерируйте общий ключ кластера командой `openssl rand -base64 756 > ./keyfile`
+   и скопируйте этот файл на все 3 сервера.
+
+3. На всех 3 серверах запустите MongoDB командой `docker compose up -d mongo`.
+
+4. Зайдите в Mongo Shell с помощью команды `docker exec -it mongo mongosh -u root -p <ВАШ_ПАРОЛЬ> localhost/admin`
+   и там выполните команду (заменив IP-адреса `10.10.10.{1,2,3}` на адреса своих серверов):
+
+`rs.initiate({ _id: 'rs0', members: [
+  { _id: 1, host: '10.10.10.1:27017' },
+  { _id: 2, host: '10.10.10.2:27017' },
+  { _id: 3, host: '10.10.10.3:27017' }
+] })`
+
+5. Находясь там же, в Mongo Shell, создайте пользователя с доступом к будущей базе данных S3:
+
+`db.createUser({ user: 's3', pwd: '<ВАШ_ПАРОЛЬ_S3>', roles: [
+  { role: 'readWrite', db: 's3' },
+  { role: 'dbAdmin', db: 's3' },
+  { role: 'readWrite', db: 'vitastor' },
+  { role: 'dbAdmin', db: 'vitastor' }
+] })`
+
+### Настройка Vitastor
+
+Создайте в Vitastor отдельный пул для данных объектов S3, например:
+
+`vitastor-cli create-pool --ec 2+1 -n 512 s3-data --used_for_app s3:standard`
+
+Опция `--used_for_app` работает как "защита от дурака" и не даёт вам случайно создать
+в этом пуле обычный блочный том и перезаписать им какие-то данные S3, а также скрывает
+статистику занятого места по томам S3 из etcd.
+
+Получите ID своего пула с помощью команды `vitastor-cli ls-pools --detail`.
+
+### Установка Vitastor S3
+
+1. Добавьте в `docker-compose.yml` строки (альтернативно вместо `network_mode: host`
+   можно использовать `ports: [ "8000:8000", "8002:8002" ]`):
+
+```
+  zenko:
+    container_name: zenko
+    image: vitalif/vitastor-zenko
+    restart: always
+    security_opt:
+      - seccomp:unconfined
+    ulimits:
+      memlock: -1
+    network_mode: host
+    volumes:
+      - /etc/vitastor:/etc/vitastor
+      - /etc/vitastor/s3:/conf
+```
+
+2. Извлеките из Docker-образа Vitastor примеры файлов конфигурации:
+   `docker run --rm -it -v /etc/vitastor:/etc/vitastor -v /etc/vitastor/s3:/conf vitalif/vitastor-zenko configure.sh`
+
+3. Отредактируйте файлы конфигурации в `/etc/vitastor/s3/`:
+   - `config.json` - общие настройки.
+   - `authdata.json` - учётные записи и ключи доступа.
+   - `locationConfig.json` - список классов хранения S3 с настройками расположения.
+     Внимание: в данной версии это именно список S3 storage class-ов (STANDARD, COLD и т.п.),
+     а не зон (подобных us-east-1), как в оригинальном Zenko CloudServer.
+   - В `config.json` и в `locationConfig.json` пропишите свои данные подключения к MongoDB.
+   - В `locationConfig.json` укажите ID пула Vitastor для хранения данных.
+   - Полный перечень настроек Vitastor-бэкенда пока можно посмотреть [в коде](https://git.yourcmc.ru/vitalif/zenko-arsenal/src/branch/master/lib/storage/data/vitastor/VitastorBackend.ts#L94).
+
+### Запуск
+
+Запустите S3-сервер: `docker-compose up -d zenko`
+
+Готово! Вы получили S3-сервер, работающий на порту 8000.
+
+Можете попробовать обратиться к нему с помощью, например, [s3cmd](https://s3tools.org/s3cmd):
+
+`s3cmd --host-bucket= --no-ssl --access_key=accessKey1 --secret_key=verySecretKey1 --host=http://localhost:8000 mb s3://testbucket`
+
+Или смонтировать его с помощью [GeeseFS](https://github.com/yandex-cloud/geesefs):
+
+`AWS_ACCESS_KEY_ID=accessKey1 AWS_SECRET_ACCESS_KEY=verySecretKey1 geesefs --endpoint http://localhost:8000 testbucket /mnt/geesefs`
+
+## Лицензия
+
+- Автор [Zenko CloudServer](https://s3-server.readthedocs.io/en/latest/) - Scality, лицензия [Apache 2.0](https://www.apache.org/licenses/LICENSE-2.0)
+- Vitastor-бэкенд для S3, как и сам Vitastor, лицензируется на условиях [VNPL 1.1](https://git.yourcmc.ru/vitalif/vitastor/src/branch/master/VNPL-1.1.txt)
+- Репозиторий сборки: https://git.yourcmc.ru/vitalif/zenko-cloudserver-vitastor
+- Бэкенд хранения данных: https://git.yourcmc.ru/vitalif/zenko-arsenal/src/branch/master/lib/storage/data/vitastor/VitastorBackend.ts
--- a/docs/intro/features.en.md
+++ b/docs/intro/features.en.md
@ -37,6 +37,7 @@
 - [Experimental internal etcd replacement - antietcd](../config/monitor.en.md#use_antietcd)
 - [Built-in Prometheus metric exporter](../config/monitor.en.md#enable_prometheus)
 - [NFS RDMA support](../usage/nfs.en.md#rdma) (probably also usable for GPUDirect)
+- [S3](../installation/s3.en.md)

 ## Plugins and tools

@ -63,7 +64,6 @@ The following features are planned for the future:
 - iSCSI and NVMeoF gateways
 - Multi-threaded client
 - Faster failover
- S3
 - Tiered storage (SSD caching)
 - NVDIMM support
 - Compression (possibly)
--- a/docs/intro/features.ru.md
+++ b/docs/intro/features.ru.md
@ -39,6 +39,7 @@
 - [Экспериментальная встроенная замена etcd - antietcd](../config/monitor.ru.md#use_antietcd)
 - [Встроенный Prometheus-экспортер метрик](../config/monitor.ru.md#enable_prometheus)
 - [Поддержка NFS RDMA](../usage/nfs.ru.md#rdma) (вероятно, также подходящая для GPUDirect)
+- [S3](../installation/s3.ru.md)

 ## Драйверы и инструменты

@ -63,7 +64,6 @@
 - iSCSI и NVMeoF прокси
 - Многопоточный клиент
 - Более быстрое переключение при отказах
- S3
 - Поддержка SSD-кэширования (tiered storage)
 - Поддержка NVDIMM
 - Возможно, сжатие
--- a/docs/usage/admin.en.md
+++ b/docs/usage/admin.en.md
@ -35,10 +35,19 @@ PG state consists of exactly 1 base state and an arbitrary number of additional

 PG state always includes exactly 1 of the following base states:
 - **active** — PG is active and handles user I/O.
- **incomplete** — Not enough OSDs are available to activate this PG. That is, more disks
-  are lost than it's allowed by the pool's redundancy scheme. For example, if the pool has
-  pg_size=3 and pg_minsize=1, part of the data may be written only to 1 OSD. If that exact
-  OSD is lost, PG will become **incomplete**.
+- **incomplete** — Not enough OSDs are available to activate this PG. More exactly, that
+  means one of the following:
+  - Less than pg_minsize current target OSDs are available for the PG. I.e. more disks
+    are lost than allowed by the pool's redundancy scheme.
+  - All OSDs of some of PG's history records are unavailable, or, for EC pools, less
+    than (pg_size-parity_chunks) OSDs are available in one of the history records.
+    In other words it means that some data in this PG was written to an OSD set such that
+    it's currently impossible to read it back because these OSDs are down. For example,
+    if the pool has pg_size=3 and pg_minsize=1, part of the data may be written only to
+    1 OSD. If that exact OSD is lost, PG becomes **incomplete**.
+  - [allow_net_split](../config/osd.en.md#allow_net_split) is disabled (default) and
+    primary OSD of the PG can't connect to some secondary OSDs marked as alive in etcd.
+    I.e. a network partition happened: OSDs can talk to etcd, but not to some other OSDs.
 - **offline** — PG isn't activated by any OSD at all. Either primary OSD isn't set for
  this PG at all (if the pool is just created), or an unavailable OSD is set as primary,
  or the primary OSD refuses to start this PG (for example, because of wrong block_size),
--- a/docs/usage/admin.ru.md
+++ b/docs/usage/admin.ru.md
@ -35,10 +35,20 @@

 Состояние PG включает в себя ровно 1 флаг из следующих:
 - **active** — PG активна и обрабатывает запросы ввода-вывода от пользователей.
- **incomplete** — Недостаточно живых OSD, чтобы включить эту PG.
-  То есть, дисков потеряно больше, чем разрешено схемой отказоустойчивости пула и pg_minsize.
-  Например, если у пула pg_size=3 и pg_minsize=1, то часть данных может записаться всего на 1 OSD.
-  Если потом конкретно этот OSD упадёт, PG окажется **incomplete**.
+- **incomplete** — Недостаточно живых OSD, чтобы включить эту PG. Если точнее, то это
+  означает один из следующих вариантов:
+  - Доступно менее, чем pg_minsize текущих целевых OSD данной PG. Иными словами, потеряно
+    больше дисков, чем это разрешает схема отказоустойчивости пула.
+  - Все OSD одной из исторических записей PG недоступны, или, для EC-пулов, в одной
+    из исторических записей PG доступно менее, чем (pg_size-parity_chunks) OSD. Другими
+    словами это означает, что часть данных этой PG была записана в такой набор OSD, из
+    которого их сейчас невозможно прочитать обратно, так как OSD не включены. Например,
+    если у пула pg_size=3 и pg_minsize=1, то часть данных может записаться всего на 1 OSD.
+    Если потом конкретно этот OSD упадёт, PG окажется **incomplete**.
+  - [allow_net_split](../config/osd.ru.md#allow_net_split) отключено (по умолчанию) и
+    первичный OSD данной PG не может соединиться с частью вторичных OSD этой PG, помеченных
+    как живых в etcd. Это означает, что произошло разделение сети: OSD могут общаться с etcd,
+    но не могут общаться с частью других OSD.
 - **offline** — PG вообще не активирована ни одним OSD. Либо первичный OSD не назначен вообще
  (если пул только создан), либо в качестве первичного назначен недоступный OSD, либо
  назначенный OSD отказывается запускать эту PG (например, из-за несовпадения block_size),
--- a/docs/usage/cli.en.md
+++ b/docs/usage/cli.en.md
@ -355,7 +355,7 @@ Set OSD reweight, tags or noout flag. See detail description in [OSD config docu

 ## pg-list

-`vitastor-cli pg-list|pg-ls|list-pg|ls-pg|ls-pgs [OPTIONS] [state1+state2] [^state3] [...]`
+`vitastor-cli pg-list|pg-ls|list-pg|ls-pg|ls-pgs|pgs [OPTIONS] [state1+state2] [^state3] [...]`

 List PGs with any of listed state filters (^ or ! in the beginning is negation). Options:

@ -363,6 +363,7 @@ List PGs with any of listed state filters (^ or ! in the beginning is negation).
 --pool <pool name or number>  Only list PGs of the given pool.
 --min <min pg number>         Only list PGs with number >= min.
 --max <max pg number>         Only list PGs with number <= max.
+--osd 1,2,...                 Only list PGs with some data on specified OSD(s).
 ```

 Examples:
@ -398,7 +399,8 @@ Optional parameters:
 | `--raw_placement <rules>`      | Specify raw PG generation rules ([details](../config/pool.en.md#raw_placement)) |
 | `--primary_affinity_tags tags` | Prefer to put primary copies on OSDs with all specified tags               |
 | `--scrub_interval <time>`      | Enable regular scrubbing for this pool. Format: number + unit s/m/h/d/M/y  |
-| `--used_for_fs <name>`         | Mark pool as used for VitastorFS with metadata in image <name>             |
+| `--used_for_app fs:<name>`     | Mark pool as used for VitastorFS with metadata in image `<name>`           |
+| `--used_for_app s3:<name>`     | Mark pool as used for S3 location with name `<name>`                       |
 | `--pg_stripe_size <number>`    | Increase object grouping stripe                                            |
 | `--max_osd_combinations 10000` | Maximum number of random combinations for LP solver input                  |
 | `--wait`                       | Wait for the new pool to come online                                       |
--- a/docs/usage/cli.ru.md
+++ b/docs/usage/cli.ru.md
@ -22,6 +22,8 @@ vitastor-cli - интерфейс командной строки для адм
 - [flatten](#flatten)
 - [rm-data](#rm-data)
 - [merge-data](#merge-data)
+- [describe](#describe)
+- [fix](#fix)
 - [alloc-osd](#alloc-osd)
 - [rm-osd](#rm-osd)
 - [osd-tree](#osd-tree)
@ -375,9 +377,10 @@ OSD  PARENT            UP    SIZE  USED%    TAGS          WEIGHT  BLOCK  BITMAP
 в начале фильтра означает отрицание). Опции:

 ```
--pool <pool name or number>  Only list PGs of the given pool.
--min <min pg number>         Only list PGs with number >= min.
--max <max pg number>         Only list PGs with number <= max.
+--pool <pool name or number>  Вывести только PG в заданном пуле.
+--min <min pg number>         Вывести только PG с номерами >= min.
+--max <max pg number>         Вывести только PG с номерами <= max.
+--osd 1,2,...                 Вывести только PG с данными на заданных OSD.
 ```

 Примеры:
--- a/docs/usage/disk.en.md
+++ b/docs/usage/disk.en.md
@ -14,6 +14,7 @@ It supports the following commands:
 - [upgrade-simple](#upgrade-simple)
 - [resize](#resize)
 - [raw-resize](#raw-resize)
+- [trim](#trim)
 - [start/stop/restart/enable/disable](#start/stop/restart/enable/disable)
 - [purge](#purge)
 - [read-sb](#read-sb)
@ -97,6 +98,9 @@ Options (both modes):
 --data_device_block 4k     Override data device block size
 --meta_device_block 4k     Override metadata device block size
 --journal_device_block 4k  Override journal device block size
+--discard_on_start 0       TRIM unused data device blocks every OSD start (default off)
+--min_discard_size 1M      Minimum TRIM block size
+--json                     Enable JSON output
 ```

 [immediate_commit](../config/layout-cluster.en.md#immediate_commit) setting is
@ -179,6 +183,19 @@ parameters from OSD command line (i.e. from systemd unit or superblock).
 SIZE may include k/m/g/t suffixes. If any of the new layout parameter
 options are not specified, old values will be used.

+## trim
+
+`vitastor-disk trim <osd_num>|<osd_device> [<osd_num>|<osd_device>...]`
+
+Try to discard unused blocks (SSD TRIM) on the data device of each of the OSD(s).
+
+May only be used on stopped OSDs. Options:
+
+```
+--min_discard_size 1M      Minimum TRIM block size
+--discard_granularity 0    Override device's discard granularity
+```
+
 ## start/stop/restart/enable/disable

 `vitastor-disk start|stop|restart|enable|disable [--now] <device> [device2 device3 ...]`
--- a/docs/usage/disk.ru.md
+++ b/docs/usage/disk.ru.md
@ -99,6 +99,9 @@ vitastor-disk - инструмент командной строки для уп
 --data_device_block 4k     Задать размер блока устройства данных
 --meta_device_block 4k     Задать размер блока метаданных
 --journal_device_block 4k  Задать размер блока журнала
+--discard_on_start 0       Выполнять TRIM пустых блоков данных при запуске OSD (по умолчанию нет)
+--min_discard_size 1M      Минимальный размер блока для TRIM
+--json                     Включить JSON-вывод
 ```

 Настройка [immediate_commit](../config/layout-cluster.ru.md#immediate_commit)
@ -182,6 +185,20 @@ throttle_target_mbs, throttle_target_parallelism, throttle_threshold_us.
 `РАЗМЕР` может быть указан с суффиксами k/m/g/t. Если любой из новых параметров
 расположения не указан, он принимается равным старому значению.

+## trim
+
+`vitastor-disk trim <osd_num>|<osd_device> [<osd_num>|<osd_device>...]`
+
+Попробовать пометить пустые блоки дисков данных всех указанных OSD неиспользуемыми
+(выполнить команду SSD TRIM).
+
+Можно использовать только с остановленными OSD. Опции:
+
+```
+--min_discard_size 1M      Минимальный размер блока для TRIM
+--discard_granularity 0    Кратность размера блока для TRIM
+```
+
 ## start/stop/restart/enable/disable

 `vitastor-disk start|stop|restart|enable|disable [--now] <device> [device2 device3 ...]`
--- a/docs/usage/nfs.en.md
+++ b/docs/usage/nfs.en.md
@ -58,7 +58,7 @@ To use VitastorFS:
 2. Create an image for FS metadata, preferably in a faster (SSD or replica-HDD) pool,
   but you can create it in the data pool too if you want (image size doesn't matter):
   `vitastor-cli create -s 10G -p fastpool testfs`
-3. Mark data pool as an FS pool: `vitastor-cli modify-pool --used-for-fs testfs data-pool`
+3. Mark data pool as an FS pool: `vitastor-cli modify-pool --used-for-app fs:testfs data-pool`
 4. Either mount the FS: `vitastor-nfs mount --fs testfs --pool data-pool /mnt/vita`
 5. Or start the NFS server: `vitastor-nfs start --fs testfs --pool data-pool`

--- a/docs/usage/nfs.ru.md
+++ b/docs/usage/nfs.ru.md
@ -60,7 +60,7 @@ JSON-формате :-). Для инспекции содержимого БД
   или по крайней мере на HDD, но без EC), но можно и в том же пуле, что данные
   (размер образа значения не имеет):
   `vitastor-cli create -s 10G -p fastpool testfs`
-3. Пометьте пул данных как ФС-пул: `vitastor-cli modify-pool --used-for-fs testfs data-pool`
+3. Пометьте пул данных как ФС-пул: `vitastor-cli modify-pool --used-for-app fs:testfs data-pool`
 4. Либо примонтируйте ФС: `vitastor-nfs mount --fs testfs --pool data-pool /mnt/vita`
 5. Либо запустите сетевой NFS-сервер: `vitastor-nfs start --fs testfs --pool data-pool`

--- a/mon/package.json
+++ b/mon/package.json
@ -1,6 +1,6 @@
 {
  "name": "vitastor-mon",
-  "version": "1.11.0",
+  "version": "2.0.0",
  "description": "Vitastor SDS monitor service",
  "main": "mon-main.js",
  "scripts": {
--- a/node-binding/client.cc
+++ b/node-binding/client.cc
@ -19,11 +19,16 @@
 class NodeVitastorRequest: public Nan::AsyncResource
 {
 public:
-    NodeVitastorRequest(NodeVitastor *cli, v8::Local<v8::Function> cb): Nan::AsyncResource("NodeVitastorRequest")
+    NodeVitastorRequest(NodeVitastor *cli, const v8::Local<v8::Function> & cb): Nan::AsyncResource("NodeVitastorRequest")
    {
        this->cli = cli;
        callback.Reset(cb);
    }
+    ~NodeVitastorRequest()
+    {
+        callback.Reset();
+        buffer_ref.Reset();
+    }

    iovec iov;
    std::vector<iovec> iov_list;
@ -33,6 +38,7 @@ public:
    uint64_t offset = 0, len = 0, version = 0;
    bool with_parents = false;
    Nan::Persistent<v8::Function> callback;
+    Nan::Persistent<v8::Value> buffer_ref;
 };

 static uint64_t get_ui64(const v8::Local<v8::Value> & val)
@ -129,8 +135,8 @@ NodeVitastorRequest* NodeVitastor::get_read_request(const Nan::FunctionCallbackI
        Nan::ThrowError("failed to allocate memory");
        return NULL;
    }
-    v8::Local<v8::Function> callback = info[argpos+2].As<v8::Function>();
-    auto req = new NodeVitastorRequest(this, callback);
+
+    auto req = new NodeVitastorRequest(this, info[argpos+2].As<v8::Function>());

    req->offset = offset;
    req->len = len;
@ -155,7 +161,9 @@ NAN_METHOD(NodeVitastor::Read)

    self->Ref();
    vitastor_c_read(self->c, ((pool << (64-POOL_ID_BITS)) | inode), req->offset, req->len, &req->iov, 1, on_read_finish, req);
+#if !defined VITASTOR_C_API_VERSION || VITASTOR_C_API_VERSION < 5
    vitastor_c_uring_handle_events(self->c);
+#endif
 }

 NodeVitastorRequest* NodeVitastor::get_write_request(const Nan::FunctionCallbackInfo<v8::Value> & info, int argpos)
@ -175,11 +183,11 @@ NodeVitastorRequest* NodeVitastor::get_write_request(const Nan::FunctionCallback
        argpos++;
    }

-    v8::Local<v8::Function> callback = info[argpos+2].As<v8::Function>();
-    auto req = new NodeVitastorRequest(this, callback);
+    auto req = new NodeVitastorRequest(this, info[argpos+2].As<v8::Function>());

    req->offset = offset;
    req->version = version;
+    req->buffer_ref.Reset(bufarg);

    if (bufarg->IsArray())
    {
@ -224,7 +232,9 @@ NAN_METHOD(NodeVitastor::Write)
        req->iov_list.size() ? req->iov_list.data() : &req->iov,
        req->iov_list.size() ? req->iov_list.size() : 1,
        on_write_finish, req);
+#if !defined VITASTOR_C_API_VERSION || VITASTOR_C_API_VERSION < 5
    vitastor_c_uring_handle_events(self->c);
+#endif
 }

 NodeVitastorRequest* NodeVitastor::get_delete_request(const Nan::FunctionCallbackInfo<v8::Value> & info, int argpos)
@ -244,8 +254,7 @@ NodeVitastorRequest* NodeVitastor::get_delete_request(const Nan::FunctionCallbac
        argpos++;
    }

-    v8::Local<v8::Function> callback = info[argpos+2].As<v8::Function>();
-    auto req = new NodeVitastorRequest(this, callback);
+    auto req = new NodeVitastorRequest(this, info[argpos+2].As<v8::Function>());

    req->offset = offset;
    req->len = len;
@ -271,7 +280,9 @@ NAN_METHOD(NodeVitastor::Delete)
    self->Ref();
    vitastor_c_delete(self->c, ((pool << (64-POOL_ID_BITS)) | inode), req->offset, req->len, req->version,
        on_write_finish, req);
+#if !defined VITASTOR_C_API_VERSION || VITASTOR_C_API_VERSION < 5
    vitastor_c_uring_handle_events(self->c);
+#endif
 }

 // sync(callback(err))
@ -283,12 +294,13 @@ NAN_METHOD(NodeVitastor::Sync)

    NodeVitastor* self = Nan::ObjectWrap::Unwrap<NodeVitastor>(info.This());

-    v8::Local<v8::Function> callback = info[0].As<v8::Function>();
-    auto req = new NodeVitastorRequest(self, callback);
+    auto req = new NodeVitastorRequest(self, info[0].As<v8::Function>());

    self->Ref();
    vitastor_c_sync(self->c, on_write_finish, req);
+#if !defined VITASTOR_C_API_VERSION || VITASTOR_C_API_VERSION < 5
    vitastor_c_uring_handle_events(self->c);
+#endif
 }

 // read_bitmap(pool, inode, offset, length, with_parents, callback(err, bitmap_buffer))
@ -305,12 +317,13 @@ NAN_METHOD(NodeVitastor::ReadBitmap)
    uint64_t offset = get_ui64(info[2]);
    uint64_t len = get_ui64(info[3]);
    bool with_parents = Nan::To<bool>(info[4]).FromJust();
-    v8::Local<v8::Function> callback = info[5].As<v8::Function>();

-    auto req = new NodeVitastorRequest(self, callback);
+    auto req = new NodeVitastorRequest(self, info[5].As<v8::Function>());
    self->Ref();
    vitastor_c_read_bitmap(self->c, ((pool << (64-POOL_ID_BITS)) | inode), offset, len, with_parents, on_read_bitmap_finish, req);
+#if !defined VITASTOR_C_API_VERSION || VITASTOR_C_API_VERSION < 5
    vitastor_c_uring_handle_events(self->c);
+#endif
 }

 static void on_error(NodeVitastorRequest *req, Nan::Callback & nanCallback, long retval)
@ -331,11 +344,12 @@ NAN_METHOD(NodeVitastor::OnReady)
    if (info.Length() < 1)
        Nan::ThrowError("Not enough arguments to on_ready(callback(err))");
    NodeVitastor* self = Nan::ObjectWrap::Unwrap<NodeVitastor>(info.This());
-    v8::Local<v8::Function> callback = info[0].As<v8::Function>();
-    auto req = new NodeVitastorRequest(self, callback);
+    auto req = new NodeVitastorRequest(self, info[0].As<v8::Function>());
    self->Ref();
    vitastor_c_on_ready(self->c, on_ready_finish, req);
+#if !defined VITASTOR_C_API_VERSION || VITASTOR_C_API_VERSION < 5
    vitastor_c_uring_handle_events(self->c);
+#endif
 }

 void NodeVitastor::on_ready_finish(void *opaque, long retval)
@ -480,7 +494,9 @@ NAN_METHOD(NodeVitastorImage::Create)
    img->Ref();
    cli->Ref();
    vitastor_c_watch_inode(cli->c, (char*)img->name.c_str(), on_watch_start, img);
+#if !defined VITASTOR_C_API_VERSION || VITASTOR_C_API_VERSION < 5
    vitastor_c_uring_handle_events(cli->c);
+#endif

    info.GetReturnValue().Set(info.This());
 }
@ -552,8 +568,7 @@ NAN_METHOD(NodeVitastorImage::Sync)

    NodeVitastorImage* img = Nan::ObjectWrap::Unwrap<NodeVitastorImage>(info.This());

-    v8::Local<v8::Function> callback = info[0].As<v8::Function>();
-    auto req = new NodeVitastorRequest(img->cli, callback);
+    auto req = new NodeVitastorRequest(img->cli, info[0].As<v8::Function>());
    req->img = img;
    req->op = NODE_VITASTOR_SYNC;

@ -572,9 +587,8 @@ NAN_METHOD(NodeVitastorImage::ReadBitmap)
    uint64_t offset = get_ui64(info[0]);
    uint64_t len = get_ui64(info[1]);
    bool with_parents = Nan::To<bool>(info[2]).FromJust();
-    v8::Local<v8::Function> callback = info[3].As<v8::Function>();

-    auto req = new NodeVitastorRequest(img->cli, callback);
+    auto req = new NodeVitastorRequest(img->cli, info[3].As<v8::Function>());
    req->img = img;
    req->op = NODE_VITASTOR_READ_BITMAP;
    req->offset = offset;
@ -593,8 +607,7 @@ NAN_METHOD(NodeVitastorImage::GetInfo)

    NodeVitastorImage* img = Nan::ObjectWrap::Unwrap<NodeVitastorImage>(info.This());

-    v8::Local<v8::Function> callback = info[0].As<v8::Function>();
-    auto req = new NodeVitastorRequest(img->cli, callback);
+    auto req = new NodeVitastorRequest(img->cli, info[0].As<v8::Function>());
    req->img = img;
    req->op = NODE_VITASTOR_GET_INFO;

@ -621,7 +634,9 @@ void NodeVitastorImage::exec_request(NodeVitastorRequest *req)
        uint64_t ino = vitastor_c_inode_get_num(watch);
        cli->Ref();
        vitastor_c_read(cli->c, ino, req->offset, req->len, &req->iov, 1, NodeVitastor::on_read_finish, req);
+#if !defined VITASTOR_C_API_VERSION || VITASTOR_C_API_VERSION < 5
        vitastor_c_uring_handle_events(cli->c);
+#endif
    }
    else if (req->op == NODE_VITASTOR_WRITE)
    {
@ -631,7 +646,9 @@ void NodeVitastorImage::exec_request(NodeVitastorRequest *req)
            req->iov_list.size() ? req->iov_list.data() : &req->iov,
            req->iov_list.size() ? req->iov_list.size() : 1,
            NodeVitastor::on_write_finish, req);
+#if !defined VITASTOR_C_API_VERSION || VITASTOR_C_API_VERSION < 5
        vitastor_c_uring_handle_events(cli->c);
+#endif
    }
    else if (req->op == NODE_VITASTOR_DELETE)
    {
@ -639,7 +656,9 @@ void NodeVitastorImage::exec_request(NodeVitastorRequest *req)
        cli->Ref();
        vitastor_c_delete(cli->c, ino, req->offset, req->len, req->version,
            NodeVitastor::on_write_finish, req);
+#if !defined VITASTOR_C_API_VERSION || VITASTOR_C_API_VERSION < 5
        vitastor_c_uring_handle_events(cli->c);
+#endif
    }
    else if (req->op == NODE_VITASTOR_SYNC)
    {
@ -649,7 +668,9 @@ void NodeVitastorImage::exec_request(NodeVitastorRequest *req)
        if (imm != IMMEDIATE_ALL)
        {
            vitastor_c_sync(cli->c, NodeVitastor::on_write_finish, req);
+#if !defined VITASTOR_C_API_VERSION || VITASTOR_C_API_VERSION < 5
            vitastor_c_uring_handle_events(cli->c);
+#endif
        }
        else
        {
@ -661,7 +682,9 @@ void NodeVitastorImage::exec_request(NodeVitastorRequest *req)
        uint64_t ino = vitastor_c_inode_get_num(watch);
        cli->Ref();
        vitastor_c_read_bitmap(cli->c, ino, req->offset, req->len, req->with_parents, NodeVitastor::on_read_bitmap_finish, req);
+#if !defined VITASTOR_C_API_VERSION || VITASTOR_C_API_VERSION < 5
        vitastor_c_uring_handle_events(cli->c);
+#endif
    }
    else if (req->op == NODE_VITASTOR_GET_INFO)
    {
@ -779,8 +802,7 @@ NAN_METHOD(NodeVitastorKV::Open)
        cfg[std::string(*Nan::Utf8String(key))] = std::string(*Nan::Utf8String(Nan::Get(jsParams, key).ToLocalChecked()));
    }

-    v8::Local<v8::Function> callback = info[3].As<v8::Function>();
-    auto req = new NodeVitastorRequest(kv->cli, callback);
+    auto req = new NodeVitastorRequest(kv->cli, info[3].As<v8::Function>());

    kv->Ref();
    kv->dbw->open(inode_id, cfg, [kv, req](int res)
@ -793,7 +815,9 @@ NAN_METHOD(NodeVitastorKV::Open)
        delete req;
        kv->Unref();
    });
+#if !defined VITASTOR_C_API_VERSION || VITASTOR_C_API_VERSION < 5
    vitastor_c_uring_handle_events(kv->cli->c);
+#endif
 }

 // close(callback(err))
@ -805,8 +829,7 @@ NAN_METHOD(NodeVitastorKV::Close)

    NodeVitastorKV* kv = Nan::ObjectWrap::Unwrap<NodeVitastorKV>(info.This());

-    v8::Local<v8::Function> callback = info[0].As<v8::Function>();
-    auto req = new NodeVitastorRequest(kv->cli, callback);
+    auto req = new NodeVitastorRequest(kv->cli, info[0].As<v8::Function>());

    kv->Ref();
    kv->dbw->close([kv, req]()
@ -817,7 +840,9 @@ NAN_METHOD(NodeVitastorKV::Close)
        delete req;
        kv->Unref();
    });
+#if !defined VITASTOR_C_API_VERSION || VITASTOR_C_API_VERSION < 5
    vitastor_c_uring_handle_events(kv->cli->c);
+#endif
 }

 // set_config({ ...config })
@ -861,8 +886,7 @@ void NodeVitastorKV::get_impl(const Nan::FunctionCallbackInfo<v8::Value> & info,
    // FIXME: Handle Buffer too
    std::string key(*Nan::Utf8String(info[0].As<v8::String>()));

-    v8::Local<v8::Function> callback = info[1].As<v8::Function>();
-    auto req = new NodeVitastorRequest(kv->cli, callback);
+    auto req = new NodeVitastorRequest(kv->cli, info[1].As<v8::Function>());

    kv->Ref();
    kv->dbw->get(key, [kv, req](int res, const std::string & value)
@ -876,7 +900,9 @@ void NodeVitastorKV::get_impl(const Nan::FunctionCallbackInfo<v8::Value> & info,
        delete req;
        kv->Unref();
    }, allow_cache);
+#if !defined VITASTOR_C_API_VERSION || VITASTOR_C_API_VERSION < 5
    vitastor_c_uring_handle_events(kv->cli->c);
+#endif
 }

 // get(key, callback(err, value))
@ -925,14 +951,12 @@ NAN_METHOD(NodeVitastorKV::Set)
    std::string key(*Nan::Utf8String(info[0].As<v8::String>()));
    std::string value(*Nan::Utf8String(info[1].As<v8::String>()));

-    v8::Local<v8::Function> callback = info[2].As<v8::Function>();
-    NodeVitastorRequest *req = new NodeVitastorRequest(kv->cli, callback), *cas_req = NULL;
+    NodeVitastorRequest *req = new NodeVitastorRequest(kv->cli, info[2].As<v8::Function>()), *cas_req = NULL;

    std::function<bool(int, const std::string &)> cas_cb;
    if (info.Length() > 3 && info[3]->IsObject())
    {
-        v8::Local<v8::Function> cas_callback = info[3].As<v8::Function>();
-        cas_req = new NodeVitastorRequest(kv->cli, cas_callback);
+        cas_req = new NodeVitastorRequest(kv->cli, info[3].As<v8::Function>());
        cas_cb = make_cas_callback(cas_req);
    }

@ -949,7 +973,9 @@ NAN_METHOD(NodeVitastorKV::Set)
            delete cas_req;
        kv->Unref();
    }, cas_cb);
+#if !defined VITASTOR_C_API_VERSION || VITASTOR_C_API_VERSION < 5
    vitastor_c_uring_handle_events(kv->cli->c);
+#endif
 }

 // del(key, callback(err), cas_compare(old_value)?)
@ -964,14 +990,12 @@ NAN_METHOD(NodeVitastorKV::Del)
    // FIXME: Handle Buffer too
    std::string key(*Nan::Utf8String(info[0].As<v8::String>()));

-    v8::Local<v8::Function> callback = info[1].As<v8::Function>();
-    NodeVitastorRequest *req = new NodeVitastorRequest(kv->cli, callback), *cas_req = NULL;
+    NodeVitastorRequest *req = new NodeVitastorRequest(kv->cli, info[1].As<v8::Function>()), *cas_req = NULL;

    std::function<bool(int, const std::string &)> cas_cb;
    if (info.Length() > 2 && info[2]->IsObject())
    {
-        v8::Local<v8::Function> cas_callback = info[2].As<v8::Function>();
-        cas_req = new NodeVitastorRequest(kv->cli, cas_callback);
+        cas_req = new NodeVitastorRequest(kv->cli, info[2].As<v8::Function>());
        cas_cb = make_cas_callback(cas_req);
    }

@ -988,7 +1012,9 @@ NAN_METHOD(NodeVitastorKV::Del)
            delete cas_req;
        kv->Unref();
    }, cas_cb);
+#if !defined VITASTOR_C_API_VERSION || VITASTOR_C_API_VERSION < 5
    vitastor_c_uring_handle_events(kv->cli->c);
+#endif
 }

 // list(start_key?)
@ -1068,12 +1094,11 @@ NAN_METHOD(NodeVitastorKVListing::Next)

    if (info.Length() > 0)
    {
-        v8::Local<v8::Function> callback = info[0].As<v8::Function>();
        if (list->iter)
        {
            delete list->iter;
        }
-        list->iter = new NodeVitastorRequest(list->kv->cli, callback);
+        list->iter = new NodeVitastorRequest(list->kv->cli, info[0].As<v8::Function>());
    }
    if (!list->handle)
    {
@ -1109,7 +1134,9 @@ NAN_METHOD(NodeVitastorKVListing::Next)
            list->iter = req;
        list->kv->Unref();
    });
+#if !defined VITASTOR_C_API_VERSION || VITASTOR_C_API_VERSION < 5
    vitastor_c_uring_handle_events(list->kv->cli->c);
+#endif
 }

 // close()
--- a/node-binding/package.json
+++ b/node-binding/package.json
@ -1,6 +1,6 @@
 {
  "name": "vitastor",
-  "version": "1.11.0",
+  "version": "2.0.0",
  "description": "Low-level native bindings to Vitastor client library",
  "main": "index.js",
  "keywords": [
@ -16,7 +16,7 @@
    "build": "node-gyp rebuild"
  },
  "author": "Vitaliy Filippov",
-  "license": "VNPL-2.0",
+  "license": "VNPL-1.1",
  "dependencies": {
    "bindings": "1.5.0",
    "nan": "^2.19.0"
--- a/patches/cinder-vitastor.py
+++ b/patches/cinder-vitastor.py
@ -50,7 +50,7 @@ from cinder.volume import configuration
 from cinder.volume import driver
 from cinder.volume import volume_utils

-VITASTOR_VERSION = '1.11.0'
+VITASTOR_VERSION = '2.0.0'

 LOG = logging.getLogger(__name__)

--- a/patches/pve-qemu-9.2-vitastor.patch
+++ b/patches/pve-qemu-9.2-vitastor.patch
@ -0,0 +1,172 @@
+Index: pve-qemu-kvm-9.2.0/block/meson.build
+===================================================================
+--- pve-qemu-kvm-9.2.0.orig/block/meson.build
+++ pve-qemu-kvm-9.2.0/block/meson.build
+@@ -126,6 +126,7 @@ foreach m : [
+   [libnfs, 'nfs', files('nfs.c')],
+   [libssh, 'ssh', files('ssh.c')],
+   [rbd, 'rbd', files('rbd.c')],
+  [vitastor, 'vitastor', files('vitastor.c')],
+ ]
+   if m[0].found()
+     module_ss = ss.source_set()
+Index: pve-qemu-kvm-9.2.0/meson.build
+===================================================================
+--- pve-qemu-kvm-9.2.0.orig/meson.build
+++ pve-qemu-kvm-9.2.0/meson.build
+@@ -1590,6 +1590,26 @@ if not get_option('rbd').auto() or have_
+   endif
+ endif
+ 
+vitastor = not_found
+if not get_option('vitastor').auto() or have_block
+  libvitastor_client = cc.find_library('vitastor_client', has_headers: ['vitastor_c.h'],
+    required: get_option('vitastor'))
+  if libvitastor_client.found()
+    if cc.links('''
+      #include <vitastor_c.h>
+      int main(void) {
+        vitastor_c_create_qemu(0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0);
+        return 0;
+      }''', dependencies: libvitastor_client)
+      vitastor = declare_dependency(dependencies: libvitastor_client)
+    elif get_option('vitastor').enabled()
+      error('could not link libvitastor_client')
+    else
+      warning('could not link libvitastor_client, disabling')
+    endif
+  endif
+endif
+
+ glusterfs = not_found
+ glusterfs_ftruncate_has_stat = false
+ glusterfs_iocb_has_stat = false
+@@ -2478,6 +2498,7 @@ endif
+ config_host_data.set('CONFIG_OPENGL', opengl.found())
+ config_host_data.set('CONFIG_PLUGIN', get_option('plugins'))
+ config_host_data.set('CONFIG_RBD', rbd.found())
+config_host_data.set('CONFIG_VITASTOR', vitastor.found())
+ config_host_data.set('CONFIG_RDMA', rdma.found())
+ config_host_data.set('CONFIG_RELOCATABLE', get_option('relocatable'))
+ config_host_data.set('CONFIG_SAFESTACK', get_option('safe_stack'))
+@@ -4789,6 +4810,7 @@ summary_info += {'fdt support':       fd
+ summary_info += {'libcap-ng support': libcap_ng}
+ summary_info += {'bpf support':       libbpf}
+ summary_info += {'rbd support':       rbd}
+summary_info += {'vitastor support':  vitastor}
+ summary_info += {'smartcard support': cacard}
+ summary_info += {'U2F support':       u2f}
+ summary_info += {'libusb':            libusb}
+Index: pve-qemu-kvm-9.2.0/meson_options.txt
+===================================================================
+--- pve-qemu-kvm-9.2.0.orig/meson_options.txt
+++ pve-qemu-kvm-9.2.0/meson_options.txt
+@@ -200,6 +200,8 @@ option('lzo', type : 'feature', value :
+        description: 'lzo compression support')
+ option('rbd', type : 'feature', value : 'auto',
+        description: 'Ceph block device driver')
+option('vitastor', type : 'feature', value : 'auto',
+       description: 'Vitastor block device driver')
+ option('opengl', type : 'feature', value : 'auto',
+        description: 'OpenGL support')
+ option('rdma', type : 'feature', value : 'auto',
+Index: pve-qemu-kvm-9.2.0/qapi/block-core.json
+===================================================================
+--- pve-qemu-kvm-9.2.0.orig/qapi/block-core.json
+++ pve-qemu-kvm-9.2.0/qapi/block-core.json
+@@ -3481,7 +3481,7 @@
+             'raw', 'rbd',
+             { 'name': 'replication', 'if': 'CONFIG_REPLICATION' },
+             'pbs',
+-            'ssh', 'throttle', 'vdi', 'vhdx',
+            'ssh', 'throttle', 'vdi', 'vhdx', 'vitastor',
+             { 'name': 'virtio-blk-vfio-pci', 'if': 'CONFIG_BLKIO' },
+             { 'name': 'virtio-blk-vhost-user', 'if': 'CONFIG_BLKIO' },
+             { 'name': 'virtio-blk-vhost-vdpa', 'if': 'CONFIG_BLKIO' },
+@@ -4592,6 +4592,28 @@
+             '*server': ['InetSocketAddressBase'] } }
+ 
+ ##
+# @BlockdevOptionsVitastor:
+#
+# Driver specific block device options for vitastor
+#
+# @image:       Image name
+# @inode:       Inode number
+# @pool:        Pool ID
+# @size:        Desired image size in bytes
+# @config-path: Path to Vitastor configuration
+# @etcd-host:   etcd connection address(es)
+# @etcd-prefix: etcd key/value prefix
+##
+{ 'struct': 'BlockdevOptionsVitastor',
+  'data': { '*inode': 'uint64',
+            '*pool': 'uint64',
+            '*size': 'uint64',
+            '*image': 'str',
+            '*config-path': 'str',
+            '*etcd-host': 'str',
+            '*etcd-prefix': 'str' } }
+
+##
+ # @ReplicationMode:
+ #
+ # An enumeration of replication modes.
+@@ -5054,6 +5076,7 @@
+       'throttle':   'BlockdevOptionsThrottle',
+       'vdi':        'BlockdevOptionsGenericFormat',
+       'vhdx':       'BlockdevOptionsGenericFormat',
+      'vitastor':   'BlockdevOptionsVitastor',
+       'virtio-blk-vfio-pci':
+                     { 'type': 'BlockdevOptionsVirtioBlkVfioPci',
+                       'if': 'CONFIG_BLKIO' },
+@@ -5501,6 +5524,20 @@
+             '*encrypt' :        'RbdEncryptionCreateOptions' } }
+ 
+ ##
+# @BlockdevCreateOptionsVitastor:
+#
+# Driver specific image creation options for Vitastor.
+#
+# @location: Where to store the new image file.  This location cannot
+#     point to a snapshot.
+#
+# @size: Size of the virtual disk in bytes
+##
+{ 'struct': 'BlockdevCreateOptionsVitastor',
+  'data': { 'location':         'BlockdevOptionsVitastor',
+            'size':             'size' } }
+
+##
+ # @BlockdevVmdkSubformat:
+ #
+ # Subformat options for VMDK images
+@@ -5722,6 +5759,7 @@
+       'ssh':            'BlockdevCreateOptionsSsh',
+       'vdi':            'BlockdevCreateOptionsVdi',
+       'vhdx':           'BlockdevCreateOptionsVhdx',
+      'vitastor':       'BlockdevCreateOptionsVitastor',
+       'vmdk':           'BlockdevCreateOptionsVmdk',
+       'vpc':            'BlockdevCreateOptionsVpc'
+   } }
+Index: pve-qemu-kvm-9.2.0/scripts/meson-buildoptions.sh
+===================================================================
+--- pve-qemu-kvm-9.2.0.orig/scripts/meson-buildoptions.sh
+++ pve-qemu-kvm-9.2.0/scripts/meson-buildoptions.sh
+@@ -174,6 +174,7 @@ meson_options_help() {
+   printf "%s\n" '  qga-vss         build QGA VSS support (broken with MinGW)'
+   printf "%s\n" '  qpl             Query Processing Library support'
+   printf "%s\n" '  rbd             Ceph block device driver'
+  printf "%s\n" '  vitastor        Vitastor block device driver'
+   printf "%s\n" '  rdma            Enable RDMA-based migration'
+   printf "%s\n" '  replication     replication support'
+   printf "%s\n" '  rust            Rust support'
+@@ -455,6 +456,8 @@ _meson_option_parse() {
+     --disable-qpl) printf "%s" -Dqpl=disabled ;;
+     --enable-rbd) printf "%s" -Drbd=enabled ;;
+     --disable-rbd) printf "%s" -Drbd=disabled ;;
+    --enable-vitastor) printf "%s" -Dvitastor=enabled ;;
+    --disable-vitastor) printf "%s" -Dvitastor=disabled ;;
+     --enable-rdma) printf "%s" -Drdma=enabled ;;
+     --disable-rdma) printf "%s" -Drdma=disabled ;;
+     --enable-relocatable) printf "%s" -Drelocatable=true ;;
--- a/patches/qemu-9.2-vitastor.patch
+++ b/patches/qemu-9.2-vitastor.patch
@ -0,0 +1,172 @@
+diff --git a/block/meson.build b/block/meson.build
+index f1262ec2ba..3cf3e23f16 100644
+--- a/block/meson.build
+++ b/block/meson.build
+@@ -114,6 +114,7 @@ foreach m : [
+   [libnfs, 'nfs', files('nfs.c')],
+   [libssh, 'ssh', files('ssh.c')],
+   [rbd, 'rbd', files('rbd.c')],
+  [vitastor, 'vitastor', files('vitastor.c')],
+ ]
+   if m[0].found()
+     module_ss = ss.source_set()
+diff --git a/meson.build b/meson.build
+index 147097c652..2486b3aeb5 100644
+--- a/meson.build
+++ b/meson.build
+@@ -1590,6 +1590,26 @@ if not get_option('rbd').auto() or have_block
+   endif
+ endif
+ 
+vitastor = not_found
+if not get_option('vitastor').auto() or have_block
+  libvitastor_client = cc.find_library('vitastor_client', has_headers: ['vitastor_c.h'],
+    required: get_option('vitastor'))
+  if libvitastor_client.found()
+    if cc.links('''
+      #include <vitastor_c.h>
+      int main(void) {
+        vitastor_c_create_qemu(0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0);
+        return 0;
+      }''', dependencies: libvitastor_client)
+      vitastor = declare_dependency(dependencies: libvitastor_client)
+    elif get_option('vitastor').enabled()
+      error('could not link libvitastor_client')
+    else
+      warning('could not link libvitastor_client, disabling')
+    endif
+  endif
+endif
+
+ glusterfs = not_found
+ glusterfs_ftruncate_has_stat = false
+ glusterfs_iocb_has_stat = false
+@@ -2474,6 +2494,7 @@ endif
+ config_host_data.set('CONFIG_OPENGL', opengl.found())
+ config_host_data.set('CONFIG_PLUGIN', get_option('plugins'))
+ config_host_data.set('CONFIG_RBD', rbd.found())
+config_host_data.set('CONFIG_VITASTOR', vitastor.found())
+ config_host_data.set('CONFIG_RDMA', rdma.found())
+ config_host_data.set('CONFIG_RELOCATABLE', get_option('relocatable'))
+ config_host_data.set('CONFIG_SAFESTACK', get_option('safe_stack'))
+@@ -4778,6 +4799,7 @@ summary_info += {'fdt support':       fdt_opt == 'internal' ? 'internal' : fdt}
+ summary_info += {'libcap-ng support': libcap_ng}
+ summary_info += {'bpf support':       libbpf}
+ summary_info += {'rbd support':       rbd}
+summary_info += {'vitastor support':  vitastor}
+ summary_info += {'smartcard support': cacard}
+ summary_info += {'U2F support':       u2f}
+ summary_info += {'libusb':            libusb}
+diff --git a/meson_options.txt b/meson_options.txt
+index 5eeaf3eee5..b04eda29f9 100644
+--- a/meson_options.txt
+++ b/meson_options.txt
+@@ -200,6 +200,8 @@ option('lzo', type : 'feature', value : 'auto',
+        description: 'lzo compression support')
+ option('rbd', type : 'feature', value : 'auto',
+        description: 'Ceph block device driver')
+option('vitastor', type : 'feature', value : 'auto',
+       description: 'Vitastor block device driver')
+ option('opengl', type : 'feature', value : 'auto',
+        description: 'OpenGL support')
+ option('rdma', type : 'feature', value : 'auto',
+diff --git a/qapi/block-core.json b/qapi/block-core.json
+index fd3bcc1c17..41571ac3f9 100644
+--- a/qapi/block-core.json
+++ b/qapi/block-core.json
+@@ -3212,7 +3212,7 @@
+             'parallels', 'preallocate', 'qcow', 'qcow2', 'qed', 'quorum',
+             'raw', 'rbd',
+             { 'name': 'replication', 'if': 'CONFIG_REPLICATION' },
+-            'ssh', 'throttle', 'vdi', 'vhdx',
+            'ssh', 'throttle', 'vdi', 'vhdx', 'vitastor',
+             { 'name': 'virtio-blk-vfio-pci', 'if': 'CONFIG_BLKIO' },
+             { 'name': 'virtio-blk-vhost-user', 'if': 'CONFIG_BLKIO' },
+             { 'name': 'virtio-blk-vhost-vdpa', 'if': 'CONFIG_BLKIO' },
+@@ -4295,6 +4295,28 @@
+             '*key-secret': 'str',
+             '*server': ['InetSocketAddressBase'] } }
+ 
+##
+# @BlockdevOptionsVitastor:
+#
+# Driver specific block device options for vitastor
+#
+# @image:       Image name
+# @inode:       Inode number
+# @pool:        Pool ID
+# @size:        Desired image size in bytes
+# @config-path: Path to Vitastor configuration
+# @etcd-host:   etcd connection address(es)
+# @etcd-prefix: etcd key/value prefix
+##
+{ 'struct': 'BlockdevOptionsVitastor',
+  'data': { '*inode': 'uint64',
+            '*pool': 'uint64',
+            '*size': 'uint64',
+            '*image': 'str',
+            '*config-path': 'str',
+            '*etcd-host': 'str',
+            '*etcd-prefix': 'str' } }
+
+ ##
+ # @ReplicationMode:
+ #
+@@ -4757,6 +4779,7 @@
+       'throttle':   'BlockdevOptionsThrottle',
+       'vdi':        'BlockdevOptionsGenericFormat',
+       'vhdx':       'BlockdevOptionsGenericFormat',
+      'vitastor':   'BlockdevOptionsVitastor',
+       'virtio-blk-vfio-pci':
+                     { 'type': 'BlockdevOptionsVirtioBlkVfioPci',
+                       'if': 'CONFIG_BLKIO' },
+@@ -5198,6 +5221,20 @@
+             '*cluster-size' :   'size',
+             '*encrypt' :        'RbdEncryptionCreateOptions' } }
+ 
+##
+# @BlockdevCreateOptionsVitastor:
+#
+# Driver specific image creation options for Vitastor.
+#
+# @location: Where to store the new image file.  This location cannot
+#     point to a snapshot.
+#
+# @size: Size of the virtual disk in bytes
+##
+{ 'struct': 'BlockdevCreateOptionsVitastor',
+  'data': { 'location':         'BlockdevOptionsVitastor',
+            'size':             'size' } }
+
+ ##
+ # @BlockdevVmdkSubformat:
+ #
+@@ -5420,6 +5457,7 @@
+       'ssh':            'BlockdevCreateOptionsSsh',
+       'vdi':            'BlockdevCreateOptionsVdi',
+       'vhdx':           'BlockdevCreateOptionsVhdx',
+      'vitastor':       'BlockdevCreateOptionsVitastor',
+       'vmdk':           'BlockdevCreateOptionsVmdk',
+       'vpc':            'BlockdevCreateOptionsVpc'
+   } }
+diff --git a/scripts/meson-buildoptions.sh b/scripts/meson-buildoptions.sh
+index a8066aab03..12e650e7d4 100644
+--- a/scripts/meson-buildoptions.sh
+++ b/scripts/meson-buildoptions.sh
+@@ -174,6 +174,7 @@ meson_options_help() {
+   printf "%s\n" '  qga-vss         build QGA VSS support (broken with MinGW)'
+   printf "%s\n" '  qpl             Query Processing Library support'
+   printf "%s\n" '  rbd             Ceph block device driver'
+  printf "%s\n" '  vitastor        Vitastor block device driver'
+   printf "%s\n" '  rdma            Enable RDMA-based migration'
+   printf "%s\n" '  replication     replication support'
+   printf "%s\n" '  rust            Rust support'
+@@ -455,6 +456,8 @@ _meson_option_parse() {
+     --disable-qpl) printf "%s" -Dqpl=disabled ;;
+     --enable-rbd) printf "%s" -Drbd=enabled ;;
+     --disable-rbd) printf "%s" -Drbd=disabled ;;
+    --enable-vitastor) printf "%s" -Dvitastor=enabled ;;
+    --disable-vitastor) printf "%s" -Dvitastor=disabled ;;
+     --enable-rdma) printf "%s" -Drdma=enabled ;;
+     --disable-rdma) printf "%s" -Drdma=disabled ;;
+     --enable-relocatable) printf "%s" -Drelocatable=true ;;
--- a/rpm/vitastor-el7.spec
+++ b/rpm/vitastor-el7.spec
@ -1,11 +1,11 @@
 Name:           vitastor
-Version:        1.11.0
+Version:        2.0.0
 Release:        1%{?dist}
 Summary:        Vitastor, a fast software-defined clustered block storage

 License:        Vitastor Network Public License 1.1
 URL:            https://vitastor.io/
-Source0:        vitastor-1.11.0.el7.tar.gz
+Source0:        vitastor-2.0.0.el7.tar.gz

 BuildRequires:  liburing-devel >= 0.6
 BuildRequires:  gperftools-devel
--- a/rpm/vitastor-el8.spec
+++ b/rpm/vitastor-el8.spec
@ -1,11 +1,11 @@
 Name:           vitastor
-Version:        1.11.0
+Version:        2.0.0
 Release:        1%{?dist}
 Summary:        Vitastor, a fast software-defined clustered block storage

 License:        Vitastor Network Public License 1.1
 URL:            https://vitastor.io/
-Source0:        vitastor-1.11.0.el8.tar.gz
+Source0:        vitastor-2.0.0.el8.tar.gz

 BuildRequires:  liburing-devel >= 0.6
 BuildRequires:  gperftools-devel
--- a/rpm/vitastor-el9.spec
+++ b/rpm/vitastor-el9.spec
@ -1,11 +1,11 @@
 Name:           vitastor
-Version:        1.11.0
+Version:        2.0.0
 Release:        1%{?dist}
 Summary:        Vitastor, a fast software-defined clustered block storage

 License:        Vitastor Network Public License 1.1
 URL:            https://vitastor.io/
-Source0:        vitastor-1.11.0.el9.tar.gz
+Source0:        vitastor-2.0.0.el9.tar.gz

 BuildRequires:  liburing-devel >= 0.6
 BuildRequires:  gperftools-devel
--- a/src/CMakeLists.txt
+++ b/src/CMakeLists.txt
@ -19,7 +19,7 @@ if("${CMAKE_INSTALL_PREFIX}" MATCHES "^/usr/local/?$")
 	set(CMAKE_INSTALL_RPATH "${CMAKE_INSTALL_PREFIX}/${CMAKE_INSTALL_LIBDIR}")
 endif()

-add_definitions(-DVITASTOR_VERSION="1.11.0")
+add_definitions(-DVITASTOR_VERSION="2.0.0")
 add_definitions(-D_LARGEFILE64_SOURCE -D_FILE_OFFSET_BITS=64 -Wall -Wno-sign-compare -Wno-comment -Wno-parentheses -Wno-pointer-arith -fdiagnostics-color=always -fno-omit-frame-pointer -I ${CMAKE_SOURCE_DIR}/src)
 add_link_options(-fno-omit-frame-pointer)
 if (${WITH_ASAN})
--- a/src/blockstore/blockstore_disk.cpp
+++ b/src/blockstore/blockstore_disk.cpp
@ -8,6 +8,7 @@
 #include "blockstore_impl.h"
 #include "blockstore_disk.h"
 #include "str_util.h"
+#include "allocator.h"

 static uint32_t is_power_of_two(uint64_t value)
 {
@ -83,6 +84,12 @@ void blockstore_disk_t::parse_config(std::map<std::string, std::string> & config
        throw std::runtime_error("data_csum_type="+config["data_csum_type"]+" is unsupported, only \"crc32c\" and \"none\" are supported");
    }
    csum_block_size = parse_size(config["csum_block_size"]);
+    discard_on_start = config.find("discard_on_start") != config.end() &&
+        (config["discard_on_start"] == "true" || config["discard_on_start"] == "1" || config["discard_on_start"] == "yes");
+    min_discard_size = parse_size(config["min_discard_size"]);
+    if (!min_discard_size)
+        min_discard_size = 1024*1024;
+    discard_granularity = parse_size(config["discard_granularity"]);
    // Validate
    if (!data_block_size)
    {
@ -172,10 +179,6 @@ void blockstore_disk_t::parse_config(std::map<std::string, std::string> & config
    {
        throw std::runtime_error("journal_offset must be a multiple of journal_block_size = "+std::to_string(journal_block_size));
    }
-    clean_entry_bitmap_size = data_block_size / bitmap_granularity / 8;
-    clean_dyn_size = clean_entry_bitmap_size*2 + (csum_block_size
-        ? data_block_size/csum_block_size*(data_csum_type & 0xFF) : 0);
-    clean_entry_size = sizeof(clean_disk_entry) + clean_dyn_size + 4 /*entry_csum*/;
 }

 void blockstore_disk_t::calc_lengths(bool skip_meta_check)
@ -224,9 +227,13 @@ void blockstore_disk_t::calc_lengths(bool skip_meta_check)
    }
    // required metadata size
    block_count = data_len / data_block_size;
+    clean_entry_bitmap_size = data_block_size / bitmap_granularity / 8;
+    clean_dyn_size = clean_entry_bitmap_size*2 + (csum_block_size
+        ? data_block_size/csum_block_size*(data_csum_type & 0xFF) : 0);
+    clean_entry_size = sizeof(clean_disk_entry) + clean_dyn_size + 4 /*entry_csum*/;
    meta_len = (1 + (block_count - 1 + meta_block_size / clean_entry_size) / (meta_block_size / clean_entry_size)) * meta_block_size;
-    if (meta_format == BLOCKSTORE_META_FORMAT_V1 ||
-        !meta_format && !skip_meta_check && meta_area_size < meta_len && !data_csum_type)
+    bool new_doesnt_fit = (!meta_format && !skip_meta_check && meta_area_size < meta_len && !data_csum_type);
+    if (meta_format == BLOCKSTORE_META_FORMAT_V1 || new_doesnt_fit)
    {
        uint64_t clean_entry_v0_size = sizeof(clean_disk_entry) + 2*clean_entry_bitmap_size;
        uint64_t meta_v0_len = (1 + (block_count - 1 + meta_block_size / clean_entry_v0_size)
@ -234,7 +241,11 @@ void blockstore_disk_t::calc_lengths(bool skip_meta_check)
        if (meta_format == BLOCKSTORE_META_FORMAT_V1 || meta_area_size >= meta_v0_len)
        {
            // Old metadata fits.
-            printf("Warning: Using old metadata format without checksums because the new format doesn't fit into provided area\n");
+            if (new_doesnt_fit)
+            {
+                printf("Warning: Using old metadata format without checksums because the new format"
+                    " doesn't fit into provided area (%ju bytes required, %ju bytes available)\n", meta_len, meta_area_size);
+            }
            clean_entry_size = clean_entry_v0_size;
            meta_len = meta_v0_len;
            meta_format = BLOCKSTORE_META_FORMAT_V1;
@ -246,7 +257,7 @@ void blockstore_disk_t::calc_lengths(bool skip_meta_check)
        meta_format = BLOCKSTORE_META_FORMAT_V2;
    if (!skip_meta_check && meta_area_size < meta_len)
    {
-        throw std::runtime_error("Metadata area is too small, need at least "+std::to_string(meta_len)+" bytes");
+        throw std::runtime_error("Metadata area is too small, need at least "+std::to_string(meta_len)+" bytes, have only "+std::to_string(meta_area_size)+" bytes");
    }
    // requested journal size
    if (!skip_meta_check && cfg_journal_size > journal_len)
@ -415,3 +426,44 @@ void blockstore_disk_t::close_all()
        close(journal_fd);
    data_fd = meta_fd = journal_fd = -1;
 }
+
+// Sadly DISCARD only works through ioctl(), but it seems to always block the device queue,
+// so it's not a big deal that we can only run it synchronously.
+int blockstore_disk_t::trim_data(allocator_t *alloc)
+{
+    int r = 0;
+    uint64_t j = 0, i = 0;
+    uint64_t discarded = 0;
+    for (; i <= block_count; i++)
+    {
+        if (i >= block_count || alloc->get(i))
+        {
+            if (i > j && (i-j)*data_block_size >= min_discard_size)
+            {
+                uint64_t range[2] = { data_offset + j*data_block_size, (i-j)*data_block_size };
+                if (discard_granularity)
+                {
+                    range[1] += range[0];
+                    if (range[1] % discard_granularity)
+                        range[1] = range[1] - (range[1] % discard_granularity);
+                    if (range[0] % discard_granularity)
+                        range[0] = range[0] + discard_granularity - (range[0] % discard_granularity);
+                    if (range[0] >= range[1])
+                        continue;
+                    range[1] -= range[0];
+                }
+                r = ioctl(data_fd, BLKDISCARD, &range);
+                if (r != 0)
+                {
+                    fprintf(stderr, "Failed to execute BLKDISCARD %ju+%ju on %s: %s (code %d)\n",
+                        range[0], range[1], data_device.c_str(), strerror(-r), r);
+                    return -errno;
+                }
+                discarded += range[1];
+            }
+            j = i+1;
+        }
+    }
+    fprintf(stderr, "%s (%ju bytes) of unused data discarded on %s\n", format_size(discarded).c_str(), discarded, data_device.c_str());
+    return 0;
+}
--- a/src/blockstore/blockstore_disk.h
+++ b/src/blockstore/blockstore_disk.h
@ -12,6 +12,8 @@
 // Lower byte of checksum type is its length
 #define BLOCKSTORE_CSUM_CRC32C 0x104

+class allocator_t;
+
 struct blockstore_disk_t
 {
    std::string data_device, meta_device, journal_device;
@ -34,14 +36,18 @@ struct blockstore_disk_t
    // I/O modes for data, metadata and journal: direct or "" = O_DIRECT, cached = O_SYNC, directsync = O_DIRECT|O_SYNC
    // O_SYNC without O_DIRECT = use Linux page cache for reads and writes
    std::string data_io, meta_io, journal_io;
+    // Data discard granularity and minimum size (for the sake of performance)
+    bool discard_on_start = false;
+    uint64_t min_discard_size = 1024*1024;
+    uint64_t discard_granularity = 0;

    int meta_fd = -1, data_fd = -1, journal_fd = -1;
    uint64_t meta_offset, meta_device_sect, meta_device_size, meta_len, meta_format = 0;
    uint64_t data_offset, data_device_sect, data_device_size, data_len;
    uint64_t journal_offset, journal_device_sect, journal_device_size, journal_len;

-    uint32_t block_order;
-    uint64_t block_count;
+    uint32_t block_order = 0;
+    uint64_t block_count = 0;
    uint32_t clean_entry_bitmap_size = 0, clean_entry_size = 0, clean_dyn_size = 0;

    void parse_config(std::map<std::string, std::string> & config);
@ -50,6 +56,7 @@ struct blockstore_disk_t
    void open_journal();
    void calc_lengths(bool skip_meta_check = false);
    void close_all();
+    int trim_data(allocator_t *alloc);

    inline uint64_t dirty_dyn_size(uint64_t offset, uint64_t len)
    {
--- a/src/blockstore/blockstore_impl.cpp
+++ b/src/blockstore/blockstore_impl.cpp
@ -12,15 +12,15 @@ blockstore_impl_t::blockstore_impl_t(blockstore_config_t & config, ring_loop_t *
    ringloop->register_consumer(&ring_consumer);
    initialized = 0;
    parse_config(config, true);
-    zero_object = (uint8_t*)memalign_or_die(MEM_ALIGNMENT, dsk.data_block_size);
-    alloc_dyn_data = dsk.clean_dyn_size > sizeof(void*) || dsk.csum_block_size > 0;
    try
    {
        dsk.open_data();
        dsk.open_meta();
        dsk.open_journal();
        calc_lengths();
-        data_alloc = new allocator(dsk.block_count);
+        alloc_dyn_data = dsk.clean_dyn_size > sizeof(void*) || dsk.csum_block_size > 0;
+        zero_object = (uint8_t*)memalign_or_die(MEM_ALIGNMENT, dsk.data_block_size);
+        data_alloc = new allocator_t(dsk.block_count);
    }
    catch (std::exception & e)
    {
@ -34,7 +34,8 @@ blockstore_impl_t::~blockstore_impl_t()
 {
    delete data_alloc;
    delete flusher;
-    free(zero_object);
+    if (zero_object)
+        free(zero_object);
    ringloop->unregister_consumer(&ring_consumer);
    dsk.close_all();
    if (metadata_buffer)
@ -83,14 +84,20 @@ void blockstore_impl_t::loop()
            {
                delete journal_init_reader;
                journal_init_reader = NULL;
-                if (journal.flush_journal)
-                    initialized = 3;
-                else
-                    initialized = 10;
+                initialized = 3;
                ringloop->wakeup();
            }
        }
        if (initialized == 3)
+        {
+            if (!readonly && dsk.discard_on_start)
+                dsk.trim_data(data_alloc);
+            if (journal.flush_journal)
+                initialized = 4;
+            else
+                initialized = 10;
+        }
+        if (initialized == 4)
        {
            if (readonly)
            {
--- a/src/blockstore/blockstore_impl.h
+++ b/src/blockstore/blockstore_impl.h
@ -279,9 +279,9 @@ class blockstore_impl_t
    std::vector<obj_ver_id> unsynced_big_writes, unsynced_small_writes;
    int unsynced_big_write_count = 0, unstable_unsynced = 0;
    int unsynced_queued_ops = 0;
-    allocator *data_alloc = NULL;
+    allocator_t *data_alloc = NULL;
    uint64_t used_blocks = 0;
-    uint8_t *zero_object;
+    uint8_t *zero_object = NULL;

    void *metadata_buffer = NULL;

--- a/src/blockstore/blockstore_init.cpp
+++ b/src/blockstore/blockstore_init.cpp
@ -138,7 +138,11 @@ resume_1:
                exit(1);
            }
            hdr->header_csum = csum;
-            bs->dsk.meta_format = BLOCKSTORE_META_FORMAT_V2;
+            if (bs->dsk.meta_format != BLOCKSTORE_META_FORMAT_V2)
+            {
+                bs->dsk.meta_format = BLOCKSTORE_META_FORMAT_V2;
+                bs->dsk.calc_lengths();
+            }
        }
        else if (hdr->version == BLOCKSTORE_META_FORMAT_V1)
        {
@ -146,11 +150,15 @@ resume_1:
            hdr->csum_block_size = 0;
            hdr->header_csum = 0;
            // Enable compatibility mode - entries without checksums
-            bs->dsk.clean_entry_size = sizeof(clean_disk_entry) + bs->dsk.clean_entry_bitmap_size*2;
-            bs->dsk.meta_len = (1 + (bs->dsk.block_count - 1 + bs->dsk.meta_block_size / bs->dsk.clean_entry_size)
-                / (bs->dsk.meta_block_size / bs->dsk.clean_entry_size)) * bs->dsk.meta_block_size;
-            bs->dsk.meta_format = BLOCKSTORE_META_FORMAT_V1;
-            printf("Warning: Starting with metadata in the old format without checksums, as stored on disk\n");
+            if (bs->dsk.meta_format != BLOCKSTORE_META_FORMAT_V1 ||
+                bs->dsk.data_csum_type != 0 || bs->dsk.csum_block_size != 0)
+            {
+                bs->dsk.data_csum_type = 0;
+                bs->dsk.csum_block_size = 0;
+                bs->dsk.meta_format = BLOCKSTORE_META_FORMAT_V1;
+                bs->dsk.calc_lengths();
+                printf("Warning: Starting with metadata in the old format without checksums, as stored on disk\n");
+            }
        }
        else if (hdr->version > BLOCKSTORE_META_FORMAT_V2)
        {
@ -338,7 +346,7 @@ bool blockstore_init_meta::handle_meta_block(uint8_t *buf, uint64_t entries_per_
                uint32_t *entry_csum = (uint32_t*)((uint8_t*)entry + bs->dsk.clean_entry_size - 4);
                if (*entry_csum != crc32c(0, entry, bs->dsk.clean_entry_size - 4))
                {
-                    printf("Metadata entry %ju is corrupt (checksum mismatch), skipping\n", done_cnt+i);
+                    printf("Metadata entry %ju is corrupt (checksum mismatch: %08x vs %08x), skipping\n", done_cnt+i, *entry_csum, crc32c(0, entry, bs->dsk.clean_entry_size - 4));
                    // zero out the invalid entry, otherwise we'll hit "tried to overwrite non-zero metadata entry" later
                    if (bs->inmemory_meta)
                    {
--- a/src/client/cluster_client.cpp
+++ b/src/client/cluster_client.cpp
@ -29,8 +29,7 @@ cluster_client_t::cluster_client_t(ring_loop_t *ringloop, timerfd_manager_t *tfd
        if (msgr.osd_peer_fds.find(peer_osd) != msgr.osd_peer_fds.end())
        {
            // peer_osd just connected
-            // retry operations waiting for connection immediately
-            continue_ops(client_retry_interval);
+            continue_ops();
            continue_lists();
            continue_raw_ops(peer_osd);
        }
--- a/src/client/cluster_client.h
+++ b/src/client/cluster_client.h
@ -83,6 +83,9 @@ class writeback_cache_t;
 // FIXME: Split into public and private interfaces
 class cluster_client_t
 {
+#ifdef __MOCK__
+public:
+#endif
    timerfd_manager_t *tfd;
    ring_loop_t *ringloop;

@ -144,8 +147,6 @@ public:

    bool get_immediate_commit(uint64_t inode);

-    void continue_ops(int time_passed = 0);
-
    void list_inode(inode_t inode, uint64_t min_offset, uint64_t max_offset, int max_parallel_pgs, std::function<void(
        int status, int pgs_left, pg_num_t pg_num, std::set<object_id>&& objects)> pg_callback);

@ -153,6 +154,11 @@ public:
    //inline uint64_t get_bs_block_size() { return st_cli.global_block_size; }
    uint64_t next_op_id();

+#ifndef __MOCK__
+protected:
+#endif
+    void continue_ops(int time_passed = 0);
+
 protected:
    bool affects_osd(uint64_t inode, uint64_t offset, uint64_t len, osd_num_t osd);
    bool affects_pg(uint64_t inode, uint64_t offset, uint64_t len, pool_id_t pool_id, pg_num_t pg_num);
--- a/src/client/cluster_client_impl.h
+++ b/src/client/cluster_client_impl.h
@ -52,6 +52,7 @@ public:
    bool read_from_cache(cluster_op_t *op, uint32_t bitmap_granularity);
    void flush_buffers(cluster_client_t *cli, dirty_buf_it_t from_it, dirty_buf_it_t to_it);
    void mark_flush_written(uint64_t inode, uint64_t offset, uint64_t len, uint64_t flush_id);
+    void delete_flush(uint64_t inode, uint64_t offset, uint64_t len, uint64_t flush_id);
    void fsync_start();
    void fsync_error();
    void fsync_ok();
--- a/src/client/cluster_client_wb.cpp
+++ b/src/client/cluster_client_wb.cpp
@ -9,7 +9,7 @@ writeback_cache_t::~writeback_cache_t()
 {
    for (auto & bp: dirty_buffers)
    {
-        if (!--(*bp.second.refcnt))
+        if (bp.second.buf && !--(*bp.second.refcnt))
        {
            free(bp.second.refcnt); // refcnt is allocated with the buffer
        }
@ -115,7 +115,10 @@ void writeback_cache_t::copy_write(cluster_op_t *op, int state, uint64_t new_flu
                    .flush_id = dirty_it->second.flush_id,
                    .refcnt = dirty_it->second.refcnt,
                });
-                (*dirty_it->second.refcnt)++;
+                if (dirty_it->second.buf)
+                {
+                    (*dirty_it->second.refcnt)++;
+                }
                if (dirty_it->second.state == CACHE_DIRTY)
                {
                    if (dirty_it->second.buf)
@ -193,7 +196,7 @@ void writeback_cache_t::copy_write(cluster_op_t *op, int state, uint64_t new_flu
                    writeback_queue_size++;
                }
            }
-            if (!--(*dirty_it->second.refcnt))
+            if (dirty_it->second.buf && !--(*dirty_it->second.refcnt))
            {
                free(dirty_it->second.refcnt);
            }
@ -204,7 +207,10 @@ void writeback_cache_t::copy_write(cluster_op_t *op, int state, uint64_t new_flu
    bool is_del = op->opcode == OSD_OP_DELETE;
    uint64_t *refcnt = is_del ? NULL : (uint64_t*)malloc_or_die(sizeof(uint64_t) + op->len);
    uint8_t *buf = is_del ? NULL : ((uint8_t*)refcnt + sizeof(uint64_t));
-    *refcnt = 1;
+    if (!is_del)
+    {
+        *refcnt = 1;
+    }
    dirty_it = dirty_buffers.emplace_hint(dirty_it, (object_id){
        .inode = op->inode,
        .stripe = op->offset,
@ -326,7 +332,14 @@ void writeback_cache_t::flush_buffers(cluster_client_t *cli, dirty_buf_it_t from
            }
            flushed_buffers.erase(fl_it++);
        }
-        mark_flush_written(op->inode, op->offset, op->len, flush_id);
+        if (op->flags & OP_IMMEDIATE_COMMIT)
+        {
+            delete_flush(op->inode, op->offset, op->len, flush_id);
+        }
+        else
+        {
+            mark_flush_written(op->inode, op->offset, op->len, flush_id);
+        }
        delete op;
        writebacks_active--;
        // We can't call execute_internal because it affects an invalid copy of the list here
@ -344,6 +357,25 @@ void writeback_cache_t::flush_buffers(cluster_client_t *cli, dirty_buf_it_t from
    }
 }

+void writeback_cache_t::delete_flush(uint64_t inode, uint64_t offset, uint64_t len, uint64_t flush_id)
+{
+    for (auto dirty_it = find_dirty(inode, offset);
+        dirty_it != dirty_buffers.end() && dirty_it->first.inode == inode &&
+        dirty_it->first.stripe < offset+len; )
+    {
+        if (dirty_it->second.flush_id == flush_id && dirty_it->second.state == CACHE_REPEATING)
+        {
+            if (dirty_it->second.buf && !--(*dirty_it->second.refcnt))
+            {
+                free(dirty_it->second.refcnt);
+            }
+            dirty_buffers.erase(dirty_it++);
+        }
+        else
+            dirty_it++;
+    }
+}
+
 void writeback_cache_t::mark_flush_written(uint64_t inode, uint64_t offset, uint64_t len, uint64_t flush_id)
 {
    for (auto dirty_it = find_dirty(inode, offset);
@ -560,8 +592,10 @@ void writeback_cache_t::fsync_ok()
    {
        if (uw_it->second.state == CACHE_FLUSHING)
        {
-            if (!--(*uw_it->second.refcnt))
+            if (uw_it->second.buf && !--(*uw_it->second.refcnt))
+            {
                free(uw_it->second.refcnt);
+            }
            dirty_buffers.erase(uw_it++);
        }
        else
--- a/src/client/etcd_state_client.cpp
+++ b/src/client/etcd_state_client.cpp
@ -31,6 +31,11 @@ etcd_state_client_t::~etcd_state_client_t()
        keepalive_client = NULL;
    }
 #endif
+    if (load_pgs_timer_id >= 0)
+    {
+        tfd->clear_timer(load_pgs_timer_id);
+        load_pgs_timer_id = -1;
+    }
 }

 #ifndef __MOCK__
@ -143,6 +148,7 @@ void etcd_state_client_t::etcd_call(std::string api, json11::Json payload, int t
                }
                if (interval > 0)
                {
+                    // FIXME: Prevent destruction of etcd_state_client if timers or requests are active
                    tfd->set_timer(interval, false, [this, api, payload, timeout, retries, interval, callback](int)
                    {
                        etcd_call(api, payload, timeout, retries, interval, callback);
@ -176,7 +182,7 @@ void etcd_state_client_t::add_etcd_url(std::string addr)
            exit(1);
        }
        if (!local_ips.size())
-            local_ips = getifaddr_list(std::vector<std::string>(), true);
+            local_ips = getifaddr_list(std::vector<addr_mask_t>(), true);
        std::string check_addr;
        int pos = addr.find('/');
        int pos2 = addr.find(':');
@ -271,6 +277,11 @@ void etcd_state_client_t::parse_config(const json11::Json & config)
    {
        this->etcd_quick_timeout = 1000;
    }
+    this->etcd_min_reload_interval = config["etcd_min_reload_interval"].uint64_value();
+    if (this->etcd_min_reload_interval <= 0)
+    {
+        this->etcd_min_reload_interval = 50;
+    }
    if (this->etcd_ws_keepalive_interval != old_etcd_ws_keepalive_interval && ws_keepalive_timer >= 0)
    {
 #ifndef __MOCK__
@ -603,6 +614,23 @@ void etcd_state_client_t::load_global_config()

 void etcd_state_client_t::load_pgs()
 {
+    timespec tv;
+    clock_gettime(CLOCK_REALTIME, &tv);
+    uint64_t ms_passed = (tv.tv_sec-etcd_last_reload.tv_sec)*1000 + (tv.tv_nsec-etcd_last_reload.tv_nsec)/1000000;
+    if (ms_passed < etcd_min_reload_interval)
+    {
+        if (load_pgs_timer_id < 0)
+        {
+            load_pgs_timer_id = tfd->set_timer(etcd_min_reload_interval+50-ms_passed, false, [this](int) { load_pgs(); });
+        }
+        return;
+    }
+    etcd_last_reload = tv;
+    if (load_pgs_timer_id >= 0)
+    {
+        tfd->clear_timer(load_pgs_timer_id);
+        load_pgs_timer_id = -1;
+    }
    json11::Json::array txn = {
        json11::Json::object {
            { "request_range", json11::Json::object {
@ -889,7 +917,11 @@ void etcd_state_client_t::parse_state(const etcd_kv_t & kv)
            if (!pc.scrub_interval)
                pc.scrub_interval = 0;
            // Mark pool as VitastorFS pool (disable per-inode stats and block volume creation)
-            pc.used_for_fs = pool_item.second["used_for_fs"].as_string();
+            pc.used_for_app = pool_item.second["used_for_fs"].as_string();
+            if (pc.used_for_app != "")
+                pc.used_for_app = "fs:"+pc.used_for_app;
+            else
+                pc.used_for_app = pool_item.second["used_for_app"].as_string();
            // Immediate Commit Mode
            pc.immediate_commit = pool_item.second["immediate_commit"].is_string()
                ? parse_immediate_commit(pool_item.second["immediate_commit"].string_value(), IMMEDIATE_ALL)
--- a/src/client/etcd_state_client.h
+++ b/src/client/etcd_state_client.h
@ -61,7 +61,7 @@ struct pool_config_t
    uint64_t pg_stripe_size;
    std::map<pg_num_t, pg_config_t> pg_config;
    uint64_t scrub_interval;
-    std::string used_for_fs;
+    std::string used_for_app;
    int backfillfull;
 };

@ -108,6 +108,7 @@ public:
    int max_etcd_attempts = 5;
    int etcd_quick_timeout = 1000;
    int etcd_slow_timeout = 5000;
+    int etcd_min_reload_interval = 1000;
    bool infinite_start = true;
    uint64_t global_block_size = DEFAULT_BLOCK_SIZE;
    uint32_t global_bitmap_granularity = DEFAULT_BITMAP_GRANULARITY;
@ -122,6 +123,8 @@ public:
    uint64_t etcd_watch_revision_config = 0;
    uint64_t etcd_watch_revision_osd = 0;
    uint64_t etcd_watch_revision_pg = 0;
+    timespec etcd_last_reload = {};
+    int load_pgs_timer_id = -1;
    std::map<pool_id_t, pool_config_t> pool_config;
    std::map<osd_num_t, json11::Json> peer_states;
    std::set<osd_num_t> seen_peers;
--- a/src/client/fio_cluster.cpp
+++ b/src/client/fio_cluster.cpp
@ -377,7 +377,7 @@ static void io_callback(void *opaque, long retval)
    bsd->completed.push_back(io);
    if (bsd->trace)
    {
-        printf("--- %s 0x%jx retval=%ld\n", io->ddir == DDIR_READ ? "READ" :
+        printf("--- %s 0x%jx retval=%jd\n", io->ddir == DDIR_READ ? "READ" :
            (io->ddir == DDIR_WRITE ? "WRITE" : "SYNC"), (uint64_t)io, retval);
    }
 }
--- a/src/client/messenger.cpp
+++ b/src/client/messenger.cpp
@ -117,15 +117,22 @@ void msgr_iothread_t::run()

 void osd_messenger_t::init()
 {
+    // FIXME: Support multiple RDMA networks?!
 #ifdef WITH_RDMA
    if (use_rdma)
    {
        rdma_context = msgr_rdma_context_t::create(
-            osd_networks, rdma_device != "" ? rdma_device.c_str() : NULL,
+            osd_num && osd_cluster_network_masks.size() ? osd_cluster_network_masks : osd_network_masks,
+            rdma_device != "" ? rdma_device.c_str() : NULL,
            rdma_port_num, rdma_gid_index, rdma_mtu, rdma_odp, log_level
        );
        if (!rdma_context)
        {
+            if (force_rdma)
+            {
+                fprintf(stderr, "[OSD %ju] Couldn't initialize RDMA, force_rdma is enabled, exiting\n", osd_num);
+                exit(1);
+            }
            if (log_level > 0)
                fprintf(stderr, "[OSD %ju] Couldn't initialize RDMA, proceeding with TCP only\n", osd_num);
        }
@ -262,6 +269,10 @@ void osd_messenger_t::parse_config(const json11::Json & config)
        // RDMA is on by default in RDMA-enabled builds
        this->use_rdma = config["use_rdma"].bool_value() || config["use_rdma"].uint64_value() != 0;
    }
+    if (!config["force_rdma"].is_null())
+    {
+        this->force_rdma = config["force_rdma"].bool_value() || config["force_rdma"].uint64_value() != 0;
+    }
    this->rdma_device = config["rdma_device"].string_value();
    this->rdma_port_num = (uint8_t)config["rdma_port_num"].uint64_value();
    if (!this->rdma_port_num)
@ -282,15 +293,6 @@ void osd_messenger_t::parse_config(const json11::Json & config)
    if (!this->rdma_max_msg || this->rdma_max_msg > 128*1024*1024)
        this->rdma_max_msg = 129*1024;
    this->rdma_odp = config["rdma_odp"].bool_value();
-    std::vector<std::string> mask;
-    if (config["bind_address"].is_string())
-        mask.push_back(config["bind_address"].string_value());
-    else if (config["osd_network"].is_string())
-        mask.push_back(config["osd_network"].string_value());
-    else
-        for (auto v: config["osd_network"].array_items())
-            mask.push_back(v.string_value());
-    this->osd_networks = mask;
 #endif
    if (!osd_num)
        this->iothread_count = (uint32_t)config["client_iothread_count"].uint64_value();
@ -314,23 +316,83 @@ void osd_messenger_t::parse_config(const json11::Json & config)
    if (!this->osd_ping_timeout)
        this->osd_ping_timeout = 5;
    this->log_level = config["log_level"].uint64_value();
+    // OSD public & cluster networks
+    this->osd_networks.clear();
+    if (config["bind_address"].is_string())
+        this->osd_networks.push_back(config["bind_address"].string_value());
+    else if (config["osd_network"].is_string())
+        this->osd_networks.push_back(config["osd_network"].string_value());
+    else
+        for (auto v: config["osd_network"].array_items())
+            this->osd_networks.push_back(v.string_value());
+    this->osd_cluster_networks.clear();
+    if (config["osd_cluster_network"].is_string())
+        this->osd_cluster_networks.push_back(config["osd_cluster_network"].string_value());
+    else
+        for (auto v: config["osd_cluster_network"].array_items())
+            this->osd_cluster_networks.push_back(v.string_value());
+    if (this->osd_cluster_networks.size())
+        for (auto & net: this->osd_cluster_networks)
+            for (int i = this->osd_networks.size()-1; i >= 0; i--)
+                if (this->osd_networks[i] == net)
+                    this->osd_networks.erase(this->osd_networks.begin()+i, this->osd_networks.begin()+i+1);
+    this->osd_network_masks.clear();
+    for (auto & netstr: this->osd_networks)
+        this->osd_network_masks.push_back(cidr_parse(netstr));
+    this->osd_cluster_network_masks.clear();
+    for (auto & netstr: this->osd_cluster_networks)
+        this->osd_cluster_network_masks.push_back(cidr_parse(netstr));
+    this->all_osd_networks.insert(this->all_osd_networks.end(), this->osd_networks.begin(), this->osd_networks.end());
+    this->all_osd_networks.insert(this->all_osd_networks.end(), this->osd_cluster_networks.begin(), this->osd_cluster_networks.end());
+    if (!this->osd_networks.size())
+    {
+        this->osd_networks = this->osd_cluster_networks;
+        this->osd_network_masks = this->osd_cluster_network_masks;
+    }
 }

 void osd_messenger_t::connect_peer(uint64_t peer_osd, json11::Json peer_state)
 {
-    if (wanted_peers.find(peer_osd) == wanted_peers.end())
+    if (wanted_peers[peer_osd].raw_address_list != peer_state["addresses"])
    {
-        wanted_peers[peer_osd] = (osd_wanted_peer_t){
-            .address_list = peer_state["addresses"],
-            .port = (int)peer_state["port"].int64_value(),
-        };
+        wanted_peers[peer_osd].raw_address_list = peer_state["addresses"];
+        if (osd_cluster_networks.size())
+        {
+            json11::Json::array address_list, cluster_address_list;
+            for (auto json_addr: peer_state["addresses"].array_items())
+            {
+                struct sockaddr_storage addr;
+                auto ok = string_to_addr(json_addr.string_value(), false, 0, &addr);
+                if (ok)
+                {
+                    bool is_cluster = false;
+                    for (auto & mask: osd_cluster_network_masks)
+                    {
+                        if (mask.family == addr.ss_family && (mask.family == AF_INET
+                            ? cidr_match(*(in_addr*)&addr, mask.ipv4, mask.bits)
+                            : cidr6_match(*(in6_addr*)&addr, mask.ipv6, mask.bits)))
+                        {
+                            is_cluster = true;
+                            break;
+                        }
+                    }
+                    if (is_cluster)
+                        cluster_address_list.push_back(json_addr);
+                    else
+                        address_list.push_back(json_addr);
+                }
+            }
+            auto n_cluster = this->osd_num ? cluster_address_list.size() : 0;
+            if (this->osd_num)
+                address_list.insert(address_list.begin(), cluster_address_list.begin(), cluster_address_list.end());
+            wanted_peers[peer_osd].address_list = address_list;
+            wanted_peers[peer_osd].n_cluster_addr = n_cluster;
+        }
+        else
+            wanted_peers[peer_osd].address_list = peer_state["addresses"];
+        wanted_peers[peer_osd].address_changed = true;
    }
-    else
-    {
-        wanted_peers[peer_osd].address_list = peer_state["addresses"];
-        wanted_peers[peer_osd].port = (int)peer_state["port"].int64_value();
-    }
-    wanted_peers[peer_osd].address_changed = true;
+    wanted_peers[peer_osd].port = (int)peer_state["port"].int64_value();
    try_connect_peer(peer_osd);
 }

@ -582,29 +644,23 @@ void osd_messenger_t::check_peer_config(osd_client_t *cl)
        {
            osd_num_t peer_osd = cl->osd_num;
            stop_client(op->peer_fd);
-            on_connect_peer(peer_osd, -1);
+            on_connect_peer(peer_osd, -EINVAL);
            delete op;
            return;
        }
 #ifdef WITH_RDMA
-        if (config["rdma_address"].is_string())
+        if (cl->rdma_conn && config["rdma_address"].is_string())
        {
            msgr_rdma_address_t addr;
            if (!msgr_rdma_address_t::from_string(config["rdma_address"].string_value().c_str(), &addr) ||
                cl->rdma_conn->connect(&addr) != 0)
            {
                fprintf(
-                    stderr, "Failed to connect to OSD %ju (address %s) using RDMA\n",
+                    stderr, "Failed to connect to OSD %ju (address %s) using RDMA, switching back to TCP\n",
                    cl->osd_num, config["rdma_address"].string_value().c_str()
                );
                delete cl->rdma_conn;
                cl->rdma_conn = NULL;
-                // FIXME: Keep TCP connection in this case
-                osd_num_t peer_osd = cl->osd_num;
-                stop_client(cl->peer_fd);
-                on_connect_peer(peer_osd, -1);
-                delete op;
-                return;
            }
            else
            {
--- a/src/client/messenger.h
+++ b/src/client/messenger.h
@ -16,6 +16,7 @@
 #include "json11/json11.hpp"
 #include "msgr_op.h"
 #include "timerfd_manager.h"
+#include "addr_util.h"
 #include <ringloop.h>

 #define CL_READ_HDR 1
@ -93,13 +94,15 @@ struct osd_client_t

 struct osd_wanted_peer_t
 {
+    json11::Json raw_address_list;
    json11::Json address_list;
-    int port;
-    time_t last_connect_attempt;
-    bool connecting, address_changed;
-    int address_index;
+    int n_cluster_addr = 0;
+    int port = 0;
+    time_t last_connect_attempt = 0;
+    bool connecting = false, address_changed = false;
+    int address_index = 0;
    std::string cur_addr;
-    int cur_port;
+    int cur_port = 0;
 };

 struct osd_op_stats_t
@ -165,7 +168,7 @@ protected:

 #ifdef WITH_RDMA
    bool use_rdma = true;
-    std::vector<std::string> osd_networks;
+    bool force_rdma = false;
    std::string rdma_device;
    uint64_t rdma_port_num = 1, rdma_mtu = 0;
    int rdma_gid_index = -1;
@ -190,6 +193,12 @@ public:
    std::map<int, osd_client_t*> clients;
    std::map<osd_num_t, osd_wanted_peer_t> wanted_peers;
    std::map<uint64_t, int> osd_peer_fds;
+    std::vector<std::string> osd_networks;
+    std::vector<addr_mask_t> osd_network_masks;
+    std::vector<std::string> osd_cluster_networks;
+    std::vector<addr_mask_t> osd_cluster_network_masks;
+    std::vector<std::string> all_osd_networks;
+    std::vector<addr_mask_t> all_osd_network_masks;
    // op statistics
    osd_op_stats_t stats, recovery_stats;

--- a/src/client/msgr_rdma.cpp
+++ b/src/client/msgr_rdma.cpp
@ -3,7 +3,6 @@

 #include <stdio.h>
 #include <stdlib.h>
-#include "addr_util.h"
 #include "msgr_rdma.h"
 #include "messenger.h"

@ -77,7 +76,7 @@ static bool is_ipv4_gid(ibv_gid_entry *gidx)
        ((uint32_t*)gidx->gid.raw)[2] == 0xffff0000);
 }

-static bool match_gid(ibv_gid_entry *gidx, addr_mask_t *networks, int nnet)
+static bool match_gid(ibv_gid_entry *gidx, const addr_mask_t *networks, int nnet)
 {
    if (gidx->gid_type != IBV_GID_TYPE_ROCE_V1 &&
        gidx->gid_type != IBV_GID_TYPE_ROCE_V2 ||
@ -125,7 +124,7 @@ static void log_rdma_dev_port_gid(ibv_device *dev, int ib_port, int gid_index, i
    );
 }

-static matched_dev match_device(ibv_device **dev_list, addr_mask_t *networks, int nnet, int log_level)
+static matched_dev match_device(ibv_device **dev_list, const addr_mask_t *networks, int nnet, int log_level)
 {
    matched_dev best;
    ibv_device_attr attr;
@ -201,7 +200,7 @@ cleanup:
 }
 #endif

-msgr_rdma_context_t *msgr_rdma_context_t::create(std::vector<std::string> osd_networks, const char *ib_devname, uint8_t ib_port, int gid_index, uint32_t mtu, bool odp, int log_level)
+msgr_rdma_context_t *msgr_rdma_context_t::create(const std::vector<addr_mask_t> & osd_network_masks, const char *ib_devname, uint8_t ib_port, int gid_index, uint32_t mtu, bool odp, int log_level)
 {
    int res;
    ibv_device **dev_list = NULL;
@ -242,14 +241,9 @@ msgr_rdma_context_t *msgr_rdma_context_t::create(std::vector<std::string> osd_ne
        }
    }
 #ifdef IBV_ADVISE_MR_ADVICE_PREFETCH_NO_FAULT
-    else if (osd_networks.size())
+    else if (osd_network_masks.size())
    {
-        std::vector<addr_mask_t> nets;
-        for (auto & netstr: osd_networks)
-        {
-            nets.push_back(cidr_parse(netstr));
-        }
-        auto best = match_device(dev_list, nets.data(), nets.size(), log_level);
+        auto best = match_device(dev_list, osd_network_masks.data(), osd_network_masks.size(), log_level);
        if (best.dev == -2)
        {
            best.dev = 0;
@ -469,9 +463,10 @@ msgr_rdma_connection_t *msgr_rdma_connection_t::create(msgr_rdma_context_t *ctx,
        .port_num        = ctx->ib_port,
    };

-    if (ibv_modify_qp(conn->qp, &attr, IBV_QP_STATE | IBV_QP_PKEY_INDEX | IBV_QP_PORT | IBV_QP_ACCESS_FLAGS))
+    int r = 0;
+    if ((r = ibv_modify_qp(conn->qp, &attr, IBV_QP_STATE | IBV_QP_PKEY_INDEX | IBV_QP_PORT | IBV_QP_ACCESS_FLAGS)) != 0)
    {
-        fprintf(stderr, "Failed to switch RDMA queue pair to INIT state\n");
+        fprintf(stderr, "Failed to switch RDMA queue pair to INIT state: %s (code %d)\n", strerror(r), r);
        delete conn;
        return NULL;
    }
@ -522,18 +517,19 @@ int msgr_rdma_connection_t::connect(msgr_rdma_address_t *dest)
        .rnr_retry      = 7,
    };
    // FIXME No idea if ibv_modify_qp is a blocking operation or not. No idea if it has a timeout and what it is.
-    if (ibv_modify_qp(conn->qp, &attr, IBV_QP_STATE | IBV_QP_AV | IBV_QP_PATH_MTU |
-        IBV_QP_DEST_QPN | IBV_QP_RQ_PSN | IBV_QP_MAX_DEST_RD_ATOMIC | IBV_QP_MIN_RNR_TIMER))
+    int r;
+    if ((r = ibv_modify_qp(conn->qp, &attr, IBV_QP_STATE | IBV_QP_AV | IBV_QP_PATH_MTU |
+        IBV_QP_DEST_QPN | IBV_QP_RQ_PSN | IBV_QP_MAX_DEST_RD_ATOMIC | IBV_QP_MIN_RNR_TIMER)) != 0)
    {
-        fprintf(stderr, "Failed to switch RDMA queue pair to RTR (ready-to-receive) state\n");
-        return 1;
+        fprintf(stderr, "Failed to switch RDMA queue pair to RTR (ready-to-receive) state: %s (code %d)\n", strerror(r), r);
+        return -r;
    }
    attr.qp_state = IBV_QPS_RTS;
-    if (ibv_modify_qp(conn->qp, &attr, IBV_QP_STATE | IBV_QP_TIMEOUT |
-        IBV_QP_RETRY_CNT | IBV_QP_RNR_RETRY | IBV_QP_SQ_PSN | IBV_QP_MAX_QP_RD_ATOMIC))
+    if ((r = ibv_modify_qp(conn->qp, &attr, IBV_QP_STATE | IBV_QP_TIMEOUT |
+        IBV_QP_RETRY_CNT | IBV_QP_RNR_RETRY | IBV_QP_SQ_PSN | IBV_QP_MAX_QP_RD_ATOMIC)) != 0)
    {
-        fprintf(stderr, "Failed to switch RDMA queue pair to RTS (ready-to-send) state\n");
-        return 1;
+        fprintf(stderr, "Failed to switch RDMA queue pair to RTS (ready-to-send) state: %s (code %d)\n", strerror(r), r);
+        return -r;
    }
    return 0;
 }
--- a/src/client/msgr_rdma.h
+++ b/src/client/msgr_rdma.h
@ -5,6 +5,7 @@
 #include <infiniband/verbs.h>
 #include <string>
 #include <vector>
+#include "addr_util.h"

 struct msgr_rdma_address_t
 {
@ -36,7 +37,7 @@ struct msgr_rdma_context_t
    int max_cqe = 0;
    int used_max_cqe = 0;

-    static msgr_rdma_context_t *create(std::vector<std::string> osd_networks, const char *ib_devname, uint8_t ib_port, int gid_index, uint32_t mtu, bool odp, int log_level);
+    static msgr_rdma_context_t *create(const std::vector<addr_mask_t> & osd_network_masks, const char *ib_devname, uint8_t ib_port, int gid_index, uint32_t mtu, bool odp, int log_level);
    ~msgr_rdma_context_t();
 };

--- a/src/client/msgr_send.cpp
+++ b/src/client/msgr_send.cpp
@ -187,6 +187,7 @@ bool osd_messenger_t::try_send(osd_client_t *cl)
    {
        return true;
    }
+    assert(cl->peer_state != PEER_RDMA);
    if (ringloop && !use_sync_send_recv)
    {
        auto iothread = iothreads.size() ? iothreads[peer_fd % iothreads.size()] : NULL;
--- a/src/client/nbd_proxy.cpp
+++ b/src/client/nbd_proxy.cpp
@ -1194,7 +1194,7 @@ protected:
        }
        else
        {
-            if (cur_op->opcode == OSD_OP_WRITE && watch->cfg.readonly)
+            if (cur_op->opcode == OSD_OP_WRITE && !inode && watch->cfg.readonly)
            {
                cur_op->retval = -EROFS;
                std::function<void(cluster_op_t*)>(cur_op->callback)(cur_op);
--- a/src/client/qemu_driver.c
+++ b/src/client/qemu_driver.c
@ -294,7 +294,9 @@ static void coroutine_fn vitastor_co_get_metadata(VitastorRPC *task)

    qemu_mutex_lock(&client->mutex);
    vitastor_c_watch_inode(client->proxy, client->image, vitastor_co_generic_cb, task);
+#if !defined VITASTOR_C_API_VERSION || VITASTOR_C_API_VERSION < 5
    vitastor_schedule_uring_handler(client);
+#endif
    qemu_mutex_unlock(&client->mutex);

    while (!task->complete)
@ -566,6 +568,22 @@ static int vitastor_file_open(BlockDriverState *bs, QDict *options, int flags, E
 static void vitastor_close(BlockDriverState *bs)
 {
    VitastorClient *client = bs->opaque;
+    if (client->uring_eventfd >= 0)
+    {
+        // clear the eventfd handler
+        universal_aio_set_fd_handler(client->ctx, client->uring_eventfd, NULL, NULL, NULL);
+        int wait_bh = 0;
+        qemu_mutex_lock(&client->mutex);
+        // clear uring_eventfd itself to prevent future scheduling of new B/H
+        client->uring_eventfd = -1;
+        wait_bh = client->bh_uring_scheduled;
+        qemu_mutex_unlock(&client->mutex);
+        if (wait_bh)
+        {
+            // wait until existing scheduled B/H is ran
+            BDRV_POLL_WHILE(bs, client->bh_uring_scheduled);
+        }
+    }
    vitastor_c_destroy(client->proxy);
    if (client->fds)
    {
@ -749,7 +767,9 @@ static int coroutine_fn vitastor_co_preadv(BlockDriverState *bs,
    uint64_t inode = client->watch ? vitastor_c_inode_get_num(client->watch) : client->inode;
    qemu_mutex_lock(&client->mutex);
    vitastor_c_read(client->proxy, inode, offset, bytes, iov->iov, iov->niov, vitastor_co_read_cb, &task);
+#if !defined VITASTOR_C_API_VERSION || VITASTOR_C_API_VERSION < 5
    vitastor_schedule_uring_handler(client);
+#endif
    qemu_mutex_unlock(&client->mutex);

    while (!task.complete)
@ -783,7 +803,9 @@ static int coroutine_fn vitastor_co_pwritev(BlockDriverState *bs,
    uint64_t inode = client->watch ? vitastor_c_inode_get_num(client->watch) : client->inode;
    qemu_mutex_lock(&client->mutex);
    vitastor_c_write(client->proxy, inode, offset, bytes, 0, iov->iov, iov->niov, vitastor_co_generic_cb, &task);
+#if !defined VITASTOR_C_API_VERSION || VITASTOR_C_API_VERSION < 5
    vitastor_schedule_uring_handler(client);
+#endif
    qemu_mutex_unlock(&client->mutex);

    while (!task.complete)
@ -863,7 +885,9 @@ static int coroutine_fn vitastor_co_block_status(
        task.bitmap = client->last_bitmap = NULL;
        qemu_mutex_lock(&client->mutex);
        vitastor_c_read_bitmap(client->proxy, task.inode, task.offset, task.len, !client->skip_parents, vitastor_co_read_bitmap_cb, &task);
+#if !defined VITASTOR_C_API_VERSION || VITASTOR_C_API_VERSION < 5
        vitastor_schedule_uring_handler(client);
+#endif
        qemu_mutex_unlock(&client->mutex);
        while (!task.complete)
        {
@ -950,7 +974,9 @@ static int coroutine_fn vitastor_co_flush(BlockDriverState *bs)

    qemu_mutex_lock(&client->mutex);
    vitastor_c_sync(client->proxy, vitastor_co_generic_cb, &task);
+#if !defined VITASTOR_C_API_VERSION || VITASTOR_C_API_VERSION < 5
    vitastor_schedule_uring_handler(client);
+#endif
    qemu_mutex_unlock(&client->mutex);

    while (!task.complete)
--- a/src/client/vitastor.pc.in
+++ b/src/client/vitastor.pc.in
@ -6,7 +6,7 @@ includedir=${prefix}/@CMAKE_INSTALL_INCLUDEDIR@

 Name: Vitastor
 Description: Vitastor client library
-Version: 1.11.0
+Version: 2.0.0
 Libs: -L${libdir} -lvitastor_client
 Cflags: -I${includedir}

--- a/src/client/vitastor_c.cpp
+++ b/src/client/vitastor_c.cpp
@ -127,6 +127,7 @@ vitastor_c *vitastor_c_create_qemu_uring(QEMUSetFDHandler *aio_set_fd_handler, v
    auto self = vitastor_c_create_qemu_common(aio_set_fd_handler, aio_context);
    self->ringloop = ringloop;
    self->cli = new cluster_client_t(self->ringloop, self->tfd, cfg_json);
+    ringloop->loop();
    return self;
 }

@ -150,6 +151,7 @@ vitastor_c *vitastor_c_create_uring(const char *config_path, const char *etcd_ho
    self->ringloop = ringloop;
    self->epmgr = new epoll_manager_t(self->ringloop);
    self->cli = new cluster_client_t(self->ringloop, self->epmgr->tfd, cfg_json);
+    ringloop->loop();
    return self;
 }

@ -183,6 +185,7 @@ vitastor_c *vitastor_c_create_uring_json(const char **options, int options_len)
    self->ringloop = ringloop;
    self->epmgr = new epoll_manager_t(self->ringloop);
    self->cli = new cluster_client_t(self->ringloop, self->epmgr->tfd, cfg_json);
+    ringloop->loop();
    return self;
 }

@ -228,6 +231,10 @@ void vitastor_c_on_ready(vitastor_c *client, VitastorIOHandler cb, void *opaque)
    {
        cb(opaque, 0);
    });
+    if (client->ringloop)
+    {
+        client->ringloop->loop();
+    }
 }

 void vitastor_c_uring_wait_ready(vitastor_c *client)
@ -284,6 +291,10 @@ void vitastor_c_read(vitastor_c *client, uint64_t inode, uint64_t offset, uint64
        delete op;
    };
    client->cli->execute(op);
+    if (client->ringloop)
+    {
+        client->ringloop->loop();
+    }
 }

 void vitastor_c_write(vitastor_c *client, uint64_t inode, uint64_t offset, uint64_t len, uint64_t check_version,
@ -305,6 +316,10 @@ void vitastor_c_write(vitastor_c *client, uint64_t inode, uint64_t offset, uint6
        delete op;
    };
    client->cli->execute(op);
+    if (client->ringloop)
+    {
+        client->ringloop->loop();
+    }
 }

 void vitastor_c_delete(vitastor_c *client, uint64_t inode, uint64_t offset, uint64_t len, uint64_t check_version,
@ -322,6 +337,10 @@ void vitastor_c_delete(vitastor_c *client, uint64_t inode, uint64_t offset, uint
        delete op;
    };
    client->cli->execute(op);
+    if (client->ringloop)
+    {
+        client->ringloop->loop();
+    }
 }

 void vitastor_c_read_bitmap(vitastor_c *client, uint64_t inode, uint64_t offset, uint64_t len,
@ -344,6 +363,10 @@ void vitastor_c_read_bitmap(vitastor_c *client, uint64_t inode, uint64_t offset,
        delete op;
    };
    client->cli->execute(op);
+    if (client->ringloop)
+    {
+        client->ringloop->loop();
+    }
 }

 void vitastor_c_sync(vitastor_c *client, VitastorIOHandler cb, void *opaque)
@ -356,6 +379,10 @@ void vitastor_c_sync(vitastor_c *client, VitastorIOHandler cb, void *opaque)
        delete op;
    };
    client->cli->execute(op);
+    if (client->ringloop)
+    {
+        client->ringloop->loop();
+    }
 }

 void vitastor_c_watch_inode(vitastor_c *client, char *image, VitastorIOHandler cb, void *opaque)
@ -365,6 +392,10 @@ void vitastor_c_watch_inode(vitastor_c *client, char *image, VitastorIOHandler c
        auto watch = client->cli->st_cli.watch_inode(std::string(image));
        cb(opaque, (long)watch);
    });
+    if (client->ringloop)
+    {
+        client->ringloop->loop();
+    }
 }

 void vitastor_c_close_watch(vitastor_c *client, void *handle)
--- a/src/client/vitastor_c.h
+++ b/src/client/vitastor_c.h
@ -7,7 +7,7 @@
 #define VITASTOR_QEMU_PROXY_H

 // C API wrapper version
-#define VITASTOR_C_API_VERSION 4
+#define VITASTOR_C_API_VERSION 5

 #ifndef POOL_ID_BITS
 #define POOL_ID_BITS 16
--- a/src/cmd/cli.cpp
+++ b/src/cmd/cli.cpp
@ -160,11 +160,12 @@ static const char* help_text =
    "vitastor-cli modify-osd [--tags tag1,tag2,...] [--reweight <number>] [--noout true/false] <osd_number>\n"
    "  Set OSD reweight, tags or noout flag.\n"
    "\n"
-    "vitastor-cli pg-list|pg-ls|list-pg|ls-pg|ls-pgs [OPTIONS] [state1+state2] [^state3] [...]\n"
+    "vitastor-cli pg-list|pg-ls|list-pg|ls-pg|ls-pgs|pgs [OPTIONS] [state1+state2] [^state3] [...]\n"
    "  List PGs with any of listed state filters (^ or ! in the beginning is negation). Options:\n"
    "    --pool <pool name or number>  Only list PGs of the given pool.\n"
    "    --min <min pg number>         Only list PGs with number >= min.\n"
    "    --max <max pg number>         Only list PGs with number <= max.\n"
+    "    --osd 1,2,...                 Only list PGs with some data on specified OSD(s).\n"
    "  Examples:\n"
    "    vitastor-cli pg-list active+degraded\n"
    "    vitastor-cli pg-list ^active\n"
@ -186,7 +187,8 @@ static const char* help_text =
    "    --raw_placement <rules>       Specify raw PG generation rules (see documentation for details)\n"
    "    --primary_affinity_tags tags  Prefer to put primary copies on OSDs with all specified tags\n"
    "    --scrub_interval <time>       Enable regular scrubbing for this pool. Format: number + unit s/m/h/d/M/y\n"
-    "    --used_for_fs <name>          Mark pool as used for VitastorFS with metadata in image <name>\n"
+    "    --used_for_app fs:<name>      Mark pool as used for VitastorFS with metadata in image <name>\n"
+    "    --used_for_app s3:<name>      Mark pool as used for S3 location with name <name>\n"
    "    --pg_stripe_size <number>     Increase object grouping stripe\n"
    "    --max_osd_combinations 10000  Maximum number of random combinations for LP solver input\n"
    "    --wait                        Wait for the new pool to come online\n"
@ -198,7 +200,7 @@ static const char* help_text =
    "vitastor-cli modify-pool|pool-modify <id|name> [--name <new_name>] [PARAMETERS...]\n"
    "  Modify an existing pool. Modifiable parameters:\n"
    "    [-s|--pg_size <number>] [--pg_minsize <number>] [-n|--pg_count <count>]\n"
-    "    [--failure_domain <level>] [--root_node <node>] [--osd_tags <tags>] [--used_for_fs <name>]\n"
+    "    [--failure_domain <level>] [--root_node <node>] [--osd_tags <tags>] [--used_for_app <type>:<name>]\n"
    "    [--max_osd_combinations <number>] [--primary_affinity_tags <tags>] [--scrub_interval <time>]\n"
    "    [--level_placement <rules>] [--raw_placement <rules>]\n"
    "  Non-modifiable parameters (changing them WILL lead to data loss):\n"
@ -482,7 +484,7 @@ static int run(cli_tool_t *p, json11::Json::object cfg)
            cfg["osd_num"] = cmd[1];
        action_cb = p->start_modify_osd(cfg);
    }
-    else if (cmd[0] == "pg-list" || cmd[0] == "pg-ls" || cmd[0] == "list-pg" || cmd[0] == "ls-pg" || cmd[0] == "ls-pgs")
+    else if (cmd[0] == "pg-list" || cmd[0] == "pg-ls" || cmd[0] == "list-pg" || cmd[0] == "ls-pg" || cmd[0] == "ls-pgs" || cmd[0] == "pgs")
    {
        // Modify OSD configuration
        if (cmd.size() > 1)
--- a/src/cmd/cli.h
+++ b/src/cmd/cli.h
@ -98,5 +98,3 @@ std::string format_lat(uint64_t lat);
 std::string format_q(double depth);

 bool stupid_glob(const std::string str, const std::string glob);
-
-std::string implode(const std::string & sep, json11::Json array);
--- a/src/cmd/cli_create.cpp
+++ b/src/cmd/cli_create.cpp
@ -92,12 +92,12 @@ struct image_creator_t
        {
            new_pool_id = pools.begin()->first;
        }
-        if (new_pool_id && !pools.at(new_pool_id).used_for_fs.empty() && !force)
+        if (new_pool_id && !pools.at(new_pool_id).used_for_app.empty() && !force)
        {
            result = (cli_result_t){
                .err = EINVAL,
                .text = "Pool "+pools.at(new_pool_id).name+
-                    " is used for VitastorFS "+pools.at(new_pool_id).used_for_fs+
+                    " is used for application "+pools.at(new_pool_id).used_for_app+
                    ". Use --force if you really know what you are doing",
            };
            state = 100;
--- a/src/cmd/cli_osd_tree.cpp
+++ b/src/cmd/cli_osd_tree.cpp
@ -7,6 +7,7 @@
 #include "epoll_manager.h"
 #include "pg_states.h"
 #include "str_util.h"
+#include "json_util.h"

 struct placement_osd_t
 {
--- a/src/cmd/cli_pg_ls.cpp
+++ b/src/cmd/cli_pg_ls.cpp
@ -5,12 +5,14 @@
 #include "cluster_client.h"
 #include "pg_states.h"
 #include "str_util.h"
+#include "json_util.h"

 struct pg_lister_t
 {
    cli_tool_t *parent;

    uint64_t pool_id = 0;
+    std::set<osd_num_t> osd_nums;
    std::string pool_name;
    std::vector<std::string> pg_state;
    uint64_t min_pg_num = 0;
@ -103,7 +105,7 @@ resume_1:
                                {
                                    if (pg_state_names[i] == bit)
                                    {
-                                        mask |= (uint64_t)1 << i;
+                                        mask |= pg_state_bits[i];
                                        found = true;
                                        break;
                                    }
@ -137,6 +139,22 @@ resume_1:
                    {
                        continue;
                    }
+                    if (osd_nums.size())
+                    {
+                        bool found = false;
+                        for (int i = 0; !found && i < pgp.second.target_set.size(); i++)
+                            if (osd_nums.find(pgp.second.target_set[i]) != osd_nums.end())
+                                found = true;
+                        for (int i = 0; !found && i < pgp.second.target_history.size(); i++)
+                            for (int j = 0; !found && j < pgp.second.target_history[i].size(); j++)
+                                if (osd_nums.find(pgp.second.target_history[i][j]) != osd_nums.end())
+                                    found = true;
+                        for (int i = 0; !found && i < pgp.second.all_peers.size(); i++)
+                            if (osd_nums.find(pgp.second.all_peers[i]) != osd_nums.end())
+                                found = true;
+                        if (!found)
+                            continue;
+                    }
                    if (masks.size())
                    {
                        bool found = false;
@ -156,7 +174,7 @@ resume_1:
                    json11::Json::array state_names;
                    for (int i = 0; i < pg_state_bit_count; i++)
                    {
-                        if (pgp.second.cur_state & (1 << i))
+                        if (pgp.second.cur_state & pg_state_bits[i])
                        {
                            state_names.push_back(std::string(pg_state_names[i]));
                        }
@ -274,6 +292,14 @@ std::function<bool(cli_result_t &)> cli_tool_t::start_pg_list(json11::Json cfg)
        pg_lister->pg_state.push_back(cfg["pg_state"].string_value());
    pg_lister->min_pg_num = cfg["min"].uint64_value();
    pg_lister->max_pg_num = cfg["max"].uint64_value();
+    if (cfg["osd"].is_array())
+        for (auto & osd_num_json: cfg["osd"].array_items())
+            pg_lister->osd_nums.insert(osd_num_json.uint64_value());
+    else if (cfg["osd"].is_string())
+        for (auto & osd_num_str: explode(",", cfg["osd"].string_value(), true))
+            pg_lister->osd_nums.insert(stoull_full(osd_num_str));
+    else if (cfg["osd"].uint64_value())
+        pg_lister->osd_nums.insert(cfg["osd"].uint64_value());
    return [pg_lister](cli_result_t & result)
    {
        pg_lister->loop();
--- a/src/cmd/cli_pool_cfg.cpp
+++ b/src/cmd/cli_pool_cfg.cpp
@ -90,8 +90,8 @@ std::string validate_pool_config(json11::Json::object & new_cfg, json11::Json ol
            value = sz;
        }
        else if (key == "name" || key == "scheme" || key == "immediate_commit" ||
-            key == "failure_domain" || key == "root_node" || key == "scrub_interval" || key == "used_for_fs" ||
-            key == "raw_placement")
+            key == "failure_domain" || key == "root_node" || key == "scrub_interval" || key == "used_for_app" ||
+            key == "used_for_fs" || key == "raw_placement")
        {
            if (!value.is_string())
            {
@ -156,8 +156,13 @@ std::string validate_pool_config(json11::Json::object & new_cfg, json11::Json ol
    {
        new_cfg.erase("parity_chunks");
    }
-    if (new_cfg.find("used_for_fs") != new_cfg.end() && new_cfg["used_for_fs"].string_value() == "")
+    if (new_cfg.find("used_for_app") != new_cfg.end() && new_cfg["used_for_app"].string_value() == "")
    {
+        new_cfg.erase("used_for_app");
+    }
+    if (new_cfg.find("used_for_app") == new_cfg.end() && new_cfg.find("used_for_fs") != new_cfg.end())
+    {
+        new_cfg["used_for_app"] = "fs:"+new_cfg["used_for_fs"].string_value();
        new_cfg.erase("used_for_fs");
    }

--- a/src/cmd/cli_pool_create.cpp
+++ b/src/cmd/cli_pool_create.cpp
@ -10,6 +10,7 @@
 #include "epoll_manager.h"
 #include "pg_states.h"
 #include "str_util.h"
+#include "json_util.h"

 struct pool_creator_t
 {
--- a/src/cmd/cli_pool_ls.cpp
+++ b/src/cmd/cli_pool_ls.cpp
@ -5,6 +5,7 @@
 #include "cli.h"
 #include "cluster_client.h"
 #include "str_util.h"
+#include "json_util.h"
 #include "pg_states.h"

 // List pools with space statistics
@ -199,7 +200,9 @@ resume_1:
            auto & st = pool_stats[pool_id];
            for (auto & kv: pp.second.object_items())
            {
-                if (st.find(kv.first) == st.end())
+                if (kv.first == "used_for_fs" && st.find("used_for_app") == st.end())
+                    st["used_for_app"] = "fs:"+kv.second.string_value();
+                else if (st.find(kv.first) == st.end())
                    st[kv.first] = kv.second;
            }
        }
@ -493,7 +496,7 @@ resume_3:
                { "name", "Name" },
                { "id", "ID" },
                { "scheme_name", "Scheme" },
-                { "used_for_fs", "Used for VitastorFS" },
+                { "used_for_app", "Used for app" },
                { "status", "Status" },
                { "backfillfull_fmt", "Backfillfull" },
                { "pg_count_fmt", "PGs" },
@ -663,19 +666,3 @@ std::function<bool(cli_result_t &)> cli_tool_t::start_pool_ls(json11::Json cfg)
        return false;
    };
 }
-
-std::string implode(const std::string & sep, json11::Json array)
-{
-    if (array.is_number() || array.is_bool() || array.is_string())
-    {
-        return array.as_string();
-    }
-    std::string res;
-    bool first = true;
-    for (auto & item: array.array_items())
-    {
-        res += (first ? item.as_string() : sep+item.as_string());
-        first = false;
-    }
-    return res;
-}
--- a/src/cmd/cli_pool_modify.cpp
+++ b/src/cmd/cli_pool_modify.cpp
@ -112,19 +112,21 @@ resume_1:
                return;
            }

-            if (new_cfg.find("used_for_fs") != new_cfg.end() && !force)
+            if (new_cfg.find("used_for_app") != new_cfg.end() && !force)
            {
                // Check that pool doesn't have images
                auto img_it = parent->cli->st_cli.inode_config.lower_bound(INODE_WITH_POOL(pool_id, 0));
-                if (img_it != parent->cli->st_cli.inode_config.end() && INODE_POOL(img_it->first) == pool_id &&
-                    img_it->second.name == new_cfg["used_for_fs"].string_value())
+                if (img_it != parent->cli->st_cli.inode_config.end() &&
+                    INODE_POOL(img_it->first) == pool_id &&
+                    new_cfg["used_for_app"].string_value().substr(0, 3) == "fs:" &&
+                    img_it->second.name == new_cfg["used_for_app"].string_value().substr(3))
                {
                    // Only allow metadata image to exist in the FS pool
                    img_it++;
                }
                if (img_it != parent->cli->st_cli.inode_config.end() && INODE_POOL(img_it->first) == pool_id)
                {
-                    result = (cli_result_t){ .err = ENOENT, .text = "Pool "+pool_name+" has block images, delete them before using it for VitastorFS" };
+                    result = (cli_result_t){ .err = ENOENT, .text = "Pool "+pool_name+" has block images, delete them before using it for VitastorFS, S3 or another app" };
                    state = 100;
                    return;
                }
--- a/src/cmd/cli_rm_data.cpp
+++ b/src/cmd/cli_rm_data.cpp
@ -69,11 +69,11 @@ struct rm_inode_t
                });
                if (min_offset == 0 && max_offset == 0)
                {
-                    total_count += objects.size();
+                    total_count += rm->objects.size();
                }
                else
                {
-                    for (object_id oid: objects)
+                    for (object_id oid: rm->objects)
                    {
                        if (oid.stripe >= min_offset && (!max_offset || oid.stripe < max_offset))
                        {
--- a/src/cmd/cli_rm_wildcard.cpp
+++ b/src/cmd/cli_rm_wildcard.cpp
@ -138,7 +138,7 @@ struct wildcard_remover_t
                    if (inode_it != parent->cli->st_cli.inode_config.end())
                        fprintf(stderr, "Warning: image %s modified by someone else during deletion, restarting wildcard deletion\n", inode_it->second.name.c_str());
                    else
-                        fprintf(stderr, "Warning: inode %lx modified by someone else during deletion, retrying wildcard deletion\n", irev.inode_num);
+                        fprintf(stderr, "Warning: inode %jx modified by someone else during deletion, retrying wildcard deletion\n", irev.inode_num);
                    goto resume_0;
                }
            }
--- a/src/disk_tool/CMakeLists.txt
+++ b/src/disk_tool/CMakeLists.txt
@ -5,7 +5,7 @@ project(vitastor)
 # vitastor-disk
 add_executable(vitastor-disk
 	disk_tool.cpp disk_simple_offsets.cpp
-	disk_tool_journal.cpp disk_tool_meta.cpp disk_tool_prepare.cpp disk_tool_resize.cpp
+	disk_tool_discard.cpp disk_tool_journal.cpp disk_tool_meta.cpp disk_tool_prepare.cpp disk_tool_resize.cpp
 	disk_tool_resize_auto.cpp disk_tool_udev.cpp disk_tool_utils.cpp disk_tool_upgrade.cpp
 	../util/crc32c.c ../util/str_util.cpp ../util/json_util.cpp ../../json11/json11.cpp ../util/rw_blocking.cpp ../util/allocator.cpp ../util/ringloop.cpp ../blockstore/blockstore_disk.cpp
 )
--- a/src/disk_tool/disk_tool.cpp
+++ b/src/disk_tool/disk_tool.cpp
@ -68,6 +68,8 @@ static const char *help_text =
    "    --data_device_block 4k     Override data device block size\n"
    "    --meta_device_block 4k     Override metadata device block size\n"
    "    --journal_device_block 4k  Override journal device block size\n"
+    "    --discard_on_start 0       TRIM unused data device blocks every OSD start (default off)\n"
+    "    --min_discard_size 1M      Minimum TRIM block size\n"
    "    --json                     Enable JSON output\n"
    "  \n"
    "  immediate_commit setting is automatically derived from \"disable fsync\" options.\n"
@ -128,6 +130,12 @@ static const char *help_text =
    "  SIZE may include k/m/g/t suffixes. If any of the new layout parameter\n"
    "  options are not specified, old values will be used.\n"
    "\n"
+    "vitastor-disk trim <osd_num>|<osd_device> [<osd_num>|<osd_device>...]\n"
+    "  Try to discard unused blocks (SSD TRIM) on the data device of each of the OSD(s).\n"
+    "  May only be used on stopped OSDs. Options:\n"
+    "    --min_discard_size 1M      Minimum TRIM block size\n"
+    "    --discard_granularity 0    Override device's discard granularity\n"
+    "\n"
    "vitastor-disk start|stop|restart|enable|disable [--now] <device> [device2 device3 ...]\n"
    "  Manipulate Vitastor OSDs using systemd by their device paths.\n"
    "  Commands are passed to systemctl with vitastor-osd@<num> units as arguments.\n"
@ -428,6 +436,19 @@ int main(int argc, char *argv[])
        disk_tool_simple_offsets(self.options, self.json);
        return 0;
    }
+    else if (!strcmp(cmd[0], "trim"))
+    {
+        if (cmd.size() < 2)
+        {
+            fprintf(stderr, "OSD number(s) or device path(s) are required\n");
+            return 1;
+        }
+        for (int i = 1; i < cmd.size(); i++)
+        {
+            self.trim_data(cmd[i]);
+        }
+        return 0;
+    }
    else if (!strcmp(cmd[0], "udev"))
    {
        if (cmd.size() != 2)
--- a/src/disk_tool/disk_tool.h
+++ b/src/disk_tool/disk_tool.h
@ -60,7 +60,7 @@ struct disk_tool_t

    bool first_block, first_entry;

-    allocator *data_alloc;
+    allocator_t *data_alloc;
    std::map<uint64_t, uint64_t> data_remap;
    std::map<uint64_t, uint64_t>::iterator remap_it;
    ring_loop_t *ringloop;
@ -86,10 +86,10 @@ struct disk_tool_t

    int dump_journal();
    void dump_journal_entry(int num, journal_entry *je, bool json);
-    int process_journal(std::function<int(void*)> block_fn);
+    int process_journal(std::function<int(void*)> block_fn, bool do_open = true);
    int process_journal_block(void *buf, std::function<void(int, journal_entry*)> iter_fn);
    int process_meta(std::function<void(blockstore_meta_header_v2_t *)> hdr_fn,
-        std::function<void(uint64_t, clean_disk_entry*, uint8_t*)> record_fn);
+        std::function<void(uint64_t, clean_disk_entry*, uint8_t*)> record_fn, bool do_open = true);

    int dump_meta();
    void dump_meta_header(blockstore_meta_header_v2_t *hdr);
@ -123,6 +123,7 @@ struct disk_tool_t
    int pre_exec_osd(std::string device);
    int purge_devices(const std::vector<std::string> & devices);
    int clear_osd_superblock(const std::string & dev);
+    int trim_data(std::string device);

    json11::Json read_osd_superblock(std::string device, bool expect_exist = true, bool ignore_nonref = false);
    uint32_t write_osd_superblock(std::string device, json11::Json params);
--- a/src/disk_tool/disk_tool_discard.cpp
+++ b/src/disk_tool/disk_tool_discard.cpp
@ -0,0 +1,89 @@
+// Copyright (c) Vitaliy Filippov, 2019+
+// License: VNPL-1.1 (see README.md for details)
+
+#include "disk_tool.h"
+#include "str_util.h"
+#include "json_util.h"
+
+int disk_tool_t::trim_data(std::string device)
+{
+    int r;
+    // Parse parameters
+    if (stoull_full(device))
+        device = "/dev/vitastor/osd"+device+"-data";
+    json11::Json sb = read_osd_superblock(device, true, false);
+    if (sb.is_null())
+        return 1;
+    auto sb_params = json_to_string_map(sb["params"].object_items());
+    if (options["discard_granularity"] != "")
+        sb_params["discard_granularity"] = options["discard_granularity"];
+    if (options["min_discard_size"] != "")
+        sb_params["min_discard_size"] = options["min_discard_size"];
+    try
+    {
+        dsk.parse_config(sb_params);
+    }
+    catch (std::exception & e)
+    {
+        fprintf(stderr, "Error: %s\n", e.what());
+        return 1;
+    }
+    if (!dsk.discard_granularity && sb["real_data_device"].string_value().substr(0, 5) == "/dev/")
+    {
+        auto dg = read_file("/sys/block/"+sb["real_data_device"].string_value().substr(5)+"/queue/discard_granularity", true);
+        if (dg != "")
+            dsk.discard_granularity = parse_size(trim(dg));
+    }
+    // Open devices
+    try
+    {
+        if (options["io"] != "")
+            dsk.data_io = dsk.meta_io = dsk.journal_io = options["io"];
+        dsk.open_data();
+        dsk.open_meta();
+        dsk.open_journal();
+        dsk.calc_lengths();
+    }
+    catch (std::exception & e)
+    {
+        dsk.close_all();
+        fprintf(stderr, "Error: %s\n", e.what());
+        return 1;
+    }
+    // Fill allocator
+    fprintf(stderr, "Reading metadata\n");
+    data_alloc = new allocator_t(dsk.block_count);
+    r = process_meta(
+        [this](blockstore_meta_header_v2_t *hdr) {},
+        [this](uint64_t block_num, clean_disk_entry *entry, uint8_t *bitmap)
+        {
+            data_alloc->set(block_num, true);
+        },
+        false
+    );
+    if (r != 0)
+    {
+        dsk.close_all();
+        return r;
+    }
+    fprintf(stderr, "Reading journal\n");
+    r = process_journal([this](void *buf)
+    {
+        return process_journal_block(buf, [this](int num, journal_entry *je)
+        {
+            if (je->type == JE_BIG_WRITE || je->type == JE_BIG_WRITE_INSTANT)
+            {
+                data_alloc->set(je->big_write.location / dsk.data_block_size, true);
+            }
+        });
+    }, false);
+    if (r != 0)
+    {
+        dsk.close_all();
+        return r;
+    }
+    // Trim
+    r = dsk.trim_data(data_alloc);
+    dsk.close_all();
+    return r == 0;
+}
--- a/src/disk_tool/disk_tool_journal.cpp
+++ b/src/disk_tool/disk_tool_journal.cpp
@ -119,13 +119,21 @@ int disk_tool_t::dump_journal()
    return 0;
 }

-int disk_tool_t::process_journal(std::function<int(void*)> block_fn)
+int disk_tool_t::process_journal(std::function<int(void*)> block_fn, bool do_open)
 {
-    dsk.journal_fd = open(dsk.journal_device.c_str(), (options["io"] == "cached" ? 0 : O_DIRECT) | O_RDONLY);
-    if (dsk.journal_fd < 0)
+    if (do_open)
    {
-        fprintf(stderr, "Failed to open journal device %s: %s\n", dsk.journal_device.c_str(), strerror(errno));
-        return 1;
+        if (dsk.journal_fd >= 0)
+        {
+            fprintf(stderr, "Bug: journal device is already opened\n");
+            return 1;
+        }
+        dsk.journal_fd = open(dsk.journal_device.c_str(), (options["io"] == "cached" ? 0 : O_DIRECT) | O_RDONLY);
+        if (dsk.journal_fd < 0)
+        {
+            fprintf(stderr, "Failed to open journal device %s: %s\n", dsk.journal_device.c_str(), strerror(errno));
+            return 1;
+        }
    }
    void *data = memalign_or_die(MEM_ALIGNMENT, dsk.journal_block_size);
    journal_pos = 0;
@ -170,8 +178,11 @@ int disk_tool_t::process_journal(std::function<int(void*)> block_fn)
                break;
        }
    }
-    close(dsk.journal_fd);
-    dsk.journal_fd = -1;
+    if (do_open)
+    {
+        close(dsk.journal_fd);
+        dsk.journal_fd = -1;
+    }
    free(data);
    return r;
 }
--- a/src/disk_tool/disk_tool_meta.cpp
+++ b/src/disk_tool/disk_tool_meta.cpp
@ -7,18 +7,26 @@
 #include "json_util.h"

 int disk_tool_t::process_meta(std::function<void(blockstore_meta_header_v2_t *)> hdr_fn,
-    std::function<void(uint64_t, clean_disk_entry*, uint8_t*)> record_fn)
+    std::function<void(uint64_t, clean_disk_entry*, uint8_t*)> record_fn, bool do_open)
 {
    if (dsk.meta_block_size % DIRECT_IO_ALIGNMENT)
    {
        fprintf(stderr, "Invalid metadata block size: is not a multiple of %d\n", DIRECT_IO_ALIGNMENT);
        return 1;
    }
-    dsk.meta_fd = open(dsk.meta_device.c_str(), (options["io"] == "cached" ? 0 : O_DIRECT) | O_RDONLY);
-    if (dsk.meta_fd < 0)
+    if (do_open)
    {
-        fprintf(stderr, "Failed to open metadata device %s: %s\n", dsk.meta_device.c_str(), strerror(errno));
-        return 1;
+        if (dsk.meta_fd >= 0)
+        {
+            fprintf(stderr, "Bug: Metadata device is already opened\n");
+            return 1;
+        }
+        dsk.meta_fd = open(dsk.meta_device.c_str(), (options["io"] == "cached" ? 0 : O_DIRECT) | O_RDONLY);
+        if (dsk.meta_fd < 0)
+        {
+            fprintf(stderr, "Failed to open metadata device %s: %s\n", dsk.meta_device.c_str(), strerror(errno));
+            return 1;
+        }
    }
    int buf_size = 1024*1024;
    if (buf_size % dsk.meta_block_size)
@ -47,8 +55,11 @@ int disk_tool_t::process_meta(std::function<void(blockstore_meta_header_v2_t *)>
            {
                fprintf(stderr, "I don't know checksum format %u, the only supported format is crc32c = %u.\n", hdr->data_csum_type, BLOCKSTORE_CSUM_CRC32C);
                free(data);
-                close(dsk.meta_fd);
-                dsk.meta_fd = -1;
+                if (do_open)
+                {
+                    close(dsk.meta_fd);
+                    dsk.meta_fd = -1;
+                }
                return 1;
            }
        }
@ -57,8 +68,11 @@ int disk_tool_t::process_meta(std::function<void(blockstore_meta_header_v2_t *)>
            // Unsupported version
            fprintf(stderr, "Metadata format is too new for me (stored version is %ju, max supported %u).\n", hdr->version, BLOCKSTORE_META_FORMAT_V2);
            free(data);
-            close(dsk.meta_fd);
-            dsk.meta_fd = -1;
+            if (do_open)
+            {
+                close(dsk.meta_fd);
+                dsk.meta_fd = -1;
+            }
            return 1;
        }
        if (hdr->meta_block_size != dsk.meta_block_size)
@ -145,8 +159,11 @@ int disk_tool_t::process_meta(std::function<void(blockstore_meta_header_v2_t *)>
        }
    }
    free(data);
-    close(dsk.meta_fd);
-    dsk.meta_fd = -1;
+    if (do_open)
+    {
+        close(dsk.meta_fd);
+        dsk.meta_fd = -1;
+    }
    return 0;
 }

--- a/src/disk_tool/disk_tool_prepare.cpp
+++ b/src/disk_tool/disk_tool_prepare.cpp
@ -26,6 +26,9 @@ int disk_tool_t::prepare_one(std::map<std::string, std::string> options, int is_
        "throttle_target_mbs",
        "throttle_target_parallelism",
        "throttle_threshold_us",
+        "discard_on_start",
+        "min_discard_size",
+        "discard_granularity",
    };
    if (options.find("force") == options.end())
    {
--- a/src/disk_tool/disk_tool_resize.cpp
+++ b/src/disk_tool/disk_tool_resize.cpp
@ -25,9 +25,9 @@ int disk_tool_t::raw_resize()
    r = resize_parse_params();
    if (r != 0)
        return r;
-    // Check parameters and fill allocator
+    // Fill allocator
    fprintf(stderr, "Reading metadata\n");
-    data_alloc = new allocator((new_data_len < dsk.data_len ? dsk.data_len : new_data_len) / dsk.data_block_size);
+    data_alloc = new allocator_t((new_data_len < dsk.data_len ? dsk.data_len : new_data_len) / dsk.data_block_size);
    r = process_meta(
        [this](blockstore_meta_header_v2_t *hdr)
        {
--- a/src/disk_tool/disk_tool_upgrade.cpp
+++ b/src/disk_tool/disk_tool_upgrade.cpp
@ -79,28 +79,59 @@ int disk_tool_t::upgrade_simple_unit(std::string unit)
    {
        // Resize data
        uint64_t blk = stoull_full(options["block_size"]);
-        blk = blk ? blk : 128*1024;
+        blk = blk ? blk : (1 << DEFAULT_DATA_BLOCK_ORDER);
        std::map<std::string, uint64_t> resize;
        if (d_o < 4096 || m_is_d && m_o < 4096 && m_o < d_o || j_is_d && j_o < 4096 && j_o < d_o)
        {
-            resize["new_data_offset"] = d_o+blk;
+            d_o += blk;
            if (m_is_d && m_o < d_o)
-                resize["new_meta_offset"] = m_o+blk;
+                m_o += blk;
            if (j_is_d && j_o < d_o)
-                resize["new_journal_offset"] = j_o+blk;
+                j_o += blk;
        }
        if (!m_is_d && m_o < 4096)
        {
-            resize["new_meta_offset"] = m_o+4096;
+            m_o += 4096;
            if (j_is_m && m_o < j_o)
-                resize["new_journal_offset"] = j_o+4096;
+                j_o += 4096;
        }
        if (!j_is_d && !j_is_m && j_o < 4096)
-            resize["new_journal_offset"] = j_o+4096;
+            j_o += 4096;
+        if (options["meta_format"] == "" || options["meta_format"] == "1")
+        {
+            blockstore_disk_t dsk;
+            options["meta_format"] = std::to_string(BLOCKSTORE_META_FORMAT_V2);
+            try
+            {
+                dsk.parse_config(options);
+                dsk.open_data();
+                dsk.open_meta();
+                dsk.open_journal();
+                dsk.calc_lengths(true);
+                dsk.close_all();
+            }
+            catch (std::exception & e)
+            {
+                dsk.close_all();
+                fprintf(stderr, "Error: %s\n", e.what());
+                return 1;
+            }
+            options.erase("meta_format");
+            if (m_is_d && m_o < d_o && d_o-m_o < dsk.meta_len)
+                d_o += ((dsk.meta_len - (d_o-m_o) + blk-1) / blk) * blk;
+        }
+        resize["new_data_offset"] = d_o;
+        resize["new_meta_offset"] = m_o;
+        resize["new_journal_offset"] = j_o;
        disk_tool_t resizer;
        resizer.options = options;
        for (auto & kv: resize)
            resizer.options[kv.first] = std::to_string(kv.second);
+        std::string cmd;
+        for (auto & kv: resizer.options)
+            if (kv.second != "")
+                cmd += "  "+kv.first+" = "+kv.second+"\n";
+        fprintf(stderr, "Running resize:\n%s", cmd.c_str());
        if (resizer.raw_resize() != 0)
        {
            // FIXME: Resize with backup or journal
--- a/src/nfs/nfs_kv.cpp
+++ b/src/nfs/nfs_kv.cpp
@ -245,10 +245,10 @@ void kv_fs_state_t::init(nfs_proxy_t *proxy, json11::Json cfg)
    if (proxy->cli->st_cli.inode_config.find(fs_kv_inode) != proxy->cli->st_cli.inode_config.end())
    {
        auto & name = proxy->cli->st_cli.inode_config.at(fs_kv_inode).name;
-        if (pool_cfg.used_for_fs != name)
+        if (pool_cfg.used_for_app != "fs:"+name)
        {
-            fprintf(stderr, "Please mark pool as used for this file system with `vitastor-cli modify-pool --used-for-fs %s %s`\n",
-                name.c_str(), cfg["fs"].string_value().c_str());
+            fprintf(stderr, "Please mark pool as used for this file system with `vitastor-cli modify-pool --used-for-app fs:%s %s`\n",
+                name.c_str(), pool_cfg.name.c_str());
            exit(1);
        }
    }
--- a/src/osd/osd.cpp
+++ b/src/osd/osd.cpp
@ -120,7 +120,9 @@ osd_t::~osd_t()
    delete epmgr;
    if (bs)
        delete bs;
-    close(listen_fd);
+    for (auto listen_fd: listen_fds)
+        close(listen_fd);
+    listen_fds.clear();
    free(zero_buffer);
 }

@ -162,9 +164,6 @@ void osd_t::parse_config(bool init)
        else
            immediate_commit = IMMEDIATE_NONE;
        // Bind address
-        bind_address = config["bind_address"].string_value();
-        if (bind_address == "")
-            bind_address = "0.0.0.0";
        bind_port = config["bind_port"].uint64_value();
        if (bind_port <= 0 || bind_port > 65535)
            bind_port = 0;
@ -186,6 +185,7 @@ void osd_t::parse_config(bool init)
    no_recovery = json_is_true(config["no_recovery"]);
    auto old_no_scrub = no_scrub;
    no_scrub = json_is_true(config["no_scrub"]);
+    allow_net_split = json_is_true(config["allow_net_split"]);
    auto old_autosync_interval = autosync_interval;
    if (!config["autosync_interval"].is_null())
    {
@ -321,41 +321,30 @@ void osd_t::parse_config(bool init)

 void osd_t::bind_socket()
 {
-    if (config["osd_network"].is_string() ||
-        config["osd_network"].is_array())
+    if (msgr.all_osd_network_masks.size())
    {
-        std::vector<std::string> mask;
-        if (config["osd_network"].is_string())
-            mask.push_back(config["osd_network"].string_value());
-        else
-            for (auto v: config["osd_network"].array_items())
-                mask.push_back(v.string_value());
-        auto matched_addrs = getifaddr_list(mask);
-        if (matched_addrs.size() > 1)
+        bind_addresses = getifaddr_list(msgr.all_osd_network_masks);
+        if (!bind_addresses.size())
        {
-            fprintf(stderr, "More than 1 address matches requested network(s): %s\n", json11::Json(matched_addrs).dump().c_str());
-            force_stop(1);
-        }
-        if (!matched_addrs.size())
-        {
-            std::string nets;
-            for (auto v: mask)
-                nets += (nets == "" ? v : ","+v);
+            auto nets = implode(", ", msgr.all_osd_networks);
            fprintf(stderr, "Addresses matching osd_network(s) %s not found\n", nets.c_str());
            force_stop(1);
        }
-        bind_address = matched_addrs[0];
    }
-
-    // FIXME Support multiple listening sockets
-
-    listen_fd = create_and_bind_socket(bind_address, bind_port, listen_backlog, &listening_port);
-    fcntl(listen_fd, F_SETFL, fcntl(listen_fd, F_GETFL, 0) | O_NONBLOCK);
-
-    epmgr->set_fd_handler(listen_fd, false, [this](int fd, int events)
+    else
    {
-        msgr.accept_connections(listen_fd);
-    });
+        bind_addresses.push_back("0.0.0.0");
+    }
+    for (auto & bind_address: bind_addresses)
+    {
+        int listen_fd = create_and_bind_socket(bind_address, bind_port, listen_backlog, &listening_port);
+        fcntl(listen_fd, F_SETFL, fcntl(listen_fd, F_GETFL, 0) | O_NONBLOCK);
+        epmgr->set_fd_handler(listen_fd, false, [this](int fd, int events)
+        {
+            msgr.accept_connections(fd);
+        });
+        listen_fds.push_back(listen_fd);
+    }
 }

 bool osd_t::shutdown()
--- a/src/osd/osd.h
+++ b/src/osd/osd.h
@ -106,7 +106,7 @@ class osd_t
    bool no_rebalance = false;
    bool no_recovery = false;
    bool no_scrub = false;
-    std::string bind_address;
+    bool allow_net_split = false;
    int bind_port, listen_backlog = 128;
    // FIXME: Implement client queue depth limit
    int client_queue_depth = 128;
@ -199,7 +199,8 @@ class osd_t
    epoll_manager_t *epmgr = NULL;

    int listening_port = 0;
-    int listen_fd = 0;
+    std::vector<std::string> bind_addresses;
+    std::vector<int> listen_fds;
    ring_consumer_t consumer;

    // op statistics
--- a/src/osd/osd_cluster.cpp
+++ b/src/osd/osd_cluster.cpp
@ -165,8 +165,8 @@ json11::Json osd_t::get_osd_state()
    hostname.resize(strnlen(hostname.data(), hostname.size()));
    json11::Json::object st;
    st["state"] = "up";
-    if (bind_address != "0.0.0.0")
-        st["addresses"] = json11::Json::array { bind_address };
+    if (bind_addresses.size() != 1 || bind_addresses[0] != "0.0.0.0")
+        st["addresses"] = bind_addresses;
    else
        st["addresses"] = getifaddr_list();
    st["host"] = std::string(hostname.data(), hostname.size());
@ -673,7 +673,7 @@ void osd_t::apply_no_inode_stats()
    std::vector<uint64_t> no_inode_stats;
    for (auto & pool_item: st_cli.pool_config)
    {
-        if (!pool_item.second.used_for_fs.empty())
+        if (!pool_item.second.used_for_app.empty())
        {
            no_inode_stats.push_back(pool_item.first);
        }
--- a/src/osd/osd_main.cpp
+++ b/src/osd/osd_main.cpp
@ -58,7 +58,7 @@ int main(int narg, char *args[])
        return 1;
    }
    char osdname[16] = { 0 };
-    snprintf(osdname, 16, "osd%lu", config["osd_num"].uint64_value());
+    snprintf(osdname, 16, "osd%ju", config["osd_num"].uint64_value());
    prctl(PR_SET_NAME, (unsigned long)osdname, 0, 0, 0);
    signal(SIGINT, handle_sigint);
    signal(SIGTERM, handle_sigint);
--- a/src/osd/osd_peering.cpp
+++ b/src/osd/osd_peering.cpp
@ -199,6 +199,7 @@ void osd_t::start_pg_peering(pg_t & pg)
    drop_dirty_pg_connections({ .pool_id = pg.pool_id, .pg_num = pg.pg_num });
    // Try to connect with current peers if they're up, but we don't have connections to them
    // Otherwise we may erroneously decide that the pg is incomplete :-)
+    bool all_connected = true;
    for (auto pg_osd: pg.all_peers)
    {
        if (pg_osd != this->osd_num &&
@ -206,8 +207,17 @@ void osd_t::start_pg_peering(pg_t & pg)
            msgr.wanted_peers.find(pg_osd) == msgr.wanted_peers.end())
        {
            msgr.connect_peer(pg_osd, st_cli.peer_states[pg_osd]);
+            if (!st_cli.peer_states[pg_osd].is_null())
+                all_connected = false;
        }
    }
+    if (!all_connected && !allow_net_split)
+    {
+        // Wait until all OSDs are either connected or their /osd/state disappears from etcd
+        pg.state = PG_INCOMPLETE;
+        report_pg_state(pg);
+        return;
+    }
    // Calculate current write OSD set
    pg.pg_cursize = 0;
    pg.cur_set.resize(pg.target_set.size());
--- a/src/osd/osd_primary.cpp
+++ b/src/osd/osd_primary.cpp
@ -722,54 +722,58 @@ resume_3:
        cur_op->reply.rw.version = op_data->fact_ver;
        goto continue_others;
    }
-    // Save version override for parallel reads
-    pg.ver_override[op_data->oid] = op_data->fact_ver;
-    // Submit deletes
-    op_data->fact_ver++;
-    submit_primary_del_subops(cur_op, NULL, 0, op_data->object_state ? op_data->object_state->osd_set : pg.cur_loc_set);
+    // If not already deleted:
+    if (op_data->fact_ver)
+    {
+        // Save version override for parallel reads
+        pg.ver_override[op_data->oid] = op_data->fact_ver;
+        // Submit deletes
+        op_data->fact_ver++;
+        submit_primary_del_subops(cur_op, NULL, 0, op_data->object_state ? op_data->object_state->osd_set : pg.cur_loc_set);
 resume_4:
-    op_data->st = 4;
-    return;
-resume_5:
-    if (op_data->errors > 0)
-    {
-        deref_object_state(pg, &op_data->object_state, true);
-        pg_cancel_write_queue(pg, cur_op, op_data->oid, op_data->errcode);
+        op_data->st = 4;
        return;
-    }
-    // Remove version override
-    pg.ver_override.erase(op_data->oid);
-    // Adjust PG stats after "instant stabilize", because we need object_state above
-    if (!op_data->object_state)
-    {
-        pg.clean_count--;
-    }
-    else
-    {
-        remove_object_from_state(op_data->oid, &op_data->object_state, pg);
-        deref_object_state(pg, &op_data->object_state, true);
-    }
-    // Mark PG and OSDs as dirty
-    for (auto & chunk: (op_data->object_state ? op_data->object_state->osd_set : pg.cur_loc_set))
-    {
-        this->dirty_osds.insert(chunk.osd_num);
-    }
-    for (auto cl_it = msgr.clients.find(cur_op->peer_fd); cl_it != msgr.clients.end(); )
-    {
-        cl_it->second->dirty_pgs.insert({ .pool_id = pg.pool_id, .pg_num = pg.pg_num });
-        break;
-    }
-    dirty_pgs.insert({ .pool_id = pg.pool_id, .pg_num = pg.pg_num });
-    if (immediate_commit == IMMEDIATE_NONE)
-    {
-        unstable_write_count++;
-        if (unstable_write_count >= autosync_writes)
+resume_5:
+        if (op_data->errors > 0)
        {
-            unstable_write_count = 0;
-            autosync();
+            deref_object_state(pg, &op_data->object_state, true);
+            pg_cancel_write_queue(pg, cur_op, op_data->oid, op_data->errcode);
+            return;
        }
+        // Remove version override
+        pg.ver_override.erase(op_data->oid);
+        // Adjust PG stats after "instant stabilize", because we need object_state above
+        if (!op_data->object_state)
+        {
+            pg.clean_count--;
+        }
+        else
+        {
+            remove_object_from_state(op_data->oid, &op_data->object_state, pg);
+            deref_object_state(pg, &op_data->object_state, true);
+        }
+        // Mark PG and OSDs as dirty
+        for (auto & chunk: (op_data->object_state ? op_data->object_state->osd_set : pg.cur_loc_set))
+        {
+            this->dirty_osds.insert(chunk.osd_num);
+        }
+        for (auto cl_it = msgr.clients.find(cur_op->peer_fd); cl_it != msgr.clients.end(); )
+        {
+            cl_it->second->dirty_pgs.insert({ .pool_id = pg.pool_id, .pg_num = pg.pg_num });
+            break;
+        }
+        dirty_pgs.insert({ .pool_id = pg.pool_id, .pg_num = pg.pg_num });
+        if (immediate_commit == IMMEDIATE_NONE)
+        {
+            unstable_write_count++;
+            if (unstable_write_count >= autosync_writes)
+            {
+                unstable_write_count = 0;
+                autosync();
+            }
+        }
+        pg.total_count--;
    }
-    pg.total_count--;
    cur_op->reply.hdr.retval = 0;
    // indicate possibly unfinished (left_on_dead) deletions
    cur_op->reply.del.flags = OSD_DEL_SUPPORT_LEFT_ON_DEAD;
--- a/src/osd/osd_primary_subops.cpp
+++ b/src/osd/osd_primary_subops.cpp
@ -46,6 +46,10 @@ void osd_t::autosync()
 void osd_t::finish_op(osd_op_t *cur_op, int retval)
 {
    inflight_ops--;
+    if cur_op->req.hdr.opcode == OSD_OP_WRITE)
+    {
+        printf("%jx %jx+%jx v%jx r=%d\n", cur_op->req.rw.inode, cur_op->req.rw.offset, cur_op->req.rw.len, cur_op->reply.rw.version, retval);
+    }
    if (cur_op->req.hdr.opcode == OSD_OP_READ ||
        cur_op->req.hdr.opcode == OSD_OP_WRITE ||
        cur_op->req.hdr.opcode == OSD_OP_DELETE)
--- a/Show More
+++ b/Show More
Author	SHA1	Message	Date
Vitaliy Filippov	c558db864b	trace writes	2025-03-26 14:01:54 +03:00
Vitaliy Filippov	640926346a	WIP Support separate OSD cluster network	2025-03-26 01:52:08 +03:00
Vitaliy Filippov	8d6ab2300f	WIP Add force_rdma parameter	2025-03-23 16:59:10 +03:00
Vitaliy Filippov	ccc32b9e68	Use TCP on RDMA connection failure	2025-03-23 12:04:23 +03:00
Vitaliy Filippov	ebaf3fee79	Add an assertion to prevent sending message to TCP channel when switched to RDMA	2025-03-23 12:04:09 +03:00
Vitaliy Filippov	196d28e987	Fix typo	2025-03-23 12:00:20 +03:00
Vitaliy Filippov	8f243b2328	Fix qemu buster build and bullseye version	2025-03-23 02:46:52 +03:00
Vitaliy Filippov	7a835fcd8f	Add allow_net_split parameter	2025-03-23 02:12:32 +03:00
Vitaliy Filippov	8b0389b4e8	Log RDMA ibv_modify_qp() errors	2025-03-22 15:58:13 +03:00
Vitaliy Filippov	f544c350ba	%l* -> %j*	2025-03-22 15:32:07 +03:00
Vitaliy Filippov	4eafb55b5c	Add a patch for QEMU 9.2, fix debian bookworm QEMU build	2025-03-22 15:30:52 +03:00
Vitaliy Filippov	5030396f71	Clear QEMU eventfd handler on vitastor block driver destruction	2025-03-21 20:47:17 +03:00
Vitaliy Filippov	be22c363ca	Do not skip client_retry_interval on reconnecting OSDs to prevent OSD spam	2025-03-20 00:12:38 +03:00
Vitaliy Filippov	0f80c87b43	Add a minimum interval for etcd_state_client to reload state (To prevent excessive load on etcd during outages)	2025-03-19 02:36:09 +03:00
Vitaliy Filippov	e0953fd502	Wait for all "up" OSDs to be connected before starting PG	2025-03-19 02:36:09 +03:00
Vitaliy Filippov	6e0ae47938	Add Proxmox QEMU 9.2 patch	2025-03-19 02:36:02 +03:00
Vitaliy Filippov	b8f19e85ad	Fix pg state formatting in ls-pgs	2025-03-17 01:37:58 +03:00
Vitaliy Filippov	b7636e595f	Update version in docker docs	2025-03-16 16:53:57 +03:00
Vitaliy Filippov	48c026bfa0	Release 2.0.0 No breaking features, it's 2.0.0 just because it includes S3 and because there are already too many 1.x releases :). New features: - S3 is finally available: https://vitastor.io/docs/installation/s3.html - node.js addon is now packaged as a Debian package - Support listing PGs by OSDs in `vitastor-cli ls-pgs` - Implement offline TRIM support: [vitastor-disk trim](https://vitastor.io/docs/usage/disk.html#trim), [discard_on_start](https://vitastor.io/docs/config/osd.html#discard_on_start) - Change used_for_fs pool option to used_for_app Bug fixes: - Fix several bugs in the node.js addon (a memory leak, an incorrectly triggered event loop) - Fix a client crash (vitastor-cli rm) during deletion when writeback is enabled - Fix PG object count statistics on deletion of non-existing objects - Fix vitastor-nbd crash when mapping by ID instead of inode name - Fix a client memory leak with enabled immediate_commit and write-back cache - Add seccomp=unconfined for vitastor docker OSDs to not break io_uring - Add udev and systemd to vitastor docker image - Fix upgrading from pre-0.7.1 (very old) systemd units O_o - Fix total object count calculation in rm_data	2025-03-16 14:34:31 +03:00
Vitaliy Filippov	a73b2a26b6	Fix blockstore initialization after moving clean_dyn_size calc to calc_lengths	2025-03-16 13:44:02 +03:00
Vitaliy Filippov	f3192b610d	Fix vitastor-disk in Docker installations	2025-03-16 13:44:01 +03:00
Vitaliy Filippov	a950889976	Add missing docs for discard_on_start	2025-03-16 12:29:22 +03:00
Vitaliy Filippov	ef5194d93c	Add S3 installation docs	2025-03-16 01:17:09 +03:00
Vitaliy Filippov	f904576ab1	Fix total calculation in rm_data	2025-03-15 17:01:10 +03:00
Vitaliy Filippov	4f9b1f2f62	Support listing PGs by OSDs	2025-03-15 16:42:57 +03:00
Vitaliy Filippov	1d94afbd51	Implement offline TRIM support	2025-03-14 01:37:16 +03:00
Vitaliy Filippov	3634f005f1	Fix upgrading from pre-0.7.1 systemd units O_o	2025-03-14 01:37:16 +03:00
Vitaliy Filippov	263a3b5ad6	Rename allocator to allocator_t	2025-03-13 00:53:34 +03:00
Vitaliy Filippov	b760951aa7	Add seccomp=unconfined for vitastor docker OSDs to not break io_uring	2025-03-11 00:42:10 +03:00
Vitaliy Filippov	c8321b8ed1	Add udev and systemd to vitastor docker image	2025-03-11 00:40:39 +03:00
Vitaliy Filippov	21066a095b	Fix a memory leak with enabled immediate_commit and write-back cache Remove dirty buffers after writing when immediate_commit is on instead of saving them for repeating later	2025-03-11 00:40:18 +03:00
Vitaliy Filippov	a96900b696	Explicitly destroy Nan::Persistents, otherwise it leaks memory	2025-03-09 16:45:10 +03:00
Vitaliy Filippov	8a6e461322	Fix license (VNPL 1.1, not 2.0)	2025-03-08 17:17:23 +03:00
Vitaliy Filippov	0b6a0463a4	Save a reference to the buffer during write	2025-03-08 16:00:26 +03:00
Vitaliy Filippov	35d4047f46	Fix vitastor-nbd crash when mapping by ID instead of inode name	2025-03-08 15:52:57 +03:00
Vitaliy Filippov	819f1125ae	Support used_for_app instead of used_for_fs	2025-03-07 01:03:43 +03:00
Vitaliy Filippov	108df7329f	Fix PG object count statistics on deletion of non-existing objects	2025-03-04 00:40:56 +03:00
Vitaliy Filippov	d32edf6cdf	Fix deletion writeback	2025-03-04 00:40:35 +03:00
Vitaliy Filippov	dca436d7e6	Trigger event loop automatically in libvitastor_c	2025-03-03 00:57:09 +03:00
Vitaliy Filippov	8129a0b4e3	Loop once after registering eventfd to prevent skipping previous events	2025-03-03 00:57:00 +03:00
Vitaliy Filippov	704c87d512	Trigger initial epoll when adding an FD	2025-03-03 00:56:17 +03:00