Release 1.9.3

- Support custom hybrid OSD creation (`vitastor-disk prepare --hybrid --fast-devices /dev/xxx,/dev/yyy`) - Auto-change partition paths to /dev/disk/by-partuuid/ in `vitastor-disk prepare` - Allow to select cached I/O in vitastor-disk commands - Fix multiple bugs in vitastor-disk resize & add tests for them - Fix vitastor-disk write-meta/write-journal in superblock-based mode writing it to an incorrect device - Fix vitastor-disk prepare sometimes again not seeing new partitions - Cleanup PG history and stats of deleted pools - Fix "is already mounted" checks in CSI
Dynamic device size in test
2024-11-07 01:28:31 +03:00 · 2024-11-06 14:16:58 +03:00 · 2024-11-06 13:52:25 +03:00 · 2024-11-06 13:30:51 +03:00 · 2024-11-06 13:30:12 +03:00 · 2024-11-06 02:58:51 +03:00
405 changed files with 19649 additions and 3538 deletions
--- a/.gitea/workflows/buildenv.Dockerfile
+++ b/.gitea/workflows/buildenv.Dockerfile
@@ -22,7 +22,7 @@ RUN apt-get update
 RUN apt-get -y install etcd qemu-system-x86 qemu-block-extra qemu-utils fio libasan5 \
    liburing1 liburing-dev libgoogle-perftools-dev devscripts libjerasure-dev cmake libibverbs-dev libisal-dev
 RUN apt-get -y build-dep fio qemu=`dpkg -s qemu-system-x86|grep ^Version:|awk '{print $2}'`
-RUN apt-get -y install jq lp-solve sudo nfs-common
+RUN apt-get update && apt-get -y install jq lp-solve sudo nfs-common fdisk parted
 RUN apt-get --download-only source fio qemu=`dpkg -s qemu-system-x86|grep ^Version:|awk '{print $2}'`

 RUN set -ex; \
--- a/.gitea/workflows/test.yml
+++ b/.gitea/workflows/test.yml
@@ -16,6 +16,7 @@ env:
  BUILDENV_IMAGE: git.yourcmc.ru/vitalif/vitastor/buildenv
  TEST_IMAGE: git.yourcmc.ru/vitalif/vitastor/test
  OSD_ARGS: '--etcd_quick_timeout 2000'
+  USE_RAMDISK: 1

 concurrency:
  group: ci-${{ github.ref }}
@@ -197,6 +198,24 @@ jobs:
          echo ""
        done

+  test_etcd_fail_antietcd:
+    runs-on: ubuntu-latest
+    needs: build
+    container: ${{env.TEST_IMAGE}}:${{github.sha}}
+    steps:
+    - name: Run test
+      id: test
+      timeout-minutes: 10
+      run: ANTIETCD=1 /root/vitastor/tests/test_etcd_fail.sh
+    - name: Print logs
+      if: always() && steps.test.outcome == 'failure'
+      run: |
+        for i in /root/vitastor/testdata/*.log /root/vitastor/testdata/*.txt; do
+          echo "-------- $i --------"
+          cat $i
+          echo ""
+        done
+
  test_interrupted_rebalance:
    runs-on: ubuntu-latest
    needs: build
@@ -539,6 +558,24 @@ jobs:
          echo ""
        done

+  test_dd:
+    runs-on: ubuntu-latest
+    needs: build
+    container: ${{env.TEST_IMAGE}}:${{github.sha}}
+    steps:
+    - name: Run test
+      id: test
+      timeout-minutes: 3
+      run: /root/vitastor/tests/test_dd.sh
+    - name: Print logs
+      if: always() && steps.test.outcome == 'failure'
+      run: |
+        for i in /root/vitastor/testdata/*.log /root/vitastor/testdata/*.txt; do
+          echo "-------- $i --------"
+          cat $i
+          echo ""
+        done
+
  test_root_node:
    runs-on: ubuntu-latest
    needs: build
@@ -665,6 +702,24 @@ jobs:
          echo ""
        done

+  test_heal_antietcd:
+    runs-on: ubuntu-latest
+    needs: build
+    container: ${{env.TEST_IMAGE}}:${{github.sha}}
+    steps:
+    - name: Run test
+      id: test
+      timeout-minutes: 10
+      run: ANTIETCD=1 /root/vitastor/tests/test_heal.sh
+    - name: Print logs
+      if: always() && steps.test.outcome == 'failure'
+      run: |
+        for i in /root/vitastor/testdata/*.log /root/vitastor/testdata/*.txt; do
+          echo "-------- $i --------"
+          cat $i
+          echo ""
+        done
+
  test_heal_csum_32k_dmj:
    runs-on: ubuntu-latest
    needs: build
@@ -773,6 +828,60 @@ jobs:
          echo ""
        done

+  test_resize:
+    runs-on: ubuntu-latest
+    needs: build
+    container: ${{env.TEST_IMAGE}}:${{github.sha}}
+    steps:
+    - name: Run test
+      id: test
+      timeout-minutes: 3
+      run: /root/vitastor/tests/test_resize.sh
+    - name: Print logs
+      if: always() && steps.test.outcome == 'failure'
+      run: |
+        for i in /root/vitastor/testdata/*.log /root/vitastor/testdata/*.txt; do
+          echo "-------- $i --------"
+          cat $i
+          echo ""
+        done
+
+  test_resize_auto:
+    runs-on: ubuntu-latest
+    needs: build
+    container: ${{env.TEST_IMAGE}}:${{github.sha}}
+    steps:
+    - name: Run test
+      id: test
+      timeout-minutes: 3
+      run: /root/vitastor/tests/test_resize_auto.sh
+    - name: Print logs
+      if: always() && steps.test.outcome == 'failure'
+      run: |
+        for i in /root/vitastor/testdata/*.log /root/vitastor/testdata/*.txt; do
+          echo "-------- $i --------"
+          cat $i
+          echo ""
+        done
+
+  test_snapshot_pool2:
+    runs-on: ubuntu-latest
+    needs: build
+    container: ${{env.TEST_IMAGE}}:${{github.sha}}
+    steps:
+    - name: Run test
+      id: test
+      timeout-minutes: 3
+      run: /root/vitastor/tests/test_snapshot_pool2.sh
+    - name: Print logs
+      if: always() && steps.test.outcome == 'failure'
+      run: |
+        for i in /root/vitastor/testdata/*.log /root/vitastor/testdata/*.txt; do
+          echo "-------- $i --------"
+          cat $i
+          echo ""
+        done
+
  test_osd_tags:
    runs-on: ubuntu-latest
    needs: build
--- a/.gitea/workflows/tests-to-yaml.pl
+++ b/.gitea/workflows/tests-to-yaml.pl
@@ -34,6 +34,10 @@ for my $line (<>)
            {
                $test_name .= '_imm';
            }
+            elsif ($1 eq 'ANTIETCD')
+            {
+                $test_name .= '_antietcd';
+            }
            else
            {
                $test_name .= '_'.lc($1).'_'.$2;
--- a/.gitignore
+++ b/.gitignore
@@ -3,16 +3,3 @@
 package-lock.json
 fio
 qemu
-osd
-stub_osd
-stub_uring_osd
-stub_bench
-osd_test
-osd_peering_pg_test
-dump_journal
-nbd_proxy
-rm_inode
-test_allocator
-test_blockstore
-test_shit
-osd_rmw_test
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -2,6 +2,6 @@ cmake_minimum_required(VERSION 2.8.12)

 project(vitastor)

-set(VERSION "1.6.1")
+set(VITASTOR_VERSION "1.9.3")

 add_subdirectory(src)
--- a/README-ru.md
+++ b/README-ru.md
@@ -1,4 +1,4 @@
-## Vitastor
+# Vitastor

 [Read English version](README.md)

@@ -19,10 +19,10 @@ Vitastor нацелен в первую очередь на SSD и SSD+HDD кл
 TCP и RDMA и на хорошем железе может достигать задержки 4 КБ чтения и записи на уровне ~0.1 мс,
 что примерно в 10 раз быстрее, чем Ceph и другие популярные программные СХД.

-Vitastor поддерживает QEMU-драйвер, протоколы NBD и NFS, драйверы OpenStack, Proxmox, Kubernetes.
+Vitastor поддерживает QEMU-драйвер, протоколы NBD и NFS, драйверы OpenStack, OpenNebula, Proxmox, Kubernetes.
 Другие драйверы могут также быть легко реализованы.

-Подробности смотрите в документации по ссылкам ниже.
+Подробности смотрите в документации по ссылкам. Можете начать отсюда: [Быстрый старт](docs/intro/quickstart.ru.md).

 ## Презентации и записи докладов

@@ -42,6 +42,7 @@ Vitastor поддерживает QEMU-драйвер, протоколы NBD и
 - Установка
  - [Пакеты](docs/installation/packages.ru.md)
  - [Proxmox](docs/installation/proxmox.ru.md)
+  - [OpenNebula](docs/installation/opennebula.ru.md)
  - [OpenStack](docs/installation/openstack.ru.md)
  - [Kubernetes CSI](docs/installation/kubernetes.ru.md)
  - [Сборка из исходных кодов](docs/installation/source.ru.md)
@@ -50,7 +51,7 @@ Vitastor поддерживает QEMU-драйвер, протоколы NBD и
  - Параметры
    - [Общие](docs/config/common.ru.md)
    - [Сетевые](docs/config/network.ru.md)
-    - [Клиентский код](docs/config/client.en.md)
+    - [Клиентский код](docs/config/client.ru.md)
    - [Глобальные дисковые параметры](docs/config/layout-cluster.ru.md)
    - [Дисковые параметры OSD](docs/config/layout-osd.ru.md)
    - [Прочие параметры OSD](docs/config/osd.ru.md)
--- a/README.md
+++ b/README.md
@@ -19,10 +19,10 @@ supports TCP and RDMA and may achieve 4 KB read and write latency as low as ~0.1
 with proper hardware which is ~10 times faster than other popular SDS's like Ceph
 or internal systems of public clouds.

-Vitastor supports QEMU, NBD, NFS protocols, OpenStack, Proxmox, Kubernetes drivers.
+Vitastor supports QEMU, NBD, NFS protocols, OpenStack, OpenNebula, Proxmox, Kubernetes drivers.
 More drivers may be created easily.

-Read more details below in the documentation.
+Read more details in the documentation. You can start from here: [Quick Start](docs/intro/quickstart.en.md).

 ## Talks and presentations

@@ -42,6 +42,7 @@ Read more details below in the documentation.
 - Installation
  - [Packages](docs/installation/packages.en.md)
  - [Proxmox](docs/installation/proxmox.en.md)
+  - [OpenNebula](docs/installation/opennebula.en.md)
  - [OpenStack](docs/installation/openstack.en.md)
  - [Kubernetes CSI](docs/installation/kubernetes.en.md)
  - [Building from Source](docs/installation/source.en.md)
--- a/copy-fio-includes.sh
+++ b/copy-fio-includes.sh
@@ -1,6 +1,6 @@
 #!/bin/bash

-gcc -I. -E -o fio_headers.i src/fio_headers.h
+gcc -I. -E -o fio_headers.i src/util/fio_headers.h

 rm -rf fio-copy
 for i in `grep -Po 'fio/[^"]+' fio_headers.i | sort | uniq`; do
--- a/copy-qemu-includes.sh
+++ b/copy-qemu-includes.sh
@@ -5,7 +5,7 @@
 #cd b/qemu; make qapi

 gcc -I qemu/b/qemu `pkg-config glib-2.0 --cflags` \
-    -I qemu/include -E -o qemu_driver.i src/qemu_driver.c
+    -I qemu/include -E -o qemu_driver.i src/client/qemu_driver.c

 rm -rf qemu-copy
 for i in `grep -Po 'qemu/[^"]+' qemu_driver.i | sort | uniq`; do
--- a/csi/Makefile
+++ b/csi/Makefile
@@ -1,9 +1,9 @@
-VERSION ?= v1.6.1
+VITASTOR_VERSION ?= v1.9.3

 all: build push

 build:
-	@docker build --rm -t vitalif/vitastor-csi:$(VERSION) .
+	@docker build --rm -t vitalif/vitastor-csi:$(VITASTOR_VERSION) .

 push:
-	@docker push vitalif/vitastor-csi:$(VERSION)
+	@docker push vitalif/vitastor-csi:$(VITASTOR_VERSION)
--- a/csi/deploy/004-csi-nodeplugin.yaml
+++ b/csi/deploy/004-csi-nodeplugin.yaml
@@ -49,7 +49,7 @@ spec:
            capabilities:
              add: ["SYS_ADMIN"]
            allowPrivilegeEscalation: true
-          image: vitalif/vitastor-csi:v1.6.1
+          image: vitalif/vitastor-csi:v1.9.3
          args:
            - "--node=$(NODE_ID)"
            - "--endpoint=$(CSI_ENDPOINT)"
--- a/csi/deploy/007-csi-provisioner.yaml
+++ b/csi/deploy/007-csi-provisioner.yaml
@@ -121,7 +121,7 @@ spec:
            privileged: true
            capabilities:
              add: ["SYS_ADMIN"]
-          image: vitalif/vitastor-csi:v1.6.1
+          image: vitalif/vitastor-csi:v1.9.3
          args:
            - "--node=$(NODE_ID)"
            - "--endpoint=$(CSI_ENDPOINT)"
--- a/csi/go.mod
+++ b/csi/go.mod
@@ -3,10 +3,10 @@ module vitastor.io/csi
 go 1.15

 require (
-	github.com/container-storage-interface/spec v1.4.0
+	github.com/container-storage-interface/spec v1.8.0
 	github.com/golang/glog v0.0.0-20160126235308-23def4e6c14b
 	github.com/kubernetes-csi/csi-lib-utils v0.9.1
-	golang.org/x/net v0.0.0-20201202161906-c7110b5ffcbb
+	golang.org/x/net v0.7.0
 	golang.org/x/xerrors v0.0.0-20200804184101-5ec99f83aff1 // indirect
 	google.golang.org/grpc v1.33.1
 	google.golang.org/protobuf v1.24.0
--- a/csi/go.sum
+++ b/csi/go.sum
@@ -41,8 +41,8 @@ github.com/chzyer/logex v1.1.10/go.mod h1:+Ywpsq7O8HXn0nuIou7OrIPyXbp3wmkHB+jjWR
 github.com/chzyer/readline v0.0.0-20180603132655-2972be24d48e/go.mod h1:nSuG5e5PlCu98SY8svDHJxuZscDgtXS6KTTbou5AhLI=
 github.com/chzyer/test v0.0.0-20180213035817-a1ea475d72b1/go.mod h1:Q3SI9o4m/ZMnBNeIyt5eFwwo7qiLfzFZmjNmxjkiQlU=
 github.com/container-storage-interface/spec v1.2.0/go.mod h1:6URME8mwIBbpVyZV93Ce5St17xBiQJQY67NDsuohiy4=
-github.com/container-storage-interface/spec v1.4.0 h1:ozAshSKxpJnYUfmkpZCTYyF/4MYeYlhdXbAvPvfGmkg=
-github.com/container-storage-interface/spec v1.4.0/go.mod h1:6URME8mwIBbpVyZV93Ce5St17xBiQJQY67NDsuohiy4=
+github.com/container-storage-interface/spec v1.8.0 h1:D0vhF3PLIZwlwZEf2eNbpujGCNwspwTYf2idJRJx4xI=
+github.com/container-storage-interface/spec v1.8.0/go.mod h1:ROLik+GhPslwwWRNFF1KasPzroNARibH2rfz1rkg4H0=
 github.com/davecgh/go-spew v1.1.0/go.mod h1:J7Y8YcW2NihsgmVo/mv3lAwl/skON4iLHjSsI+c5H38=
 github.com/davecgh/go-spew v1.1.1 h1:vj9j/u1bqnvCEfJOwUhtlOARqs3+rkHYY13jYWTU97c=
 github.com/davecgh/go-spew v1.1.1/go.mod h1:J7Y8YcW2NihsgmVo/mv3lAwl/skON4iLHjSsI+c5H38=
@@ -182,6 +182,7 @@ github.com/stretchr/testify v1.3.0/go.mod h1:M5WIy9Dh21IEIfnGCwXGc5bZfKNJtfHm1UV
 github.com/stretchr/testify v1.4.0/go.mod h1:j7eGeouHqKxXV5pUuKE4zz7dFj8WfuZ+81PSLYec5m4=
 github.com/stretchr/testify v1.5.1 h1:nOGnQDM7FYENwehXlg/kFVnos3rEvtKTjRvOWSzb6H4=
 github.com/stretchr/testify v1.5.1/go.mod h1:5W2xD1RspED5o8YsWQXVCued0rvSQ+mT+I5cxcmMvtA=
+github.com/yuin/goldmark v1.4.13/go.mod h1:6yULJ656Px+3vBD8DxQVa3kxgyrAnzto9xy5taEt/CY=
 go.opencensus.io v0.21.0/go.mod h1:mSImk1erAIZhrmZN+AvHh14ztQfjbGwt4TtuofqLduU=
 go.opencensus.io v0.22.0/go.mod h1:+kGneAE2xo2IficOXnaByMWTGM9T73dGwxeWcUqIpI8=
 go.opencensus.io v0.22.2/go.mod h1:yxeiOL68Rb0Xd1ddK5vPZ/oVn4vY4Ynel7k9FzqtOIw=
@@ -195,6 +196,7 @@ golang.org/x/crypto v0.0.0-20190605123033-f99c8df09eb5/go.mod h1:yigFU9vqHzYiE8U
 golang.org/x/crypto v0.0.0-20191011191535-87dc89f01550/go.mod h1:yigFU9vqHzYiE8UmvKecakEJjdnWj3jj499lnFckfCI=
 golang.org/x/crypto v0.0.0-20191206172530-e9b2fee46413/go.mod h1:LzIPMQfyMNhhGPhUkYOs5KpL4U8rLKemX1yGLhDgUto=
 golang.org/x/crypto v0.0.0-20200622213623-75b288015ac9/go.mod h1:LzIPMQfyMNhhGPhUkYOs5KpL4U8rLKemX1yGLhDgUto=
+golang.org/x/crypto v0.0.0-20210921155107-089bfa567519/go.mod h1:GvvjBRRGRdwPK5ydBHafDWAxML/pGHZbMvKqRZ5+Abc=
 golang.org/x/exp v0.0.0-20190121172915-509febef88a4/go.mod h1:CJ0aWSM057203Lf6IL+f9T1iT9GByDxfZKAQTCR3kQA=
 golang.org/x/exp v0.0.0-20190306152737-a1d7652674e8/go.mod h1:CJ0aWSM057203Lf6IL+f9T1iT9GByDxfZKAQTCR3kQA=
 golang.org/x/exp v0.0.0-20190510132918-efd6b22b2522/go.mod h1:ZjyILWgesfNpC6sMxTJOJm9Kp84zZh5NQWvqDGG3Qr8=
@@ -213,6 +215,7 @@ golang.org/x/mobile v0.0.0-20190719004257-d2bd2a29d028/go.mod h1:E/iHnbuqvinMTCc
 golang.org/x/mod v0.0.0-20190513183733-4bf6d317e70e/go.mod h1:mXi4GBBbnImb6dmsKGUJ2LatrhH/nqhxcFungHvyanc=
 golang.org/x/mod v0.1.0/go.mod h1:0QHyrYULN0/3qlju5TqG8bIK38QM8yzMo5ekMj3DlcY=
 golang.org/x/mod v0.1.1-0.20191105210325-c90efee705ee/go.mod h1:QqPTAvyqsEbceGzBzNggFXnrqF1CaUcvgkdR5Ot7KZg=
+golang.org/x/mod v0.6.0-dev.0.20220419223038-86c51ed26bb4/go.mod h1:jJ57K6gSWd91VN4djpZkiMVwK6gcyfeH4XE8wZrZaV4=
 golang.org/x/net v0.0.0-20180724234803-3673e40ba225/go.mod h1:mL1N/T3taQHkDXs73rZJwtUhF3w3ftmwwsq0BUmARs4=
 golang.org/x/net v0.0.0-20180906233101-161cd47e91fd/go.mod h1:mL1N/T3taQHkDXs73rZJwtUhF3w3ftmwwsq0BUmARs4=
 golang.org/x/net v0.0.0-20181114220301-adae6a3d119a/go.mod h1:mL1N/T3taQHkDXs73rZJwtUhF3w3ftmwwsq0BUmARs4=
@@ -228,8 +231,10 @@ golang.org/x/net v0.0.0-20190620200207-3b0461eec859/go.mod h1:z5CRVTTTmAJ677TzLL
 golang.org/x/net v0.0.0-20191209160850-c0dbc17a3553/go.mod h1:z5CRVTTTmAJ677TzLLGU+0bjPO0LkuOLi4/5GtJWs/s=
 golang.org/x/net v0.0.0-20200324143707-d3edc9973b7e/go.mod h1:qpuaurCH72eLCgpAm/N6yyVIVM9cpaDIP3A8BGJEC5A=
 golang.org/x/net v0.0.0-20200707034311-ab3426394381/go.mod h1:/O7V0waA8r7cgGh81Ro3o1hOxt32SMVPicZroKQ2sZA=
-golang.org/x/net v0.0.0-20201202161906-c7110b5ffcbb h1:eBmm0M9fYhWpKZLjQUUKka/LtIxf46G4fxeEz5KJr9U=
-golang.org/x/net v0.0.0-20201202161906-c7110b5ffcbb/go.mod h1:sp8m0HH+o8qH0wwXwYZr8TS3Oi6o0r6Gce1SSxlDquU=
+golang.org/x/net v0.0.0-20210226172049-e18ecbb05110/go.mod h1:m0MpNAwzfU5UDzcl9v0D8zg8gWTRqZa9RBIspLL5mdg=
+golang.org/x/net v0.0.0-20220722155237-a158d28d115b/go.mod h1:XRhObCWvk6IyKnWLug+ECip1KBveYUHfp+8e9klMJ9c=
+golang.org/x/net v0.7.0 h1:rJrUqqhjsgNp7KqAIc25s9pZnjU7TUcSY7HcVZjdn1g=
+golang.org/x/net v0.7.0/go.mod h1:2Tu9+aMcznHK/AK1HMvgo6xiTLG5rD5rZLDS+rp2Bjs=
 golang.org/x/oauth2 v0.0.0-20180821212333-d2e6202438be/go.mod h1:N/0e6XlmueqKjAGxoOufVs8QHGRruUQn6yWY3a++T0U=
 golang.org/x/oauth2 v0.0.0-20190226205417-e64efc72b421/go.mod h1:gOpvHmFTYa4IltrdGE7lF6nIHvwfUNPOp7c8zoXwtLw=
 golang.org/x/oauth2 v0.0.0-20190604053449-0f29369cfe45/go.mod h1:gOpvHmFTYa4IltrdGE7lF6nIHvwfUNPOp7c8zoXwtLw=
@@ -240,6 +245,7 @@ golang.org/x/sync v0.0.0-20181221193216-37e7f081c4d4/go.mod h1:RxMgew5VJxzue5/jJ
 golang.org/x/sync v0.0.0-20190227155943-e225da77a7e6/go.mod h1:RxMgew5VJxzue5/jJTE5uejpjVlOe/izrB70Jof72aM=
 golang.org/x/sync v0.0.0-20190423024810-112230192c58/go.mod h1:RxMgew5VJxzue5/jJTE5uejpjVlOe/izrB70Jof72aM=
 golang.org/x/sync v0.0.0-20190911185100-cd5d95a43a6e/go.mod h1:RxMgew5VJxzue5/jJTE5uejpjVlOe/izrB70Jof72aM=
+golang.org/x/sync v0.0.0-20220722155255-886fb9371eb4/go.mod h1:RxMgew5VJxzue5/jJTE5uejpjVlOe/izrB70Jof72aM=
 golang.org/x/sys v0.0.0-20180905080454-ebe1bf3edb33/go.mod h1:STP8DvDyc/dI5b8T5hshtkjS+E42TnysNCUPdjciGhY=
 golang.org/x/sys v0.0.0-20180909124046-d0be0721c37e/go.mod h1:STP8DvDyc/dI5b8T5hshtkjS+E42TnysNCUPdjciGhY=
 golang.org/x/sys v0.0.0-20181116152217-5ac8a444bdc5/go.mod h1:STP8DvDyc/dI5b8T5hshtkjS+E42TnysNCUPdjciGhY=
@@ -259,13 +265,22 @@ golang.org/x/sys v0.0.0-20200302150141-5c8b2ff67527/go.mod h1:h1NjWce9XRLGQEsW7w
 golang.org/x/sys v0.0.0-20200323222414-85ca7c5b95cd/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs=
 golang.org/x/sys v0.0.0-20200615200032-f1bc736245b1/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs=
 golang.org/x/sys v0.0.0-20200622214017-ed371f2e16b4/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs=
-golang.org/x/sys v0.0.0-20200930185726-fdedc70b468f h1:+Nyd8tzPX9R7BWHguqsrbFdRx3WQ/1ib8I44HXV5yTA=
-golang.org/x/sys v0.0.0-20200930185726-fdedc70b468f/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs=
+golang.org/x/sys v0.0.0-20201119102817-f84b799fce68/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs=
+golang.org/x/sys v0.0.0-20210615035016-665e8c7367d1/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg=
+golang.org/x/sys v0.0.0-20220520151302-bc2c85ada10a/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg=
+golang.org/x/sys v0.0.0-20220722155257-8c9f86f7a55f/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg=
+golang.org/x/sys v0.5.0 h1:MUK/U/4lj1t1oPg0HfuXDN/Z1wv31ZJ/YcPiGccS4DU=
+golang.org/x/sys v0.5.0/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg=
+golang.org/x/term v0.0.0-20201126162022-7de9c90e9dd1/go.mod h1:bj7SfCRtBDWHUb9snDiAeCFNEtKQo2Wmx5Cou7ajbmo=
+golang.org/x/term v0.0.0-20210927222741-03fcf44c2211/go.mod h1:jbD1KX2456YbFQfuXm/mYQcufACuNUgVhRMnK/tPxf8=
+golang.org/x/term v0.5.0/go.mod h1:jMB1sMXY+tzblOD4FWmEbocvup2/aLOaQEp7JmGp78k=
 golang.org/x/text v0.3.0/go.mod h1:NqM8EUOU14njkJ3fqMW+pc6Ldnwhi/IjpwHt7yyuwOQ=
 golang.org/x/text v0.3.1-0.20180807135948-17ff2d5776d2/go.mod h1:NqM8EUOU14njkJ3fqMW+pc6Ldnwhi/IjpwHt7yyuwOQ=
 golang.org/x/text v0.3.2/go.mod h1:bEr9sfX3Q8Zfm5fL9x+3itogRgK3+ptLWKqgva+5dAk=
-golang.org/x/text v0.3.3 h1:cokOdA+Jmi5PJGXLlLllQSgYigAEfHXJAERHVMaCc2k=
 golang.org/x/text v0.3.3/go.mod h1:5Zoc/QRtKVWzQhOtBMvqHzDpF6irO9z98xDceosuGiQ=
+golang.org/x/text v0.3.7/go.mod h1:u+2+/6zg+i71rQMx5EYifcz6MCKuco9NR6JIITiCfzQ=
+golang.org/x/text v0.7.0 h1:4BRB4x83lYWy72KwLD/qYDuTu7q9PjSagHvijDw7cLo=
+golang.org/x/text v0.7.0/go.mod h1:mrYo+phRRbMaCq/xk9113O4dZlRixOauAjOtrjsXDZ8=
 golang.org/x/time v0.0.0-20181108054448-85acf8d2951c/go.mod h1:tRJNPiyCQ0inRvYxbN9jk5I+vvW/OXSQhTDSoE431IQ=
 golang.org/x/time v0.0.0-20190308202827-9d24e82272b4/go.mod h1:tRJNPiyCQ0inRvYxbN9jk5I+vvW/OXSQhTDSoE431IQ=
 golang.org/x/time v0.0.0-20191024005414-555d28b269f0/go.mod h1:tRJNPiyCQ0inRvYxbN9jk5I+vvW/OXSQhTDSoE431IQ=
@@ -286,8 +301,10 @@ golang.org/x/tools v0.0.0-20190628153133-6cdbf07be9d0/go.mod h1:/rFqwRUd4F7ZHNgw
 golang.org/x/tools v0.0.0-20190816200558-6889da9d5479/go.mod h1:b+2E5dAYhXwXZwtnZ6UAqBI28+e2cm9otk0dWdXHAEo=
 golang.org/x/tools v0.0.0-20190911174233-4f2ddba30aff/go.mod h1:b+2E5dAYhXwXZwtnZ6UAqBI28+e2cm9otk0dWdXHAEo=
 golang.org/x/tools v0.0.0-20191012152004-8de300cfc20a/go.mod h1:b+2E5dAYhXwXZwtnZ6UAqBI28+e2cm9otk0dWdXHAEo=
+golang.org/x/tools v0.0.0-20191119224855-298f0cb1881e/go.mod h1:b+2E5dAYhXwXZwtnZ6UAqBI28+e2cm9otk0dWdXHAEo=
 golang.org/x/tools v0.0.0-20191125144606-a911d9008d1f/go.mod h1:b+2E5dAYhXwXZwtnZ6UAqBI28+e2cm9otk0dWdXHAEo=
 golang.org/x/tools v0.0.0-20191227053925-7b8e75db28f4/go.mod h1:TB2adYChydJhpapKDTa4BR/hXlZSLoq2Wpct/0txZ28=
+golang.org/x/tools v0.1.12/go.mod h1:hNGJHUnrk76NpqgfD5Aqm5Crs+Hm0VOH/i9J2+nxYbc=
 golang.org/x/xerrors v0.0.0-20190717185122-a985d3407aa7/go.mod h1:I/5z698sn9Ka8TeJc9MKroUUfqBBauWjQqLJ2OPfmY0=
 golang.org/x/xerrors v0.0.0-20191011141410-1b5146add898/go.mod h1:I/5z698sn9Ka8TeJc9MKroUUfqBBauWjQqLJ2OPfmY0=
 golang.org/x/xerrors v0.0.0-20191204190536-9bdfabe68543/go.mod h1:I/5z698sn9Ka8TeJc9MKroUUfqBBauWjQqLJ2OPfmY0=
--- a/csi/src/config.go
+++ b/csi/src/config.go
@@ -5,7 +5,7 @@ package vitastor

 const (
    vitastorCSIDriverName    = "csi.vitastor.io"
-    vitastorCSIDriverVersion = "1.6.1"
+    vitastorCSIDriverVersion = "1.9.3"
 )

 // Config struct fills the parameters of request or user input
--- a/csi/src/controllerserver.go
+++ b/csi/src/controllerserver.go
@@ -8,11 +8,9 @@ import (
    "encoding/json"
    "fmt"
    "strings"
-    "bytes"
    "strconv"
    "time"
    "os"
-    "os/exec"
    "io/ioutil"

    "github.com/kubernetes-csi/csi-lib-utils/protosanitizer"
@@ -114,22 +112,6 @@ func GetConnectionParams(params map[string]string) (map[string]string, error)
    return ctxVars, nil
 }

-func system(program string, args ...string) ([]byte, []byte, error)
-{
-    klog.Infof("Running "+program+" "+strings.Join(args, " "))
-    c := exec.Command(program, args...)
-    var stdout, stderr bytes.Buffer
-    c.Stdout, c.Stderr = &stdout, &stderr
-    err := c.Run()
-    if (err != nil)
-    {
-        stdoutStr, stderrStr := string(stdout.Bytes()), string(stderr.Bytes())
-        klog.Errorf(program+" "+strings.Join(args, " ")+" failed: %s, status %s\n", stdoutStr+stderrStr, err)
-        return nil, nil, status.Error(codes.Internal, stdoutStr+stderrStr+" (status "+err.Error()+")")
-    }
-    return stdout.Bytes(), stderr.Bytes(), nil
-}
-
 func invokeCLI(ctxVars map[string]string, args []string) ([]byte, error)
 {
    if (ctxVars["configPath"] != "")
@@ -158,6 +140,12 @@ func (cs *ControllerServer) CreateVolume(ctx context.Context, req *csi.CreateVol
        return nil, status.Error(codes.InvalidArgument, "volume capabilities is a required field")
    }

+    err := cs.checkCaps(volumeCapabilities)
+    if (err != nil)
+    {
+        return nil, err
+    }
+
    etcdVolumePrefix := req.Parameters["etcdVolumePrefix"]
    poolId, _ := strconv.ParseUint(req.Parameters["poolId"], 10, 64)
    if (poolId == 0)
@@ -301,13 +289,44 @@ func (cs *ControllerServer) ValidateVolumeCapabilities(ctx context.Context, req
        return nil, status.Error(codes.InvalidArgument, "volumeCapabilities is nil")
    }

+    err := cs.checkCaps(volumeCapabilities)
+    if (err != nil)
+    {
+        return nil, err
+    }
+
+    return &csi.ValidateVolumeCapabilitiesResponse{
+        Confirmed: &csi.ValidateVolumeCapabilitiesResponse_Confirmed{
+            VolumeCapabilities: req.VolumeCapabilities,
+        },
+    }, nil
+}
+
+func (cs *ControllerServer) checkCaps(volumeCapabilities []*csi.VolumeCapability) error
+{
    var volumeCapabilityAccessModes []*csi.VolumeCapability_AccessMode
    for _, mode := range []csi.VolumeCapability_AccessMode_Mode{
        csi.VolumeCapability_AccessMode_SINGLE_NODE_WRITER,
-        csi.VolumeCapability_AccessMode_MULTI_NODE_MULTI_WRITER,
+        csi.VolumeCapability_AccessMode_SINGLE_NODE_READER_ONLY,
+        csi.VolumeCapability_AccessMode_MULTI_NODE_READER_ONLY,
+        csi.VolumeCapability_AccessMode_SINGLE_NODE_SINGLE_WRITER,
+        csi.VolumeCapability_AccessMode_SINGLE_NODE_MULTI_WRITER,
    } {
        volumeCapabilityAccessModes = append(volumeCapabilityAccessModes, &csi.VolumeCapability_AccessMode{Mode: mode})
    }
+    for _, capability := range volumeCapabilities
+    {
+        if (capability.GetBlock() != nil)
+        {
+            for _, mode := range []csi.VolumeCapability_AccessMode_Mode{
+                csi.VolumeCapability_AccessMode_MULTI_NODE_SINGLE_WRITER,
+                csi.VolumeCapability_AccessMode_MULTI_NODE_MULTI_WRITER,
+            } {
+                volumeCapabilityAccessModes = append(volumeCapabilityAccessModes, &csi.VolumeCapability_AccessMode{Mode: mode})
+            }
+            break
+        }
+    }

    capabilitySupport := false
    for _, capability := range volumeCapabilities
@@ -323,14 +342,10 @@ func (cs *ControllerServer) ValidateVolumeCapabilities(ctx context.Context, req

    if (!capabilitySupport)
    {
-        return nil, status.Errorf(codes.NotFound, "%v not supported", req.GetVolumeCapabilities())
+        return status.Errorf(codes.NotFound, "%v not supported", volumeCapabilities)
    }

-    return &csi.ValidateVolumeCapabilitiesResponse{
-        Confirmed: &csi.ValidateVolumeCapabilitiesResponse_Confirmed{
-            VolumeCapabilities: req.VolumeCapabilities,
-        },
-    }, nil
+    return nil
 }

 // ListVolumes returns a list of volumes
--- a/csi/src/nodeserver.go
+++ b/csi/src/nodeserver.go
@@ -227,7 +227,32 @@ func (ns *NodeServer) NodeStageVolume(ctx context.Context, req *csi.NodeStageVol
    isBlock := req.GetVolumeCapability().GetBlock() != nil

    // Check that it's not already mounted
-    _, err = mount.IsNotMountPoint(ns.mounter, targetPath)
+    notmnt, err := mount.IsNotMountPoint(ns.mounter, targetPath)
+    if (err == nil)
+    {
+        if (!notmnt)
+        {
+            klog.Errorf("target path %s is already mounted", targetPath)
+            return nil, fmt.Errorf("target path %s is already mounted", targetPath)
+        }
+        var finfo os.FileInfo
+        finfo, err = os.Stat(targetPath)
+        if (err != nil)
+        {
+            klog.Errorf("failed to stat %s: %v", targetPath, err)
+            return nil, err
+        }
+        if (finfo.IsDir() != (!isBlock))
+        {
+            err = os.Remove(targetPath)
+            if (err != nil)
+            {
+                klog.Errorf("failed to remove %s (to recreate it with correct type): %v", targetPath, err)
+                return nil, err
+            }
+            err = os.ErrNotExist
+        }
+    }
    if (err != nil)
    {
        if (os.IsNotExist(err))
@@ -280,6 +305,7 @@ func (ns *NodeServer) NodeStageVolume(ctx context.Context, req *csi.NodeStageVol
    diskMounter := &mount.SafeFormatAndMount{Interface: ns.mounter, Exec: utilexec.New()}
    if (isBlock)
    {
+        klog.Infof("bind-mounting %s to %s", devicePath, targetPath)
        err = diskMounter.Mount(devicePath, targetPath, "", []string{"bind"})
    }
    else
@@ -309,39 +335,40 @@ func (ns *NodeServer) NodeStageVolume(ctx context.Context, req *csi.NodeStageVol
        readOnly := Contains(opt, "ro")
        if (existingFormat == "" && !readOnly)
        {
-            var cmdOut []byte
            switch fsType
            {
                case "ext4":
                    args := []string{"-m0", "-Enodiscard,lazy_itable_init=1,lazy_journal_init=1", devicePath}
-                    cmdOut, err = diskMounter.Exec.Command("mkfs.ext4", args...).CombinedOutput()
+                    _, err = systemCombined("mkfs.ext4", args...)
                case "xfs":
-                    cmdOut, err = diskMounter.Exec.Command("mkfs.xfs", "-K", devicePath).CombinedOutput()
+                    _, err = systemCombined("mkfs.xfs", "-K", devicePath)
            }
            if (err != nil)
            {
-                klog.Errorf("failed to run mkfs error: %v, output: %v", err, string(cmdOut))
                goto unmap
            }
        }

+        klog.Infof("formatting and mounting %s to %s with FS %s, options: %v", devicePath, targetPath, fsType, opt)
        err = diskMounter.FormatAndMount(devicePath, targetPath, fsType, opt)
+        if (err == nil)
+        {
+            klog.Infof("successfully mounted %s to %s", devicePath, targetPath)
+        }

        // Try to run online resize on mount.
        // FIXME: Implement online resize. It requires online resize support in vitastor-nbd.
        if (err == nil && existingFormat != "" && !readOnly)
        {
-            var cmdOut []byte
            switch (fsType)
            {
                case "ext4":
-                    cmdOut, err = diskMounter.Exec.Command("resize2fs", devicePath).CombinedOutput()
+                    _, err = systemCombined("resize2fs", devicePath)
                case "xfs":
-                    cmdOut, err = diskMounter.Exec.Command("xfs_growfs", devicePath).CombinedOutput()
+                    _, err = systemCombined("xfs_growfs", devicePath)
            }
            if (err != nil)
            {
-                klog.Errorf("failed to run resizefs error: %v, output: %v", err, string(cmdOut))
                goto unmap
            }
        }
@@ -385,7 +412,7 @@ func (ns *NodeServer) NodeUnstageVolume(ctx context.Context, req *csi.NodeUnstag
    defer ns.unlockVolume(ctxVars["configPath"]+":"+volName)

    targetPath := req.GetStagingTargetPath()
-    devicePath, refCount, err := mount.GetDeviceNameFromMount(ns.mounter, targetPath)
+    devicePath, _, err := mount.GetDeviceNameFromMount(ns.mounter, targetPath)
    if (err != nil)
    {
        if (os.IsNotExist(err))
@@ -402,6 +429,16 @@ func (ns *NodeServer) NodeUnstageVolume(ctx context.Context, req *csi.NodeUnstag
        return &csi.NodeUnstageVolumeResponse{}, nil
    }

+    refList, err := ns.mounter.GetMountRefs(targetPath)
+    if (err != nil)
+    {
+        return nil, err
+    }
+    if (len(refList) > 0)
+    {
+        klog.Warningf("%s is still referenced: %v", targetPath, refList)
+    }
+
    // unmount
    err = mount.CleanupMountPoint(targetPath, ns.mounter, false)
    if (err != nil)
@@ -410,7 +447,7 @@ func (ns *NodeServer) NodeUnstageVolume(ctx context.Context, req *csi.NodeUnstag
    }

    // unmap device
-    if (refCount == 1)
+    if (len(refList) == 0)
    {
        if (!ns.useVduse)
        {
@@ -451,15 +488,20 @@ func (ns *NodeServer) NodePublishVolume(ctx context.Context, req *csi.NodePublis
    isBlock := req.GetVolumeCapability().GetBlock() != nil

    // Check that stagingTargetPath is mounted
-    _, err = mount.IsNotMountPoint(ns.mounter, stagingTargetPath)
+    notmnt, err := mount.IsNotMountPoint(ns.mounter, stagingTargetPath)
    if (err != nil)
    {
-        klog.Errorf("staging path %v is not mounted: %v", stagingTargetPath, err)
-        return nil, fmt.Errorf("staging path %v is not mounted: %v", stagingTargetPath, err)
+        klog.Errorf("staging path %v is not mounted: %w", stagingTargetPath, err)
+        return nil, fmt.Errorf("staging path %v is not mounted: %w", stagingTargetPath, err)
+    }
+    else if (notmnt)
+    {
+        klog.Errorf("staging path %v is not mounted", stagingTargetPath)
+        return nil, fmt.Errorf("staging path %v is not mounted", stagingTargetPath)
    }

    // Check that targetPath is not already mounted
-    _, err = mount.IsNotMountPoint(ns.mounter, targetPath)
+    notmnt, err = mount.IsNotMountPoint(ns.mounter, targetPath)
    if (err != nil)
    {
        if (os.IsNotExist(err))
@@ -494,6 +536,11 @@ func (ns *NodeServer) NodePublishVolume(ctx context.Context, req *csi.NodePublis
            return nil, err
        }
    }
+    else if (!notmnt)
+    {
+        klog.Errorf("target path %s is already mounted", targetPath)
+        return nil, fmt.Errorf("target path %s is already mounted", targetPath)
+    }

    execArgs := []string{"--bind", stagingTargetPath, targetPath}
    if (req.GetReadonly())
--- a/csi/src/utils.go
+++ b/csi/src/utils.go
@@ -4,6 +4,7 @@
 package vitastor

 import (
+    "bytes"
    "errors"
    "encoding/json"
    "fmt"
@@ -15,6 +16,8 @@ import (
    "syscall"

    "k8s.io/klog"
+    "google.golang.org/grpc/codes"
+    "google.golang.org/grpc/status"
 )

 func Contains(list []string, s string) bool
@@ -73,6 +76,10 @@ func checkVduseSupport() bool
            " For VDUSE you need at least Linux 5.15 and the following kernel modules: vdpa, virtio-vdpa, vduse.",
        )
    }
+    else
+    {
+        klog.Infof("VDUSE support enabled successfully")
+    }
    return vduse
 }

@@ -97,6 +104,7 @@ func mapNbd(volName string, ctxVars map[string]string, readonly bool) (string, e
    {
        return "", fmt.Errorf("vitastor-nbd did not return the name of NBD device. output: %s", stderr)
    }
+    klog.Infof("Attached volume %s via NBD as %s", volName, dev)
    return dev, err
 }

@@ -217,6 +225,7 @@ func mapVduse(stateDir string, volName string, ctxVars map[string]string, readon
                    err = os.WriteFile(stateFile, stateJSON, 0600)
                    if (err == nil)
                    {
+                        klog.Infof("Attached volume %s via VDUSE as %s (VDPA ID %s)", volName, blockdev, vdpaId)
                        return blockdev, vdpaId, nil
                    }
                }
@@ -299,3 +308,35 @@ func unmapVduseById(stateDir, vdpaId string)
        os.Remove(pidFile)
    }
 }
+
+func system(program string, args ...string) ([]byte, []byte, error)
+{
+    klog.Infof("Running "+program+" "+strings.Join(args, " "))
+    c := exec.Command(program, args...)
+    var stdout, stderr bytes.Buffer
+    c.Stdout, c.Stderr = &stdout, &stderr
+    err := c.Run()
+    if (err != nil)
+    {
+        stdoutStr, stderrStr := string(stdout.Bytes()), string(stderr.Bytes())
+        klog.Errorf(program+" "+strings.Join(args, " ")+" failed: %s\nOutput:\n%s", err, stdoutStr+stderrStr)
+        return nil, nil, status.Error(codes.Internal, stdoutStr+stderrStr+" (status "+err.Error()+")")
+    }
+    return stdout.Bytes(), stderr.Bytes(), nil
+}
+
+func systemCombined(program string, args ...string) ([]byte, error)
+{
+    klog.Infof("Running "+program+" "+strings.Join(args, " "))
+    c := exec.Command(program, args...)
+    var out bytes.Buffer
+    c.Stdout, c.Stderr = &out, &out
+    err := c.Run()
+    if (err != nil)
+    {
+        outStr := string(out.Bytes())
+        klog.Errorf(program+" "+strings.Join(args, " ")+" failed: %s, status %s\n", outStr, err)
+        return nil, status.Error(codes.Internal, outStr+" (status "+err.Error()+")")
+    }
+    return out.Bytes(), nil
+}
--- a/debian/changelog
+++ b/debian/changelog
@@ -1,4 +1,4 @@
-vitastor (1.6.1-1) unstable; urgency=medium
+vitastor (1.9.3-1) unstable; urgency=medium

  * Bugfixes

--- a/debian/control
+++ b/debian/control
@@ -53,3 +53,9 @@ Architecture: amd64
 Depends: ${shlibs:Depends}, ${misc:Depends}, vitastor-client (= ${binary:Version})
 Description: Vitastor Proxmox Virtual Environment storage plugin
 Vitastor storage plugin for Proxmox Virtual Environment.
+
+Package: vitastor-opennebula
+Architecture: amd64
+Depends: ${shlibs:Depends}, ${misc:Depends}, vitastor-client, patch, python3, jq
+Description: Vitastor OpenNebula storage plugin
+ Vitastor storage plugin for OpenNebula.
--- a/debian/patched-qemu.Dockerfile
+++ b/debian/patched-qemu.Dockerfile
@@ -27,7 +27,7 @@ RUN apt-get -y build-dep qemu
 RUN apt-get --download-only source qemu

 ADD patches /root/vitastor/patches
-ADD src/qemu_driver.c /root/vitastor/src/qemu_driver.c
+ADD src/client/qemu_driver.c /root/qemu_driver.c

 #RUN set -e; \
 #    apt-get install -y wget; \
@@ -52,7 +52,7 @@ RUN set -e; \
    cd /root/packages/qemu-$REL/qemu-*/; \
    quilt push -a; \
    quilt add block/vitastor.c; \
-    cp /root/vitastor/src/qemu_driver.c block/vitastor.c; \
+    cp /root/qemu_driver.c block/vitastor.c; \
    quilt refresh; \
    V=$(head -n1 debian/changelog | perl -pe 's/5\.2\+dfsg-9/5.2+dfsg-11/; s/^.*\((.*?)(~bpo[\d\+]*)?\).*$/$1/')+vitastor4; \
    if [ "$REL" = bullseye ]; then V=${V}bullseye; fi; \
--- a/debian/vitastor-mon.install
+++ b/debian/vitastor-mon.install
@@ -1,2 +1,3 @@
-mon usr/lib/vitastor
-mon/vitastor-mon.service /lib/systemd/system
+mon usr/lib/vitastor/
+mon/scripts/make-etcd usr/lib/vitastor/mon
+mon/scripts/vitastor-mon.service /lib/systemd/system
--- a/debian/vitastor-mon.postinst
+++ b/debian/vitastor-mon.postinst
@@ -6,4 +6,6 @@ if [ "$1" = "configure" ]; then
 	addgroup --system --quiet vitastor
 	adduser --system --quiet --ingroup vitastor --no-create-home --home /nonexistent vitastor
 	mkdir -p /etc/vitastor
+	mkdir -p /var/lib/vitastor
+	chown vitastor:vitastor /var/lib/vitastor
 fi
--- a/debian/vitastor-opennebula.install
+++ b/debian/vitastor-opennebula.install
@@ -0,0 +1,3 @@
+opennebula/remotes var/lib/one/
+opennebula/sudoers.d etc/
+opennebula/install.sh var/lib/one/remotes/datastore/vitastor/
--- a/debian/vitastor-opennebula.postinst
+++ b/debian/vitastor-opennebula.postinst
@@ -0,0 +1,7 @@
+#!/bin/sh
+
+set -e
+
+if [ "$1" = "configure" ]; then
+	/var/lib/one/remotes/datastore/vitastor/install.sh
+fi
--- a/debian/vitastor-opennebula.triggers
+++ b/debian/vitastor-opennebula.triggers
@@ -0,0 +1,4 @@
+interest /var/lib/one/remotes/datastore/downloader.sh
+interest /etc/one/oned.conf
+interest /etc/one/vmm_exec/vmm_execrc
+interest /etc/apparmor.d/local/abstractions/libvirt-qemu
--- a/debian/vitastor-osd.install
+++ b/debian/vitastor-osd.install
@@ -1,6 +1,6 @@
 usr/bin/vitastor-osd
 usr/bin/vitastor-disk
 usr/bin/vitastor-dump-journal
-mon/vitastor-osd@.service /lib/systemd/system
-mon/vitastor.target /lib/systemd/system
-mon/90-vitastor.rules /lib/udev/rules.d
+mon/scripts/vitastor-osd@.service /lib/systemd/system
+mon/scripts/vitastor.target /lib/systemd/system
+mon/scripts/90-vitastor.rules /lib/udev/rules.d
--- a/debian/vitastor.Dockerfile
+++ b/debian/vitastor.Dockerfile
@@ -9,12 +9,12 @@ ARG REL=

 WORKDIR /root

-RUN if [ "$REL" = "buster" -o "$REL" = "bullseye" ]; then \
-        echo "deb http://deb.debian.org/debian $REL-backports main" >> /etc/apt/sources.list; \
-        echo >> /etc/apt/preferences; \
-        echo 'Package: *' >> /etc/apt/preferences; \
-        echo "Pin: release a=$REL-backports" >> /etc/apt/preferences; \
-        echo 'Pin-Priority: 500' >> /etc/apt/preferences; \
+RUN set -e -x; \
+    if [ "$REL" = "buster" ]; then \
+        apt-get update; \
+        apt-get -y install wget; \
+        wget https://vitastor.io/debian/pubkey.gpg -O /etc/apt/trusted.gpg.d/vitastor.gpg; \
+        echo "deb https://vitastor.io/debian $REL main" >> /etc/apt/sources.list; \
    fi; \
    grep '^deb ' /etc/apt/sources.list | perl -pe 's/^deb/deb-src/' >> /etc/apt/sources.list; \
    perl -i -pe 's/Types: deb$/Types: deb deb-src/' /etc/apt/sources.list.d/debian.sources || true; \
@@ -22,10 +22,9 @@ RUN if [ "$REL" = "buster" -o "$REL" = "bullseye" ]; then \
    echo 'APT::Install-Suggests false;' >> /etc/apt/apt.conf

 RUN apt-get update
-RUN apt-get -y install fio liburing-dev libgoogle-perftools-dev devscripts
+RUN apt-get -y install fio liburing-dev libgoogle-perftools-dev devscripts libjerasure-dev cmake libibverbs-dev libisal-dev libnl-3-dev libnl-genl-3-dev curl
 RUN apt-get -y build-dep fio
 RUN apt-get --download-only source fio
-RUN apt-get update && apt-get -y install libjerasure-dev cmake libibverbs-dev libisal-dev libnl-3-dev libnl-genl-3-dev

 ADD . /root/vitastor
 RUN set -e -x; \
@@ -37,8 +36,10 @@ RUN set -e -x; \
    mkdir -p /root/packages/vitastor-$REL; \
    rm -rf /root/packages/vitastor-$REL/*; \
    cd /root/packages/vitastor-$REL; \
-    cp -r /root/vitastor vitastor-1.6.1; \
-    cd vitastor-1.6.1; \
+    FULLVER=$(head -n1 /root/vitastor/debian/changelog | perl -pe 's/^.*\((.*?)\).*$/$1/'); \
+    VER=${FULLVER%%-*}; \
+    cp -r /root/vitastor vitastor-$VER; \
+    cd vitastor-$VER; \
    ln -s /root/fio-build/fio-*/ ./fio; \
    FIO=$(head -n1 fio/debian/changelog | perl -pe 's/^.*\((.*?)\).*$/$1/'); \
    ls /usr/include/linux/raw.h || cp ./debian/raw.h /usr/include/linux/raw.h; \
@@ -50,10 +51,14 @@ RUN set -e -x; \
    echo fio-headers.patch >> debian/patches/series; \
    rm -rf a b; \
    echo "dep:fio=$FIO" > debian/fio_version; \
+    cd /root/packages/vitastor-$REL/vitastor-$VER; \
+    mkdir mon/node_modules; \
+    cd mon/node_modules; \
+    curl -s https://git.yourcmc.ru/vitalif/antietcd/archive/master.tar.gz | tar -zx; \
+    curl -s https://git.yourcmc.ru/vitalif/tinyraft/archive/master.tar.gz | tar -zx; \
    cd /root/packages/vitastor-$REL; \
-    tar --sort=name --mtime='2020-01-01' --owner=0 --group=0 --exclude=debian -cJf vitastor_1.6.1.orig.tar.xz vitastor-1.6.1; \
-    cd vitastor-1.6.1; \
-    V=$(head -n1 debian/changelog | perl -pe 's/^.*\((.*?)\).*$/$1/'); \
-    DEBFULLNAME="Vitaliy Filippov <vitalif@yourcmc.ru>" dch -D $REL -v "$V""$REL" "Rebuild for $REL"; \
+    tar --sort=name --mtime='2020-01-01' --owner=0 --group=0 --exclude=debian -cJf vitastor_$VER.orig.tar.xz vitastor-$VER; \
+    cd vitastor-$VER; \
+    DEBFULLNAME="Vitaliy Filippov <vitalif@yourcmc.ru>" dch -D $REL -v "$FULLVER""$REL" "Rebuild for $REL"; \
    DEB_BUILD_OPTIONS=nocheck dpkg-buildpackage --jobs=auto -sa; \
    rm -rf /root/packages/vitastor-$REL/vitastor-*/
--- a/docs/config/client.en.md
+++ b/docs/config/client.en.md
@@ -9,6 +9,7 @@
 These parameters apply only to Vitastor clients (QEMU, fio, NBD and so on) and
 affect their interaction with the cluster.

+- [client_iothread_count](#client_iothread_count)
 - [client_retry_interval](#client_retry_interval)
 - [client_eio_retry_interval](#client_eio_retry_interval)
 - [client_retry_enospc](#client_retry_enospc)
@@ -23,6 +24,23 @@ affect their interaction with the cluster.
 - [nbd_max_part](#nbd_max_part)
 - [osd_nearfull_ratio](#osd_nearfull_ratio)

+## client_iothread_count
+
+- Type: integer
+- Default: 0
+
+Number of separate threads for handling TCP network I/O at client library
+side. Enabling 4 threads usually allows to increase peak performance of each
+client from approx. 2-3 to 7-8 GByte/s linear read/write and from approx.
+100-150 to 400 thousand iops, but at the same time it increases latency.
+Latency increase depends on CPU: with CPU power saving disabled latency
+only increases by ~10 us (equivalent to Q=1 iops decrease from 10500 to 9500),
+with CPU power saving enabled it may be as high as 500 us (equivalent to Q=1
+iops decrease from 2000 to 1000). RDMA isn't affected by this option.
+
+It's recommended to enable client I/O threads if you don't use RDMA and want
+to increase peak client performance.
+
 ## client_retry_interval

 - Type: milliseconds
--- a/docs/config/client.ru.md
+++ b/docs/config/client.ru.md
@@ -9,6 +9,7 @@
 Данные параметры применяются только к клиентам Vitastor (QEMU, fio, NBD и т.п.) и
 затрагивают логику их работы с кластером.

+- [client_iothread_count](#client_iothread_count)
 - [client_retry_interval](#client_retry_interval)
 - [client_eio_retry_interval](#client_eio_retry_interval)
 - [client_retry_enospc](#client_retry_enospc)
@@ -23,6 +24,24 @@
 - [nbd_max_part](#nbd_max_part)
 - [osd_nearfull_ratio](#osd_nearfull_ratio)

+## client_iothread_count
+
+- Тип: целое число
+- Значение по умолчанию: 0
+
+Число отдельных потоков для обработки ввода-вывода через TCP сеть на стороне
+клиентской библиотеки. Включение 4 потоков обычно позволяет поднять пиковую
+производительность каждого клиента примерно с 2-3 до 7-8 Гбайт/с линейного
+чтения/записи и примерно с 100-150 до 400 тысяч операций ввода-вывода в
+секунду, но ухудшает задержку. Увеличение задержки зависит от процессора:
+при отключённом энергосбережении CPU это всего ~10 микросекунд (равносильно
+падению iops с Q=1 с 10500 до 9500), а при включённом это может быть
+и 500 микросекунд (равносильно падению iops с Q=1 с 2000 до 1000). На работу
+RDMA данная опция не влияет.
+
+Рекомендуется включать клиентские потоки ввода-вывода, если вы не используете
+RDMA и хотите повысить пиковую производительность клиентов.
+
 ## client_retry_interval

 - Тип: миллисекунды
--- a/docs/config/layout-cluster.en.md
+++ b/docs/config/layout-cluster.en.md
@@ -56,14 +56,24 @@ Can't be smaller than the OSD data device sector.
 ## immediate_commit

 - Type: string
- Default: false
+- Default: all

-Another parameter which is really important for performance.
+One of "none", "all" or "small". Global value, may be overriden [at pool level](pool.en.md#immediate_commit).
+
+This parameter is also really important for performance.
+
+TLDR: default "all" is optimal for server-grade SSDs with supercapacitor-based
+power loss protection (nonvolatile write-through cache) and also for most HDDs.
+"none" or "small" should be only selected if you use desktop SSDs without
+capacitors or drives with slow write-back cache that can't be disabled. Check
+immediate_commit of your OSDs in [ls-osd](../usage/cli.en.md#ls-osd).
+
+Detailed explanation:

 Desktop SSDs are very fast (100000+ iops) for simple random writes
 without cache flush. However, they are really slow (only around 1000 iops)
-if you try to fsync() each write, that is, when you want to guarantee that
-each change gets immediately persisted to the physical media.
+if you try to fsync() each write, that is, if you want to guarantee that
+each change gets actually persisted to the physical media.

 Server-grade SSDs with "Advanced/Enhanced Power Loss Protection" or with
 "Supercapacitor-based Power Loss Protection", on the other hand, are equally
@@ -75,8 +85,8 @@ really slow when used with desktop SSDs. Vitastor, however, can also
 efficiently utilize desktop SSDs by postponing fsync until the client calls
 it explicitly.

-This is what this parameter regulates. When it's set to "all" the whole
-Vitastor cluster commits each change to disks immediately and clients just
+This is what this parameter regulates. When it's set to "all" Vitastor
+cluster commits each change to disks immediately and clients just
 ignore fsyncs because they know for sure that they're unneeded. This reduces
 the amount of network roundtrips performed by clients and improves
 performance. So it's always better to use server grade SSDs with
@@ -96,12 +106,8 @@ SSD cache or "media-cache" - for example, a lot of Seagate EXOS drives have
 it (they have internal SSD cache even though it's not stated in datasheets).

 Setting this parameter to "all" or "small" in OSD parameters requires enabling
-[disable_journal_fsync](layout-osd.en.yml#disable_journal_fsync) and
-[disable_meta_fsync](layout-osd.en.yml#disable_meta_fsync), setting it to
-"all" also requires enabling [disable_data_fsync](layout-osd.en.yml#disable_data_fsync).
-
-TLDR: For optimal performance, set immediate_commit to "all" if you only use
-SSDs with supercapacitor-based power loss protection (nonvolatile
-write-through cache) for both data and journals in the whole Vitastor
-cluster. Set it to "small" if you only use such SSDs for journals. Leave
-empty if your drives have write-back cache.
+[disable_journal_fsync](layout-osd.en.md#disable_journal_fsync) and
+[disable_meta_fsync](layout-osd.en.md#disable_meta_fsync), setting it to
+"all" also requires enabling [disable_data_fsync](layout-osd.en.md#disable_data_fsync).
+vitastor-disk tried to do that by default, first checking/disabling drive cache.
+If it can't disable drive cache, OSD get initialized with "none".
--- a/docs/config/layout-cluster.ru.md
+++ b/docs/config/layout-cluster.ru.md
@@ -57,9 +57,18 @@ amplification) и эффективность распределения нагр
 ## immediate_commit

 - Тип: строка
- Значение по умолчанию: false
+- Значение по умолчанию: all

-Ещё один важный для производительности параметр.
+Одно из значений "none", "small" или "all". Глобальное значение, может быть
+переопределено [на уровне пула](pool.ru.md#immediate_commit).
+
+Данный параметр тоже важен для производительности.
+
+Вкратце: значение по умолчанию "all" оптимально для всех серверных SSD с
+суперконденсаторами и также для большинства HDD. "none" и "small" имеет смысл
+устанавливать только при использовании SSD настольного класса без
+суперконденсаторов или дисков с медленным неотключаемым кэшем записи.
+Проверьте настройку immediate_commit своих OSD в выводе команды [ls-osd](../usage/cli.ru.md#ls-osd).

 Модели SSD для настольных компьютеров очень быстрые (100000+ операций в
 секунду) при простой случайной записи без сбросов кэша. Однако они очень
@@ -80,7 +89,7 @@ Power Loss Protection" - одинаково быстрые и со сбросо
 эффективно утилизировать настольные SSD.

 Данный параметр влияет как раз на это. Когда он установлен в значение "all",
-весь кластер Vitastor мгновенно фиксирует каждое изменение на физические
+кластер Vitastor мгновенно фиксирует каждое изменение на физические
 носители и клиенты могут просто игнорировать запросы fsync, т.к. они точно
 знают, что fsync-и не нужны. Это уменьшает число необходимых обращений к OSD
 по сети и улучшает производительность. Поэтому даже с Vitastor лучше всегда
@@ -103,13 +112,6 @@ HDD-дисках с внутренним SSD или "медиа" кэшем - н
 указано в спецификациях).

 Указание "all" или "small" в настройках / командной строке OSD требует
-включения [disable_journal_fsync](layout-osd.ru.yml#disable_journal_fsync) и
-[disable_meta_fsync](layout-osd.ru.yml#disable_meta_fsync), значение "all"
-также требует включения [disable_data_fsync](layout-osd.ru.yml#disable_data_fsync).
-
-Итого, вкратце: для оптимальной производительности установите
-immediate_commit в значение "all", если вы используете в кластере только SSD
-с суперконденсаторами и для данных, и для журналов. Если вы используете
-такие SSD для всех журналов, но не для данных - можете установить параметр
-в "small". Если и какие-то из дисков журналов имеют волатильный кэш записи -
-оставьте параметр пустым.
+включения [disable_journal_fsync](layout-osd.ru.md#disable_journal_fsync) и
+[disable_meta_fsync](layout-osd.ru.md#disable_meta_fsync), значение "all"
+также требует включения [disable_data_fsync](layout-osd.ru.md#disable_data_fsync).
--- a/docs/config/layout-osd.en.md
+++ b/docs/config/layout-osd.en.md
@@ -118,12 +118,13 @@ Physical block size of the journal device. Must be a multiple of
 - Type: boolean
 - Default: false

-Do not issue fsyncs to the data device, i.e. do not flush its cache.
-Safe ONLY if your data device has write-through cache. If you disable
-the cache yourself using `hdparm` or `scsi_disk/cache_type` then make sure
-that the cache disable command is run every time before starting Vitastor
-OSD, for example, in the systemd unit. See also `immediate_commit` option
-for the instructions to disable cache and how to benefit from it.
+Do not issue fsyncs to the data device, i.e. do not force it to flush cache.
+Safe ONLY if your data device has write-through cache or if write-back
+cache is disabled. If you disable drive cache manually with `hdparm` or
+writing to `/sys/.../scsi_disk/cache_type` then make sure that you do it
+every time before starting Vitastor OSD (vitastor-disk does it automatically).
+See also [immediate_commit](layout-cluster.en.md#immediate_commit)
+for information about how to benefit from disabled cache.

 ## disable_meta_fsync

@@ -171,8 +172,7 @@ size, it actually has to write the whole 4 KB sector.

 Because of this it can actually be beneficial to use SSDs which work well
 with 512 byte sectors and use 512 byte disk_alignment, journal_block_size
-and meta_block_size. But the only SSD that may fit into this category is
-Intel Optane (probably, not tested yet).
+and meta_block_size. But at the moment, no such SSDs are known...

 Clients don't need to be aware of disk_alignment, so it's not required to
 put a modified value into etcd key /vitastor/config/global.
--- a/docs/config/layout-osd.ru.md
+++ b/docs/config/layout-osd.ru.md
@@ -122,13 +122,14 @@ SSD-диске, иначе производительность пострада
 - Тип: булево (да/нет)
 - Значение по умолчанию: false

-Не отправлять fsync-и устройству данных, т.е. не сбрасывать его кэш.
+Не отправлять fsync-и устройству данных, т.е. не заставлять его сбрасывать кэш.
 Безопасно, ТОЛЬКО если ваше устройство данных имеет кэш со сквозной
-записью (write-through). Если вы отключаете кэш через `hdparm` или
-`scsi_disk/cache_type`, то удостоверьтесь, что команда отключения кэша
-выполняется перед каждым запуском Vitastor OSD, например, в systemd unit-е.
-Смотрите также опцию `immediate_commit` для инструкций по отключению кэша
-и о том, как из этого извлечь выгоду.
+записью (write-through) или если кэш с отложенной записью (write-back) отключён.
+Если вы отключаете кэш вручную через `hdparm` или запись в `/sys/.../scsi_disk/cache_type`,
+то удостоверьтесь, что вы делаете это каждый раз перед запуском Vitastor OSD
+(vitastor-disk делает это автоматически). Смотрите также опцию
+[immediate_commit](layout-cluster.ru.md#immediate_commit) для информации о том,
+как извлечь выгоду из отключённого кэша.

 ## disable_meta_fsync

@@ -179,9 +180,8 @@ SSD и HDD диски используют 4 КБ физические сект

 Поэтому, на самом деле, может быть выгодно найти SSD, хорошо работающие с
 меньшими, 512-байтными, блоками и использовать 512-байтные disk_alignment,
-journal_block_size и meta_block_size. Однако единственные SSD, которые
-теоретически могут попасть в эту категорию - это Intel Optane (но и это
-пока не проверялось автором).
+journal_block_size и meta_block_size. Однако на данный момент такие SSD
+не известны...

 Клиентам не обязательно знать про disk_alignment, так что помещать значение
 этого параметра в etcd в /vitastor/config/global не нужно.
--- a/docs/config/monitor.en.md
+++ b/docs/config/monitor.en.md
@@ -8,6 +8,14 @@

 These parameters only apply to Monitors.

+- [use_antietcd](#use_antietcd)
+- [enable_prometheus](#enable_prometheus)
+- [mon_http_port](#mon_http_port)
+- [mon_http_ip](#mon_http_ip)
+- [mon_https_cert](#mon_https_cert)
+- [mon_https_key](#mon_https_key)
+- [mon_https_client_auth](#mon_https_client_auth)
+- [mon_https_ca](#mon_https_ca)
 - [etcd_mon_ttl](#etcd_mon_ttl)
 - [etcd_mon_timeout](#etcd_mon_timeout)
 - [etcd_mon_retries](#etcd_mon_retries)
@@ -17,6 +25,87 @@ These parameters only apply to Monitors.
 - [placement_levels](#placement_levels)
 - [use_old_pg_combinator](#use_old_pg_combinator)

+## use_antietcd
+
+- Type: boolean
+- Default: false
+
+Enable experimental built-in etcd replacement (clustered key-value database):
+[antietcd](https://git.yourcmc.ru/vitalif/antietcd/).
+
+When set to true, monitor runs internal antietcd automatically if it finds
+a network interface with an IP address matching one of addresses in the
+`etcd_address` configuration option (in `/etc/vitastor/vitastor.conf` or in
+the monitor command line). If there are multiple matching addresses, it also
+checks `antietcd_port` and antietcd is started for address with matching port.
+By default, antietcd accepts connection on the selected IP address, but it
+can also be overridden manually in the `antietcd_ip` option.
+
+When antietcd is started, monitor stores cluster metadata itself and exposes
+a etcd-compatible REST API. On disk, these metadata are stored in
+`/var/lib/vitastor/mon_2379.json.gz` (can be overridden in antietcd_data_file
+or antietcd_data_dir options). All other antietcd parameters
+(see [here](https://git.yourcmc.ru/vitalif/antietcd/)) except node_id,
+cluster, cluster_key, persist_filter, stale_read can also be set in
+Vitastor configuration with `antietcd_` prefix.
+
+You can dump/load data to or from antietcd using Antietcd `anticli` tool:
+
+```
+npm exec anticli -e http://etcd:2379/v3 get --prefix '' --no-temp > dump.json
+npm exec anticli -e http://antietcd:2379/v3 load < dump.json
+```
+
+## enable_prometheus
+
+- Type: boolean
+- Default: true
+
+Enable built-in Prometheus metrics exporter at mon_http_port (8060 by default).
+
+Note that only the active (master) monitor exposes metrics, others return
+HTTP 503. So you should add all monitor URLs to your Prometheus job configuration.
+
+Grafana dashboard suitable for this exporter is here: [Vitastor-Grafana-6+.json](../../mon/scripts/Vitastor-Grafana-6+.json).
+
+## mon_http_port
+
+- Type: integer
+- Default: 8060
+
+HTTP port for monitors to listen on (including metrics exporter)
+
+## mon_http_ip
+
+- Type: string
+
+IP address for monitors to listen on (all addresses by default)
+
+## mon_https_cert
+
+- Type: string
+
+Path to PEM SSL certificate file for monitor to listen using HTTPS
+
+## mon_https_key
+
+- Type: string
+
+Path to PEM SSL private key file for monitor to listen using HTTPS
+
+## mon_https_client_auth
+
+- Type: boolean
+- Default: false
+
+Enable HTTPS client certificate-based authorization for monitor connections
+
+## mon_https_ca
+
+- Type: string
+
+Path to CA certificate for client HTTPS authorization
+
 ## etcd_mon_ttl

 - Type: seconds
--- a/docs/config/monitor.ru.md
+++ b/docs/config/monitor.ru.md
@@ -8,6 +8,14 @@

 Данные параметры используются только мониторами Vitastor.

+- [use_antietcd](#use_antietcd)
+- [enable_prometheus](#enable_prometheus)
+- [mon_http_port](#mon_http_port)
+- [mon_http_ip](#mon_http_ip)
+- [mon_https_cert](#mon_https_cert)
+- [mon_https_key](#mon_https_key)
+- [mon_https_client_auth](#mon_https_client_auth)
+- [mon_https_ca](#mon_https_ca)
 - [etcd_mon_ttl](#etcd_mon_ttl)
 - [etcd_mon_timeout](#etcd_mon_timeout)
 - [etcd_mon_retries](#etcd_mon_retries)
@@ -17,6 +25,89 @@
 - [placement_levels](#placement_levels)
 - [use_old_pg_combinator](#use_old_pg_combinator)

+## use_antietcd
+
+- Тип: булево (да/нет)
+- Значение по умолчанию: false
+
+Включить экспериментальный встроенный заменитель etcd (кластерную БД ключ-значение):
+[antietcd](https://git.yourcmc.ru/vitalif/antietcd/).
+
+Если параметр установлен в true, монитор запускает antietcd автоматически,
+если обнаруживает сетевой интерфейс с одним из адресов, указанных в опции
+конфигурации `etcd_address` (в `/etc/vitastor/vitastor.conf` или в опциях
+командной строки монитора). Если таких адресов несколько, также проверяется
+опция `antietcd_port` и antietcd запускается для адреса с соответствующим
+портом. По умолчанию antietcd принимает подключения по выбранному совпадающему
+IP, но его также можно определить вручную опцией `antietcd_ip`.
+
+При запуске antietcd монитор сам хранит центральные метаданные кластера и
+выставляет etcd-совместимое REST API. На диске эти метаданные хранятся в файле
+`/var/lib/vitastor/mon_2379.json.gz` (можно переопределить параметрами
+antietcd_data_file или antietcd_data_dir). Все остальные параметры antietcd
+(смотрите [по ссылке](https://git.yourcmc.ru/vitalif/antietcd/)), за исключением
+node_id, cluster, cluster_key, persist_filter, stale_read также можно задавать
+в конфигурации Vitastor с префиксом `antietcd_`.
+
+Вы можете выгружать/загружать данные в или из antietcd с помощью его инструмента
+`anticli`:
+
+```
+npm exec anticli -e http://etcd:2379/v3 get --prefix '' --no-temp > dump.json
+npm exec anticli -e http://antietcd:2379/v3 load < dump.json
+```
+
+## enable_prometheus
+
+- Тип: булево (да/нет)
+- Значение по умолчанию: true
+
+Включить встроенный Prometheus-экспортер метрик на порту mon_http_port (по умолчанию 8060).
+
+Обратите внимание, что метрики выставляет только активный (главный) монитор, остальные
+возвращают статус HTTP 503, поэтому вам следует добавлять адреса всех мониторов
+в задание по сбору метрик Prometheus.
+
+Дашборд для Grafana, подходящий для этого экспортера: [Vitastor-Grafana-6+.json](../../mon/scripts/Vitastor-Grafana-6+.json).
+
+## mon_http_port
+
+- Тип: целое число
+- Значение по умолчанию: 8060
+
+Порт, на котором мониторы принимают HTTP-соединения (в том числе для отдачи метрик)
+
+## mon_http_ip
+
+- Тип: строка
+
+IP-адрес, на котором мониторы принимают HTTP-соединения (по умолчанию все адреса)
+
+## mon_https_cert
+
+- Тип: строка
+
+Путь к PEM-файлу SSL-сертификата для монитора, чтобы принимать соединения через HTTPS
+
+## mon_https_key
+
+- Тип: строка
+
+Путь к PEM-файлу секретного SSL-ключа для монитора, чтобы принимать соединения через HTTPS
+
+## mon_https_client_auth
+
+- Тип: булево (да/нет)
+- Значение по умолчанию: false
+
+Включить в HTTPS-сервере монитора авторизацию по клиентским сертификатам
+
+## mon_https_ca
+
+- Тип: строка
+
+Путь к удостоверяющему сертификату для авторизации клиентских HTTPS соединений
+
 ## etcd_mon_ttl

 - Тип: секунды
--- a/docs/config/network.en.md
+++ b/docs/config/network.en.md
@@ -248,7 +248,7 @@ etcd_report_interval to guarantee that keepalive actually works.
 ## etcd_ws_keepalive_interval

 - Type: seconds
- Default: 30
+- Default: 5
 - Can be changed online: yes

 etcd websocket ping interval required to keep the connection alive and
--- a/docs/config/network.ru.md
+++ b/docs/config/network.ru.md
@@ -259,7 +259,7 @@ etcd_report_interval, чтобы keepalive гарантированно рабо
 ## etcd_ws_keepalive_interval

 - Тип: секунды
- Значение по умолчанию: 30
+- Значение по умолчанию: 5
 - Можно менять на лету: да

 Интервал проверки живости вебсокет-подключений к etcd.
--- a/docs/config/osd.en.md
+++ b/docs/config/osd.en.md
@@ -10,6 +10,7 @@ These parameters only apply to OSDs, are not fixed at the moment of OSD drive
 initialization and can be changed - either with an OSD restart or, for some of
 them, even without restarting by updating configuration in etcd.

+- [osd_iothread_count](#osd_iothread_count)
 - [etcd_report_interval](#etcd_report_interval)
 - [etcd_stats_interval](#etcd_stats_interval)
 - [run_primary](#run_primary)
@@ -61,6 +62,18 @@ them, even without restarting by updating configuration in etcd.
 - [recovery_tune_sleep_min_us](#recovery_tune_sleep_min_us)
 - [recovery_tune_sleep_cutoff_us](#recovery_tune_sleep_cutoff_us)

+## osd_iothread_count
+
+- Type: integer
+- Default: 0
+
+TCP network I/O thread count for OSD. When non-zero, a single OSD process
+may handle more TCP I/O, but at a cost of increased latency because thread
+switching overhead occurs. RDMA isn't affected by this option.
+
+Because of latency, instead of enabling OSD I/O threads it's recommended to
+just create multiple OSDs per disk, or use RDMA.
+
 ## etcd_report_interval

 - Type: seconds
--- a/docs/config/osd.ru.md
+++ b/docs/config/osd.ru.md
@@ -11,6 +11,7 @@
 момент с помощью перезапуска OSD, а некоторые и без перезапуска, с помощью
 изменения конфигурации в etcd.

+- [osd_iothread_count](#osd_iothread_count)
 - [etcd_report_interval](#etcd_report_interval)
 - [etcd_stats_interval](#etcd_stats_interval)
 - [run_primary](#run_primary)
@@ -62,6 +63,19 @@
 - [recovery_tune_sleep_min_us](#recovery_tune_sleep_min_us)
 - [recovery_tune_sleep_cutoff_us](#recovery_tune_sleep_cutoff_us)

+## osd_iothread_count
+
+- Тип: целое число
+- Значение по умолчанию: 0
+
+Число отдельных потоков для обработки ввода-вывода через TCP-сеть на
+стороне OSD. Включение опции позволяет каждому отдельному OSD передавать
+по сети больше данных, но ухудшает задержку из-за накладных расходов
+переключения потоков. На работу RDMA опция не влияет.
+
+Из-за задержек вместо включения потоков ввода-вывода OSD рекомендуется
+просто создавать по несколько OSD на каждом диске, или использовать RDMA.
+
 ## etcd_report_interval

 - Тип: секунды
--- a/docs/config/pool.en.md
+++ b/docs/config/pool.en.md
@@ -55,7 +55,7 @@ Examples:
 OSD placement tree is set in a separate etcd key `/vitastor/config/node_placement`
 in the following JSON format:

-`
+```
 {
  "<node name or OSD number>": {
    "level": "<level>",
@@ -63,7 +63,7 @@ in the following JSON format:
  },
  ...
 }
-`
+```

 Here, if a node name is a number then it is assumed to refer to an OSD.
 Level of the OSD is always "osd" and cannot be overriden. You may only
--- a/docs/config/pool.ru.md
+++ b/docs/config/pool.ru.md
@@ -54,7 +54,7 @@
 Дерево размещения OSD задаётся в отдельном ключе etcd `/vitastor/config/node_placement`
 в следующем JSON-формате:

-`
+```
 {
  "<имя узла или номер OSD>": {
    "level": "<уровень>",
@@ -62,7 +62,7 @@
  },
  ...
 }
-`
+```

 Здесь, если название узла - число, считается, что это OSD. Уровень OSD
 всегда равен "osd" и не может быть переопределён. Для OSD вы можете только
--- a/docs/config/src/client.yml
+++ b/docs/config/src/client.yml
@@ -1,3 +1,32 @@
+- name: client_iothread_count
+  type: int
+  default: 0
+  online: false
+  info: |
+    Number of separate threads for handling TCP network I/O at client library
+    side. Enabling 4 threads usually allows to increase peak performance of each
+    client from approx. 2-3 to 7-8 GByte/s linear read/write and from approx.
+    100-150 to 400 thousand iops, but at the same time it increases latency.
+    Latency increase depends on CPU: with CPU power saving disabled latency
+    only increases by ~10 us (equivalent to Q=1 iops decrease from 10500 to 9500),
+    with CPU power saving enabled it may be as high as 500 us (equivalent to Q=1
+    iops decrease from 2000 to 1000). RDMA isn't affected by this option.
+
+    It's recommended to enable client I/O threads if you don't use RDMA and want
+    to increase peak client performance.
+  info_ru: |
+    Число отдельных потоков для обработки ввода-вывода через TCP сеть на стороне
+    клиентской библиотеки. Включение 4 потоков обычно позволяет поднять пиковую
+    производительность каждого клиента примерно с 2-3 до 7-8 Гбайт/с линейного
+    чтения/записи и примерно с 100-150 до 400 тысяч операций ввода-вывода в
+    секунду, но ухудшает задержку. Увеличение задержки зависит от процессора:
+    при отключённом энергосбережении CPU это всего ~10 микросекунд (равносильно
+    падению iops с Q=1 с 10500 до 9500), а при включённом это может быть
+    и 500 микросекунд (равносильно падению iops с Q=1 с 2000 до 1000). На работу
+    RDMA данная опция не влияет.
+
+    Рекомендуется включать клиентские потоки ввода-вывода, если вы не используете
+    RDMA и хотите повысить пиковую производительность клиентов.
 - name: client_retry_interval
  type: ms
  min: 10
--- a/docs/config/src/layout-cluster.yml
+++ b/docs/config/src/layout-cluster.yml
@@ -47,14 +47,24 @@
    Не может быть меньше размера сектора дисков данных OSD.
 - name: immediate_commit
  type: string
-  default: false
+  default: all
  info: |
-    Another parameter which is really important for performance.
+    One of "none", "all" or "small". Global value, may be overriden [at pool level](pool.en.md#immediate_commit).
+
+    This parameter is also really important for performance.
+
+    TLDR: default "all" is optimal for server-grade SSDs with supercapacitor-based
+    power loss protection (nonvolatile write-through cache) and also for most HDDs.
+    "none" or "small" should be only selected if you use desktop SSDs without
+    capacitors or drives with slow write-back cache that can't be disabled. Check
+    immediate_commit of your OSDs in [ls-osd](../usage/cli.en.md#ls-osd).
+
+    Detailed explanation:

    Desktop SSDs are very fast (100000+ iops) for simple random writes
    without cache flush. However, they are really slow (only around 1000 iops)
-    if you try to fsync() each write, that is, when you want to guarantee that
-    each change gets immediately persisted to the physical media.
+    if you try to fsync() each write, that is, if you want to guarantee that
+    each change gets actually persisted to the physical media.

    Server-grade SSDs with "Advanced/Enhanced Power Loss Protection" or with
    "Supercapacitor-based Power Loss Protection", on the other hand, are equally
@@ -66,8 +76,8 @@
    efficiently utilize desktop SSDs by postponing fsync until the client calls
    it explicitly.

-    This is what this parameter regulates. When it's set to "all" the whole
-    Vitastor cluster commits each change to disks immediately and clients just
+    This is what this parameter regulates. When it's set to "all" Vitastor
+    cluster commits each change to disks immediately and clients just
    ignore fsyncs because they know for sure that they're unneeded. This reduces
    the amount of network roundtrips performed by clients and improves
    performance. So it's always better to use server grade SSDs with
@@ -87,17 +97,22 @@
    it (they have internal SSD cache even though it's not stated in datasheets).

    Setting this parameter to "all" or "small" in OSD parameters requires enabling
-    [disable_journal_fsync](layout-osd.en.yml#disable_journal_fsync) and
-    [disable_meta_fsync](layout-osd.en.yml#disable_meta_fsync), setting it to
-    "all" also requires enabling [disable_data_fsync](layout-osd.en.yml#disable_data_fsync).
-
-    TLDR: For optimal performance, set immediate_commit to "all" if you only use
-    SSDs with supercapacitor-based power loss protection (nonvolatile
-    write-through cache) for both data and journals in the whole Vitastor
-    cluster. Set it to "small" if you only use such SSDs for journals. Leave
-    empty if your drives have write-back cache.
+    [disable_journal_fsync](layout-osd.en.md#disable_journal_fsync) and
+    [disable_meta_fsync](layout-osd.en.md#disable_meta_fsync), setting it to
+    "all" also requires enabling [disable_data_fsync](layout-osd.en.md#disable_data_fsync).
+    vitastor-disk tried to do that by default, first checking/disabling drive cache.
+    If it can't disable drive cache, OSD get initialized with "none".
  info_ru: |
-    Ещё один важный для производительности параметр.
+    Одно из значений "none", "small" или "all". Глобальное значение, может быть
+    переопределено [на уровне пула](pool.ru.md#immediate_commit).
+
+    Данный параметр тоже важен для производительности.
+
+    Вкратце: значение по умолчанию "all" оптимально для всех серверных SSD с
+    суперконденсаторами и также для большинства HDD. "none" и "small" имеет смысл
+    устанавливать только при использовании SSD настольного класса без
+    суперконденсаторов или дисков с медленным неотключаемым кэшем записи.
+    Проверьте настройку immediate_commit своих OSD в выводе команды [ls-osd](../usage/cli.ru.md#ls-osd).

    Модели SSD для настольных компьютеров очень быстрые (100000+ операций в
    секунду) при простой случайной записи без сбросов кэша. Однако они очень
@@ -118,7 +133,7 @@
    эффективно утилизировать настольные SSD.

    Данный параметр влияет как раз на это. Когда он установлен в значение "all",
-    весь кластер Vitastor мгновенно фиксирует каждое изменение на физические
+    кластер Vitastor мгновенно фиксирует каждое изменение на физические
    носители и клиенты могут просто игнорировать запросы fsync, т.к. они точно
    знают, что fsync-и не нужны. Это уменьшает число необходимых обращений к OSD
    по сети и улучшает производительность. Поэтому даже с Vitastor лучше всегда
@@ -141,13 +156,6 @@
    указано в спецификациях).

    Указание "all" или "small" в настройках / командной строке OSD требует
-    включения [disable_journal_fsync](layout-osd.ru.yml#disable_journal_fsync) и
-    [disable_meta_fsync](layout-osd.ru.yml#disable_meta_fsync), значение "all"
-    также требует включения [disable_data_fsync](layout-osd.ru.yml#disable_data_fsync).
-
-    Итого, вкратце: для оптимальной производительности установите
-    immediate_commit в значение "all", если вы используете в кластере только SSD
-    с суперконденсаторами и для данных, и для журналов. Если вы используете
-    такие SSD для всех журналов, но не для данных - можете установить параметр
-    в "small". Если и какие-то из дисков журналов имеют волатильный кэш записи -
-    оставьте параметр пустым.
+    включения [disable_journal_fsync](layout-osd.ru.md#disable_journal_fsync) и
+    [disable_meta_fsync](layout-osd.ru.md#disable_meta_fsync), значение "all"
+    также требует включения [disable_data_fsync](layout-osd.ru.md#disable_data_fsync).
--- a/docs/config/src/layout-osd.yml
+++ b/docs/config/src/layout-osd.yml
@@ -110,20 +110,22 @@
  type: bool
  default: false
  info: |
-    Do not issue fsyncs to the data device, i.e. do not flush its cache.
-    Safe ONLY if your data device has write-through cache. If you disable
-    the cache yourself using `hdparm` or `scsi_disk/cache_type` then make sure
-    that the cache disable command is run every time before starting Vitastor
-    OSD, for example, in the systemd unit. See also `immediate_commit` option
-    for the instructions to disable cache and how to benefit from it.
+    Do not issue fsyncs to the data device, i.e. do not force it to flush cache.
+    Safe ONLY if your data device has write-through cache or if write-back
+    cache is disabled. If you disable drive cache manually with `hdparm` or
+    writing to `/sys/.../scsi_disk/cache_type` then make sure that you do it
+    every time before starting Vitastor OSD (vitastor-disk does it automatically).
+    See also [immediate_commit](layout-cluster.en.md#immediate_commit)
+    for information about how to benefit from disabled cache.
  info_ru: |
-    Не отправлять fsync-и устройству данных, т.е. не сбрасывать его кэш.
+    Не отправлять fsync-и устройству данных, т.е. не заставлять его сбрасывать кэш.
    Безопасно, ТОЛЬКО если ваше устройство данных имеет кэш со сквозной
-    записью (write-through). Если вы отключаете кэш через `hdparm` или
-    `scsi_disk/cache_type`, то удостоверьтесь, что команда отключения кэша
-    выполняется перед каждым запуском Vitastor OSD, например, в systemd unit-е.
-    Смотрите также опцию `immediate_commit` для инструкций по отключению кэша
-    и о том, как из этого извлечь выгоду.
+    записью (write-through) или если кэш с отложенной записью (write-back) отключён.
+    Если вы отключаете кэш вручную через `hdparm` или запись в `/sys/.../scsi_disk/cache_type`,
+    то удостоверьтесь, что вы делаете это каждый раз перед запуском Vitastor OSD
+    (vitastor-disk делает это автоматически). Смотрите также опцию
+    [immediate_commit](layout-cluster.ru.md#immediate_commit) для информации о том,
+    как извлечь выгоду из отключённого кэша.
 - name: disable_meta_fsync
  type: bool
  default: false
@@ -179,8 +181,7 @@

    Because of this it can actually be beneficial to use SSDs which work well
    with 512 byte sectors and use 512 byte disk_alignment, journal_block_size
-    and meta_block_size. But the only SSD that may fit into this category is
-    Intel Optane (probably, not tested yet).
+    and meta_block_size. But at the moment, no such SSDs are known...

    Clients don't need to be aware of disk_alignment, so it's not required to
    put a modified value into etcd key /vitastor/config/global.
@@ -198,9 +199,8 @@

    Поэтому, на самом деле, может быть выгодно найти SSD, хорошо работающие с
    меньшими, 512-байтными, блоками и использовать 512-байтные disk_alignment,
-    journal_block_size и meta_block_size. Однако единственные SSD, которые
-    теоретически могут попасть в эту категорию - это Intel Optane (но и это
-    пока не проверялось автором).
+    journal_block_size и meta_block_size. Однако на данный момент такие SSD
+    не известны...

    Клиентам не обязательно знать про disk_alignment, так что помещать значение
    этого параметра в etcd в /vitastor/config/global не нужно.
--- a/docs/config/src/monitor.yml
+++ b/docs/config/src/monitor.yml
@@ -1,3 +1,103 @@
+- name: use_antietcd
+  type: bool
+  default: false
+  info: |
+    Enable experimental built-in etcd replacement (clustered key-value database):
+    [antietcd](https://git.yourcmc.ru/vitalif/antietcd/).
+
+    When set to true, monitor runs internal antietcd automatically if it finds
+    a network interface with an IP address matching one of addresses in the
+    `etcd_address` configuration option (in `/etc/vitastor/vitastor.conf` or in
+    the monitor command line). If there are multiple matching addresses, it also
+    checks `antietcd_port` and antietcd is started for address with matching port.
+    By default, antietcd accepts connection on the selected IP address, but it
+    can also be overridden manually in the `antietcd_ip` option.
+
+    When antietcd is started, monitor stores cluster metadata itself and exposes
+    a etcd-compatible REST API. On disk, these metadata are stored in
+    `/var/lib/vitastor/mon_2379.json.gz` (can be overridden in antietcd_data_file
+    or antietcd_data_dir options). All other antietcd parameters
+    (see [here](https://git.yourcmc.ru/vitalif/antietcd/)) except node_id,
+    cluster, cluster_key, persist_filter, stale_read can also be set in
+    Vitastor configuration with `antietcd_` prefix.
+
+    You can dump/load data to or from antietcd using Antietcd `anticli` tool:
+
+    ```
+    npm exec anticli -e http://etcd:2379/v3 get --prefix '' --no-temp > dump.json
+    npm exec anticli -e http://antietcd:2379/v3 load < dump.json
+    ```
+  info_ru: |
+    Включить экспериментальный встроенный заменитель etcd (кластерную БД ключ-значение):
+    [antietcd](https://git.yourcmc.ru/vitalif/antietcd/).
+
+    Если параметр установлен в true, монитор запускает antietcd автоматически,
+    если обнаруживает сетевой интерфейс с одним из адресов, указанных в опции
+    конфигурации `etcd_address` (в `/etc/vitastor/vitastor.conf` или в опциях
+    командной строки монитора). Если таких адресов несколько, также проверяется
+    опция `antietcd_port` и antietcd запускается для адреса с соответствующим
+    портом. По умолчанию antietcd принимает подключения по выбранному совпадающему
+    IP, но его также можно определить вручную опцией `antietcd_ip`.
+
+    При запуске antietcd монитор сам хранит центральные метаданные кластера и
+    выставляет etcd-совместимое REST API. На диске эти метаданные хранятся в файле
+    `/var/lib/vitastor/mon_2379.json.gz` (можно переопределить параметрами
+    antietcd_data_file или antietcd_data_dir). Все остальные параметры antietcd
+    (смотрите [по ссылке](https://git.yourcmc.ru/vitalif/antietcd/)), за исключением
+    node_id, cluster, cluster_key, persist_filter, stale_read также можно задавать
+    в конфигурации Vitastor с префиксом `antietcd_`.
+
+    Вы можете выгружать/загружать данные в или из antietcd с помощью его инструмента
+    `anticli`:
+
+    ```
+    npm exec anticli -e http://etcd:2379/v3 get --prefix '' --no-temp > dump.json
+    npm exec anticli -e http://antietcd:2379/v3 load < dump.json
+    ```
+- name: enable_prometheus
+  type: bool
+  default: true
+  info: |
+    Enable built-in Prometheus metrics exporter at mon_http_port (8060 by default).
+
+    Note that only the active (master) monitor exposes metrics, others return
+    HTTP 503. So you should add all monitor URLs to your Prometheus job configuration.
+
+    Grafana dashboard suitable for this exporter is here: [Vitastor-Grafana-6+.json](../../mon/scripts/Vitastor-Grafana-6+.json).
+  info_ru: |
+    Включить встроенный Prometheus-экспортер метрик на порту mon_http_port (по умолчанию 8060).
+
+    Обратите внимание, что метрики выставляет только активный (главный) монитор, остальные
+    возвращают статус HTTP 503, поэтому вам следует добавлять адреса всех мониторов
+    в задание по сбору метрик Prometheus.
+
+    Дашборд для Grafana, подходящий для этого экспортера: [Vitastor-Grafana-6+.json](../../mon/scripts/Vitastor-Grafana-6+.json).
+- name: mon_http_port
+  type: int
+  default: 8060
+  info: HTTP port for monitors to listen on (including metrics exporter)
+  info_ru: Порт, на котором мониторы принимают HTTP-соединения (в том числе для отдачи метрик)
+- name: mon_http_ip
+  type: string
+  info: IP address for monitors to listen on (all addresses by default)
+  info_ru: IP-адрес, на котором мониторы принимают HTTP-соединения (по умолчанию все адреса)
+- name: mon_https_cert
+  type: string
+  info: Path to PEM SSL certificate file for monitor to listen using HTTPS
+  info_ru: Путь к PEM-файлу SSL-сертификата для монитора, чтобы принимать соединения через HTTPS
+- name: mon_https_key
+  type: string
+  info: Path to PEM SSL private key file for monitor to listen using HTTPS
+  info_ru: Путь к PEM-файлу секретного SSL-ключа для монитора, чтобы принимать соединения через HTTPS
+- name: mon_https_client_auth
+  type: bool
+  default: false
+  info: Enable HTTPS client certificate-based authorization for monitor connections
+  info_ru: Включить в HTTPS-сервере монитора авторизацию по клиентским сертификатам
+- name: mon_https_ca
+  type: string
+  info: Path to CA certificate for client HTTPS authorization
+  info_ru: Путь к удостоверяющему сертификату для авторизации клиентских HTTPS соединений
 - name: etcd_mon_ttl
  type: sec
  min: 5
--- a/docs/config/src/network.yml
+++ b/docs/config/src/network.yml
@@ -282,7 +282,7 @@
    etcd_report_interval, чтобы keepalive гарантированно работал.
 - name: etcd_ws_keepalive_interval
  type: sec
-  default: 30
+  default: 5
  online: true
  info: |
    etcd websocket ping interval required to keep the connection alive and
--- a/docs/config/src/osd.yml
+++ b/docs/config/src/osd.yml
@@ -1,3 +1,21 @@
+- name: osd_iothread_count
+  type: int
+  default: 0
+  info: |
+    TCP network I/O thread count for OSD. When non-zero, a single OSD process
+    may handle more TCP I/O, but at a cost of increased latency because thread
+    switching overhead occurs. RDMA isn't affected by this option.
+
+    Because of latency, instead of enabling OSD I/O threads it's recommended to
+    just create multiple OSDs per disk, or use RDMA.
+  info_ru: |
+    Число отдельных потоков для обработки ввода-вывода через TCP-сеть на
+    стороне OSD. Включение опции позволяет каждому отдельному OSD передавать
+    по сети больше данных, но ухудшает задержку из-за накладных расходов
+    переключения потоков. На работу RDMA опция не влияет.
+
+    Из-за задержек вместо включения потоков ввода-вывода OSD рекомендуется
+    просто создавать по несколько OSD на каждом диске, или использовать RDMA.
 - name: etcd_report_interval
  type: sec
  default: 5
--- a/docs/installation/opennebula.en.md
+++ b/docs/installation/opennebula.en.md
@@ -0,0 +1,186 @@
+[Documentation](../../README.md#documentation) → Installation → OpenNebula
+
+-----
+
+[Читать на русском](opennebula.ru.md)
+
+# OpenNebula
+
+## Automatic Installation
+
+OpenNebula plugin is packaged as `vitastor-opennebula` Debian and RPM package since Vitastor 1.9.0. So:
+
+- Run `apt-get install vitastor-opennebula` or `yum install vitastor-opennebula` after installing OpenNebula on all nodes
+- Check that it prints "OK, Vitastor OpenNebula patches successfully applied" or "OK, Vitastor OpenNebula patches are already applied"
+- If it does not, refer to [Manual Installation](#manual-installation) and apply configuration file changes manually
+- Make sure that Vitastor patched versions of QEMU and libvirt are installed
+  (`dpkg -l qemu-system-x86`, `dpkg -l | grep libvirt`, `rpm -qa | grep qemu`, `rpm -qa | grep qemu`, `rpm -qa | grep libvirt-libs` should show "vitastor" in version names)
+- [Block VM access to Vitastor cluster](#block-vm-access-to-vitastor-cluster)
+
+## Manual Installation
+
+Install OpenNebula. Then, on each node:
+
+- Copy [opennebula/remotes](../../opennebula/remotes) into `/var/lib/one` recursively: `cp -r opennebula/remotes /var/lib/one/`
+- Copy [opennebula/sudoers.d](../../opennebula/sudoers.d) to `/etc`: `cp -r opennebula/sudoers.d /etc/`
+- Apply [downloader-vitastor.sh.diff](../../opennebula/remotes/datastore/vitastor/downloader-vitastor.sh.diff) to `/var/lib/one/remotes/datastore/downloader.sh`:
+  `patch /var/lib/one/remotes/datastore/downloader.sh < opennebula/remotes/datastore/vitastor/downloader-vitastor.sh.diff` - or read the patch and apply the same change manually
+- Add `kvm-vitastor` to `LIVE_DISK_SNAPSHOTS` in `/etc/one/vmm_exec/vmm_execrc`
+- If on Debian or Ubuntu (and AppArmor is used), add Vitastor config file path(s) to `/etc/apparmor.d/local/abstractions/libvirt-qemu`: for example,
+  `echo '  "/etc/vitastor/vitastor.conf" r,' >> /etc/apparmor.d/local/abstractions/libvirt-qemu`
+- Apply changes to `/etc/one/oned.conf`
+
+### oned.conf changes
+
+1. Add deploy script override in kvm VM_MAD: add `-l deploy.vitastor` to ARGUMENTS.
+
+```diff
+ VM_MAD = [
+     NAME           = "kvm",
+     SUNSTONE_NAME  = "KVM",
+     EXECUTABLE     = "one_vmm_exec",
+-    ARGUMENTS      = "-t 15 -r 0 kvm -p",
+    ARGUMENTS      = "-t 15 -r 0 kvm -p -l deploy=deploy.vitastor",
+     DEFAULT        = "vmm_exec/vmm_exec_kvm.conf",
+     TYPE           = "kvm",
+     KEEP_SNAPSHOTS = "yes",
+     LIVE_RESIZE    = "yes",
+     SUPPORT_SHAREABLE    = "yes",
+     IMPORTED_VMS_ACTIONS = "terminate, terminate-hard, hold, release, suspend,
+         resume, delete, reboot, reboot-hard, resched, unresched, disk-attach,
+         disk-detach, nic-attach, nic-detach, snapshot-create, snapshot-delete,
+         resize, updateconf, update"
+ ]
+```
+
+Optional: if you also want to save VM RAM checkpoints to Vitastor, use
+`-l deploy=deploy.vitastor,save=save.vitastor,restore=restore.vitastor`
+instead of just `-l deploy=deploy.vitastor`.
+
+2. Add `vitastor` to TM_MAD.ARGUMENTS and DATASTORE_MAD.ARGUMENTS:
+
+```diff
+ TM_MAD = [
+     EXECUTABLE = "one_tm",
+-    ARGUMENTS = "-t 15 -d dummy,lvm,shared,fs_lvm,fs_lvm_ssh,qcow2,ssh,ceph,dev,vcenter,iscsi_libvirt"
+    ARGUMENTS = "-t 15 -d dummy,lvm,shared,fs_lvm,fs_lvm_ssh,qcow2,ssh,ceph,vitastor,dev,vcenter,iscsi_libvirt"
+ ]
+
+ DATASTORE_MAD = [
+     EXECUTABLE = "one_datastore",
+-    ARGUMENTS  = "-t 15 -d dummy,fs,lvm,ceph,dev,iscsi_libvirt,vcenter,restic,rsync -s shared,ssh,ceph,fs_lvm,fs_lvm_ssh,qcow2,vcenter"
+    ARGUMENTS  = "-t 15 -d dummy,fs,lvm,ceph,vitastor,dev,iscsi_libvirt,vcenter,restic,rsync -s shared,ssh,ceph,vitastor,fs_lvm,fs_lvm_ssh,qcow2,vcenter"
+ ]
+```
+
+3. Add INHERIT_DATASTORE_ATTR for two Vitastor attributes:
+
+```
+INHERIT_DATASTORE_ATTR = "VITASTOR_CONF"
+INHERIT_DATASTORE_ATTR = "IMAGE_PREFIX"
+```
+
+4. Add TM_MAD_CONF and DS_MAD_CONF for Vitastor:
+
+```
+TM_MAD_CONF = [
+    NAME = "vitastor", LN_TARGET = "NONE", CLONE_TARGET = "SELF", SHARED = "YES",
+    DS_MIGRATE = "NO", DRIVER = "raw", ALLOW_ORPHANS="format",
+    TM_MAD_SYSTEM = "ssh,shared", LN_TARGET_SSH = "SYSTEM", CLONE_TARGET_SSH = "SYSTEM",
+    DISK_TYPE_SSH = "FILE", LN_TARGET_SHARED = "NONE",
+    CLONE_TARGET_SHARED = "SELF", DISK_TYPE_SHARED = "FILE"
+]
+
+DS_MAD_CONF = [
+    NAME = "vitastor",
+    REQUIRED_ATTRS = "DISK_TYPE,BRIDGE_LIST",
+    PERSISTENT_ONLY = "NO",
+    MARKETPLACE_ACTIONS = "export"
+]
+```
+
+## Create Datastores
+
+Example Image and System Datastore definitions:
+[opennebula/vitastor-imageds.conf](../../opennebula/vitastor-imageds.conf) and
+[opennebula/vitastor-systemds.conf](../../opennebula/vitastor-systemds.conf).
+
+Change parameters to your will:
+
+- POOL_NAME is Vitastor pool name to store images.
+- IMAGE_PREFIX is a string prepended to all Vitastor image names.
+- BRIDGE_LIST is a list of hosts with access to Vitastor cluster, mostly used for image (not system) datastore operations.
+- VITASTOR_CONF is the path to cluster configuration. Note that it should be also added to `/etc/apparmor.d/local/abstractions/libvirt-qemu` if you use AppArmor.
+- STAGING_DIR is a temporary directory used when importing external images. Should have free space sufficient for downloading external images.
+
+Then create datastores using `onedatastore create vitastor-imageds.conf` and `onedatastore create vitastor-systemds.conf` (or use UI).
+
+## Block VM access to Vitastor cluster
+
+Vitastor doesn't support any authentication yet, so you MUST block VM guest access to the Vitastor cluster at the network level.
+
+If you use VLAN networking for VMs - make sure you use different VLANs for VMs and hypervisor/storage network and
+block access between them using your firewall/switch configuration.
+
+If you use something more stupid like bridged networking, you probably have to use manual firewall/iptables setup
+to only allow access to Vitastor from hypervisor IPs.
+
+Also you need to switch network to "Bridged & Security Groups" and enable IP spoofing filters in OpenNebula.
+Problem is that OpenNebula's IP spoofing filter doesn't affect local interfaces of the hypervisor i.e. when
+it's enabled a VM can't talk to other VMs or to the outer world using a spoofed IP, but it CAN talk to the
+hypervisor if it takes an IP from its subnet. To fix that you also need some more iptables.
+
+So the complete "stupid" bridged network filter setup could look like the following
+(here `10.0.3.0/24` is the VM subnet and `10.0.2.0/24` is the hypervisor subnet):
+
+```
+# Allow incoming traffic from physical device
+iptables -A INPUT -m physdev --physdev-in eth0 -j ACCEPT
+# Do not allow incoming traffic from VMs, but not from VM subnet
+iptables -A INPUT ! -s 10.0.3.0/24 -i onebr0 -j DROP
+# Drop traffic from VMs to hypervisor/storage subnet
+iptables -I FORWARD 1 -s 10.0.3.0/24 -d 10.0.2.0/24 -j DROP
+```
+
+## Testing
+
+The OpenNebula plugin includes quite a bit of bash scripts, so here's their description to get an idea about what they actually do.
+
+| Script                  | Action                                    | How to Test                                                                          |
+| ----------------------- | ----------------------------------------- | ------------------------------------------------------------------------------------ |
+| vmm/kvm/deploy.vitastor | Start a VM                                | Create and start a VM with Vitastor disk(s): persistent / non-persistent / volatile. |
+| vmm/kvm/save.vitastor   | Save VM memory checkpoint                 | Stop a VM using "Stop" command.                                                      |
+| vmm/kvm/restore.vitastor| Restore VM memory checkpoint              | Start a VM back after stopping it.                                                   |
+| datastore/clone         | Copy an image as persistent               | Create a VM template and instantiate it as persistent.                               |
+| datastore/cp            | Import an external image                  | Import a VM template with images from Marketplace.                                   |
+| datastore/export        | Export an image as URL                    | Probably: export a VM template with images to Marketplace.                           |
+| datastore/mkfs          | Create an image with FS                   | Storage → Images → Create → Type: Datablock, Location: Empty disk image, Filesystem: Not empty. |
+| datastore/monitor       | Monitor used space in image datastore     | Check reported used/free space in image datastore list.                              |
+| datastore/rm            | Remove a persistent image                 | Storage → Images → Select an image → Delete.                                         |
+| datastore/snap_delete   | Delete a snapshot of a persistent image   | Storage → Images → Select an image → Select a snapshot → Delete; <br> To create an image with snapshot: attach a persistent image to a VM; create a snapshot; detach the image. |
+| datastore/snap_flatten  | Revert an image to snapshot and delete other snapshots | Storage → Images → Select an image → Select a snapshot → Flatten.       |
+| datastore/snap_revert   | Revert an image to snapshot               | Storage → Images → Select an image → Select a snapshot → Revert.                     |
+| datastore/stat          | Get virtual size of an image in MB        | No idea. Seems to be unused both in Vitastor and Ceph datastores.                    |
+| tm/clone                | Clone a non-persistent image to a VM disk | Attach a non-persistent image to a VM.                                               |
+| tm/context              | Generate a contextualisation VM disk      | Create a VM with enabled contextualisation (default). Common host FS-based version is used in Vitastor and Ceph datastores. |
+| tm/cpds                 | Copy a VM disk / its snapshot to an image | Select a VM → Select a disk → Optionally select a snapshot → Save as.                |
+| tm/delete               | Delete a cloned or volatile VM disk       | Detach a volatile disk or a non-persistent image from a VM.                          |
+| tm/failmigrate          | Handle live migration failure             | No action. Script is empty in Vitastor and Ceph. In other datastores, should roll back actions done by tm/premigrate. |
+| tm/ln                   | Attach a persistent image to a VM         | No action. Script is empty in Vitastor and Ceph.                                     |
+| tm/mkimage              | Create a volatile disk, maybe with FS     | Attach a volatile disk to a VM, with or without file system.                         |
+| tm/mkswap               | Create a volatile swap disk               | Attach a volatile disk to a VM, formatted as swap.                                   |
+| tm/monitor              | Monitor used space in system datastore    | Check reported used/free space in system datastore list.                             |
+| tm/mv                   | Move a migrated VM disk between hosts     | Migrate a VM between hosts. In Vitastor and Ceph datastores, doesn't do any storage action. |
+| tm/mvds                 | Detach a persistent image from a VM       | No action. The opposite of tm/ln. Script is empty in Vitastor and Ceph. In other datastores, script may copy the image from VM host back to the datastore. |
+| tm/postbackup           | Executed after backup                     | Seems that the script just removes temporary files after backup. Perform a VM backup and check that temporary files are cleaned up. |
+| tm/postbackup_live      | Executed after backup of a running VM     | Same as tm/postbackup, but for a running VM.                                         |
+| tm/postmigrate          | Executed after VM live migration          | No action. Only executed for system datastore, so the script tries to call other TMs for other disks. Except that, the script does nothing in Vitastor and Ceph datastores. |
+| tm/prebackup            | Actual backup script: backup VM disks     | Set up "rsync" backup datastore → Backup a VM to it.                                 |
+| tm/prebackup_live       | Backup VM disks of a running VM           | Same as tm/prebackup, but also does fsfreeze/thaw. So perform a live backup, restore it and check that disks are consistent. |
+| tm/premigrate           | Executed before live migration            | No action. Only executed for system datastore, so the script tries to call other TMs for other disks. Except that, the script does nothing in Vitastor and Ceph datastores. |
+| tm/resize               | Resize a VM disk                          | Select a VM → Select a non-persistent disk → Resize.                                 |
+| tm/restore              | Restore VM disks from backup              | Set up "rsync" backup datastore → Backup a VM to it → Restore it back.               |
+| tm/snap_create          | Create a VM disk snapshot                 | Select a VM → Select a disk → Create snapshot.                                       |
+| tm/snap_create_live     | Create a VM disk snapshot for a live VM   | Select a running VM → Select a disk → Create snapshot.                               |
+| tm/snap_delete          | Delete a VM disk snapshot                 | Select a VM → Select a disk → Select a snapshot → Delete.                            |
+| tm/snap_revert          | Revert a VM disk to a snapshot            | Select a VM → Select a disk → Select a snapshot → Revert.                            |
--- a/docs/installation/opennebula.ru.md
+++ b/docs/installation/opennebula.ru.md
@@ -0,0 +1,189 @@
+[Документация](../../README-ru.md#документация) → Установка → OpenNebula
+
+-----
+
+[Read in English](opennebula.en.md)
+
+# OpenNebula
+
+## Автоматическая установка
+
+Плагин OpenNebula Vitastor распространяется как Debian и RPM пакет `vitastor-opennebula`, начиная с версии Vitastor 1.9.0. Так что:
+
+- Запустите `apt-get install vitastor-opennebula` или `yum install vitastor-opennebula` после установки OpenNebula на всех серверах
+- Проверьте, что он выводит "OK, Vitastor OpenNebula patches successfully applied" или "OK, Vitastor OpenNebula patches are already applied" в процессе установки
+- Если сообщение не выведено, пройдите по шагам инструкцию [Ручная установка](#ручная-установка) и примените правки файлов конфигурации вручную
+- Удостоверьтесь, что установлены версии QEMU и libvirt с изменениями Vitastor
+  (`dpkg -l qemu-system-x86`, `dpkg -l | grep libvirt`, `rpm -qa | grep qemu`, `rpm -qa | grep qemu`, `rpm -qa | grep libvirt-libs` должны показывать "vitastor" в номере версии)
+- [Заблокируйте доступ виртуальных машин в Vitastor](#блокировка-доступа-вм-в-vitastor)
+
+## Ручная установка
+
+Сначала установите саму OpenNebula. После этого, на каждом сервере:
+
+- Скопируйте директорию [opennebula/remotes](../../opennebula/remotes) в `/var/lib/one`: `cp -r opennebula/remotes /var/lib/one/`
+- Скопируйте директорию [opennebula/sudoers.d](../../opennebula/sudoers.d) в `/etc`: `cp -r opennebula/sudoers.d /etc/`
+- Примените патч [downloader-vitastor.sh.diff](../../opennebula/remotes/datastore/vitastor/downloader-vitastor.sh.diff) к `/var/lib/one/remotes/datastore/downloader.sh`:
+  `patch /var/lib/one/remotes/datastore/downloader.sh < opennebula/remotes/datastore/vitastor/downloader-vitastor.sh.diff` - либо прочитайте патч и примените изменение вручную
+- Добавьте `kvm-vitastor` в список `LIVE_DISK_SNAPSHOTS` в файле `/etc/one/vmm_exec/vmm_execrc`
+- Если вы используете Debian или Ubuntu (и AppArmor), добавьте пути к файлу(ам) конфигурации Vitastor в файл `/etc/apparmor.d/local/abstractions/libvirt-qemu`: например,
+  `echo '  "/etc/vitastor/vitastor.conf" r,' >> /etc/apparmor.d/local/abstractions/libvirt-qemu`
+- Примените изменения `/etc/one/oned.conf`
+
+### Изменения oned.conf
+
+1. Добавьте переопределение скрипта deploy в VM_MAD kvm, добавив `-l deploy.vitastor` в `ARGUMENTS`:
+
+```diff
+ VM_MAD = [
+     NAME           = "kvm",
+     SUNSTONE_NAME  = "KVM",
+     EXECUTABLE     = "one_vmm_exec",
+-    ARGUMENTS      = "-t 15 -r 0 kvm -p",
+    ARGUMENTS      = "-t 15 -r 0 kvm -p -l deploy=deploy.vitastor",
+     DEFAULT        = "vmm_exec/vmm_exec_kvm.conf",
+     TYPE           = "kvm",
+     KEEP_SNAPSHOTS = "yes",
+     LIVE_RESIZE    = "yes",
+     SUPPORT_SHAREABLE    = "yes",
+     IMPORTED_VMS_ACTIONS = "terminate, terminate-hard, hold, release, suspend,
+         resume, delete, reboot, reboot-hard, resched, unresched, disk-attach,
+         disk-detach, nic-attach, nic-detach, snapshot-create, snapshot-delete,
+         resize, updateconf, update"
+ ]
+```
+
+Опционально: если вы хотите также сохранять снимки памяти ВМ в Vitastor, добавьте
+`-l deploy=deploy.vitastor,save=save.vitastor,restore=restore.vitastor`
+вместо просто `-l deploy=deploy.vitastor`.
+
+2. Добавьте `vitastor` в значения TM_MAD.ARGUMENTS и DATASTORE_MAD.ARGUMENTS:
+
+```diff
+ TM_MAD = [
+     EXECUTABLE = "one_tm",
+-    ARGUMENTS = "-t 15 -d dummy,lvm,shared,fs_lvm,fs_lvm_ssh,qcow2,ssh,ceph,dev,vcenter,iscsi_libvirt"
+    ARGUMENTS = "-t 15 -d dummy,lvm,shared,fs_lvm,fs_lvm_ssh,qcow2,ssh,ceph,vitastor,dev,vcenter,iscsi_libvirt"
+ ]
+
+ DATASTORE_MAD = [
+     EXECUTABLE = "one_datastore",
+-    ARGUMENTS  = "-t 15 -d dummy,fs,lvm,ceph,dev,iscsi_libvirt,vcenter,restic,rsync -s shared,ssh,ceph,fs_lvm,fs_lvm_ssh,qcow2,vcenter"
+    ARGUMENTS  = "-t 15 -d dummy,fs,lvm,ceph,vitastor,dev,iscsi_libvirt,vcenter,restic,rsync -s shared,ssh,ceph,vitastor,fs_lvm,fs_lvm_ssh,qcow2,vcenter"
+ ]
+```
+
+3. Добавьте строчки с INHERIT_DATASTORE_ATTR для двух атрибутов Vitastor-хранилищ:
+
+```
+INHERIT_DATASTORE_ATTR = "VITASTOR_CONF"
+INHERIT_DATASTORE_ATTR = "IMAGE_PREFIX"
+```
+
+4. Добавьте TM_MAD_CONF и DS_MAD_CONF для Vitastor:
+
+```
+TM_MAD_CONF = [
+    NAME = "vitastor", LN_TARGET = "NONE", CLONE_TARGET = "SELF", SHARED = "YES",
+    DS_MIGRATE = "NO", DRIVER = "raw", ALLOW_ORPHANS="format",
+    TM_MAD_SYSTEM = "ssh,shared", LN_TARGET_SSH = "SYSTEM", CLONE_TARGET_SSH = "SYSTEM",
+    DISK_TYPE_SSH = "FILE", LN_TARGET_SHARED = "NONE",
+    CLONE_TARGET_SHARED = "SELF", DISK_TYPE_SHARED = "FILE"
+]
+
+DS_MAD_CONF = [
+    NAME = "vitastor",
+    REQUIRED_ATTRS = "DISK_TYPE,BRIDGE_LIST",
+    PERSISTENT_ONLY = "NO",
+    MARKETPLACE_ACTIONS = "export"
+]
+```
+
+## Создайте хранилища
+
+Примеры настроек хранилищ образов (image) и дисков ВМ (system):
+[opennebula/vitastor-imageds.conf](../../opennebula/vitastor-imageds.conf) и
+[opennebula/vitastor-systemds.conf](../../opennebula/vitastor-systemds.conf).
+
+Скопируйте настройки и поменяйте следующие параметры так, как вам необходимо:
+
+- POOL_NAME - имя пула Vitastor для сохранения образов дисков.
+- IMAGE_PREFIX - строка, добавляемая в начало имён образов дисков.
+- BRIDGE_LIST - список серверов с доступом к кластеру Vitastor, используемых для операций с хранилищем образов (image, не system).
+- VITASTOR_CONF - путь к конфигурации Vitastor. Имейте в виду, что этот путь также надо добавить в `/etc/apparmor.d/local/abstractions/libvirt-qemu`, если вы используете AppArmor.
+- STAGING_DIR - путь к временному каталогу, используемому при импорте внешних образов. Должен иметь достаточно свободного места, чтобы вмещать скачанные образы.
+
+После этого создайте хранилища с помощью команд `onedatastore create vitastor-imageds.conf` и `onedatastore create vitastor-systemds.conf` (либо через UI).
+
+## Блокировка доступа ВМ в Vitastor
+
+Vitastor пока не поддерживает никакую аутентификацию, так что вы ДОЛЖНЫ заблокировать доступ гостевых ВМ
+в кластер Vitastor на сетевом уровне.
+
+Если вы используете VLAN-сети для ВМ - удостоверьтесь, что ВМ и гипервизор/сеть хранения помещены в разные
+изолированные друг от друга VLAN-ы.
+
+Если вы используете что-то более примитивное, например, мосты (bridge), вам, скорее всего, придётся вручную
+настроить iptables / межсетевой экран, чтобы разрешить доступ к Vitastor только с IP гипервизоров.
+
+Также в этом случае нужно будет переключить обычные мосты на "Bridged & Security Groups" и включить фильтр
+спуфинга IP в OpenNebula. Правда, реализация этого фильтра пока не полная, и она не блокирует доступ к
+локальным интерфейсам гипервизора. То есть, включённый фильтр спуфинга IP запрещает ВМ отправлять трафик
+с чужими IP к другим ВМ или во внешний мир, но не запрещает отправлять его напрямую гипервизору. Чтобы
+исправить это, тоже нужны дополнительные правила iptables.
+
+Таким образом, более-менее полная блокировка при использовании простой сети на сетевых мостах может
+выглядеть так (здесь `10.0.3.0/24` - подсеть ВМ, `10.0.2.0/24` - подсеть гипервизора):
+
+```
+# Разрешаем входящий трафик с физического устройства
+iptables -A INPUT -m physdev --physdev-in eth0 -j ACCEPT
+# Запрещаем трафик со всех ВМ, но с IP не из подсети ВМ
+iptables -A INPUT ! -s 10.0.3.0/24 -i onebr0 -j DROP
+# Запрещаем трафик от ВМ к сети гипервизора
+iptables -I FORWARD 1 -s 10.0.3.0/24 -d 10.0.2.0/24 -j DROP
+```
+
+## Тестирование
+
+Плагин OpenNebula по большей части состоит из bash-скриптов, и чтобы было понятнее, что они
+вообще делают - ниже приведены описания процедур, которыми можно протестировать каждый из них.
+
+| Скрипт                  | Описание                                      | Как протестировать                                                                   |
+| ----------------------- | --------------------------------------------- | ------------------------------------------------------------------------------------ |
+| vmm/kvm/deploy.vitastor | Запустить виртуальную машину                  | Создайте и запустите виртуальную машину с дисками Vitastor: постоянным / непостоянным / волатильным (временным). |
+| vmm/kvm/save.vitastor   | Сохранить снимок памяти ВМ                    | Остановите виртуальную машину командой "Остановить".                                 |
+| vmm/kvm/restore.vitastor| Восстановить снимок памяти ВМ                 | Запустите ВМ после остановки обратно.                                                |
+| datastore/clone         | Скопировать образ как "постоянный"            | Создайте шаблон ВМ и создайте из него постоянную ВМ.                                 |
+| datastore/cp            | Импортировать внешний образ                   | Импортируйте шаблон ВМ с образами дисков из Магазина OpenNebula.                     |
+| datastore/export        | Экспортировать образ как URL                  | Вероятно: экспортируйте шаблон ВМ с образами в Магазин.                              |
+| datastore/mkfs          | Создать образ с файловой системой             | Хранилище → Образы → Создать → Тип: базовый блок данных, Расположение: пустой образ диска, Файловая система: любая непустая. |
+| datastore/monitor       | Вывод статистики места в хранилище образов    | Проверьте статистику свободного/занятого места в списке хранилищ образов.            |
+| datastore/rm            | Удалить "постоянный" образ                    | Хранилище → Образы → Выберите образ → Удалить.                                       |
+| datastore/snap_delete   | Удалить снимок "постоянного" образа           | Хранилище → Образы → Выберите образ → Выберите снимок → Удалить; <br> Чтобы создать образ со снимком: подключите постоянный образ к ВМ, создайте снимок, отключите образ. |
+| datastore/snap_flatten  | Откатить образ к снимку, удалив другие снимки | Хранилище → Образы → Выберите образ → Выберите снимок → "Выровнять" (flatten).       |
+| datastore/snap_revert   | Откатить образ к снимку                       | Хранилище → Образы → Выберите образ → Выберите снимок → Откатить.                    |
+| datastore/stat          | Показать виртуальный размер образа в МБ       | Неизвестно. По-видимому, в плагинах Vitastor и Ceph не используется.                 |
+| tm/clone                | Клонировать "непостоянный" образ в диск ВМ    | Подключите "непостоянный" образ к ВМ.                                                |
+| tm/context              | Создать диск контекстуализации ВМ             | Создайте ВМ с контекстуализацией, как обычно. Но тестировать особенно нечего: в плагинах Vitastor и Ceph образ контекста хранится в локальной ФС гипервизора. |
+| tm/cpds                 | Копировать диск ВМ/его снимок в новый образ   | Выберите ВМ → Выберите диск → Опционально выберите снимок → "Сохранить как".         |
+| tm/delete               | Удалить диск-клон или волатильный диск ВМ     | Отключите волатильный или не-постоянный диск от ВМ.                                  |
+| tm/failmigrate          | Обработать неудачную миграцию                 | Тестировать нечего. Скрипт пуст в плагинах Vitastor и Ceph. В других плагинах скрипт должен откатывать действия tm/premigrate. |
+| tm/ln                   | Подключить "постоянный" образ к ВМ            | Тестировать нечего. Скрипт пуст в плагинах Vitastor и Ceph.                          |
+| tm/mkimage              | Создать волатильный диск, без или с ФС        | Подключите волатильный диск к ВМ, с или без файловой системы.                        |
+| tm/mkswap               | Создать волатильный диск подкачки             | Подключите волатильный диск к ВМ, форматированный как диск подкачки (swap).          |
+| tm/monitor              | Вывод статистики места в хранилище дисков ВМ  | Проверьте статистику свободного/занятого места в списке хранилищ дисков ВМ.          |
+| tm/mv                   | Мигрировать диск ВМ между хостами             | Мигрируйте ВМ между серверами. Правда, с точки зрения хранилища в плагинах Vitastor и Ceph этот скрипт ничего не делает. |
+| tm/mvds                 | Отключить "постоянный" образ от ВМ            | Тестировать нечего. Скрипт пуст в плагинах Vitastor и Ceph. В целом же скрипт обратный к tm/ln и в других хранилищах он может, например, копировать образ ВМ с диска гипервизора обратно в хранилище. |
+| tm/postbackup           | Выполняется после бэкапа                      | По-видимому, скрипт просто удаляет временные файлы после резервного копирования. Так что можно провести его и проверить, что на серверах не осталось временных файлов. |
+| tm/postbackup_live      | Выполняется после бэкапа запущенной ВМ        | То же, что tm/postbackup, но для запущенной ВМ.                                      |
+| tm/postmigrate          | Выполняется после миграции ВМ                 | Тестировать нечего. Однако, OpenNebula запускает скрипт только для системного хранилища, поэтому он вызывает аналогичные скрипты для хранилищ других дисков той же ВМ. Помимо этого в плагинах Vitastor и Ceph скрипт ничего не делает. |
+| tm/prebackup            | Выполнить резервное копирование дисков ВМ     | Создайте хранилище резервных копий типа "rsync" → Забэкапьте в него ВМ.              |
+| tm/prebackup_live       | То же самое для запущенной ВМ                 | То же, что tm/prebackup, но запускает fsfreeze/thaw (остановку доступа к дискам). Так что смысл теста - проведите резервное копирование и проверьте, что данные скопировались консистентно. |
+| tm/premigrate           | Выполняется перед миграцией ВМ                | Тестировать нечего. Аналогично tm/postmigrate запускается только для системного хранилища. |
+| tm/resize               | Изменить размер диска ВМ                      | Выберите ВМ → Выберите непостоянный диск → Измените его размер.                      |
+| tm/restore              | Восстановить диски ВМ из бэкапа               | Создайте хранилище резервных копий → Забэкапьте в него ВМ → Восстановите её обратно. |
+| tm/snap_create          | Создать снимок диска ВМ                       | Выберите ВМ → Выберите диск → Создайте снимок.                                       |
+| tm/snap_create_live     | Создать снимок диска запущенной ВМ            | Выберите запущенную ВМ → Выберите диск → Создайте снимок.                            |
+| tm/snap_delete          | Удалить снимок диска ВМ                       | Выберите ВМ → Выберите диск → Выберите снимок → Удалить.                             |
+| tm/snap_revert          | Откатить диск ВМ к снимку                     | Выберите ВМ → Выберите диск → Выберите снимок → Откатить.                            |
--- a/docs/installation/packages.en.md
+++ b/docs/installation/packages.en.md
@@ -16,8 +16,6 @@
  - Debian 10 (Buster): `deb https://vitastor.io/debian buster main`
  - Add `-oldstable` to bookworm/bullseye/buster in this line to install the last
    stable version from 0.9.x branch instead of 1.x
- For Debian 10 (Buster) also enable backports repository:
-  `deb http://deb.debian.org/debian buster-backports main`
 - Install packages: `apt update; apt install vitastor lp-solve etcd linux-image-amd64 qemu-system-x86`

 ## CentOS
--- a/docs/installation/packages.ru.md
+++ b/docs/installation/packages.ru.md
@@ -16,8 +16,6 @@
  - Debian 10 (Buster): `deb https://vitastor.io/debian buster main`
  - Добавьте `-oldstable` к слову bookworm/bullseye/buster в этой строке, чтобы
    установить последнюю стабильную версию из ветки 0.9.x вместо 1.x
- Для Debian 10 (Buster) также включите репозиторий backports:
-  `deb http://deb.debian.org/debian buster-backports main`
 - Установите пакеты: `apt update; apt install vitastor lp-solve etcd linux-image-amd64 qemu-system-x86`

 ## CentOS
--- a/docs/installation/proxmox.en.md
+++ b/docs/installation/proxmox.en.md
@@ -17,10 +17,10 @@ To enable Vitastor support in Proxmox Virtual Environment (6.4-8.1 are supported
 - Restart pvedaemon: `systemctl restart pvedaemon`

 `/etc/pve/storage.cfg` example (the only required option is vitastor_pool, all others
-are listed below with their default values):
+are listed below with their default values; `vitastor_ssd` is Proxmox storage pool id):

 ```
-vitastor: vitastor
+vitastor: vitastor_ssd
    # pool to put new images into
    vitastor_pool testpool
    # path to the configuration file
--- a/docs/installation/proxmox.ru.md
+++ b/docs/installation/proxmox.ru.md
@@ -16,10 +16,10 @@
 - Перезапустите демон Proxmox: `systemctl restart pvedaemon`

 Пример `/etc/pve/storage.cfg` (единственная обязательная опция - vitastor_pool, все остальные
-перечислены внизу для понимания значений по умолчанию):
+перечислены внизу для понимания значений по умолчанию; `vitastor_ssd` - имя хранилища в Proxmox):

 ```
-vitastor: vitastor
+vitastor: vitastor_ssd
    # Пул, в который будут помещаться образы дисков
    vitastor_pool testpool
    # Путь к файлу конфигурации
--- a/docs/installation/source.en.md
+++ b/docs/installation/source.en.md
@@ -41,7 +41,7 @@ It's recommended to build the QEMU driver (qemu_driver.c) in-tree, as a part of
 QEMU build process. To do that:
 - Install vitastor client library headers (from source or from vitastor-client-dev package)
 - Take a corresponding patch from `patches/qemu-*-vitastor.patch` and apply it to QEMU source
- Copy `src/qemu_driver.c` to QEMU source directory as `block/vitastor.c`
+- Copy `src/client/qemu_driver.c` to QEMU source directory as `block/vitastor.c`
 - Build QEMU as usual

 But it is also possible to build it out-of-tree. To do that:
--- a/docs/installation/source.ru.md
+++ b/docs/installation/source.ru.md
@@ -41,7 +41,7 @@ cmake .. && make -j8 install
 Драйвер QEMU (qemu_driver.c) рекомендуется собирать вместе с самим QEMU. Для этого:
 - Установите заголовки клиентской библиотеки Vitastor (из исходников или из пакета vitastor-client-dev)
 - Возьмите соответствующий патч из `patches/qemu-*-vitastor.patch` и примените его к исходникам QEMU
- Скопируйте [src/qemu_driver.c](../../src/qemu_driver.c) в директорию исходников QEMU как `block/vitastor.c`
+- Скопируйте [src/client/qemu_driver.c](../../src/client/qemu_driver.c) в директорию исходников QEMU как `block/vitastor.c`
 - Соберите QEMU как обычно

 Однако в целях отладки драйвер также можно собирать отдельно от QEMU. Для этого:
--- a/docs/intro/architecture.en.md
+++ b/docs/intro/architecture.en.md
@@ -11,14 +11,140 @@
 - [Differences from Ceph](#differences-from-ceph)
 - [Implementation Principles](#implementation-principles)

+## Server-side components
+
+- **OSD** (Object Storage Daemon) is a process that directly works with the disk, stores data
+  and serves read/write requests. One OSD serves one disk (or one partition). OSDs talk to etcd
+  and to each other — they receive cluster state from etcd, and send read/write requests for
+  secondary copies of data to other OSDs.
+- **etcd** — clustered key/value database, used as a reliable storage for configuration
+  and high-level cluster state. Etcd is the component that prevents splitbrain in the cluster.
+  Data blocks are not stored in etcd, etcd doesn't participate in data write or read path.
+- **Монитор** — a separate node.js based daemon which monitors the cluster, calculates
+  required configuration changes and saves them to etcd, thus commanding OSDs to apply these
+  changes. Monitor also aggregates cluster statistics. OSD don't talk to monitor, monitor
+  only sends and receives data from etcd.
+
 ## Basic concepts

- OSD (Object Storage Daemon) is a process that stores data and serves read/write requests.
- PG (Placement Group) is a "shard" of the cluster, group of data stored on one set of replicas.
- Pool is a container for data that has equal redundancy scheme and placement rules.
- Monitor is a separate daemon that watches cluster state and handles failures.
- Failure Domain is a group of OSDs that you allow to fail. It's "host" by default.
- Placement Tree groups OSDs in a hierarchy to later split them into Failure Domains.
+- **Pool** is a container for data that has equal redundancy scheme and disk placement rules.
+- **PG (Placement Group)** is a "shard" of the cluster, subdivision unit that has its own
+  set of OSDs for data storage.
+- **Failure Domain** is a group of OSDs, from the simultaneous failure of which you are
+  protected by Vitastor. Default failure domain is "host" (server), but you choose a
+  larger (for example, a rack of servers) or smaller (a single drive) failure domain
+  for every pool.
+- **Placement Tree** (similar to Ceph CRUSH Tree) groups OSDs in a hierarchy to later
+  split them into Failure Domains.
+
+## Client-side components
+
+- **Client library** incapsulates client I/O logic. Client library connects to etcd and to all OSDs,
+  receives cluster state from etcd, sends read and write requests directly to all OSDs. Due
+  to the symmetric distributed architecture, all data blocks (each 128 KB by default) are placed
+  to different OSDs, but clients always knows where each data block is stored and connects directly
+  to the right OSD.
+
+All other client-side components are based on the client library:
+
+- **[vitastor-cli](../usage/cli.en.md)** — command-line utility for cluster management.
+  Allows to view cluster state, manage pools and images, i.e. create, modify and remove
+  virtual disks, their snapshots and clones.
+- **[QEMU driver](../usage/qemu.en.md)** — pluggable QEMU module allowing QEMU/KVM virtual
+  machines work with virtual Vitastor disks directly from userspace through the client library,
+  without the need to attach disks as kernel block devices. However, if you want to attach
+  disks, you can also do that with the same driver and [VDUSE](../usage/qemu.en.md#vduse).
+- **[vitastor-nbd](../usage/nbd.en.md)** — utility that allows to attach Vitastor disks as
+  kernel block devices using NBD (Network Block Device), which works more like "BUSE"
+  (Block Device In Userspace). Vitastor doesn't have Linux kernel modules for the same task
+  (at least by now). NBD is an older, non-recommended way to attach disks — you should use
+  VDUSE whenever you can.
+- **[CSI driver](../installation/kubernetes.en.md)** — driver for attaching Vitastor images
+  as Kubernetes persistent volumes. Works through VDUSE (when available) or NBD — images are
+  attached as kernel block devices and mounted into containers.
+- **Drivers for Proxmox, OpenStack and so on** — pluggable modules for corresponding systems,
+  allowing to use Vitastor as storage in them.
+- **[vitastor-nfs](../usage/nfs.en.md)** — NFS 3.0 server allowing export of two file system variants:
+  the first is a simplified pseudo-FS for file-based access to Vitastor block images (for non-QEMU
+  hypervisors with NFS support), the second is **VitastorFS**, full-featured clustered POSIX FS.
+  Both variants support parallel access from multiple vitastor-nfs servers. In fact, you are
+  not required to setup separate NFS servers at all and use vitastor-nfs mount command on every
+  client node — it starts the NFS server and mounts the FS locally.
+- **[fio driver](../usage/fio.en.md)** — pluggable module for fio disk benchmarking tool for
+  running performance tests on your Vitastor cluster.
+- **vitastor-kv** — client for a key-value DB working over shared block volumes (usual
+  vitastor images). VitastorFS metadata is stored in vitastor-kv.
+
+## Additional utilities
+
+- **vitastor-disk** — утилита для разметки дисков под Vitastor OSD. С её помощью можно
+  создавать, удалять, менять размеры или перемещать разделы OSD.
+
+## Overall read/write process
+
+- Vitastor stores virtual disks, also named "images" or "inodes".
+- Each image is stored in some pool. Pool specifies storage parameters such as redundancy
+  scheme (replication or EC — erasure codes, i.e. error correction codes), failure domain
+  and restrictions on OSD selection for image data placement. See [Pool configuration](../config/pool.en.md) for details.
+- Each image is split into objects/blocks of fixed size, equal to [block_size](../config/layout-cluster.en.md#block_size)
+  (128 KB by default), multiplied by data part count for EC or 1 for replicas. That is,
+  if a pool uses EC 4+2 coding scheme (4 data parts + 2 parity parts), then, with the
+  default block_size, images are split into 512 KB objects.
+- Client read/write requests are split into parts at object boundaries.
+- Each object is mapped to a PG number it belongs to, by simply taking a remainder of
+  division of its offset by PG count of the image's pool.
+- Client reads primary OSD for all PGs from etcd. Primary OSD for each PG is assigned
+  by the monitor during cluster operation, along with the full PG OSD set.
+- If not already connected, client connects to primary OSDs of all PGs involved in a
+  read/write request and sends parts of the request to them.
+- If a primary OSD is unavailable, client retries connection attempts indefinitely
+  either until it becomes available or until the monitor assigns another OSD as primary
+  for that PG.
+- Client also retries requests if the primary OSD replies with error code EPIPE, meaning
+  that the PG is inactive at this OSD at the moment - for example, when the primary OSD
+  is switched, or if the primary OSD itself loses connection to replicas during request
+  handling.
+- Primary OSD determines where the parts of the object are stored. By default, all objects
+  are assumed to be stored at the target OSD set of a PG, but some of them may be present
+  at a different OSD set if they are degraded or moved, or if the data rebalancing process
+  is active. OSDs doesn't do any network requests, if calculates locations of all objects
+  during PG activation and stores it in memory.
+- Primary OSD handles the request locally when it can - for example, when it's a read
+  from a replicated pool or when it's a read from a EC pool involving only one data part
+  stored on the OSD's local disk.
+- When a request requires reads or writes to additional OSDs, primary OSD uses already
+  established connections to secondary OSDs of the PG to execute these requests. This happens
+  in parallel to local disk operations. All such connections are guaranteed to be already
+  established when the PG is active, and if any of them is dropped, PG is restarted and
+  all current read/write operations to it fail with EPIPE error and are retried by clients.
+- After completing all secondary read/write requests, primary OSD sends the response to
+  the client.
+
+### Nuances of request handling
+
+- If a pool uses erasure codes and some of the OSDs are unavailable, primary OSDs recover
+  data from the remaining parts during read.
+- Each object has a version number. During write, primary OSD first determines the current
+  version of the object. As primary OSD usually stores the object or its part itself, most
+  of the time version is read from the memory of the OSD itself. However, if primary OSD
+  doesn't contain parts of the object, it requests the version number from a secondary OSD
+  which has that part. Such request still doesn't involve reading from the disk though,
+  because object metadata, including version number, is always stored in OSD memory.
+- If a pool uses erasure codes, partial writes of an object require reading other parts of
+  it from secondary OSDs or from the local disk of the primary OSD itself. This is called
+  "read-modify-write" process.
+- If a pool uses erasure codes, two-phase write process is used to get rid of the Write Hole
+  problem: first a new version of object parts is written to all secondary OSDs without
+  removing the previous version, and then, after receiving successful write confirmations
+  from all OSDs, new version is committed and the old one is allowed to be removed.
+- In a pool doesn't use immediate_commit mode, then write requests sent by clients aren't
+  treated as committed to physical media instantly. Clients have to send separate type of
+  requests (SYNC) to commit changes, and before it isn't sent, new versions of data are
+  allowed to be lost if some OSDs die. Thus, when immediate_commit is disabled, clients
+  store copies of all write requests in memory and repeat them from there when the
+  connection to primary OSD is lost. This in-memory copy is removed after a successful
+  SYNC, and to prevent excessive memory usage, clients also do an automatic SYNC
+  every [client_dirty_limit](../config/network.en.md#client_dirty_limit) written bytes.

 ## Similarities to Ceph

--- a/docs/intro/architecture.ru.md
+++ b/docs/intro/architecture.ru.md
@@ -11,6 +11,7 @@
 - [Серверные компоненты](#серверные-компоненты)
 - [Базовые понятия](#базовые-понятия)
 - [Клиентские компоненты](#клиентские-компоненты)
+- [Дополнительные утилиты](#дополнительные-утилиты)
 - [Общий процесс записи и чтения](#общий-процесс-записи-и-чтения)
  - [Особенности обработки запросов](#особенности-обработки-запросов)
 - [Схожесть с Ceph](#схожесть-с-ceph)
@@ -34,8 +35,9 @@
 - **Пул (Pool)** — контейнер для данных, имеющих одну и ту же схему избыточности и правила распределения по OSD.
 - **PG (Placement Group)** — "шард", единица деления пулов в кластере, которой назначается свой набор
  OSD для хранения данных (копий или частей объектов).
- **Домен отказа (Failure Domain)** — группа OSD, одновременное падение которых рассматривается
-  как вероятное. По умолчанию это "host" (сервер).
+- **Домен отказа (Failure Domain)** — группа OSD, от одновременного падения которых должен защищать
+  Vitastor. По умолчанию домен отказа — "host" (сервер), но вы можете установить для пула как больший
+  домен отказа (например, стойку серверов), так и меньший (например, отдельный диск).
 - **Дерево распределения** (Placement Tree, в Ceph CRUSH Tree) — иерархическая группировка OSD
  в узлы, которые далее можно использовать как домены отказа.

@@ -49,25 +51,39 @@

 На базе клиентской библиотеки реализованы все остальные клиенты:

- **vitastor-cli** — утилита командной строки для управления кластером. В данный момент позволяет
-  просматривать общее состояние кластера и управлять образами — т.е. создавать, менять и удалять
-  виртуальные диски, их снимки и клоны.
- **Драйвер QEMU** — подключаемый модуль QEMU, позволяющий QEMU/KVM виртуальным машинам работать
-  с виртуальными дисками Vitastor напрямую из пространства пользователя с помощью клиентской
-  библиотеки, без необходимости отображения дисков в виде блочных устройств. Тот же драйвер
-  позволяет подключать диски в систему через [VDUSE](../usage/qemu.ru.md#vduse).
- **vitastor-nbd** — утилита, позволяющая монтировать образы Vitastor в виде блочных устройств
-  с помощью NBD (Network Block Device), на самом деле скорее работающего как "BUSE"
-  (Block Device In Userspace). Модуля ядра Linux для выполнения той же задачи в Vitastor нет
-  (по крайней мере, пока).
- **CSI драйвер** — драйвер для подключения Vitastor-образов в виде персистентных томов (PV) Kubernetes.
-  Работает через vitastor-nbd — образы отражаются в виде блочных устройств и монтируются
-  в контейнеры.
+- **[vitastor-cli](../usage/cli.ru.md)** — утилита командной строки для управления кластером.
+  Позволяет просматривать общее состояние кластера, управлять пулами и образами — то есть
+  создавать, менять и удалять виртуальные диски, их снимки и клоны.
+- **[Драйвер QEMU](../usage/qemu.ru.md)** — подключаемый модуль QEMU, позволяющий QEMU/KVM
+  виртуальным машинам работать с виртуальными дисками Vitastor напрямую из пространства пользователя
+  с помощью клиентской библиотеки, без необходимости подключения дисков в виде блочных устройств
+  Linux. Если, однако, вы хотите подключать диски в виде блочных устройств, то вы тоже можете
+  сделать это с помощью того же самого драйвера и [VDUSE](../usage/qemu.ru.md#vduse).
+- **[vitastor-nbd](../usage/nbd.ru.md)** — утилита, позволяющая монтировать образы Vitastor
+  в виде блочных устройств с помощью NBD (Network Block Device), на самом деле скорее работающего
+  как "BUSE" (Block Device In Userspace). Модуля ядра Linux для выполнения той же задачи в
+  Vitastor нет (по крайней мере, пока). NBD — более старый и нерекомендуемый способ подключения
+  дисков — вам следует использовать VDUSE всегда, когда это возможно.
+- **[CSI драйвер](../installation/kubernetes.ru.md)** — драйвер для подключения Vitastor-образов
+  в виде персистентных томов (PV) Kubernetes. Работает через VDUSE (если доступно) или через
+  NBD — образы отражаются в виде блочных устройств и монтируются в контейнеры.
 - **Драйвера Proxmox, OpenStack и т.п.** — подключаемые модули для соответствующих систем,
  позволяющие использовать Vitastor как хранилище в оных.
- **vitastor-nfs** — утилита, предоставляющая файловый доступ к образам в кластере Vitastor
-  по протоколу NFS 3.0. Предназначена для гипервизоров, не основанных на QEMU и Linux, но при
-  этом поддерживающих NFS.
+- **[vitastor-nfs](../usage/nfs.ru.md)** — NFS 3.0 сервер, предоставляющий два варианта файловой системы:
+  первая — упрощённая для файлового доступа к блочным образам (для не-QEMU гипервизоров, поддерживающих NFS),
+  вторая — VitastorFS, полноценная кластерная POSIX ФС. Оба варианта поддерживают параллельный
+  доступ с нескольких vitastor-nfs серверов. На самом деле можно вообще не выделять
+  отдельные NFS-серверы, а вместо этого использовать команду vitastor-nfs mount, запускающую
+  NFS-сервер прямо на клиентской машине и монтирующую ФС локально.
+- **[Драйвер fio](../usage/fio.ru.md)** — подключаемый модуль для утилиты тестирования
+  производительности дисков fio, позволяющий тестировать Vitastor-кластеры.
+- **vitastor-kv** — клиент для key-value базы данных, работающей поверх разделяемого блочного
+  образа (обычного блочного образа vitastor). Метаданные VitastorFS хранятся именно в vitastor-kv.
+
+## Дополнительные утилиты
+
+- **vitastor-disk** — утилита для разметки дисков под Vitastor OSD. С её помощью можно
+  создавать, удалять, менять размеры или перемещать разделы OSD.

 ## Общий процесс записи и чтения

@@ -98,16 +114,22 @@
  находиться на других OSD, если эти объекты деградированы или перемещены, или идёт процесс
  ребаланса. Запросы для проверки по сети не отправляются, информация о местоположении всех
  объектов рассчитывается первичным OSD при активации PG и хранится в памяти.
- Первичный OSD соединяется (если ещё не соединён) с вторичными OSD, на которых располагаются
-  части объекта, и отправляет им запросы чтения/записи, а также читает/пишет из/в своё локальное
-  хранилище, если сам входит в набор.
+- Когда это возможно, первичный OSD обрабатывает запрос локально. Например, так происходит
+  при чтениях объектов из пулов с репликацией или при чтении из EC пула, затрагивающего
+  только часть, хранимую на диске самого первичного OSD.
+- Когда запрос требует записи или чтения с вторичных OSD, первичный OSD использует заранее
+  установленные соединения с ними для выполнения этих запросов. Это происходит параллельно
+  локальным операциям чтения/записи с диска самого OSD. Так как соединения к вторичным OSD PG
+  устанавливаются при её запуске, то они уже гарантированно установлены, когда PG активна,
+  и если любое из этих соединений отключается, PG перезапускается, а все текущие запросы чтения
+  и записи в неё завершаются с ошибкой EPIPE, после чего повторяются клиентами.
 - После завершения всех вторичных операций чтения/записи первичный OSD отправляет ответ клиенту.

 ### Особенности обработки запросов

 - Если в пуле используются коды коррекции ошибок и при этом часть OSD недоступна, первичный
  OSD при чтении восстанавливает данные из оставшихся частей.
- Каждый объект имеет номер версии. При записи объекта первичный OSD сначала читает из номер
+- Каждый объект имеет номер версии. При записи объекта первичный OSD сначала получает номер
  версии объекта. Так как первичный OSD обычно сам хранит копию или часть объекта, номер
  версии обычно читается из памяти самого OSD. Однако, если ни одна часть обновляемого объекта
  не находится на первичном OSD, для получения номера версии он обращается к одному из вторичных
@@ -115,20 +137,20 @@
  так как метаданные объектов, включая номер версии, все OSD хранят в памяти.
 - Если в пуле используются коды коррекции ошибок, перед частичной записью объекта для вычисления
  чётности зачастую требуется чтение частей объекта с вторичных OSD или с локального диска
-  самого первичного OSD.
- Также, если в пуле используются коды коррекции ошибок, для закрытия Write Hole применяется
+  самого первичного OSD. Это называется процессом "чтение-модификация-запись" (read-modify-write).
+- Если в пуле используются коды коррекции ошибок, для закрытия Write Hole применяется
  двухфазный алгоритм записи: сначала на все вторичные OSD записывается новая версия частей
  объекта, но при этом старая версия не удаляется, а потом, после получения подтверждения
  успешной записи от всех вторичных OSD, новая версия фиксируется и разрешается удаление старой.
- Если в кластере не включён режим immediate_commit, то запросы записи, отправляемые клиентами,
+- Если в пуле не включён режим immediate_commit, то запросы записи, отправляемые клиентами,
  не считаются зафиксированными на физических накопителях сразу. Для фиксации данных клиенты
  должны отдельно отправлять запросы SYNC (отдельный от чтения и записи вид запроса),
  а пока такой запрос не отправлен, считается, что записанные данные могут исчезнуть,
  если соответствующий OSD упадёт. Поэтому, когда режим immediate_commit отключён, все
  запросы записи клиенты копируют в памяти и при потере соединения и повторном соединении
-  с OSD повторяют из памяти. Скопированные в память данные удаляются при успешном fsync,
+  с OSD повторяют из памяти. Скопированные в память данные удаляются при успешном SYNC,
  а чтобы хранение этих данных не приводило к чрезмерному потреблению памяти, клиенты
-  автоматически выполняют fsync каждые [client_dirty_limit](../config/network.ru.md#client_dirty_limit)
+  автоматически выполняют SYNC каждые [client_dirty_limit](../config/network.ru.md#client_dirty_limit)
  записанных байт.

 ## Схожесть с Ceph
--- a/docs/intro/features.en.md
+++ b/docs/intro/features.en.md
@@ -34,9 +34,15 @@
 - [Client write-back cache](../config/client.en.md#client_enable_writeback)
 - [Intelligent recovery auto-tuning](../config/osd.en.md#recovery_tune_interval)
 - [Clustered file system](../usage/nfs.en.md#vitastorfs)
+- [Experimental internal etcd replacement - antietcd](../config/monitor.en.md#use_antietcd)
+- [Built-in Prometheus metric exporter](../config/monitor.en.md#enable_prometheus)

 ## Plugins and tools

+- [Proxmox storage plugin and packages](../installation/proxmox.en.md)
+- [OpenNebula storage plugin](../installation/opennebula.en.md)
+- [CSI plugin for Kubernetes](../installation/kubernetes.en.md)
+- [OpenStack support: Cinder driver, Nova and libvirt patches](../installation/openstack.en.md)
 - [Debian and CentOS packages](../installation/packages.en.md)
 - [Image management CLI (vitastor-cli)](../usage/cli.en.md)
 - [Disk management CLI (vitastor-disk)](../usage/disk.en.md)
@@ -44,9 +50,6 @@
 - [Native QEMU driver](../usage/qemu.en.md)
 - [Loadable fio engine for benchmarks](../usage/fio.en.md)
 - [NBD proxy for kernel mounts](../usage/nbd.en.md)
- [CSI plugin for Kubernetes](../installation/kubernetes.en.md)
- [OpenStack support: Cinder driver, Nova and libvirt patches](../installation/openstack.en.md)
- [Proxmox storage plugin and packages](../installation/proxmox.en.md)
 - [Simplified NFS proxy for file-based image access emulation (suitable for VMWare)](../usage/nfs.en.md#pseudo-fs)

 ## Roadmap
@@ -56,7 +59,6 @@ The following features are planned for the future:
 - Control plane optimisation
 - Other administrative tools
 - Web GUI
- OpenNebula plugin
 - iSCSI and NVMeoF gateways
 - Multi-threaded client
 - Faster failover
--- a/docs/intro/features.ru.md
+++ b/docs/intro/features.ru.md
@@ -36,9 +36,15 @@
 - [Буферизация записи на стороне клиента](../config/client.ru.md#client_enable_writeback)
 - [Интеллектуальная автоподстройка скорости восстановления](../config/osd.ru.md#recovery_tune_interval)
 - [Кластерная файловая система](../usage/nfs.ru.md#vitastorfs)
+- [Экспериментальная встроенная замена etcd - antietcd](../config/monitor.ru.md#use_antietcd)
+- [Встроенный Prometheus-экспортер метрик](../config/monitor.ru.md#enable_prometheus)

 ## Драйверы и инструменты

+- [Плагин для Proxmox](../installation/proxmox.ru.md)
+- [Плагин для OpenNebula](../installation/opennebula.ru.md)
+- [CSI-плагин для Kubernetes](../installation/kubernetes.ru.md)
+- [Базовая поддержка OpenStack: драйвер Cinder, патчи для Nova и libvirt](../installation/openstack.ru.md)
 - [Пакеты для Debian и CentOS](../installation/packages.ru.md)
 - [Консольный интерфейс управления образами (vitastor-cli)](../usage/cli.ru.md)
 - [Инструмент управления дисками (vitastor-disk)](../usage/disk.ru.md)
@@ -46,9 +52,6 @@
 - [Драйвер диска для QEMU](../usage/qemu.ru.md)
 - [Драйвер диска для утилиты тестирования производительности fio](../usage/fio.ru.md)
 - [NBD-прокси для монтирования образов ядром](../usage/nbd.ru.md) ("блочное устройство в режиме пользователя")
- [CSI-плагин для Kubernetes](../installation/kubernetes.ru.md)
- [Базовая поддержка OpenStack: драйвер Cinder, патчи для Nova и libvirt](../installation/openstack.ru.md)
- [Плагин для Proxmox](../installation/proxmox.ru.md)
 - [Упрощённая NFS-прокси для эмуляции файлового доступа к образам (подходит для VMWare)](../usage/nfs.ru.md#псевдо-фс)

 ## Планы развития
@@ -56,7 +59,6 @@
 - Оптимизация слоя управления
 - Другие инструменты администрирования
 - Web-интерфейс
- Плагин для OpenNebula
 - iSCSI и NVMeoF прокси
 - Многопоточный клиент
 - Более быстрое переключение при отказах
--- a/docs/intro/quickstart.en.md
+++ b/docs/intro/quickstart.en.md
@@ -22,7 +22,7 @@
  with lazy fsync, but prepare for inferior single-thread latency. Read more about capacitors
  [here](../config/layout-cluster.en.md#immediate_commit).
 - If you want to use HDDs, get modern HDDs with Media Cache or SSD Cache: HGST Ultrastar,
-  Toshiba MG08, Seagate EXOS or something similar. If your drives don't have such cache then
+  Toshiba MG, Seagate EXOS or something similar. If your drives don't have such cache then
  you also need small SSDs for journal and metadata (even 2 GB per 1 TB of HDD space is enough).
 - Get a fast network (at least 10 Gbit/s). Something like Mellanox ConnectX-4 with RoCEv2 is ideal.
 - Disable CPU powersaving: `cpupower idle-set -D 0 && cpupower frequency-set -g performance`.
@@ -32,8 +32,8 @@

 - SATA SSD: Micron 5100/5200/5300/5400, Samsung PM863/PM883/PM893, Intel D3-S4510/4520/4610/4620, Kingston DC500M
 - NVMe: Micron 9100/9200/9300/9400, Micron 7300/7450, Samsung PM983/PM9A3, Samsung PM1723/1735/1743,
-  Intel DC-P3700/P4500/P4600, Intel D7-P5500/P5600, Intel Optane, Kingston DC1000B/DC1500M
- HDD: HGST Ultrastar, Toshiba MG06/MG07/MG08, Seagate EXOS
+  Intel DC-P3700/P4500/P4600, Intel D5-P4320, Intel D7-P5500/P5600, Intel Optane, Kingston DC1000B/DC1500M
+- HDD: HGST Ultrastar, Toshiba MG, Seagate EXOS

 ## Configure monitors

@@ -68,10 +68,6 @@ On the monitor hosts:
    but some free unpartitioned space must be available because the script creates new partitions for journals.
 - You can change OSD configuration in units or in `vitastor.conf`.
  Check [Configuration Reference](../config.en.md) for parameter descriptions.
- If all your drives have capacitors, and even if not, but if you ran `vitastor-disk`
-  without `--disable_data_fsync off` at the first step, then put the following
-  setting into etcd: \
-  `etcdctl --endpoints=... put /vitastor/config/global '{"immediate_commit":"all"}'`
 - Start all OSDs: `systemctl start vitastor.target`

 ## Create a pool
@@ -88,6 +84,10 @@ For EC pools the configuration should look like the following:
 vitastor-cli create-pool testpool --ec 2+2 --pg_count 256
 ```

+Add `--immediate_commit none` if you added `--disable_data_fsync off` at the OSD
+initialization step, or if `vitastor-disk` complained about impossibility to
+disable drive cache.
+
 After you do this, one of the monitors will configure PGs and OSDs will start them.

 If you use HDDs you should also add `"block_size": 1048576` to pool configuration.
--- a/docs/intro/quickstart.ru.md
+++ b/docs/intro/quickstart.ru.md
@@ -22,7 +22,7 @@
  использовать и десктопные SSD, включив режим отложенного fsync, но производительность будет хуже.
  О конденсаторах читайте [здесь](../config/layout-cluster.ru.md#immediate_commit).
 - Если хотите использовать HDD, берите современные модели с Media или SSD кэшем - HGST Ultrastar,
-  Toshiba MG08, Seagate EXOS или что-то похожее. Если такого кэша у ваших дисков нет,
+  Toshiba MG, Seagate EXOS или что-то похожее. Если такого кэша у ваших дисков нет,
  обязательно возьмите SSD под метаданные и журнал (маленькие, буквально 2 ГБ на 1 ТБ HDD-места).
 - Возьмите быструю сеть, минимум 10 гбит/с. Идеал - что-то вроде Mellanox ConnectX-4 с RoCEv2.
 - Для лучшей производительности отключите энергосбережение CPU: `cpupower idle-set -D 0 && cpupower frequency-set -g performance`.
@@ -32,8 +32,8 @@

 - SATA SSD: Micron 5100/5200/5300/5400, Samsung PM863/PM883/PM893, Intel D3-S4510/4520/4610/4620, Kingston DC500M
 - NVMe: Micron 9100/9200/9300/9400, Micron 7300/7450, Samsung PM983/PM9A3, Samsung PM1723/1735/1743,
-  Intel DC-P3700/P4500/P4600, Intel D7-P5500/P5600, Intel Optane, Kingston DC1000B/DC1500M
- HDD: HGST Ultrastar, Toshiba MG06/MG07/MG08, Seagate EXOS
+  Intel DC-P3700/P4500/P4600, Intel D5-P4320, Intel D7-P5500/P5600, Intel Optane, Kingston DC1000B/DC1500M
+- HDD: HGST Ultrastar, Toshiba MG, Seagate EXOS

 ## Настройте мониторы

@@ -69,11 +69,6 @@
    для журналов, на SSD должно быть доступно свободное нераспределённое место.
 - Вы можете менять параметры OSD в юнитах systemd или в `vitastor.conf`. Описания параметров
  смотрите в [справке по конфигурации](../config.ru.md).
- Если все ваши диски - серверные с конденсаторами, и даже если нет, но при этом
-  вы не добавляли опцию `--disable_data_fsync off` на первом шаге, а `vitastor-disk`
-  не ругался на невозможность отключения кэша дисков, пропишите следующую настройку
-  в глобальную конфигурацию в etcd: \
-  `etcdctl --endpoints=... put /vitastor/config/global '{"immediate_commit":"all"}'`.
 - Запустите все OSD: `systemctl start vitastor.target`

 ## Создайте пул
@@ -90,6 +85,10 @@ vitastor-cli create-pool testpool --pg_size 2 --pg_count 256
 vitastor-cli create-pool testpool --ec 2+2 --pg_count 256
 ```

+Добавьте также опцию `--immediate_commit none`, если вы добавляли `--disable_data_fsync off`
+на этапе инициализации OSD, либо если `vitastor-disk` ругался на невозможность отключения
+кэша дисков.
+
 После этого один из мониторов должен сконфигурировать PG, а OSD должны запустить их.

 Если вы используете HDD-диски, то добавьте в конфигурацию пулов опцию `"block_size": 1048576`.
@@ -123,4 +122,4 @@ vitastor-cli create -s 10G testimg
 Если вы хотите использовать не только блочные образы виртуальных машин или контейнеров,
 а также кластерную файловую систему, то:

- [Следуйте инструкциям](../usage/nfs.en.md#vitastorfs)
+- [Следуйте инструкциям](../usage/nfs.ru.md#vitastorfs)
--- a/docs/usage/admin.en.md
+++ b/docs/usage/admin.en.md
@@ -42,7 +42,7 @@ PG state always includes exactly 1 of the following base states:
 - **offline** — PG isn't activated by any OSD at all. Either primary OSD isn't set for
  this PG at all (if the pool is just created), or an unavailable OSD is set as primary,
  or the primary OSD refuses to start this PG (for example, because of wrong block_size),
-  or the PG is stopped by the monitor using `pause: true` flag in `/vitastor/config/pgs` in etcd.
+  or the PG is stopped by the monitor using `pause: true` flag in `/vitastor/pg/config` in etcd.
 - **starting** — primary OSD has acquired PG lock in etcd, PG is starting.
 - **peering** — primary OSD requests PG object listings from secondary OSDs and calculates
  the PG state.
@@ -107,16 +107,17 @@ If a PG is active it can also have any number of the following additional states

 ## Removing a healthy disk

-Befor removing a healthy disk from the cluster set its OSD weight(s) to 0 to
-move data away. To do that, add `"reweight":0` to etcd key `/vitastor/config/osd/<OSD_NUMBER>`.
-For example:
+Before removing a healthy disk from the cluster set its OSD weight(s) to 0 to
+move data away. To do that, run `vitastor-cli modify-osd --reweight 0 <НОМЕР_OSD>`.
+
+Then wait until rebalance finishes and remove OSD by running `vitastor-disk purge /dev/vitastor/osdN-data`.
+
+Zero weight can also be put manually into etcd key `/vitastor/config/osd/<НОМЕР_OSD>`, for example:

 ```
 etcdctl --endpoints=http://1.1.1.1:2379/v3 put /vitastor/config/osd/1 '{"reweight":0}'
 ```

-Then wait until rebalance finishes and remove OSD by running `vitastor-disk purge /dev/vitastor/osdN-data`.
-
 ## Removing a failed disk

 If a disk is already dead, its OSD(s) are likely already stopped.
@@ -149,7 +150,7 @@ POOL_ID=1
 ALL_OSDS=$(etcdctl --endpoints=your_etcd_address:2379 get --keys-only --prefix /vitastor/osd/stats/ | \
    perl -e '$/ = undef; $a = <>; $a =~ s/\s*$//; $a =~ s!/vitastor/osd/stats/!!g; $a =~ s/\s+/,/g; print $a')
 for i in $(seq 1 $PG_COUNT); do
-    etcdctl --endpoints=your_etcd_address:2379 put /vitastor/pg/history/$POOL_ID/$i '{"all_peers":['$ALL_OSDS']}'; done
+    etcdctl --endpoints=your_etcd_address:2379 put /vitastor/pg/history/$POOL_ID/$i '{"all_peers":['$ALL_OSDS']}'
 done
 ```

@@ -168,21 +169,63 @@ Upgrading is performed without stopping clients (VMs/containers), you just need
 upgrade and restart servers one by one. However, ideally you should restart VMs too
 to make them use the new version of the client library.

-Exceptions (specific upgrade instructions):
- Upgrading <= 1.1.x to 1.2.0 or later, if you use EC n+k with k>=2, is recommended
-  to be performed with full downtime: first you should stop all clients, then all OSDs,
-  then upgrade and start everything back — because versions before 1.2.0 have several
-  bugs leading to invalid data being read in EC n+k, k>=2 configurations in degraded pools.
- Versions <= 0.8.7 are incompatible with versions >= 0.9.0, so you should first
-  upgrade from <= 0.8.7 to 0.8.8 or 0.8.9, and only then to >= 0.9.x. If you upgrade
-  without this intermediate step, client I/O will hang until the end of upgrade process.
- Upgrading from <= 0.5.x to >= 0.6.x is not supported.
+### 1.7.x to 1.8.0

-Rollback:
- Version 1.0.0 has a new disk format, so OSDs initiaziled on 1.0.0 can't be rolled
-  back to 0.9.x or previous versions.
- Versions before 0.8.0 don't have vitastor-disk, so OSDs, initialized by it, won't
-  start with 0.7.x or 0.6.x. :-)
+After upgrading version <= 1.7.x to version >= 1.8.0, BUT <= 1.9.0: restart all clients
+(VMs and so on), otherwise they will hang when monitor clears old PG configuration key,
+which happens 24 hours after upgrade.
+
+This is fixed in 1.9.1. So, after upgrading version <= 1.7.x directly to version >= 1.9.1,
+you DO NOT have to restart all old clients immediately - they will work like before until
+you decide to upgrade them too. The downside is that you'll have to remove the old PG
+configuration key (`/vitastor/config/pgs`) from etcd by hand when you make sure that all
+your clients are restarted.
+
+### 1.1.x to 1.2.0
+
+Upgrading version <= 1.1.x to version >= 1.2.0, if you use EC n+k with k>=2, is recommended
+to be performed with full downtime: first you should stop all clients, then all OSDs,
+then upgrade and start everything back — because versions before 1.2.0 have several
+bugs leading to invalid data being read in EC n+k, k>=2 configurations in degraded pools.
+
+### 0.8.7 to 0.9.0
+
+Versions <= 0.8.7 are incompatible with versions >= 0.9.0, so you should first
+upgrade from <= 0.8.7 to 0.8.8 or 0.8.9, and only then to >= 0.9.x. If you upgrade
+without this intermediate step, client I/O will hang until the end of upgrade process.
+
+### 0.5.x to 0.6.x
+
+Upgrading from <= 0.5.x to >= 0.6.x is not supported.
+
+## Downgrade
+
+Downgrade are also allowed freely, except the following specific instructions:
+
+### 1.8.0 to 1.7.1
+
+Before downgrading from version >= 1.8.0 to version <= 1.7.1
+you have to copy /vitastor/pg/config etcd key to /vitastor/config/pgs:
+
+```
+etcdctl --endpoints=http://... get --print-value-only /vitastor/pg/config | \
+  etcdctl --endpoints=http://... put /vitastor/config/pgs
+```
+
+Then you can just install older packages and restart all services.
+
+If you performed downgrade without first copying that key, run "add all OSDs into the
+history records of all PGs" from [Restoring from lost pool configuration](#restoring-from-lost-pool-configuration).
+
+### 1.0.0 to 0.9.x
+
+Version 1.0.0 has a new disk format, so OSDs initialized on 1.0.0 or later can't
+be rolled back to 0.9.x or previous versions.
+
+### 0.8.0 to 0.7.x
+
+Versions before 0.8.0 don't have vitastor-disk, so OSDs, initialized by it, won't
+start with older versions (0.4.x - 0.7.x). :-)

 ## OSD memory usage

--- a/docs/usage/admin.ru.md
+++ b/docs/usage/admin.ru.md
@@ -42,7 +42,7 @@
 - **offline** — PG вообще не активирована ни одним OSD. Либо первичный OSD не назначен вообще
  (если пул только создан), либо в качестве первичного назначен недоступный OSD, либо
  назначенный OSD отказывается запускать эту PG (например, из-за несовпадения block_size),
-  либо PG остановлена монитором через флаг `pause: true` в `/vitastor/config/pgs` в etcd.
+  либо PG остановлена монитором через флаг `pause: true` в `/vitastor/pg/config` в etcd.
 - **starting** — первичный OSD захватил блокировку PG в etcd, PG запускается.
 - **peering** — первичный OSD опрашивает вторичные OSD на предмет списков объектов данной PG и рассчитывает её состояние.
 - **repeering** — PG ожидает завершения текущих операций ввода-вывода, после чего перейдёт в состояние **peering**.
@@ -105,14 +105,16 @@ PG должны очень быстро переходить из них в др
 ## Удаление исправного диска

 Перед удалением исправного диска из кластера установите его OSD вес в 0, чтобы убрать с него данные.
-Для этого добавьте в ключ `/vitastor/config/osd/<НОМЕР_OSD>` в etcd значение `"reweight":0`, например:
+Для этого выполните команду `vitastor-cli modify-osd --reweight 0 <НОМЕР_OSD>`.
+
+Дождитесь завершения перебалансировки данных, после чего удалите OSD командой `vitastor-disk purge /dev/vitastor/osdN-data`.
+
+Также вес 0 можно прописать вручную прямо в etcd в ключ `/vitastor/config/osd/<НОМЕР_OSD>`, например:

 ```
 etcdctl --endpoints=http://1.1.1.1:2379/v3 put /vitastor/config/osd/1 '{"reweight":0}'
 ```

-Дождитесь завершения ребаланса, после чего удалите OSD командой `vitastor-disk purge /dev/vitastor/osdN-data`.
-
 ## Удаление неисправного диска

 Если диск уже умер, его OSD, скорее всего, уже будет/будут остановлен(ы).
@@ -145,7 +147,7 @@ POOL_ID=1
 ALL_OSDS=$(etcdctl --endpoints=your_etcd_address:2379 get --keys-only --prefix /vitastor/osd/stats/ | \
    perl -e '$/ = undef; $a = <>; $a =~ s/\s*$//; $a =~ s!/vitastor/osd/stats/!!g; $a =~ s/\s+/,/g; print $a')
 for i in $(seq 1 $PG_COUNT); do
-    etcdctl --endpoints=your_etcd_address:2379 put /vitastor/pg/history/$POOL_ID/$i '{"all_peers":['$ALL_OSDS']}'; done
+    etcdctl --endpoints=your_etcd_address:2379 put /vitastor/pg/history/$POOL_ID/$i '{"all_peers":['$ALL_OSDS']}'
 done
 ```

@@ -164,21 +166,63 @@ done
 достаточно обновлять серверы по одному. Однако, конечно, чтобы запущенные виртуальные машины
 начали использовать новую версию клиентской библиотеки, их тоже нужно перезапустить.

-Исключения (особые указания при обновлении):
- Обновляться с версий <= 1.1.x до версий >= 1.2.0, если вы используете EC n+k и k>=2,
-  рекомендуется с временной остановкой кластера — сначала нужно остановить всех клиентов,
-  потом все OSD, потом обновить и запустить всё обратно — из-за нескольких багов, которые
-  могли приводить к некорректному чтению данных в деградированных EC-пулах.
- Версии <= 0.8.7 несовместимы с версиями >= 0.9.0, поэтому при обновлении с <= 0.8.7
-  нужно сначала обновиться до 0.8.8 или 0.8.9, а уже потом до любых версий >= 0.9.x.
-  Иначе клиентский ввод-вывод зависнет до завершения обновления.
- Обновление с версий 0.5.x и более ранних до 0.6.x и более поздних не поддерживается.
+### 1.7.x -> 1.8.0

-Откат:
- В версии 1.0.0 поменялся дисковый формат, поэтому OSD, созданные на версии >= 1.0.0,
-  нельзя откатить до версии 0.9.x и более ранних.
- В версиях ранее 0.8.0 нет vitastor-disk, значит, созданные им OSD нельзя откатить
-  до 0.7.x или 0.6.x. :-)
+После обновления с версий <= 1.7.x до версий >= 1.8.0, НО <= 1.9.0: перезапустите всех
+клиентов (процессы виртуальных машин можно перезапустить путём миграции на другой сервер),
+иначе они зависнут, когда монитор удалит старый ключ конфигурации PG, что происходит через
+24 часа после обновления.
+
+Однако, это исправлено в 1.9.1. Так что, если вы обновляетесь с <= 1.7.x сразу до >= 1.9.1,
+вам НЕ нужно сразу перезапускать всех клиентов - они будут работать, как раньше. Минус,
+правда, в том, что старый ключ конфигурации PG (`/vitastor/config/pgs`) будет нужно удалить
+вам из etcd вручную - после того, как вы убедитесь, что все клиенты перезапущены.
+
+### 1.1.x -> 1.2.0
+
+Обновляться с версий <= 1.1.x до версий >= 1.2.0, если вы используете EC n+k и k>=2,
+рекомендуется с временной остановкой кластера — сначала нужно остановить всех клиентов,
+потом все OSD, потом обновить и запустить всё обратно — из-за нескольких багов, которые
+могли приводить к некорректному чтению данных в деградированных EC-пулах.
+
+### 0.8.7 -> 0.9.0
+
+Версии <= 0.8.7 несовместимы с версиями >= 0.9.0, поэтому при обновлении с <= 0.8.7
+нужно сначала обновиться до 0.8.8 или 0.8.9, а уже потом до любых версий >= 0.9.x.
+Иначе клиентский ввод-вывод зависнет до завершения обновления.
+
+### 0.5.x -> 0.6.x
+
+Обновление с версий 0.5.x и более ранних до 0.6.x и более поздних не поддерживается.
+
+## Откат версии
+
+Откат (понижение версии) тоже свободно разрешён, кроме указанных ниже случаев:
+
+### 1.8.0 -> 1.7.1
+
+Перед понижением версии с >= 1.8.0 до <= 1.7.1 вы должны скопировать ключ
+etcd `/vitastor/pg/config` в `/vitastor/config/pgs`:
+
+```
+etcdctl --endpoints=http://... get --print-value-only /vitastor/pg/config | \
+  etcdctl --endpoints=http://... put /vitastor/config/pgs
+```
+
+После этого можно просто установить более старые пакеты и перезапустить все сервисы.
+
+Если вы откатили версию, не скопировав предварительно этот ключ - выполните "добавление всех
+OSD в исторические записи всех PG" из раздела [Восстановление потерянной конфигурации пулов](#восстановление-потерянной-конфигурации-пулов).
+
+### 1.0.0 -> 0.9.x
+
+В версии 1.0.0 поменялся дисковый формат, поэтому OSD, созданные на версии >= 1.0.0,
+нельзя откатить до версии 0.9.x и более ранних.
+
+### 0.8.0 -> 0.7.x
+
+В версиях ранее 0.8.0 нет vitastor-disk, значит, созданные им OSD не запустятся на
+более ранних версиях (0.4.x - 0.7.x). :-)

 ## Потребление памяти OSD

--- a/docs/usage/cli.en.md
+++ b/docs/usage/cli.en.md
@@ -16,6 +16,7 @@ It supports the following commands:
 - [create](#create)
 - [snap-create](#create)
 - [modify](#modify)
+- [dd](#dd)
 - [rm](#rm)
 - [flatten](#flatten)
 - [rm-data](#rm-data)
@@ -24,6 +25,10 @@ It supports the following commands:
 - [fix](#fix)
 - [alloc-osd](#alloc-osd)
 - [rm-osd](#rm-osd)
+- [osd-tree](#osd-tree)
+- [ls-osd](#ls-osd)
+- [modify-osd](#modify-osd)
+- [pg-list](#pg-list)
 - [create-pool](#create-pool)
 - [modify-pool](#modify-pool)
 - [ls-pools](#ls-pools)
@@ -144,19 +149,60 @@ You should resize file system in the image, if present, before shrinking it.
 * `-f|--force` - Proceed with shrinking or setting readwrite flag even if the image has children.
 * `--down-ok` - Proceed with shrinking even if some data will be left on unavailable OSDs.

+## dd
+
+```
+vitastor-cli dd [iimg=<image> | if=<file>] [oimg=<image> | of=<file>] [bs=1M] \
+    [count=N] [seek/oseek=N] [skip/iseek=M] [iodepth=N] [status=progress] \
+    [conv=nocreat,noerror,nofsync,trunc,nosparse] [iflag=direct] [oflag=direct,append]
+```
+
+Copy data between Vitastor images, files and pipes.
+
+Options can be specified in classic dd style (`key=value`) or like usual (`--key value`).
+
+| <!-- -->        | <!-- -->                                                                |
+|-----------------|-------------------------------------------------------------------------|
+| `iimg=<image>`  | Copy from Vitastor image `<image>`                                      |
+| `if=<file>`     | Copy from file `<file>`                                                 |
+| `oimg=<image>`  | Copy to Vitastor image `<image>`                                        |
+| `of=<file>`     | Copy to file `<file>`                                                   |
+| `bs=1M`         | Set copy block size                                                     |
+| `count=N`       | Copy only N input blocks. If N ends in B it counts bytes, not blocks    |
+| `seek/oseek=N`  | Skip N output blocks. If N ends in B it counts bytes, not blocks        |
+| `skip/iseek=N`  | Skip N input blocks. If N ends in B it counts bytes, not blocks         |
+| `iodepth=N`     | Send N reads or writes in parallel (default 4)                          |
+| `status=LEVEL`  | The LEVEL of information to print to stderr: none/noxfer/progress       |
+| `size=N`        | Specify size for the created output file/image (defaults to input size) |
+| `iflag=direct`  | For input files only: use direct I/O                                    |
+| `oflag=direct`  | For output files only: use direct I/O                                   |
+| `oflag=append`  | For files only: append to output file                                   |
+| `conv=nocreat`  | Do not create output file/image                                         |
+| `conv=trunc`    | Truncate output file/image                                              |
+| `conv=noerror`  | Continue copying after errors                                           |
+| `conv=nofsync`  | Do not call fsync before finishing (default behaviour is fsync)         |
+| `conv=nosparse` | Write all output blocks including all-zero blocks                       |
+
 ## rm

 `vitastor-cli rm <from> [<to>] [--writers-stopped] [--down-ok]`

-Remove `<from>` or all layers between `<from>` and `<to>` (`<to>` must be a child of `<from>`),
-rebasing all their children accordingly. --writers-stopped allows merging to be a bit
-more effective in case of a single 'slim' read-write child and 'fat' removed parent:
-the child is merged into parent and parent is renamed to child in that case.
-In other cases parent layers are always merged into children.
+`vitastor-cli rm (--exact|--matching) <glob> ...`

-Other options:
+Remove layer(s) and rebase all their children accordingly.

-* `--down-ok` - Continue deletion/merging even if some data will be left on unavailable OSDs.
+In the first form, remove `<from>` or layers between `<from>` and its child `<to>`.
+
+In the second form, remove all images with exact or pattern-matched names.
+
+Options:
+
+* `--writers-stopped` allows optimised removal in case of a single 'slim' read-write
+  child and 'fat' removed parent: the child is merged into parent and parent is renamed
+  to child in that case. In other cases parent layers are always merged into children.
+* `--exact` - remove multiple images with names matching given glob patterns.
+* `--matching` - remove multiple images with given names
+* `--down-ok` - continue deletion/merging even if some data will be left on unavailable OSDs.

 ## flatten

@@ -174,6 +220,7 @@ Remove inode data without changing metadata.
 --wait-list   Retrieve full objects listings before starting to remove objects.
              Requires more memory, but allows to show correct removal progress.
 --min-offset  Purge only data starting with specified offset.
+--max-offset  Purge only data before specified offset.
 ```

 ## merge-data
@@ -246,6 +293,82 @@ Refuses to remove OSDs with data without `--force` and `--allow-data-loss`.
 With `--dry-run` only checks if deletion is possible without data loss and
 redundancy degradation.

+## osd-tree
+
+`vitastor-cli osd-tree [-l|--long]`
+
+Show current OSD tree, optionally with I/O statistics if -l is specified.
+
+Example output:
+
+```
+TYPE     NAME       UP    SIZE  USED%    TAGS          WEIGHT  BLOCK  BITMAP  IMM   NOOUT
+host     kaveri
+  disk   nvme0n1p1
+    osd  3          down  100G  0 %      abc,kaveri    1       128k   4k      none  -
+    osd  4          down  100G  0 %                    1       128k   4k      none  -
+  disk   nvme1n1p1
+    osd  5          down  100G  0 %      abc,kaveri    1       128k   4k      none  -
+    osd  6          down  100G  0 %                    1       128k   4k      none  -
+host     stump
+  osd    1          up    100G  37.29 %  osdone        1       128k   4k      all   -
+  osd    2          up    100G  26.8 %   abc           1       128k   4k      all   -
+  osd    7          up    100G  21.84 %                1       128k   4k      all   -
+  osd    8          up    100G  21.63 %                1       128k   4k      all   -
+  osd    9          up    100G  20.69 %                1       128k   4k      all   -
+  osd    10         up    100G  21.61 %                1       128k   4k      all   -
+  osd    11         up    100G  21.53 %                1       128k   4k      all   -
+  osd    12         up    100G  22.4 %                 1       128k   4k      all   -
+```
+
+## ls-osd
+
+`vitastor-cli osds|ls-osd|osd-ls [-l|--long]`
+
+Show current OSDs as list, optionally with I/O statistics if -l is specified.
+
+Example output:
+
+```
+OSD  PARENT            UP    SIZE  USED%    TAGS          WEIGHT  BLOCK  BITMAP  IMM   NOOUT
+3    kaveri/nvme0n1p1  down  100G  0 %      globl,kaveri  1       128k   4k      none  -
+4    kaveri/nvme0n1p1  down  100G  0 %                    1       128k   4k      none  -
+5    kaveri/nvme1n1p1  down  100G  0 %      globl,kaveri  1       128k   4k      none  -
+6    kaveri/nvme1n1p1  down  100G  0 %                    1       128k   4k      none  -
+1    stump             up    100G  37.29 %  osdone        1       128k   4k      all   -
+2    stump             up    100G  26.8 %   globl         1       128k   4k      all   -
+7    stump             up    100G  21.84 %                1       128k   4k      all   -
+8    stump             up    100G  21.63 %                1       128k   4k      all   -
+9    stump             up    100G  20.69 %                1       128k   4k      all   -
+10   stump             up    100G  21.61 %                1       128k   4k      all   -
+11   stump             up    100G  21.53 %                1       128k   4k      all   -
+12   stump             up    100G  22.4 %                 1       128k   4k      all   -
+```
+
+## modify-osd
+
+`vitastor-cli modify-osd [--tags tag1,tag2,...] [--reweight <number>] [--noout true/false] <osd_number>`
+
+Set OSD reweight, tags or noout flag. See detail description in [OSD config documentation](../config/pool.en.md#osd-settings).
+
+## pg-list
+
+`vitastor-cli pg-list|pg-ls|list-pg|ls-pg|ls-pgs [OPTIONS] [state1+state2] [^state3] [...]`
+
+List PGs with any of listed state filters (^ or ! in the beginning is negation). Options:
+
+```
+--pool <pool name or number>  Only list PGs of the given pool.
+--min <min pg number>         Only list PGs with number >= min.
+--max <max pg number>         Only list PGs with number <= max.
+```
+
+Examples:
+
+`vitastor-cli pg-list active+degraded`
+
+`vitastor-cli pg-list ^active`
+
 ## create-pool

 `vitastor-cli create-pool|pool-create <name> (-s <pg_size>|--ec <N>+<K>) -n <pg_count> [OPTIONS]`
--- a/docs/usage/cli.ru.md
+++ b/docs/usage/cli.ru.md
@@ -17,12 +17,17 @@ vitastor-cli - интерфейс командной строки для адм
 - [create](#create)
 - [snap-create](#create)
 - [modify](#modify)
+- [dd](#dd)
 - [rm](#rm)
 - [flatten](#flatten)
 - [rm-data](#rm-data)
 - [merge-data](#merge-data)
 - [alloc-osd](#alloc-osd)
 - [rm-osd](#rm-osd)
+- [osd-tree](#osd-tree)
+- [ls-osd](#ls-osd)
+- [modify-osd](#modify-osd)
+- [pg-list](#pg-list)
 - [create-pool](#create-pool)
 - [modify-pool](#modify-pool)
 - [ls-pools](#ls-pools)
@@ -147,23 +152,61 @@ vitastor-cli snap-create [-p|--pool <id|name>] <image>@<snapshot>
 * `-f|--force` - Разрешить уменьшение или перевод в чтение-запись образа, у которого есть клоны.
 * `--down-ok` - Разрешить уменьшение, даже если часть данных останется неудалённой на недоступных OSD.

+## dd
+
+```
+vitastor-cli dd [iimg=<image> | if=<file>] [oimg=<image> | of=<file>] [bs=1M] \
+    [count=N] [seek/oseek=N] [skip/iseek=M] [iodepth=N] [status=progress] \
+    [conv=nocreat,noerror,nofsync,trunc,nosparse] [iflag=direct] [oflag=direct,append]
+```
+
+Копировать данные между образами Vitastor, файлами и каналами.
+
+Опции можно передавать в классическом стиле dd (`key=value`) или как обычно (`--key value`).
+
+| <!-- -->        | <!-- -->                                                                |
+|-----------------|-------------------------------------------------------------------------|
+| `iimg=<image>`  | Копировать из образа Vitastor `<image>`                                 |
+| `if=<file>`     | Копировать из файла `<file>`                                            |
+| `oimg=<image>`  | Копировать в образ Vitastor `<image>`                                   |
+| `of=<file>`     | Копировать в файл `<file>`                                              |
+| `bs=1M`         | Задать размер блока копирования                                         |
+| `count=N`       | Копировать не более N блоков. Если N заканчивается на B - то N байт.    |
+| `seek/oseek=N`  | Пропустить N выходных блоков. Если N заканчивается на B - то N байт.    |
+| `skip/iseek=N`  | Пропустить N входных блоков. Если N заканчивается на B - то N байт.     |
+| `iodepth=N`     | Отправлять N чтений/записей параллельно (по умолчанию 4).               |
+| `status=LEVEL`  | Уровень вывода в консоль: none/noxfer/progress                          |
+| `size=N`        | Задать размер выходного файла/образа (по умолчанию равен размеру входа).|
+| `iflag=direct`  | Только для входного файла: использовать прямой ввод-вывод               |
+| `oflag=direct`  | Только для выходного файла: использовать прямой ввод-вывод              |
+| `oflag=append`  | Только для файлов: дописывать в конец выходного файла                   |
+| `conv=nocreat`  | Не создавать выходной файл/образ                                        |
+| `conv=trunc`    | Обрезать выходной файл/образ до размера входа                           |
+| `conv=noerror`  | Продолжать копирование после ошибок                                     |
+| `conv=nofsync`  | Не вызывать fsync перед завершением                                     |
+| `conv=nosparse` | Записывать все выходные блоки, включая пустые                           |
+
 ## rm

 `vitastor-cli rm <from> [<to>] [--writers-stopped] [--down-ok]`

-Удалить образ `<from>` или все слои от `<from>` до `<to>` (`<to>` должен быть дочерним
-образом `<from>`), одновременно меняя родительские образы их клонов (если таковые есть).
+`vitastor-cli rm (--exact|--matching) <glob> ...`

-`--writers-stopped` позволяет чуть более эффективно удалять образы в частом случае, когда
-у удаляемой цепочки есть только один дочерний образ, содержащий небольшой объём данных.
-В этом случае дочерний образ вливается в родительский и удаляется, а родительский
-переименовывается в дочерний.
+Удалить образ(ы), корректно перебазируя их дочерние образы.

-В других случаях родительские слои вливаются в дочерние.
+В первой форме удаляет один образ `<from>` или все слои между `<from>` и его дочерним `<to>`.

-Другие опции:
+Во второй форме, удаляет все образы с точными именами или именами, подходящими под шаблон(ы).

-* `--down-ok` - Продолжать удаление/слияние, даже если часть данных останется неудалённой на недоступных OSD.
+Опции:
+
+* `--writers-stopped` позволяет чуть более эффективно удалять образы в частом случае, когда
+  у удаляемой цепочки есть только один дочерний образ, содержащий небольшой объём данных.
+  В этом случае дочерний образ вливается в родительский и удаляется, а родительский
+  переименовывается в дочерний.
+* `--exact` - удалить все образы с именами, подходящими под переданные glob-шаблоны.
+* `--matching` - удалить все образы с точно заданными именами.
+* `--down-ok` - продолжать удаление/слияние, даже если часть данных останется неудалённой на недоступных OSD.

 ## flatten

@@ -182,6 +225,7 @@ vitastor-cli snap-create [-p|--pool <id|name>] <image>@<snapshot>
 --wait-list   Сначала запросить полный листинг объектов, а потом начать удалять.
              Требует больше памяти, но позволяет правильно печатать прогресс удаления.
 --min-offset  Удалять только данные, начиная с заданного смещения.
+--max-offset  Удалять только данные до (исключительно) заданного смещения.
 ```

 ## merge-data
@@ -263,6 +307,83 @@ vitastor-cli snap-create [-p|--pool <id|name>] <image>@<snapshot>
 С опцией `--dry-run` только проверяет, возможно ли удаление без потери данных и деградации
 избыточности.

+## osd-tree
+
+`vitastor-cli osd-tree [-l|--long]`
+
+Показать дерево OSD, со статистикой ввода-вывода, если установлено -l.
+
+Пример вывода:
+
+```
+TYPE     NAME       UP    SIZE  USED%    TAGS          WEIGHT  BLOCK  BITMAP  IMM   NOOUT
+host     kaveri
+  disk   nvme0n1p1
+    osd  3          down  100G  0 %      globl,kaveri  1       128k   4k      none  -
+    osd  4          down  100G  0 %                    1       128k   4k      none  -
+  disk   nvme1n1p1
+    osd  5          down  100G  0 %      globl,kaveri  1       128k   4k      none  -
+    osd  6          down  100G  0 %                    1       128k   4k      none  -
+host     stump
+  osd    1          up    100G  37.29 %  osdone        1       128k   4k      all   -
+  osd    2          up    100G  26.8 %   globl         1       128k   4k      all   -
+  osd    7          up    100G  21.84 %                1       128k   4k      all   -
+  osd    8          up    100G  21.63 %                1       128k   4k      all   -
+  osd    9          up    100G  20.69 %                1       128k   4k      all   -
+  osd    10         up    100G  21.61 %                1       128k   4k      all   -
+  osd    11         up    100G  21.53 %                1       128k   4k      all   -
+  osd    12         up    100G  22.4 %                 1       128k   4k      all   -
+```
+
+## ls-osd
+
+`vitastor-cli osds|ls-osd|osd-ls [-l|--long]`
+
+Показать список OSD, со статистикой ввода-вывода, если установлено -l.
+
+Пример вывода:
+
+```
+OSD  PARENT            UP    SIZE  USED%    TAGS          WEIGHT  BLOCK  BITMAP  IMM   NOOUT
+3    kaveri/nvme0n1p1  down  100G  0 %      globl,kaveri  1       128k   4k      none  -
+4    kaveri/nvme0n1p1  down  100G  0 %                    1       128k   4k      none  -
+5    kaveri/nvme1n1p1  down  100G  0 %      globl,kaveri  1       128k   4k      none  -
+6    kaveri/nvme1n1p1  down  100G  0 %                    1       128k   4k      none  -
+1    stump             up    100G  37.29 %  osdone        1       128k   4k      all   -
+2    stump             up    100G  26.8 %   globl         1       128k   4k      all   -
+7    stump             up    100G  21.84 %                1       128k   4k      all   -
+8    stump             up    100G  21.63 %                1       128k   4k      all   -
+9    stump             up    100G  20.69 %                1       128k   4k      all   -
+10   stump             up    100G  21.61 %                1       128k   4k      all   -
+11   stump             up    100G  21.53 %                1       128k   4k      all   -
+12   stump             up    100G  22.4 %                 1       128k   4k      all   -
+```
+
+## modify-osd
+
+`vitastor-cli modify-osd [--tags tag1,tag2,...] [--reweight <number>] [--noout true/false] <osd_number>`
+
+Установить вес OSD, теги или флаг noout. Смотрите подробное описание в [документации настроек OSD](../config/pool.ru.md#настройки-osd).
+
+## pg-list
+
+`vitastor-cli pg-list|pg-ls|list-pg|ls-pg|ls-pgs [OPTIONS] [state1+state2] [^state3] [...]`
+
+Вывести список PG с состояними, удовлетворяющими любому из переданных фильтров (^ или !
+в начале фильтра означает отрицание). Опции:
+
+```
+--pool <pool name or number>  Only list PGs of the given pool.
+--min <min pg number>         Only list PGs with number >= min.
+--max <max pg number>         Only list PGs with number <= max.
+```
+
+Примеры:
+
+`vitastor-cli pg-list active+degraded`
+
+`vitastor-cli pg-list ^active`
+
 ## create-pool

 `vitastor-cli create-pool|pool-create <name> (-s <pg_size>|--ec <N>+<K>) -n <pg_count> [OPTIONS]`
--- a/docs/usage/disk.en.md
+++ b/docs/usage/disk.en.md
@@ -13,6 +13,7 @@ It supports the following commands:
 - [prepare](#prepare)
 - [upgrade-simple](#upgrade-simple)
 - [resize](#resize)
+- [raw-resize](#raw-resize)
 - [start/stop/restart/enable/disable](#start/stop/restart/enable/disable)
 - [purge](#purge)
 - [read-sb](#read-sb)
@@ -50,12 +51,16 @@ Options (automatic mode):
 --osd_per_disk <N>
  Create <N> OSDs on each disk (default 1)
 --hybrid
-  Prepare hybrid (HDD+SSD) OSDs using provided devices. SSDs will be used for
-  journals and metadata, HDDs will be used for data. Partitions for journals and
-  metadata will be created automatically. Whether disks are SSD or HDD is decided
-  by the `/sys/block/.../queue/rotational` flag. In hybrid mode, default object
-  size is 1 MB instead of 128 KB, default journal size is 1 GB instead of 32 MB,
-  and throttle_small_writes is enabled by default.
+  Prepare hybrid (HDD+SSD, NVMe+SATA or etc) OSDs using provided devices. By default,
+  any passed SSDs will be used for journals and metadata, HDDs will be used for data,
+  but you can override this behaviour with --fast-devices option. Journal and metadata
+  partitions will be created automatically. In the default mode, SSD and HDD disks
+  are distinguished by the `/sys/block/.../queue/rotational` flag. When HDDs are used
+  for data in hybrid mode, default block_size is 1 MB instead of 128 KB, default journal
+  size is 1 GB instead of 32 MB, and throttle_small_writes is enabled by default.
+--fast-devices /dev/nvmeX,/dev/nvmeY
+  In --hybrid mode, use these devices for journal and metadata instead of auto-detecting
+  and extracting them from the main [devices...] list.
 --disable_data_fsync auto
  Disable data device cache and fsync (1/yes/true = on, default auto)
 --disable_meta_fsync auto
@@ -127,25 +132,49 @@ Requires the `sfdisk` utility.

 ## resize

-`vitastor-disk resize <ALL_OSD_PARAMETERS> <NEW_LAYOUT> [--iodepth 32]`
+`vitastor-disk resize <osd_num>|<osd_device> [OPTIONS]`

-Resize data area and/or rewrite/move journal and metadata.
+Resize data area and/or move journal and metadata:
+
+| <!-- -->                  | <!-- -->                               |
+|---------------------------|----------------------------------------|
+| `--move-journal TARGET`   | move journal to `TARGET`               |
+| `--move-meta TARGET`      | move metadata to `TARGET`              |
+| `--journal-size NEW_SIZE` | resize journal to `NEW_SIZE`           |
+| `--data-size NEW_SIZE`    | resize data device to `NEW_SIZE`       |
+| `--dry-run`               | only show new layout, do not apply it  |
+
+`NEW_SIZE` may include k/m/g/t suffixes.
+
+`TARGET` may be one of:
+
+| <!-- -->       | <!-- -->                                                                 |
+|----------------|--------------------------------------------------------------------------|
+| `<partition>`  | move journal/metadata to an existing GPT partition                       |
+| `<raw_device>` | create a GPT partition on `<raw_device>` and move journal/metadata to it |
+| `""`           | (empty string) move journal/metadata back to the data device             |
+
+## raw-resize
+
+`vitastor-disk raw-resize <ALL_OSD_PARAMETERS> <NEW_LAYOUT> [--iodepth 32]`
+
+Resize data area and/or rewrite/move journal and metadata (manual format).

 `ALL_OSD_PARAMETERS` must include all (at least all disk-related)
 parameters from OSD command line (i.e. from systemd unit or superblock).

 `NEW_LAYOUT` may include new disk layout parameters:

-```
--new_data_offset SIZE     resize data area so it starts at SIZE
--new_data_len SIZE        resize data area to SIZE bytes
--new_meta_device PATH     use PATH for new metadata
--new_meta_offset SIZE     make new metadata area start at SIZE
--new_meta_len SIZE        make new metadata area SIZE bytes long
--new_journal_device PATH  use PATH for new journal
--new_journal_offset SIZE  make new journal area start at SIZE
--new_journal_len SIZE     make new journal area SIZE bytes long
-```
+| <!-- -->                    | <!-- -->                                  |
+|-----------------------------|-------------------------------------------|
+| `--new_data_offset SIZE`    | resize data area so it starts at `SIZE`   |
+| `--new_data_len SIZE`       | resize data area to `SIZE` bytes          |
+| `--new_meta_device PATH`    | use `PATH` for new metadata               |
+| `--new_meta_offset SIZE`    | make new metadata area start at `SIZE`    |
+| `--new_meta_len SIZE`       | make new metadata area `SIZE` bytes long  |
+| `--new_journal_device PATH` | use `PATH` for new journal                |
+| `--new_journal_offset SIZE` | make new journal area start at `SIZE`     |
+| `--new_journal_len SIZE`    | make new journal area `SIZE` bytes long   |

 SIZE may include k/m/g/t suffixes. If any of the new layout parameter
 options are not specified, old values will be used.
@@ -217,10 +246,14 @@ Intended for use from startup scripts (i.e. from systemd units).

 ## dump-journal

+`vitastor-disk dump-journal [OPTIONS] <osd_device>`
+
 `vitastor-disk dump-journal [OPTIONS] <journal_file> <journal_block_size> <offset> <size>`

 Dump journal in human-readable or JSON (if `--json` is specified) format.

+You can specify any OSD device (data, metadata or journal), or the layout manually.
+
 Options:

 ```
@@ -233,23 +266,35 @@ Options:

 ## write-journal

+`vitastor-disk write-journal <osd_device>`
+
 `vitastor-disk write-journal <journal_file> <journal_block_size> <bitmap_size> <offset> <size>`

 Write journal from JSON taken from standard input in the same format as produced by
 `dump-journal --json --format data`.

+You can specify any OSD device (data, metadata or journal), or the layout manually.
+
 ## dump-meta

+`vitastor-disk dump-meta <osd_device>`
+
 `vitastor-disk dump-meta <meta_file> <meta_block_size> <offset> <size>`

 Dump metadata in JSON format.

+You can specify any OSD device (data, metadata or journal), or the layout manually.
+
 ## write-meta

+`vitastor-disk write-meta <osd_device>`
+
 `vitastor-disk write-meta <meta_file> <offset> <size>`

 Write metadata from JSON taken from standard input in the same format as produced by `dump-meta`.

+You can specify any OSD device (data, metadata or journal), or the layout manually.
+
 ## simple-offsets

 `vitastor-disk simple-offsets <device>`
--- a/docs/usage/disk.ru.md
+++ b/docs/usage/disk.ru.md
@@ -13,6 +13,7 @@ vitastor-disk - инструмент командной строки для уп
 - [prepare](#prepare)
 - [upgrade-simple](#upgrade-simple)
 - [resize](#resize)
+- [raw-resize](#raw-resize)
 - [start/stop/restart/enable/disable](#start/stop/restart/enable/disable)
 - [purge](#purge)
 - [read-sb](#read-sb)
@@ -50,12 +51,17 @@ vitastor-disk - инструмент командной строки для уп
 --osd_per_disk <N>
  Создавать по несколько (<N>) OSD на каждом диске (по умолчанию 1)
 --hybrid
-  Инициализировать гибридные (HDD+SSD) OSD на указанных дисках. SSD будут
-  использованы для журналов и метаданных, а HDD - для данных. Разделы для журналов
-  и метаданных будут созданы автоматически. Является ли диск SSD или HDD, определяется
-  по флагу `/sys/block/.../queue/rotational`. В гибридном режиме по умолчанию
-  используется размер объекта 1 МБ вместо 128 КБ, размер журнала 1 ГБ вместо 32 МБ
-  и включённый throttle_small_writes.
+  Инициализировать гибридные (HDD+SSD, NVMe+SATA и т.п.) OSD на указанных дисках.
+  По умолчанию, SSD будут использованы для журналов и метаданных, а HDD - для данных,
+  но вы можете поменять это поведение опцией --fast-devices. Разделы для журналов
+  и метаданных будут созданы автоматически. В режиме по умолчанию SSD и HDD-диски
+  различаются по флагу `/sys/block/.../queue/rotational`. Когда в гибридном режиме
+  для данных используются HDD, по умолчанию размер блока устанавливается 1 МБ вместо
+  128 КБ, размер журнала 1 ГБ вместо 32 МБ, и throttle_small_writes включается по
+  умолчанию.
+--fast-devices /dev/nvmeX,/dev/nvmeY
+  Использовать данные диски для журналов и метаданных в гибридном режиме вместо их
+  автоопределения и извлечения из основного списка [devices...].
 --disable_data_fsync auto
  Отключать кэш и fsync-и для устройств данных. (1/yes/true = да, по умолчанию автоопределение)
 --disable_meta_fsync auto
@@ -129,27 +135,51 @@ throttle_target_mbs, throttle_target_parallelism, throttle_threshold_us.

 ## resize

-`vitastor-disk resize <ALL_OSD_PARAMETERS> <NEW_LAYOUT> [--iodepth 32]`
+`vitastor-disk resize <osd_num>|<osd_device> [OPTIONS]`

-Изменить размер области данных и/или переместить журнал и метаданные.
+Изменить размер области данных и/или переместить журнал и метаданные:

-В `ALL_OSD_PARAMETERS` нужно указать все относящиеся к диску параметры OSD
+| <!-- -->                      | <!-- -->                                       |
+|-------------------------------|------------------------------------------------|
+| `--move-journal ЦЕЛЬ`         | переместить журнал на `ЦЕЛЬ`                   |
+| `--move-meta ЦЕЛЬ`            | переместить метаданные на `ЦЕЛЬ`               |
+| `--journal-size НОВЫЙ_РАЗМЕР` | изменить размер журнала на `НОВЫЙ_РАЗМЕР`      |
+| `--data-size НОВЫЙ_РАЗМЕР`    | изменить размер диска данных на `НОВЫЙ_РАЗМЕР` |
+| `--dry-run`                   | показать новые параметры, но не применять их   |
+
+`НОВЫЙ_РАЗМЕР` может быть указан с суффиксами k/m/g/t (кило/мега/гига/терабайт).
+
+`ЦЕЛЬ` может быть одним из:
+
+| <!-- -->        | <!-- -->                                                                            |
+|-----------------|-------------------------------------------------------------------------------------|
+| `<раздел>`      | переместить журнал/метаданные на существующий GPT-раздел                            |
+| `<полный_диск>` | создать GPT-раздел на диске `<полный_диск>` и переместить журнал/метаданные на него |
+| `""`            | (пустая строка) переместить журнал/метаданные обратно на диск данных                |
+
+## raw-resize
+
+`vitastor-disk raw-resize <ВСЕ_ПАРАМЕТРЫ_OSD> <НОВЫЕ_РАЗМЕРЫ> [--iodepth 32]`
+
+Изменить размер области данных и/или переместить журнал и метаданные (ручной формат).
+
+В `ВСЕ_ПАРАМЕТРЫ_OSD` нужно указать все относящиеся к диску параметры OSD
 из суперблока OSD или из файла сервиса systemd (в старых версиях).

-В `NEW_LAYOUT` нужно указать новые параметры расположения данных:
+В `НОВЫЕ_РАЗМЕРЫ` нужно указать новые параметры расположения данных:

-```
--new_data_offset РАЗМЕР     сдвинуть начало области данных на РАЗМЕР байт
--new_data_len РАЗМЕР        изменить размер области данных до РАЗМЕР байт
--new_meta_device ПУТЬ       использовать ПУТЬ как новое устройство метаданных
--new_meta_offset РАЗМЕР     разместить новые метаданные по смещению РАЗМЕР байт
--new_meta_len РАЗМЕР        сделать новые метаданные размером РАЗМЕР байт
--new_journal_device ПУТЬ    использовать ПУТЬ как новое устройство журнала
--new_journal_offset РАЗМЕР  разместить новый журнал по смещению РАЗМЕР байт
--new_journal_len РАЗМЕР     сделать новый журнал размером РАЗМЕР байт
-```
+| <!-- -->                      | <!-- -->                                              |
+|-------------------------------|-------------------------------------------------------|
+| `--new_data_offset РАЗМЕР`    | сдвинуть начало области данных на `РАЗМЕР` байт       |
+| `--new_data_len РАЗМЕР`       | изменить размер области данных до `РАЗМЕР` байт       |
+| `--new_meta_device ПУТЬ`      | использовать `ПУТЬ` как новое устройство метаданных   |
+| `--new_meta_offset РАЗМЕР`    | разместить новые метаданные по смещению `РАЗМЕР` байт |
+| `--new_meta_len РАЗМЕР`       | сделать новые метаданные размером `РАЗМЕР` байт       |
+| `--new_journal_device ПУТЬ`   | использовать `ПУТЬ` как новое устройство журнала      |
+| `--new_journal_offset РАЗМЕР` | разместить новый журнал по смещению `РАЗМЕР` байт     |
+| `--new_journal_len РАЗМЕР`    | сделать новый журнал размером `РАЗМЕР` байт           |

-РАЗМЕР может быть указан с суффиксами k/m/g/t. Если любой из новых параметров
+`РАЗМЕР` может быть указан с суффиксами k/m/g/t. Если любой из новых параметров
 расположения не указан, он принимается равным старому значению.

 ## start/stop/restart/enable/disable
@@ -224,10 +254,15 @@ OSD отключены fsync-и.

 ## dump-journal

+`vitastor-disk dump-journal <osd_device>`
+
 `vitastor-disk dump-journal [OPTIONS] <journal_file> <journal_block_size> <offset> <size>`

 Вывести журнал в человекочитаемом или в JSON (с опцией `--json`) виде.

+Вы можете указать любой раздел OSD - данных, журнала или метаданных - либо указать все
+параметры расположения вручную.
+
 Опции:

 ```
@@ -240,22 +275,37 @@ OSD отключены fsync-и.

 ## write-journal

+`vitastor-disk write-journal <osd_device>`
+
 `vitastor-disk write-journal <journal_file> <journal_block_size> <bitmap_size> <offset> <size>`

 Записать журнал из JSON со стандартного ввода в формате, аналогичном `dump-journal --json --format data`.

+Вы можете указать любой раздел OSD - данных, журнала или метаданных - либо указать все
+параметры расположения вручную.
+
 ## dump-meta

+`vitastor-disk dump-meta <osd_device>`
+
 `vitastor-disk dump-meta <meta_file> <meta_block_size> <offset> <size>`

 Вывести метаданные в формате JSON.

+Вы можете указать любой раздел OSD - данных, журнала или метаданных - либо указать все
+параметры расположения вручную.
+
 ## write-meta

+`vitastor-disk write-meta <osd_device>`
+
 `vitastor-disk write-meta <meta_file> <offset> <size>`

 Записать метаданные из JSON со стандартного ввода в формате, аналогичном `dump-meta`.

+Вы можете указать любой раздел OSD - данных, журнала или метаданных - либо указать все
+параметры расположения вручную.
+
 ## simple-offsets

 `vitastor-disk simple-offsets <device>`
--- a/docs/usage/nfs.en.md
+++ b/docs/usage/nfs.en.md
@@ -11,6 +11,8 @@ Vitastor has two file system implementations. Both can be used via `vitastor-nfs
 Commands:
 - [mount](#mount)
 - [start](#start)
+- [upgrade](#upgrade)
+- [defrag](#defrag)

 ## Pseudo-FS

@@ -86,10 +88,6 @@ POSIX features currently not implemented in VitastorFS:
 - Modification time (`mtime`) is updated lazily every second (like `-o lazytime`)

 Other notable missing features which should be addressed in the future:
- Defragmentation of "shared" inodes. Files smaller than pool object size (block_size
-  multiplied by data part count if pool is EC) are internally stored in large block
-  volumes sequentially, one after another, and leave garbage after deleting or resizing.
-  Defragmentator will be implemented to collect this garbage.
 - Inode ID reuse. Currently inode IDs always grow, the limit is 2^48 inodes, so
  in theory you may hit it if you create and delete a very large number of files
 - Compaction of the key-value B-Tree. Current implementation never merges or deletes
@@ -139,6 +137,37 @@ Start network NFS server. Options:
 | `--port <PORT>` | use port \<PORT> for NFS services (default is 2049)        |
 | `--portmap 0`   | do not listen on port 111 (portmap/rpcbind, requires root) |

+### upgrade
+
+`vitastor-nfs --fs <NAME> upgrade`
+
+Upgrade FS metadata. Can be run online, but server(s) should be restarted after upgrade.
+
+### defrag
+
+`vitastor-nfs --fs <NAME> defrag [OPTIONS] [--dry-run]`
+
+Defragment volumes used for small file storage having more than \<defrag_percent> %
+of data removed. Can be run online.
+
+In VitastorFS, small files are stored in large "volumes" / "shared inodes" one
+after another. When you delete or extend such files, they are moved and garbage is left
+behind. Defragmentation removes garbage and moves data still in use to new volumes.
+
+Options:
+
+| <!-- -->                   | <!-- -->                                                                |
+|----------------------------|------------------------------------------------------------------------ |
+| `--volume_untouched 86400` | Defragment volumes last appended to at least this number of seconds ago |
+| `--defrag_percent 50`      | Defragment volumes with at least this % of removed data                 |
+| `--defrag_block_count 16`  | Read this number of pool blocks at once during defrag                   |
+| `--defrag_iodepth 16`      | Move up to this number of files in parallel during defrag               |
+| `--trace`                  | Print verbose defragmentation status                                    |
+| `--dry-run`                | Skip modifications, only print status                                   |
+| `--recalc-stats`           | Recalculate all volume statistics                                       |
+| `--include-empty`          | Include old and empty volumes; make sure to restart NFS servers before using it |
+| `--no-rm`                  | Move, but do not delete data                                            |
+
 ## Common options

 | <!-- -->           | <!-- -->                                                 |
--- a/docs/usage/nfs.ru.md
+++ b/docs/usage/nfs.ru.md
@@ -11,6 +11,8 @@
 Команды:
 - [mount](#mount)
 - [start](#start)
+- [upgrade](#upgrade)
+- [defrag](#defrag)

 ## Псевдо-ФС

@@ -88,11 +90,6 @@ JSON-формате :-). Для инспекции содержимого БД
 - Времена модификации (`mtime`) отслеживаются асинхронно (как будто ФС смонтирована с `-o lazytime`)

 Другие недостающие функции, которые нужно добавить в будущем:
- Дефрагментация "общих инодов". На уровне реализации ФС файлы, меньшие, чем размер
-  объекта пула (block_size умножить на число частей данных, если пул EC),
-  упаковываются друг за другом в большие "общие" иноды/тома. Если такие файлы удалять
-  или увеличивать, они перемещаются и оставляют за собой "мусор", вот тут-то и нужен
-  дефрагментатор.
 - Переиспользование номеров инодов. В текущей реализации номера инодов всё время
  увеличиваются, так что в теории вы можете упереться в лимит, если насоздаёте
  и наудаляете больше, чем 2^48 файлов.
@@ -145,6 +142,40 @@ JSON-формате :-). Для инспекции содержимого БД
 | `--port <PORT>` | использовать порт \<PORT> для NFS-сервисов (по умолчанию 2049)        |
 | `--portmap 0`   | отключить сервис portmap/rpcbind на порту 111 (по умолчанию включён и требует root привилегий) |

+### upgrade
+
+`vitastor-nfs --fs <NAME> upgrade`
+
+Обновить метаданные ФС. Можно запускать онлайн (при запущенных серверах NFS), но после выполнения их всё
+же желательно перезапустить.
+
+### defrag
+
+`vitastor-nfs --fs <NAME> defrag [OPTIONS] [--dry-run]`
+
+Дефрагментировать тома, используемые для хранения мелких файлов, в которых более, чем
+<defrag_percent> процентов данных удалено. Можно запускать онлайн.
+
+На уровне реализации ФС файлы, меньшие, чем размер объекта пула (block_size умножить на число
+частей данных, если пул EC), упаковываются друг за другом в большие "тома" / "общие иноды".
+Когда такие файлы удаляются или увеличиваются, они перемещаются и оставляют за собой "мусор".
+
+При дефрагментации мусор удаляется, а всё ещё используемые данные перемещаются в новые тома.
+
+Опции:
+
+| <!-- -->                   | <!-- -->                                                                |
+|----------------------------|------------------------------------------------------------------------ |
+| `--volume_untouched 86400` | Дефрагментировать только тома, в которые уже не писали это число секунд |
+| `--defrag_percent 50`      | Дефрагментировать только тома, в которых этот % данных удалён           |
+| `--defrag_block_count 16`  | Читать это количество блоков пула за один раз                           |
+| `--defrag_iodepth 16`      | Перемещать одновременно до этого числа файлов                           |
+| `--trace`                  | Печатать детальную статистику дефрагментации                            |
+| `--dry-run`                | Не производить никаких изменений, только описать выполняемые действия   |
+| `--recalc-stats`           | Пересчитать и сохранить статистику всех томов                           |
+| `--include-empty`          | Дефрагментировать старые и пустые тома; обязательно перезапустите NFS-сервера после использования этой опции |
+| `--no-rm`                  | Перемещать, но не удалять данные                                        |
+
 ## Общие опции

 | <!-- -->           | <!-- -->                                                |
--- a/docs/usage/qemu.en.md
+++ b/docs/usage/qemu.en.md
@@ -151,9 +151,9 @@ Example performance comparison:
 To try VDUSE you need at least Linux 5.15, built with VDUSE support
 (CONFIG_VDPA=m, CONFIG_VDPA_USER=m, CONFIG_VIRTIO_VDPA=m).

-Debian Linux kernels have these options disabled by now, so if you want to try it on Debian,
-use a kernel from Ubuntu [kernel-ppa/mainline](https://kernel.ubuntu.com/~kernel-ppa/mainline/), Proxmox,
-or build modules for Debian kernel manually:
+Debian Linux kernels had these options disabled until 6.6, so make sure you install a newer kernel
+(from bookworm-backports, trixie or newer Debian version) if you want to try VDUSE. You can also
+build modules for an existing kernel manually:

 ```
 mkdir build
--- a/docs/usage/qemu.ru.md
+++ b/docs/usage/qemu.ru.md
@@ -154,9 +154,9 @@ VDUSE - на данный момент лучший интерфейс для п
 Чтобы попробовать VDUSE, вам нужно ядро Linux как минимум версии 5.15, собранное с поддержкой
 VDUSE (CONFIG_VDPA=m, CONFIG_VDPA_USER=m, CONFIG_VIRTIO_VDPA=m).

-В ядрах в Debian Linux поддержка пока отключена по умолчанию, так что чтобы попробовать VDUSE
-на Debian, поставьте ядро из Ubuntu [kernel-ppa/mainline](https://kernel.ubuntu.com/~kernel-ppa/mainline/),
-из Proxmox или соберите модули для ядра Debian вручную:
+В ядрах в Debian Linux эти опции включены, только начиная с 6.6, так что установите свежее ядро
+из bookworm-backports, trixie или из более новой версии Debian, если хотите попробовать VDUSE.
+Либо же вы можете самостоятельно собрать модули для установленного ядра:

 ```
 mkdir build
--- a/mon/.eslintrc.js
+++ b/mon/.eslintrc.js
@@ -11,6 +11,7 @@ module.exports = {
        "ecmaVersion": 2020
    },
    "plugins": [
+        "import"
    ],
    "rules": {
        "indent": [
@@ -44,6 +45,10 @@ module.exports = {
        ],
        "node/shebang": [
            "off"
+        ],
+        "import/no-unresolved": [
+            2,
+            { "commonjs": true }
        ]
    }
 };
--- a/mon/antietcd_adapter.js
+++ b/mon/antietcd_adapter.js
@@ -0,0 +1,188 @@
+// Copyright (c) Vitaliy Filippov, 2019+
+// License: VNPL-1.1 (see README.md for details)
+
+const AntiEtcd = require('antietcd');
+
+const vitastor_persist_filter = require('./vitastor_persist_filter.js');
+const { b64, local_ips } = require('./utils.js');
+
+class AntiEtcdAdapter
+{
+    static async start_antietcd(config)
+    {
+        let antietcd;
+        if (config.use_antietcd)
+        {
+            let cluster = config.etcd_address;
+            if (!(cluster instanceof Array))
+                cluster = cluster ? (''+(cluster||'')).split(/,+/) : [];
+            cluster = Object.keys(cluster.reduce((a, url) =>
+            {
+                a[url.toLowerCase().replace(/^(https?:\/\/)/, '').replace(/\/.*$/, '')] = true;
+                return a;
+            }, {}));
+            const cfg_port = config.antietcd_port;
+            const is_local = local_ips(true).reduce((a, c) => { a[c] = true; return a; }, {});
+            const selected = cluster.map(s => s.split(':', 2)).filter(ip => is_local[ip[0]] && (!cfg_port || ip[1] == cfg_port));
+            if (selected.length > 1)
+            {
+                console.error('More than 1 etcd_address matches local IPs, please specify port');
+                process.exit(1);
+            }
+            else if (selected.length == 1)
+            {
+                const antietcd_config = {
+                    ip: selected[0][0],
+                    port: selected[0][1],
+                    data: config.antietcd_data_file || ((config.antietcd_data_dir || '/var/lib/vitastor') + '/mon_'+selected[0][1]+'.json.gz'),
+                    persist_filter: vitastor_persist_filter({ vitastor_prefix: config.etcd_prefix || '/vitastor' }),
+                    node_id: selected[0][0]+':'+selected[0][1], // node_id = ip:port
+                    cluster: (cluster.length == 1 ? null : cluster.reduce((a, c) => { a[c] = "http://"+c; return a; }, {})),
+                    cluster_key: (config.etcd_prefix || '/vitastor'),
+                    stale_read: 1,
+                    log_level: 1,
+                };
+                for (const key in config)
+                {
+                    if (key.substr(0, 9) === 'antietcd_')
+                    {
+                        const noprefix = key.substr(9);
+                        if (!(noprefix in antietcd_config) || noprefix == 'ip' || noprefix == 'cluster_key')
+                        {
+                            antietcd_config[noprefix] = config[key];
+                        }
+                    }
+                }
+                console.log('Starting Antietcd node '+antietcd_config.node_id);
+                antietcd = new AntiEtcd(antietcd_config);
+                await antietcd.start();
+            }
+            else
+            {
+                console.log('Antietcd is enabled, but etcd_address does not contain local IPs, proceeding without it');
+            }
+        }
+        return antietcd;
+    }
+
+    constructor(mon, antietcd)
+    {
+        this.mon = mon;
+        this.antietcd = antietcd;
+        this.on_leader = [];
+        this.on_change = (st) =>
+        {
+            if (st.state === 'leader')
+            {
+                for (const cb of this.on_leader)
+                {
+                    cb();
+                }
+                this.on_leader = [];
+            }
+        };
+        this.antietcd.on('raftchange', this.on_change);
+    }
+
+    parse_config(/*config*/)
+    {
+    }
+
+    stop_watcher()
+    {
+        this.antietcd.off('raftchange', this.on_change);
+        const watch_id = this.watch_id;
+        if (watch_id)
+        {
+            this.watch_id = null;
+            this.antietcd.cancel_watch(watch_id).catch(console.error);
+        }
+    }
+
+    async start_watcher()
+    {
+        if (this.watch_id)
+        {
+            await this.antietcd.cancel_watch(this.watch_id);
+            this.watch_id = null;
+        }
+        const watch_id = await this.antietcd.create_watch({
+            key: b64(this.mon.config.etcd_prefix+'/'),
+            range_end: b64(this.mon.config.etcd_prefix+'0'),
+            start_revision: ''+this.mon.etcd_watch_revision,
+            watch_id: 1,
+            progress_notify: true,
+        }, (message) =>
+        {
+            setImmediate(() => this.mon.on_message(message.result));
+        });
+        console.log('Successfully subscribed to antietcd revision '+this.antietcd.etctree.mod_revision);
+        this.watch_id = watch_id;
+    }
+
+    async become_master()
+    {
+        if (!this.antietcd.cluster)
+        {
+            console.log('Running in non-clustered mode');
+        }
+        else
+        {
+            console.log('Waiting to become master');
+            if (this.antietcd.cluster.raft.state !== 'leader')
+            {
+                await new Promise(ok => this.on_leader.push(ok));
+            }
+        }
+        const state = { ...this.mon.get_mon_state(), id: ''+this.mon.etcd_lease_id };
+        await this.etcd_call('/kv/txn', {
+            success: [ { requestPut: { key: b64(this.mon.config.etcd_prefix+'/mon/master'), value: b64(JSON.stringify(state)), lease: ''+this.mon.etcd_lease_id } } ],
+        }, this.mon.config.etcd_start_timeout, 0);
+        if (this.antietcd.cluster)
+        {
+            console.log('Became master');
+        }
+    }
+
+    async etcd_call(path, body, timeout, retries)
+    {
+        let retry = 0;
+        if (retries >= 0 && retries < 1)
+        {
+            retries = 1;
+        }
+        let prev = 0;
+        while (retries < 0 || retry < retries)
+        {
+            retry++;
+            if (this.mon.stopped)
+            {
+                throw new Error('Monitor instance is stopped');
+            }
+            try
+            {
+                if (Date.now()-prev < timeout)
+                {
+                    await new Promise(ok => setTimeout(ok, timeout-(Date.now()-prev)));
+                }
+                prev = Date.now();
+                const res = await this.antietcd.api(path.replace(/^\/+/, '').replace(/\/+$/, '').replace(/\/+/g, '_'), body);
+                if (res.error)
+                {
+                    console.error('Failed to query antietcd '+path+' (retry '+retry+'/'+retries+'): '+res.error);
+                }
+                else
+                {
+                    return res;
+                }
+            }
+            catch (e)
+            {
+                console.error('Failed to query antietcd '+path+' (retry '+retry+'/'+retries+'): '+e.stack);
+            }
+        }
+        throw new Error('Failed to query antietcd ('+retries+' retries)');
+    }
+}
+
+module.exports = AntiEtcdAdapter;
--- a/mon/etcd_adapter.js
+++ b/mon/etcd_adapter.js
@@ -0,0 +1,352 @@
+// Copyright (c) Vitaliy Filippov, 2019+
+// License: VNPL-1.1 (see README.md for details)
+
+const http = require('http');
+const WebSocket = require('ws');
+const { b64, local_ips } = require('./utils.js');
+
+const MON_STOPPED = 'Monitor instance is stopped';
+
+class EtcdAdapter
+{
+    constructor(mon)
+    {
+        this.mon = mon;
+        this.ws = null;
+        this.ws_alive = false;
+        this.ws_keepalive_timer = null;
+    }
+
+    parse_config(config)
+    {
+        this.parse_etcd_addresses(config.etcd_address||config.etcd_url);
+    }
+
+    parse_etcd_addresses(addrs)
+    {
+        const is_local_ip = local_ips(true).reduce((a, c) => { a[c] = true; return a; }, {});
+        this.etcd_local = [];
+        this.etcd_urls = [];
+        this.selected_etcd_url = null;
+        this.etcd_urls_to_try = [];
+        if (!(addrs instanceof Array))
+            addrs = addrs ? (''+(addrs||'')).split(/,/) : [];
+        if (!addrs.length)
+        {
+            console.error('Vitastor etcd address(es) not specified. Please set on the command line or in the config file');
+            process.exit(1);
+        }
+        for (let url of addrs)
+        {
+            let scheme = 'http';
+            url = url.trim().replace(/^(https?):\/\//, (m, m1) => { scheme = m1; return ''; });
+            const slash = url.indexOf('/');
+            const colon = url.indexOf(':');
+            const is_local = is_local_ip[colon >= 0 ? url.substr(0, colon) : (slash >= 0 ? url.substr(0, slash) : url)];
+            url = scheme+'://'+(slash >= 0 ? url : url+'/v3');
+            if (is_local)
+                this.etcd_local.push(url);
+            else
+                this.etcd_urls.push(url);
+        }
+    }
+
+    pick_next_etcd()
+    {
+        if (this.selected_etcd_url)
+            return this.selected_etcd_url;
+        if (!this.etcd_urls_to_try || !this.etcd_urls_to_try.length)
+        {
+            this.etcd_urls_to_try = [ ...this.etcd_local ];
+            const others = [ ...this.etcd_urls ];
+            while (others.length)
+            {
+                const url = others.splice(0|(others.length*Math.random()), 1);
+                this.etcd_urls_to_try.push(url[0]);
+            }
+        }
+        this.selected_etcd_url = this.etcd_urls_to_try.shift();
+        return this.selected_etcd_url;
+    }
+
+    stop_watcher(cur_addr)
+    {
+        cur_addr = cur_addr || this.selected_etcd_url;
+        if (this.ws)
+        {
+            console.log('Disconnected from etcd at '+this.ws_used_url);
+            this.ws.close();
+            this.ws = null;
+        }
+        if (this.ws_keepalive_timer)
+        {
+            clearInterval(this.ws_keepalive_timer);
+            this.ws_keepalive_timer = null;
+        }
+        if (this.selected_etcd_url == cur_addr)
+        {
+            this.selected_etcd_url = null;
+        }
+    }
+
+    restart_watcher(cur_addr)
+    {
+        this.stop_watcher(cur_addr);
+        this.start_watcher(this.mon.config.etcd_mon_retries).catch(this.mon.die);
+    }
+
+    async start_watcher(retries)
+    {
+        let retry = 0;
+        if (!retries || retries < 1)
+        {
+            retries = 1;
+        }
+        const tried = {};
+        while (retries < 0 || retry < retries)
+        {
+            const cur_addr = this.pick_next_etcd();
+            const base = 'ws'+cur_addr.substr(4);
+            let now = Date.now();
+            if (tried[base] && now-tried[base] < this.mon.config.etcd_start_timeout)
+            {
+                await new Promise(ok => setTimeout(ok, this.mon.config.etcd_start_timeout-(now-tried[base])));
+                now = Date.now();
+            }
+            tried[base] = now;
+            if (this.mon.stopped)
+            {
+                return;
+            }
+            const ok = await new Promise(ok =>
+            {
+                const timer_id = setTimeout(() =>
+                {
+                    if (this.ws)
+                    {
+                        console.log('Disconnected from etcd at '+this.ws_used_url);
+                        this.ws.close();
+                        this.ws = null;
+                    }
+                    ok(false);
+                }, this.mon.config.etcd_mon_timeout);
+                this.ws = new WebSocket(base+'/watch');
+                this.ws_used_url = cur_addr;
+                const fail = () =>
+                {
+                    ok(false);
+                };
+                this.ws.on('error', fail);
+                this.ws.on('open', () =>
+                {
+                    this.ws.removeListener('error', fail);
+                    if (timer_id)
+                        clearTimeout(timer_id);
+                    ok(true);
+                });
+            });
+            if (ok)
+                break;
+            if (this.selected_etcd_url == cur_addr)
+                this.selected_etcd_url = null;
+            this.ws = null;
+            retry++;
+        }
+        if (!this.ws)
+        {
+            this.mon.die('Failed to open etcd watch websocket');
+            return;
+        }
+        if (this.mon.stopped)
+        {
+            this.stop_watcher();
+            return;
+        }
+        const cur_addr = this.selected_etcd_url;
+        this.ws_alive = true;
+        this.ws_keepalive_timer = setInterval(() =>
+        {
+            if (this.ws_alive && this.ws)
+            {
+                this.ws_alive = false;
+                this.ws.send(JSON.stringify({ progress_request: {} }));
+            }
+            else
+            {
+                console.log('etcd websocket timed out, restarting it');
+                this.restart_watcher(cur_addr);
+            }
+        }, (Number(this.mon.config.etcd_ws_keepalive_interval) || 5)*1000);
+        this.ws.on('error', () => this.restart_watcher(cur_addr));
+        this.ws.send(JSON.stringify({
+            create_request: {
+                key: b64(this.mon.config.etcd_prefix+'/'),
+                range_end: b64(this.mon.config.etcd_prefix+'0'),
+                start_revision: ''+this.mon.etcd_watch_revision,
+                watch_id: 1,
+                progress_notify: true,
+            },
+        }));
+        this.ws.on('message', (msg) =>
+        {
+            if (this.mon.stopped)
+            {
+                this.stop_watcher();
+                return;
+            }
+            this.ws_alive = true;
+            let data;
+            try
+            {
+                data = JSON.parse(msg);
+            }
+            catch (e)
+            {
+            }
+            if (!data || !data.result)
+            {
+                console.error('Unknown message received from watch websocket: '+msg);
+            }
+            else if (data.result.canceled)
+            {
+                // etcd watch canceled
+                if (data.result.compact_revision)
+                {
+                    // we may miss events if we proceed
+                    this.mon.die('Revisions before '+data.result.compact_revision+' were compacted by etcd, exiting');
+                }
+                this.mon.die('Watch canceled by etcd, reason: '+data.result.cancel_reason+', exiting');
+            }
+            else if (data.result.created)
+            {
+                // etcd watch created
+                console.log('Successfully subscribed to etcd at '+this.selected_etcd_url+', revision '+data.result.header.revision);
+            }
+            else
+            {
+                this.mon.on_message(data.result);
+            }
+        });
+    }
+
+    async become_master()
+    {
+        const state = { ...this.mon.get_mon_state(), id: ''+this.mon.etcd_lease_id };
+        // eslint-disable-next-line no-constant-condition
+        while (1)
+        {
+            const res = await this.etcd_call('/kv/txn', {
+                compare: [ { target: 'CREATE', create_revision: 0, key: b64(this.mon.config.etcd_prefix+'/mon/master') } ],
+                success: [ { requestPut: { key: b64(this.mon.config.etcd_prefix+'/mon/master'), value: b64(JSON.stringify(state)), lease: ''+this.mon.etcd_lease_id } } ],
+            }, this.mon.config.etcd_start_timeout, 0);
+            if (res.succeeded)
+            {
+                break;
+            }
+            console.log('Waiting to become master');
+            await new Promise(ok => setTimeout(ok, this.mon.config.etcd_start_timeout));
+        }
+        console.log('Became master');
+    }
+
+    async etcd_call(path, body, timeout, retries)
+    {
+        let retry = 0;
+        if (retries >= 0 && retries < 1)
+        {
+            retries = 1;
+        }
+        const tried = {};
+        while (retries < 0 || retry < retries)
+        {
+            retry++;
+            const base = this.pick_next_etcd();
+            let now = Date.now();
+            if (tried[base] && now-tried[base] < timeout)
+            {
+                await new Promise(ok => setTimeout(ok, timeout-(now-tried[base])));
+                now = Date.now();
+            }
+            tried[base] = now;
+            if (this.mon.stopped)
+            {
+                throw new Error(MON_STOPPED);
+            }
+            const res = await POST(base+path, body, timeout);
+            if (this.mon.stopped)
+            {
+                throw new Error(MON_STOPPED);
+            }
+            if (res.error)
+            {
+                if (this.selected_etcd_url == base)
+                    this.selected_etcd_url = null;
+                console.error('Failed to query etcd '+path+' (retry '+retry+'/'+retries+'): '+res.error);
+                continue;
+            }
+            if (res.json)
+            {
+                if (res.json.error)
+                {
+                    console.error(path+': etcd returned error: '+res.json.error);
+                    break;
+                }
+                return res.json;
+            }
+        }
+        throw new Error('Failed to query etcd ('+retries+' retries)');
+    }
+}
+
+function POST(url, body, timeout)
+{
+    return new Promise(ok =>
+    {
+        const body_text = Buffer.from(JSON.stringify(body));
+        let timer_id = timeout > 0 ? setTimeout(() =>
+        {
+            if (req)
+                req.abort();
+            req = null;
+            ok({ error: 'timeout' });
+        }, timeout) : null;
+        let req = http.request(url, { method: 'POST', headers: {
+            'Content-Type': 'application/json',
+            'Content-Length': body_text.length,
+        } }, (res) =>
+        {
+            if (!req)
+            {
+                return;
+            }
+            clearTimeout(timer_id);
+            let res_body = '';
+            res.setEncoding('utf8');
+            res.on('error', (error) => ok({ error }));
+            res.on('data', chunk => { res_body += chunk; });
+            res.on('end', () =>
+            {
+                if (res.statusCode != 200)
+                {
+                    ok({ error: res_body, code: res.statusCode });
+                    return;
+                }
+                try
+                {
+                    res_body = JSON.parse(res_body);
+                    ok({ response: res, json: res_body });
+                }
+                catch (e)
+                {
+                    ok({ error: e, response: res, body: res_body });
+                }
+            });
+        });
+        req.on('error', (error) => ok({ error }));
+        req.on('close', () => ok({ error: new Error('Connection closed prematurely') }));
+        req.write(body_text);
+        req.end();
+    });
+}
+
+module.exports = EtcdAdapter;
--- a/mon/etcd_schema.js
+++ b/mon/etcd_schema.js
@@ -0,0 +1,396 @@
+// Copyright (c) Vitaliy Filippov, 2019+
+// License: VNPL-1.1 (see README.md for details)
+
+// FIXME document all etcd keys and config variables in the form of JSON schema or similar
+const etcd_nonempty_keys = {
+    'config/global': 1,
+    'config/node_placement': 1,
+    'config/pools': 1,
+    'pg/config': 1,
+    'history/last_clean_pgs': 1,
+    'stats': 1,
+};
+const etcd_allow = new RegExp('^'+[
+    'config/global',
+    'config/node_placement',
+    'config/pools',
+    'config/osd/[1-9]\\d*',
+    'config/pgs', // old name
+    'pg/config',
+    'config/inode/[1-9]\\d*/[1-9]\\d*',
+    'osd/state/[1-9]\\d*',
+    'osd/stats/[1-9]\\d*',
+    'osd/inodestats/[1-9]\\d*',
+    'osd/space/[1-9]\\d*',
+    'mon/master',
+    'mon/member/[a-f0-9]+',
+    'pg/state/[1-9]\\d*/[1-9]\\d*',
+    'pg/stats/[1-9]\\d*/[1-9]\\d*', // old name
+    'pgstats/[1-9]\\d*/[1-9]\\d*',
+    'pg/history/[1-9]\\d*/[1-9]\\d*',
+    'history/last_clean_pgs',
+    'inode/stats/[1-9]\\d*/\\d+',
+    'pool/stats/[1-9]\\d*',
+    'stats',
+    'index/image/.*',
+    'index/maxid/[1-9]\\d*',
+].join('$|^')+'$');
+
+const etcd_tree = {
+    config: {
+        /* global: {
+            // WARNING: NOT ALL OF THESE ARE ACTUALLY CONFIGURABLE HERE
+            // THIS IS JUST A POOR MAN'S CONFIG DOCUMENTATION
+            // etcd connection
+            config_path: "/etc/vitastor/vitastor.conf",
+            etcd_prefix: "/vitastor",
+            // etcd connection - configurable online
+            etcd_address: "10.0.115.10:2379/v3",
+            // mon
+            etcd_mon_ttl: 5, // min: 1
+            etcd_mon_timeout: 1000, // ms. min: 0
+            etcd_mon_retries: 5, // min: 0
+            mon_change_timeout: 1000, // ms. min: 100
+            mon_retry_change_timeout: 50, // ms. min: 10
+            mon_stats_timeout: 1000, // ms. min: 100
+            osd_out_time: 600, // seconds. min: 0
+            placement_levels: { datacenter: 1, rack: 2, host: 3, osd: 4, ... },
+            use_old_pg_combinator: false,
+            // client and osd
+            tcp_header_buffer_size: 65536,
+            use_sync_send_recv: false,
+            use_rdma: true,
+            rdma_device: null, // for example, "rocep5s0f0"
+            rdma_port_num: 1,
+            rdma_gid_index: 0,
+            rdma_mtu: 4096,
+            rdma_max_sge: 128,
+            rdma_max_send: 8,
+            rdma_max_recv: 16,
+            rdma_max_msg: 132096,
+            block_size: 131072,
+            disk_alignment: 4096,
+            bitmap_granularity: 4096,
+            immediate_commit: 'all', // 'none', 'all' or 'small'
+            // client - configurable online
+            client_max_dirty_bytes: 33554432,
+            client_max_dirty_ops: 1024,
+            client_enable_writeback: false,
+            client_max_buffered_bytes: 33554432,
+            client_max_buffered_ops: 1024,
+            client_max_writeback_iodepth: 256,
+            client_retry_interval: 50, // ms. min: 10
+            client_eio_retry_interval: 1000, // ms
+            client_retry_enospc: true,
+            osd_nearfull_ratio: 0.95,
+            // client and osd - configurable online
+            log_level: 0,
+            peer_connect_interval: 5, // seconds. min: 1
+            peer_connect_timeout: 5, // seconds. min: 1
+            osd_idle_timeout: 5, // seconds. min: 1
+            osd_ping_timeout: 5, // seconds. min: 1
+            max_etcd_attempts: 5,
+            etcd_quick_timeout: 1000, // ms
+            etcd_slow_timeout: 5000, // ms
+            etcd_keepalive_timeout: 30, // seconds, default is max(30, etcd_report_interval*2)
+            etcd_ws_keepalive_interval: 5, // seconds
+            // osd
+            etcd_report_interval: 5, // seconds
+            etcd_stats_interval: 30, // seconds
+            run_primary: true,
+            osd_network: null, // "192.168.7.0/24" or an array of masks
+            bind_address: "0.0.0.0",
+            bind_port: 0,
+            readonly: false,
+            osd_memlock: false,
+            // osd - configurable online
+            autosync_interval: 5,
+            autosync_writes: 128,
+            client_queue_depth: 128, // unused
+            recovery_queue_depth: 1,
+            recovery_sleep_us: 0,
+            recovery_tune_util_low: 0.1,
+            recovery_tune_client_util_low: 0,
+            recovery_tune_util_high: 1.0,
+            recovery_tune_client_util_high: 0.5,
+            recovery_tune_interval: 1,
+            recovery_tune_agg_interval: 10, // 10 times recovery_tune_interval
+            recovery_tune_sleep_min_us: 10, // 10 microseconds
+            recovery_pg_switch: 128,
+            recovery_sync_batch: 16,
+            no_recovery: false,
+            no_rebalance: false,
+            print_stats_interval: 3,
+            slow_log_interval: 10,
+            inode_vanish_time: 60,
+            auto_scrub: false,
+            no_scrub: false,
+            scrub_interval: '30d', // 1s/1m/1h/1d
+            scrub_queue_depth: 1,
+            scrub_sleep: 0, // milliseconds
+            scrub_list_limit: 1000, // objects to list on one scrub iteration
+            scrub_find_best: true,
+            scrub_ec_max_bruteforce: 100, // maximum EC error locator brute-force iterators
+            // blockstore - fixed in superblock
+            block_size,
+            disk_alignment,
+            journal_block_size,
+            meta_block_size,
+            bitmap_granularity,
+            journal_device,
+            journal_offset,
+            journal_size,
+            disable_journal_fsync,
+            data_device,
+            data_offset,
+            data_size,
+            disable_data_fsync,
+            meta_device,
+            meta_offset,
+            disable_meta_fsync,
+            disable_device_lock,
+            // blockstore - configurable offline
+            inmemory_metadata,
+            inmemory_journal,
+            journal_sector_buffer_count,
+            journal_no_same_sector_overwrites,
+            // blockstore - configurable online
+            max_write_iodepth,
+            min_flusher_count: 1,
+            max_flusher_count: 256,
+            throttle_small_writes: false,
+            throttle_target_iops: 100,
+            throttle_target_mbs: 100,
+            throttle_target_parallelism: 1,
+            throttle_threshold_us: 50,
+        }, */
+        global: {},
+        /* node_placement: {
+            host1: { level: 'host', parent: 'rack1' },
+            ...
+        }, */
+        node_placement: {},
+        /* pools: {
+            <id>: {
+                name: 'testpool',
+                // 'ec' uses Reed-Solomon-Vandermonde codes, 'jerasure' is an alias for 'ec'
+                scheme: 'replicated' | 'xor' | 'ec' | 'jerasure',
+                pg_size: 3,
+                pg_minsize: 2,
+                // number of parity chunks, required for EC
+                parity_chunks?: 1,
+                pg_count: 100,
+                // default is failure_domain=host
+                failure_domain?: 'host',
+                // additional failure domain rules; failure_domain=x is equivalent to x=123..N
+                level_placement?: 'dc=112233 host=123456',
+                raw_placement?: 'any, dc=1 host!=1, dc=1 host!=(1,2)',
+                old_combinator: false,
+                max_osd_combinations: 10000,
+                // block_size, bitmap_granularity, immediate_commit must match all OSDs used in that pool
+                block_size: 131072,
+                bitmap_granularity: 4096,
+                // 'all'/'small'/'none', same as in OSD options
+                immediate_commit: 'all',
+                pg_stripe_size: 0,
+                root_node?: 'rack1',
+                // restrict pool to OSDs having all of these tags
+                osd_tags?: 'nvme' | [ 'nvme', ... ],
+                // prefer to put primary on OSD with these tags
+                primary_affinity_tags?: 'nvme' | [ 'nvme', ... ],
+                // scrub interval
+                scrub_interval?: '30d',
+            },
+            ...
+        }, */
+        pools: {},
+        osd: {
+            /* <id>: { reweight?: 1, tags?: [ 'nvme', ... ], noout?: true }, ... */
+        },
+        /* inode: {
+            <pool_id>: {
+                <inode_t>: {
+                    name: string,
+                    size?: uint64_t, // bytes
+                    parent_pool?: <pool_id>,
+                    parent_id?: <inode_t>,
+                    readonly?: boolean,
+                }
+            }
+        }, */
+        inode: {},
+    },
+    osd: {
+        state: {
+            /* <osd_num_t>: {
+                state: "up",
+                addresses: string[],
+                host: string,
+                port: uint16_t,
+                primary_enabled: boolean,
+                blockstore_enabled: boolean,
+            }, */
+        },
+        stats: {
+            /* <osd_num_t>: {
+                time: number, // unix time
+                data_block_size: uint64_t, // bytes
+                bitmap_granularity: uint64_t, // bytes
+                immediate_commit: "all"|"small"|"none",
+                blockstore_ready: boolean,
+                size: uint64_t, // bytes
+                free: uint64_t, // bytes
+                host: string,
+                op_stats: {
+                    <string>: { count: uint64_t, usec: uint64_t, bytes: uint64_t },
+                },
+                subop_stats: {
+                    <string>: { count: uint64_t, usec: uint64_t },
+                },
+                recovery_stats: {
+                    degraded: { count: uint64_t, bytes: uint64_t },
+                    misplaced: { count: uint64_t, bytes: uint64_t },
+                },
+            }, */
+        },
+        inodestats: {
+            /* <pool_id>: {
+                <inode_t>: {
+                    read: { count: uint64_t, usec: uint64_t, bytes: uint64_t },
+                    write: { count: uint64_t, usec: uint64_t, bytes: uint64_t },
+                    delete: { count: uint64_t, usec: uint64_t, bytes: uint64_t },
+                },
+            }, */
+        },
+        space: {
+            /* <osd_num_t>: {
+                <pool_id>: {
+                    <inode_t>: uint64_t, // bytes
+                },
+            }, */
+        },
+    },
+    mon: {
+        master: {
+            /* ip: [ string ], id: uint64_t */
+        },
+        member: {
+            /* <uint64_t>: { ip: [ string ] }, */
+        },
+    },
+    pg: {
+        /* config: {
+            hash: string,
+            items: {
+                <pool_id>: {
+                    <pg_id>: {
+                        osd_set: [ 1, 2, 3 ],
+                        primary: 1,
+                        pause: false,
+                    }
+                }
+            }
+        }, */
+        config: {},
+        state: {
+            /* <pool_id>: {
+                <pg_id>: {
+                    primary: osd_num_t,
+                    state: ("starting"|"peering"|"incomplete"|"active"|"repeering"|"stopping"|"offline"|
+                        "degraded"|"has_incomplete"|"has_degraded"|"has_misplaced"|"has_unclean"|
+                        "has_invalid"|"has_inconsistent"|"has_corrupted"|"left_on_dead"|"scrubbing")[],
+                }
+            }, */
+        },
+        history: {
+            /* <pool_id>: {
+                <pg_id>: {
+                    osd_sets: osd_num_t[][],
+                    all_peers: osd_num_t[],
+                    epoch: uint64_t,
+                    next_scrub: uint64_t,
+                },
+            }, */
+        },
+    },
+    pgstats: {
+        /* <pool_id>: {
+            <pg_id>: {
+                object_count: uint64_t,
+                clean_count: uint64_t,
+                misplaced_count: uint64_t,
+                degraded_count: uint64_t,
+                incomplete_count: uint64_t,
+                write_osd_set: osd_num_t[],
+            },
+        }, */
+    },
+    inode: {
+        stats: {
+            /* <pool_id>: {
+                <inode_t>: {
+                    raw_used: uint64_t, // raw used bytes on OSDs
+                    read: { count: uint64_t, usec: uint64_t, bytes: uint64_t, bps: uint64_t, iops: uint64_t, lat: uint64_t },
+                    write: { count: uint64_t, usec: uint64_t, bytes: uint64_t, bps: uint64_t, iops: uint64_t, lat: uint64_t },
+                    delete: { count: uint64_t, usec: uint64_t, bytes: uint64_t, bps: uint64_t, iops: uint64_t, lat: uint64_t },
+                },
+            }, */
+        },
+    },
+    pool: {
+        stats: {
+            /* <pool_id>: {
+                used_raw_tb: float, // used raw space in the pool
+                total_raw_tb: float, // maximum amount of space in the pool
+                raw_to_usable: float, // raw to usable ratio
+                space_efficiency: float, // 0..1
+            } */
+        },
+    },
+    stats: {
+        /* op_stats: {
+            <string>: { count: uint64_t, usec: uint64_t, bytes: uint64_t, bps: uint64_t, iops: uint64_t, lat: uint64_t },
+        },
+        subop_stats: {
+            <string>: { count: uint64_t, usec: uint64_t, iops: uint64_t, lat: uint64_t },
+        },
+        recovery_stats: {
+            degraded: { count: uint64_t, bytes: uint64_t, bps: uint64_t, iops: uint64_t },
+            misplaced: { count: uint64_t, bytes: uint64_t, bps: uint64_t, iops: uint64_t },
+        },
+        object_counts: {
+            object: uint64_t,
+            clean: uint64_t,
+            misplaced: uint64_t,
+            degraded: uint64_t,
+            incomplete: uint64_t,
+        },
+        object_bytes: {
+            total: uint64_t,
+            clean: uint64_t,
+            misplaced: uint64_t,
+            degraded: uint64_t,
+            incomplete: uint64_t,
+        }, */
+    },
+    history: {
+        last_clean_pgs: {},
+    },
+    index: {
+        image: {
+            /* <name>: {
+                id: uint64_t,
+                pool_id: uint64_t,
+            }, */
+        },
+        maxid: {
+            /* <pool_id>: uint64_t, */
+        },
+    },
+};
+
+module.exports = {
+    etcd_nonempty_keys,
+    etcd_allow,
+    etcd_tree,
+};
--- a/mon/http_server.js
+++ b/mon/http_server.js
@@ -0,0 +1,50 @@
+// Copyright (c) Vitaliy Filippov, 2019+
+// License: VNPL-1.1 (see README.md for details)
+
+const fsp = require('fs').promises;
+const http = require('http');
+const https = require('https');
+
+async function create_http_server(cfg, handler)
+{
+    let server;
+    if (cfg.mon_https_cert)
+    {
+        const tls = {
+            key: await fsp.readFile(cfg.mon_https_key),
+            cert: await fsp.readFile(cfg.mon_https_cert),
+        };
+        if (cfg.mon_https_ca)
+        {
+            tls.mon_https_ca = await fsp.readFile(cfg.mon_https_ca);
+        }
+        if (cfg.mon_https_client_auth)
+        {
+            tls.requestCert = true;
+        }
+        server = https.createServer(tls, handler);
+    }
+    else
+    {
+        server = http.createServer(handler);
+    }
+    try
+    {
+        let err;
+        server.once('error', e => err = e);
+        server.listen(cfg.mon_http_port || 8060, cfg.mon_http_ip || undefined);
+        if (err)
+            throw err;
+    }
+    catch (e)
+    {
+        console.error(
+            'HTTP server disabled because listen at address: '+
+            (cfg.mon_http_ip || '')+':'+(cfg.mon_http_port || 9090)+' failed with error: '+e
+        );
+        return null;
+    }
+    return server;
+}
+
+module.exports = { create_http_server };
--- a/mon/lp_optimizer/dsl_pgs.js
+++ b/mon/lp_optimizer/dsl_pgs.js
--- a/mon/lp_optimizer/lp_optimizer.js
+++ b/mon/lp_optimizer/lp_optimizer.js
--- a/mon/lp_optimizer/murmur3.js
+++ b/mon/lp_optimizer/murmur3.js
--- a/mon/lp_optimizer/simple_pgs.js
+++ b/mon/lp_optimizer/simple_pgs.js
--- a/mon/lp_optimizer/test-nonuniform.js
+++ b/mon/lp_optimizer/test-nonuniform.js
@@ -8,7 +8,7 @@
 // But we support this case with the "parity_space" parameter in optimize_initial()/optimize_change().

 const { SimpleCombinator } = require('./simple_pgs.js');
-const LPOptimizer = require('./lp-optimizer.js');
+const LPOptimizer = require('./lp_optimizer.js');

 const osd_tree = {
    ripper5: {
--- a/mon/lp_optimizer/test-optimize-simple.js
+++ b/mon/lp_optimizer/test-optimize-simple.js
@@ -2,7 +2,7 @@
 // License: VNPL-1.1 (see README.md for details)

 const { compat } = require('./simple_pgs.js');
-const LPOptimizer = require('./lp-optimizer.js');
+const LPOptimizer = require('./lp_optimizer.js');

 async function run()
 {
--- a/mon/lp_optimizer/test-optimize-undersized.js
+++ b/mon/lp_optimizer/test-optimize-undersized.js
@@ -2,7 +2,7 @@
 // License: VNPL-1.1 (see README.md for details)

 const { compat, flatten_tree } = require('./simple_pgs.js');
-const LPOptimizer = require('./lp-optimizer.js');
+const LPOptimizer = require('./lp_optimizer.js');

 const crush_tree = [
    { level: 1, children: [
--- a/mon/lp_optimizer/test-optimize-unfeasible.js
+++ b/mon/lp_optimizer/test-optimize-unfeasible.js
@@ -2,7 +2,7 @@
 // License: VNPL-1.1 (see README.md for details)

 const { compat } = require('./simple_pgs.js');
-const LPOptimizer = require('./lp-optimizer.js');
+const LPOptimizer = require('./lp_optimizer.js');

 const osd_tree = {
    100: {
--- a/mon/lp_optimizer/test-optimize.js
+++ b/mon/lp_optimizer/test-optimize.js
@@ -2,7 +2,7 @@
 // License: VNPL-1.1 (see README.md for details)

 const { compat, flatten_tree } = require('./simple_pgs.js');
-const LPOptimizer = require('./lp-optimizer.js');
+const LPOptimizer = require('./lp_optimizer.js');

 const osd_tree = {
    100: {
--- a/mon/lp_optimizer/test-parse-dsl.js
+++ b/mon/lp_optimizer/test-parse-dsl.js
--- a/mon/mon-main.js
+++ b/mon/mon-main.js
@@ -23,4 +23,4 @@ for (let i = 2; i < process.argv.length; i++)
    }
 }

-new Mon(options).start().catch(e => { console.error(e); process.exit(1); });
+Mon.run_forever(options).catch(console.error);
--- a/mon/mon.js
+++ b/mon/mon.js
--- a/mon/osd_tree.js
+++ b/mon/osd_tree.js
@@ -0,0 +1,215 @@
+// Copyright (c) Vitaliy Filippov, 2019+
+// License: VNPL-1.1 (see README.md for details)
+
+function get_osd_tree(global_config, state)
+{
+    const levels = global_config.placement_levels||{};
+    levels.host = levels.host || 100;
+    levels.osd = levels.osd || 101;
+    const tree = {};
+    let up_osds = {};
+    // This requires monitor system time to be in sync with OSD system times (at least to some extent)
+    const down_time = Date.now()/1000 - global_config.osd_out_time;
+    for (const osd_num of Object.keys(state.osd.stats).sort((a, b) => a - b))
+    {
+        const stat = state.osd.stats[osd_num];
+        const osd_cfg = state.config.osd[osd_num];
+        let reweight = osd_cfg == null ? 1 : Number(osd_cfg.reweight);
+        if (reweight < 0 || isNaN(reweight))
+            reweight = 1;
+        if (stat && stat.size && reweight && (state.osd.state[osd_num] || Number(stat.time) >= down_time ||
+            osd_cfg && osd_cfg.noout))
+        {
+            // Numeric IDs are reserved for OSDs
+            if (state.osd.state[osd_num] && reweight > 0)
+            {
+                // React to down OSDs immediately
+                up_osds[osd_num] = true;
+            }
+            tree[osd_num] = tree[osd_num] || {};
+            tree[osd_num].id = osd_num;
+            tree[osd_num].parent = tree[osd_num].parent || stat.host;
+            tree[osd_num].level = 'osd';
+            tree[osd_num].size = reweight * stat.size / 1024 / 1024 / 1024 / 1024; // terabytes
+            if (osd_cfg && osd_cfg.tags)
+            {
+                tree[osd_num].tags = (osd_cfg.tags instanceof Array ? [ ...osd_cfg.tags ] : [ osd_cfg.tags ])
+                    .reduce((a, c) => { a[c] = true; return a; }, {});
+            }
+            delete tree[osd_num].children;
+            if (!tree[stat.host])
+            {
+                tree[stat.host] = {
+                    id: stat.host,
+                    level: 'host',
+                    parent: null,
+                    children: [],
+                };
+            }
+        }
+    }
+    for (const node_id in state.config.node_placement||{})
+    {
+        const node_cfg = state.config.node_placement[node_id];
+        if (/^\d+$/.exec(node_id))
+        {
+            node_cfg.level = 'osd';
+        }
+        if (!node_id || !node_cfg.level || !levels[node_cfg.level] ||
+            node_cfg.level === 'osd' && !tree[node_id])
+        {
+            // All nodes must have non-empty IDs and valid levels
+            // OSDs have to actually exist
+            continue;
+        }
+        tree[node_id] = tree[node_id] || {};
+        tree[node_id].id = node_id;
+        tree[node_id].level = node_cfg.level;
+        tree[node_id].parent = node_cfg.parent;
+        if (node_cfg.level !== 'osd')
+        {
+            tree[node_id].children = [];
+        }
+    }
+    return { up_osds, levels, osd_tree: tree };
+}
+
+function make_hier_tree(global_config, tree)
+{
+    const levels = global_config.placement_levels||{};
+    levels.host = levels.host || 100;
+    levels.osd = levels.osd || 101;
+    tree = { ...tree };
+    for (const node_id in tree)
+    {
+        tree[node_id] = { ...tree[node_id], children: [] };
+    }
+    tree[''] = { children: [] };
+    for (const node_id in tree)
+    {
+        if (node_id === '' || tree[node_id].level === 'osd' && (!tree[node_id].size || tree[node_id].size <= 0))
+        {
+            continue;
+        }
+        const node_cfg = tree[node_id];
+        const node_level = levels[node_cfg.level] || node_cfg.level;
+        let parent_level = node_cfg.parent && tree[node_cfg.parent] && tree[node_cfg.parent].children
+            && tree[node_cfg.parent].level;
+        parent_level = parent_level ? (levels[parent_level] || parent_level) : null;
+        // Parent's level must be less than child's; OSDs must be leaves
+        const parent = parent_level && parent_level < node_level ? node_cfg.parent : '';
+        tree[parent].children.push(tree[node_id]);
+    }
+    // Delete empty nodes
+    let deleted = 0;
+    do
+    {
+        deleted = 0;
+        for (const node_id in tree)
+        {
+            if (tree[node_id].level !== 'osd' && (!tree[node_id].children || !tree[node_id].children.length))
+            {
+                const parent = tree[node_id].parent;
+                if (parent)
+                {
+                    tree[parent].children = tree[parent].children.filter(c => c != tree[node_id]);
+                }
+                deleted++;
+                delete tree[node_id];
+            }
+        }
+    } while (deleted > 0);
+    return tree;
+}
+
+function filter_osds_by_root_node(global_config, pool_tree, root_node)
+{
+    if (!root_node)
+    {
+        return;
+    }
+    let hier_tree = make_hier_tree(global_config, pool_tree);
+    let included = [ ...(hier_tree[root_node] || {}).children||[] ];
+    for (let i = 0; i < included.length; i++)
+    {
+        if (included[i].children)
+        {
+            included.splice(i+1, 0, ...included[i].children);
+        }
+    }
+    let cur = pool_tree[root_node] || {};
+    while (cur && cur.id)
+    {
+        included.unshift(cur);
+        cur = pool_tree[cur.parent||''];
+    }
+    included = included.reduce((a, c) => { a[c.id||''] = true; return a; }, {});
+    for (const item in pool_tree)
+    {
+        if (!included[item])
+        {
+            delete pool_tree[item];
+        }
+    }
+}
+
+function filter_osds_by_tags(orig_tree, tags)
+{
+    if (!tags)
+    {
+        return;
+    }
+    for (const tag of (tags instanceof Array ? tags : [ tags ]))
+    {
+        for (const osd in orig_tree)
+        {
+            if (orig_tree[osd].level === 'osd' &&
+                (!orig_tree[osd].tags || !orig_tree[osd].tags[tag]))
+            {
+                delete orig_tree[osd];
+            }
+        }
+    }
+}
+
+function filter_osds_by_block_layout(orig_tree, osd_stats, block_size, bitmap_granularity, immediate_commit)
+{
+    for (const osd in orig_tree)
+    {
+        if (orig_tree[osd].level === 'osd')
+        {
+            const osd_stat = osd_stats[osd];
+            if (osd_stat && (osd_stat.bs_block_size && osd_stat.bs_block_size != block_size ||
+                osd_stat.bitmap_granularity && osd_stat.bitmap_granularity != bitmap_granularity ||
+                osd_stat.immediate_commit == 'small' && immediate_commit == 'all' ||
+                osd_stat.immediate_commit == 'none' && immediate_commit != 'none'))
+            {
+                delete orig_tree[osd];
+            }
+        }
+    }
+}
+
+function get_affinity_osds(pool_cfg, up_osds, osd_tree)
+{
+    let aff_osds = up_osds;
+    if (pool_cfg.primary_affinity_tags)
+    {
+        aff_osds = Object.keys(up_osds).reduce((a, c) => { a[c] = osd_tree[c]; return a; }, {});
+        filter_osds_by_tags(aff_osds, pool_cfg.primary_affinity_tags);
+        for (const osd in aff_osds)
+        {
+            aff_osds[osd] = true;
+        }
+    }
+    return aff_osds;
+}
+
+module.exports = {
+    get_osd_tree,
+    make_hier_tree,
+    filter_osds_by_root_node,
+    filter_osds_by_tags,
+    filter_osds_by_block_layout,
+    get_affinity_osds,
+};
--- a/mon/package.json
+++ b/mon/package.json
@@ -1,25 +1,24 @@
 {
  "name": "vitastor-mon",
-  "version": "1.6.1",
+  "version": "1.9.3",
  "description": "Vitastor SDS monitor service",
  "main": "mon-main.js",
  "scripts": {
-    "test": "echo \"Error: no test specified\" && exit 1"
+    "lint": "eslint *.js lp_optimizer/*.js scripts/*.js"
  },
  "author": "Vitaliy Filippov",
  "license": "UNLICENSED",
  "dependencies": {
+    "antietcd": "^1.1.0",
    "sprintf-js": "^1.1.2",
    "ws": "^7.2.5"
  },
  "devDependencies": {
    "eslint": "^8.0.0",
+    "eslint-plugin-import": "^2.29.1",
    "eslint-plugin-node": "^11.1.0"
  },
  "engines": {
    "node": ">=12.0.0"
-  },
-  "scripts": {
-    "lint": "eslint *.js"
  }
 }
--- a/mon/pg_gen.js
+++ b/mon/pg_gen.js
@@ -0,0 +1,267 @@
+// Copyright (c) Vitaliy Filippov, 2019+
+// License: VNPL-1.1 (see README.md for details)
+
+const { RuleCombinator } = require('./lp_optimizer/dsl_pgs.js');
+const { SimpleCombinator, flatten_tree } = require('./lp_optimizer/simple_pgs.js');
+const { validate_pool_cfg, get_pg_rules } = require('./pool_config.js');
+const LPOptimizer = require('./lp_optimizer/lp_optimizer.js');
+const { scale_pg_count } = require('./pg_utils.js');
+const { make_hier_tree, filter_osds_by_root_node,
+    filter_osds_by_tags, filter_osds_by_block_layout, get_affinity_osds } = require('./osd_tree.js');
+
+let seed;
+
+function reset_rng()
+{
+    seed = 0x5f020e43;
+}
+
+function rng()
+{
+    seed ^= seed << 13;
+    seed ^= seed >> 17;
+    seed ^= seed << 5;
+    return seed + 2147483648;
+}
+
+function pick_primary(pool_config, osd_set, up_osds, aff_osds)
+{
+    let alive_set;
+    if (pool_config.scheme === 'replicated')
+    {
+        // Prefer "affinity" OSDs
+        alive_set = osd_set.filter(osd_num => osd_num && aff_osds[osd_num]);
+        if (!alive_set.length)
+            alive_set = osd_set.filter(osd_num => osd_num && up_osds[osd_num]);
+    }
+    else
+    {
+        // Prefer data OSDs for EC because they can actually read something without an additional network hop
+        const pg_data_size = (pool_config.pg_size||0) - (pool_config.parity_chunks||0);
+        alive_set = osd_set.slice(0, pg_data_size).filter(osd_num => osd_num && aff_osds[osd_num]);
+        if (!alive_set.length)
+            alive_set = osd_set.filter(osd_num => osd_num && aff_osds[osd_num]);
+        if (!alive_set.length)
+        {
+            alive_set = osd_set.slice(0, pg_data_size).filter(osd_num => osd_num && up_osds[osd_num]);
+            if (!alive_set.length)
+                alive_set = osd_set.filter(osd_num => osd_num && up_osds[osd_num]);
+        }
+    }
+    if (!alive_set.length)
+    {
+        return 0;
+    }
+    return alive_set[rng() % alive_set.length];
+}
+
+function recheck_primary(state, global_config, up_osds, osd_tree)
+{
+    let new_pg_config;
+    for (const pool_id in state.config.pools)
+    {
+        const pool_cfg = state.config.pools[pool_id];
+        if (!validate_pool_cfg(pool_id, pool_cfg, global_config.placement_levels, false))
+        {
+            continue;
+        }
+        const aff_osds = get_affinity_osds(pool_cfg, up_osds, osd_tree);
+        reset_rng();
+        for (let pg_num = 1; pg_num <= pool_cfg.pg_count; pg_num++)
+        {
+            if (!state.pg.config.items[pool_id])
+            {
+                continue;
+            }
+            const pg_cfg = state.pg.config.items[pool_id][pg_num];
+            if (pg_cfg)
+            {
+                const new_primary = pick_primary(state.config.pools[pool_id], pg_cfg.osd_set, up_osds, aff_osds);
+                if (pg_cfg.primary != new_primary)
+                {
+                    if (!new_pg_config)
+                    {
+                        new_pg_config = JSON.parse(JSON.stringify(state.pg.config));
+                    }
+                    console.log(
+                        `Moving pool ${pool_id} (${pool_cfg.name || 'unnamed'}) PG ${pg_num}`+
+                        ` primary OSD from ${pg_cfg.primary} to ${new_primary}`
+                    );
+                    new_pg_config.items[pool_id][pg_num].primary = new_primary;
+                }
+            }
+        }
+    }
+    return new_pg_config;
+}
+
+function save_new_pgs_txn(save_to, request, state, etcd_prefix, etcd_watch_revision, pool_id, up_osds, osd_tree, prev_pgs, new_pgs, pg_history)
+{
+    const aff_osds = get_affinity_osds(state.config.pools[pool_id] || {}, up_osds, osd_tree);
+    const pg_items = {};
+    reset_rng();
+    new_pgs.map((osd_set, i) =>
+    {
+        osd_set = osd_set.map(osd_num => osd_num === LPOptimizer.NO_OSD ? 0 : osd_num);
+        pg_items[i+1] = {
+            osd_set,
+            primary: pick_primary(state.config.pools[pool_id], osd_set, up_osds, aff_osds),
+        };
+        if (prev_pgs[i] && prev_pgs[i].join(' ') != osd_set.join(' ') &&
+            prev_pgs[i].filter(osd_num => osd_num).length > 0)
+        {
+            pg_history[i] = pg_history[i] || {};
+            pg_history[i].osd_sets = pg_history[i].osd_sets || [];
+            pg_history[i].osd_sets.push(prev_pgs[i]);
+        }
+        if (pg_history[i] && pg_history[i].osd_sets)
+        {
+            pg_history[i].osd_sets = Object.values(pg_history[i].osd_sets
+                .reduce((a, c) => { a[c.join(' ')] = c; return a; }, {}));
+        }
+    });
+    for (let i = 0; i < new_pgs.length || i < prev_pgs.length; i++)
+    {
+        // FIXME: etcd has max_txn_ops limit, and it's 128 by default
+        // Sooo we probably want to change our storage scheme for PG histories...
+        request.compare.push({
+            key: b64(etcd_prefix+'/pg/history/'+pool_id+'/'+(i+1)),
+            target: 'MOD',
+            mod_revision: ''+etcd_watch_revision,
+            result: 'LESS',
+        });
+        if (pg_history[i])
+        {
+            request.success.push({
+                requestPut: {
+                    key: b64(etcd_prefix+'/pg/history/'+pool_id+'/'+(i+1)),
+                    value: b64(JSON.stringify(pg_history[i])),
+                },
+            });
+        }
+        else
+        {
+            request.success.push({
+                requestDeleteRange: {
+                    key: b64(etcd_prefix+'/pg/history/'+pool_id+'/'+(i+1)),
+                },
+            });
+        }
+    }
+    save_to.items = save_to.items || {};
+    if (!new_pgs.length)
+    {
+        delete save_to.items[pool_id];
+    }
+    else
+    {
+        save_to.items[pool_id] = pg_items;
+    }
+}
+
+async function generate_pool_pgs(state, global_config, pool_id, osd_tree, levels)
+{
+    const pool_cfg = state.config.pools[pool_id];
+    if (!validate_pool_cfg(pool_id, pool_cfg, global_config.placement_levels, false))
+    {
+        return null;
+    }
+    let pool_tree = { ...osd_tree };
+    filter_osds_by_root_node(global_config, pool_tree, pool_cfg.root_node);
+    filter_osds_by_tags(pool_tree, pool_cfg.osd_tags);
+    filter_osds_by_block_layout(
+        pool_tree,
+        state.osd.stats,
+        pool_cfg.block_size || global_config.block_size || 131072,
+        pool_cfg.bitmap_granularity || global_config.bitmap_granularity || 4096,
+        pool_cfg.immediate_commit || global_config.immediate_commit || 'all'
+    );
+    pool_tree = make_hier_tree(global_config, pool_tree);
+    // First try last_clean_pgs to minimize data movement
+    let prev_pgs = [];
+    for (const pg in ((state.history.last_clean_pgs.items||{})[pool_id]||{}))
+    {
+        prev_pgs[pg-1] = [ ...state.history.last_clean_pgs.items[pool_id][pg].osd_set ];
+    }
+    if (!prev_pgs.length)
+    {
+        // Fall back to pg/config if it's empty
+        for (const pg in ((state.pg.config.items||{})[pool_id]||{}))
+        {
+            prev_pgs[pg-1] = [ ...state.pg.config.items[pool_id][pg].osd_set ];
+        }
+    }
+    const old_pg_count = prev_pgs.length;
+    const optimize_cfg = {
+        osd_weights: Object.values(pool_tree).filter(item => item.level === 'osd').reduce((a, c) => { a[c.id] = c.size; return a; }, {}),
+        combinator: !global_config.use_old_pg_combinator || pool_cfg.level_placement || pool_cfg.raw_placement
+            // new algorithm:
+            ? new RuleCombinator(pool_tree, get_pg_rules(pool_id, pool_cfg, global_config.placement_levels), pool_cfg.max_osd_combinations)
+            // old algorithm:
+            : new SimpleCombinator(flatten_tree(pool_tree[''].children, levels, pool_cfg.failure_domain, 'osd'), pool_cfg.pg_size, pool_cfg.max_osd_combinations),
+        pg_count: pool_cfg.pg_count,
+        pg_size: pool_cfg.pg_size,
+        pg_minsize: pool_cfg.pg_minsize,
+        ordered: pool_cfg.scheme != 'replicated',
+    };
+    let optimize_result;
+    // Re-shuffle PGs if pg/config.hash is empty
+    if (old_pg_count > 0 && state.pg.config.hash)
+    {
+        if (prev_pgs.length != pool_cfg.pg_count)
+        {
+            // Scale PG count
+            // Do it even if old_pg_count is already equal to pool_cfg.pg_count,
+            // because last_clean_pgs may still contain the old number of PGs
+            scale_pg_count(prev_pgs, pool_cfg.pg_count);
+        }
+        for (const pg of prev_pgs)
+        {
+            while (pg.length < pool_cfg.pg_size)
+            {
+                pg.push(0);
+            }
+        }
+        optimize_result = await LPOptimizer.optimize_change({
+            prev_pgs,
+            ...optimize_cfg,
+        });
+    }
+    else
+    {
+        optimize_result = await LPOptimizer.optimize_initial(optimize_cfg);
+    }
+    console.log(`Pool ${pool_id} (${pool_cfg.name || 'unnamed'}):`);
+    LPOptimizer.print_change_stats(optimize_result);
+    let pg_effsize = pool_cfg.pg_size;
+    for (const pg of optimize_result.int_pgs)
+    {
+        const this_pg_size = pg.filter(osd => osd != LPOptimizer.NO_OSD).length;
+        if (this_pg_size && this_pg_size < pg_effsize)
+        {
+            pg_effsize = this_pg_size;
+        }
+    }
+    return {
+        pool_id,
+        pgs: optimize_result.int_pgs,
+        stats: {
+            total_raw_tb: optimize_result.space,
+            pg_real_size: pg_effsize || pool_cfg.pg_size,
+            raw_to_usable: (pg_effsize || pool_cfg.pg_size) / (pool_cfg.scheme === 'replicated'
+                ? 1 : (pool_cfg.pg_size - (pool_cfg.parity_chunks||0))),
+            space_efficiency: optimize_result.space/(optimize_result.total_space||1),
+        },
+    };
+}
+
+function b64(str)
+{
+    return Buffer.from(str).toString('base64');
+}
+
+module.exports = {
+    recheck_primary,
+    save_new_pgs_txn,
+    generate_pool_pgs,
+};
--- a/mon/pg_utils.js
+++ b/mon/pg_utils.js
--- a/mon/pool_config.js
+++ b/mon/pool_config.js
@@ -0,0 +1,169 @@
+// Copyright (c) Vitaliy Filippov, 2019+
+// License: VNPL-1.1 (see README.md for details)
+
+const { parse_level_indexes, parse_pg_dsl } = require('./lp_optimizer/dsl_pgs.js');
+
+function validate_pool_cfg(pool_id, pool_cfg, placement_levels, warn)
+{
+    pool_cfg.pg_size = Math.floor(pool_cfg.pg_size);
+    pool_cfg.pg_minsize = Math.floor(pool_cfg.pg_minsize);
+    pool_cfg.parity_chunks = Math.floor(pool_cfg.parity_chunks) || undefined;
+    pool_cfg.pg_count = Math.floor(pool_cfg.pg_count);
+    pool_cfg.max_osd_combinations = Math.floor(pool_cfg.max_osd_combinations) || 10000;
+    if (!/^[1-9]\d*$/.exec(''+pool_id))
+    {
+        if (warn)
+            console.log('Pool ID '+pool_id+' is invalid');
+        return false;
+    }
+    if (pool_cfg.scheme !== 'xor' && pool_cfg.scheme !== 'replicated' &&
+        pool_cfg.scheme !== 'ec' && pool_cfg.scheme !== 'jerasure')
+    {
+        if (warn)
+            console.log('Pool '+pool_id+' has invalid coding scheme (one of "xor", "replicated", "ec" and "jerasure" required)');
+        return false;
+    }
+    if (!pool_cfg.pg_size || pool_cfg.pg_size < 1 || pool_cfg.pg_size > 256 ||
+        pool_cfg.scheme !== 'replicated' && pool_cfg.pg_size < 3)
+    {
+        if (warn)
+            console.log('Pool '+pool_id+' has invalid pg_size');
+        return false;
+    }
+    if (!pool_cfg.pg_minsize || pool_cfg.pg_minsize < 1 || pool_cfg.pg_minsize > pool_cfg.pg_size ||
+        pool_cfg.scheme === 'xor' && pool_cfg.pg_minsize < (pool_cfg.pg_size - 1))
+    {
+        if (warn)
+            console.log('Pool '+pool_id+' has invalid pg_minsize');
+        return false;
+    }
+    if (pool_cfg.scheme === 'xor' && pool_cfg.parity_chunks != 0 && pool_cfg.parity_chunks != 1)
+    {
+        if (warn)
+            console.log('Pool '+pool_id+' has invalid parity_chunks (must be 1)');
+        return false;
+    }
+    if ((pool_cfg.scheme === 'ec' || pool_cfg.scheme === 'jerasure') &&
+        (pool_cfg.parity_chunks < 1 || pool_cfg.parity_chunks > pool_cfg.pg_size-2))
+    {
+        if (warn)
+            console.log('Pool '+pool_id+' has invalid parity_chunks (must be between 1 and pg_size-2)');
+        return false;
+    }
+    if (!pool_cfg.pg_count || pool_cfg.pg_count < 1)
+    {
+        if (warn)
+            console.log('Pool '+pool_id+' has invalid pg_count');
+        return false;
+    }
+    if (!pool_cfg.name)
+    {
+        if (warn)
+            console.log('Pool '+pool_id+' has empty name');
+        return false;
+    }
+    if (pool_cfg.max_osd_combinations < 100)
+    {
+        if (warn)
+            console.log('Pool '+pool_id+' has invalid max_osd_combinations (must be at least 100)');
+        return false;
+    }
+    if (pool_cfg.root_node && typeof(pool_cfg.root_node) != 'string')
+    {
+        if (warn)
+            console.log('Pool '+pool_id+' has invalid root_node (must be a string)');
+        return false;
+    }
+    if (pool_cfg.osd_tags && typeof(pool_cfg.osd_tags) != 'string' &&
+        (!(pool_cfg.osd_tags instanceof Array) || pool_cfg.osd_tags.filter(t => typeof t != 'string').length > 0))
+    {
+        if (warn)
+            console.log('Pool '+pool_id+' has invalid osd_tags (must be a string or array of strings)');
+        return false;
+    }
+    if (pool_cfg.primary_affinity_tags && typeof(pool_cfg.primary_affinity_tags) != 'string' &&
+        (!(pool_cfg.primary_affinity_tags instanceof Array) || pool_cfg.primary_affinity_tags.filter(t => typeof t != 'string').length > 0))
+    {
+        if (warn)
+            console.log('Pool '+pool_id+' has invalid primary_affinity_tags (must be a string or array of strings)');
+        return false;
+    }
+    if (!get_pg_rules(pool_id, pool_cfg, placement_levels, true))
+    {
+        return false;
+    }
+    return true;
+}
+
+function get_pg_rules(pool_id, pool_cfg, placement_levels, warn)
+{
+    if (pool_cfg.level_placement)
+    {
+        const pg_size = (0|pool_cfg.pg_size);
+        let rules = pool_cfg.level_placement;
+        if (typeof rules === 'string')
+        {
+            rules = rules.split(/\s+/).map(s => s.split(/=/, 2)).reduce((a, c) => { a[c[0]] = c[1]; return a; }, {});
+        }
+        else
+        {
+            rules = { ...rules };
+        }
+        // Always add failure_domain to prevent rules from being totally incorrect
+        const all_diff = [];
+        for (let i = 1; i <= pg_size; i++)
+        {
+            all_diff.push(i);
+        }
+        rules[pool_cfg.failure_domain || 'host'] = all_diff;
+        placement_levels = placement_levels||{};
+        placement_levels.host = placement_levels.host || 100;
+        placement_levels.osd = placement_levels.osd || 101;
+        for (const k in rules)
+        {
+            if (!placement_levels[k] || typeof rules[k] !== 'string' &&
+                (!(rules[k] instanceof Array) ||
+                rules[k].filter(s => typeof s !== 'string' && typeof s !== 'number').length > 0))
+            {
+                if (warn)
+                    console.log('Pool '+pool_id+' configuration is invalid: level_placement should be { [level]: string | (string|number)[] }');
+                return null;
+            }
+            else if (rules[k].length != pg_size)
+            {
+                if (warn)
+                    console.log('Pool '+pool_id+' configuration is invalid: values in level_placement should contain exactly pg_size ('+pg_size+') items');
+                return null;
+            }
+        }
+        return parse_level_indexes(rules);
+    }
+    else if (typeof pool_cfg.raw_placement === 'string')
+    {
+        try
+        {
+            return parse_pg_dsl(pool_cfg.raw_placement);
+        }
+        catch (e)
+        {
+            if (warn)
+                console.log('Pool '+pool_id+' configuration is invalid: invalid raw_placement: '+e.message);
+        }
+    }
+    else
+    {
+        let rules = [ [] ];
+        let prev = [ 1 ];
+        for (let i = 1; i < pool_cfg.pg_size; i++)
+        {
+            rules.push([ [ pool_cfg.failure_domain||'host', '!=', prev ] ]);
+            prev = [ ...prev, i+1 ];
+        }
+        return rules;
+    }
+}
+
+module.exports = {
+    validate_pool_cfg,
+    get_pg_rules,
+};
--- a/mon/prometheus.js
+++ b/mon/prometheus.js
@@ -0,0 +1,220 @@
+// Copyright (c) Vitaliy Filippov, 2019+
+// License: VNPL-1.1 (see README.md for details)
+
+const metric_help =
+`# HELP vitastor_object_bytes Total size of objects in cluster in bytes
+# TYPE vitastor_object_bytes gauge
+# HELP vitastor_object_count Total number of objects in cluster
+# TYPE vitastor_object_count gauge
+# HELP vitastor_stat_count Total operation count
+# TYPE vitastor_stat_count counter
+# HELP vitastor_stat_usec Total operation latency in usec
+# TYPE vitastor_stat_usec counter
+# HELP vitastor_stat_bytes Total operation size in bytes
+# HELP vitastor_stat_bytes counter
+
+# HELP vitastor_image_raw_used Image raw used size in bytes
+# TYPE vitastor_image_raw_used counter
+# HELP vitastor_image_stat_count Per-image total operation count
+# TYPE vitastor_image_stat_count counter
+# HELP vitastor_image_stat_usec Per-image total operation latency
+# TYPE vitastor_image_stat_usec counter
+# HELP vitastor_image_stat_bytes Per-image total operation size in bytes
+# TYPE vitastor_image_stat_bytes counter
+
+# HELP vitastor_osd_status OSD up/down status
+# TYPE vitastor_osd_status gauge
+# HELP vitastor_osd_size_bytes OSD total space in bytes
+# TYPE vitastor_osd_size_bytes gauge
+# HELP vitastor_osd_free_bytes OSD free space in bytes
+# TYPE vitastor_osd_free_bytes gauge
+# HELP vitastor_osd_stat_count Per-image total operation count
+# TYPE vitastor_osd_stat_count counter
+# HELP vitastor_osd_stat_usec Per-image total operation latency
+# TYPE vitastor_osd_stat_usec counter
+# HELP vitastor_osd_stat_bytes Per-image total operation size in bytes
+# TYPE vitastor_osd_stat_bytes counter
+
+# HELP vitastor_monitor_info Monitor info, 1 is master, 0 is standby
+# TYPE vitastor_monitor_info gauge
+
+# HELP vitastor_pool_info Pool configuration (in labels)
+# TYPE vitastor_pool_info gauge
+# HELP vitastor_pool_status Pool up/down status
+# TYPE vitastor_pool_status gauge
+# HELP vitastor_pool_raw_to_usable Raw to usable space ratio
+# TYPE vitastor_pool_raw_to_usable gauge
+# HELP vitastor_pool_space_efficiency Pool space usage efficiency
+# TYPE vitastor_pool_space_efficiency gauge
+# HELP vitastor_pool_total_raw_tb Total raw space in pool in TB
+# TYPE vitastor_pool_total_raw_tb gauge
+# HELP vitastor_pool_used_raw_tb Used raw space in pool in TB
+# TYPE vitastor_pool_used_raw_tb gauge
+# HELP vitastor_pg_count PG counts by state
+# HELP vitastor_pg_count gauge
+
+`;
+
+function export_prometheus_metrics(st)
+{
+    let res = metric_help;
+
+    // Global statistics
+
+    for (const k in st.stats.object_bytes)
+    {
+        res += `vitastor_object_bytes{object_type="${k}"} ${st.stats.object_bytes[k]}\n`;
+    }
+
+    for (const k in st.stats.object_counts)
+    {
+        res += `vitastor_object_count{object_type="${k}"} ${st.stats.object_counts[k]}\n`;
+    }
+
+    for (const typ of [ 'op', 'subop', 'recovery' ])
+    {
+        for (const op in st.stats[typ+"_stats"]||{})
+        {
+            const op_stat = st.stats[typ+"_stats"][op];
+            for (const key of [ 'count', 'usec', 'bytes' ])
+            {
+                res += `vitastor_stat_${key}{op="${op}",op_type="${typ}"} ${op_stat[key]||0}\n`;
+            }
+        }
+    }
+
+    // Per-image statistics
+
+    for (const pool in st.inode.stats)
+    {
+        for (const inode in st.inode.stats[pool])
+        {
+            const ist = st.inode.stats[pool][inode];
+            const inode_name = ((st.config.inode[pool]||{})[inode]||{}).name||'';
+            const inode_label = `image_name="${addslashes(inode_name)}",inode_num="${inode}",pool_id="${pool}"`;
+            res += `vitastor_image_raw_used{${inode_label}} ${ist.raw_used||0}\n`;
+            for (const op of [ 'read', 'write', 'delete' ])
+            {
+                for (const k of [ 'count', 'usec', 'bytes' ])
+                {
+                    if (ist[op])
+                    {
+                        res += `vitastor_image_stat_${k}{${inode_label},op="${op}"} ${ist[op][k]||0}\n`;
+                    }
+                }
+            }
+        }
+    }
+
+    // Per-OSD statistics
+
+    for (const osd in st.osd.stats)
+    {
+        const osd_stat = st.osd.stats[osd];
+        const up = st.osd.state[osd] && st.osd.state[osd].state == 'up' ? 1 : 0;
+        res += `vitastor_osd_status{host="${addslashes(osd_stat.host)}",osd_num="${osd}"} ${up}\n`;
+        res += `vitastor_osd_size_bytes{osd_num="${osd}"} ${osd_stat.size||0}\n`;
+        res += `vitastor_osd_free_bytes{osd_num="${osd}"} ${osd_stat.free||0}\n`;
+        for (const op in osd_stat.op_stats)
+        {
+            const ist = osd_stat.op_stats[op];
+            for (const k of [ 'count', 'usec', 'bytes' ])
+            {
+                res += `vitastor_osd_stat_${k}{osd_num="${osd}",op="${op}",op_type="op"} ${ist[k]||0}\n`;
+            }
+        }
+        for (const op in osd_stat.subop_stats)
+        {
+            const ist = osd_stat.subop_stats[op];
+            for (const k of [ 'count', 'usec', 'bytes' ])
+            {
+                res += `vitastor_osd_stat_${k}{osd_num="${osd}",op="${op}",op_type="subop"} ${ist[k]||0}\n`;
+            }
+        }
+    }
+
+    // Monitor statistics
+
+    for (const mon_id in st.mon.member)
+    {
+        const mon = st.mon.member[mon_id];
+        const master = st.mon.master && st.mon.master.id == mon_id ? 1 : 0;
+        const ip = (mon.ip instanceof Array ? mon.ip[0] : mon.ip) || '';
+        res += `vitastor_monitor_info{monitor_hostname="${addslashes(mon.hostname)}",monitor_id="${mon_id}",monitor_ip="${addslashes(ip)}"} ${master}\n`;
+    }
+
+    // Per-pool statistics
+
+    for (const pool_id in st.config.pools)
+    {
+        const pool_cfg = st.config.pools[pool_id];
+        const pool_label = `pool_id="${pool_id}",pool_name="${addslashes(pool_cfg.name)}"`;
+        const pool_stat = st.pool.stats[pool_id];
+        res += `vitastor_pool_info{${pool_label}`+
+            `,pool_scheme="${addslashes(pool_cfg.scheme)}"`+
+            `,pg_size="${pool_cfg.pg_size||0}",pg_minsize="${pool_cfg.pg_minsize||0}"`+
+            `,parity_chunks="${pool_cfg.parity_chunks||0}",pg_count="${pool_cfg.pg_count||0}"`+
+            `,failure_domain="${addslashes(pool_cfg.failure_domain)}"`+
+            `} 1\n`;
+        if (!pool_stat)
+        {
+            continue;
+        }
+        res += `vitastor_pool_raw_to_usable{${pool_label}} ${pool_stat.raw_to_usable||0}\n`;
+        res += `vitastor_pool_space_efficiency{${pool_label}} ${pool_stat.space_efficiency||0}\n`;
+        res += `vitastor_pool_total_raw_tb{${pool_label}} ${pool_stat.total_raw_tb||0}\n`;
+        res += `vitastor_pool_used_raw_tb{${pool_label}} ${pool_stat.used_raw_tb||0}\n`;
+
+        // PG states and pool up/down status
+        const real_pg_count = (Object.keys(((st.pg.config||{}).items||{})[pool_id]||{}).length) || (0|pool_cfg.pg_count);
+        const per_state = {
+            active: 0,
+            starting: 0,
+            peering: 0,
+            incomplete: 0,
+            repeering: 0,
+            stopping: 0,
+            offline: 0,
+            degraded: 0,
+            has_inconsistent: 0,
+            has_corrupted: 0,
+            has_incomplete: 0,
+            has_degraded: 0,
+            has_misplaced: 0,
+            has_unclean: 0,
+            has_invalid: 0,
+            left_on_dead: 0,
+            scrubbing: 0,
+        };
+        const pool_pg_states = st.pg.state[pool_id] || {};
+        for (let i = 1; i <= real_pg_count; i++)
+        {
+            if (!pool_pg_states[i])
+            {
+                per_state['offline'] = 1 + (per_state['offline']|0);
+            }
+            else
+            {
+                for (const st_name of pool_pg_states[i].state)
+                {
+                    per_state[st_name] = 1 + (per_state[st_name]|0);
+                }
+            }
+        }
+        for (const st_name in per_state)
+        {
+            res += `vitastor_pg_count{pg_state="${st_name}",${pool_label}} ${per_state[st_name]}\n`;
+        }
+        const pool_active = per_state['active'] >= real_pg_count ? 1 : 0;
+        res += `vitastor_pool_status{${pool_label}} ${pool_active}\n`;
+    }
+
+    return res;
+}
+
+function addslashes(str)
+{
+    return ((str||'')+'').replace(/(["\n\\])/g, "\\$1"); // escape " \n \
+}
+
+module.exports = { export_prometheus_metrics };
--- a/mon/scripts/90-vitastor.rules
+++ b/mon/scripts/90-vitastor.rules
--- a/mon/scripts/Vitastor-Grafana-6+.json
+++ b/mon/scripts/Vitastor-Grafana-6+.json
--- a/mon/scripts/afr.js
+++ b/mon/scripts/afr.js
--- a/mon/scripts/afr_test.js
+++ b/mon/scripts/afr_test.js
--- a/mon/scripts/make-etcd
+++ b/mon/scripts/make-etcd
--- a/Show More
+++ b/Show More