Save cached_read_* in superblock

Document cached_read_{data,meta,journal} parameters
Support using Linux page cache for reads
2023-07-17 00:34:58 +03:00 · 2023-07-17 00:34:58 +03:00 · 2023-07-17 00:34:58 +03:00 · 2023-07-17 00:34:58 +03:00 · 2023-07-17 00:34:58 +03:00 · 2023-07-15 02:34:20 +03:00
376 changed files with 5408 additions and 28396 deletions
--- a/.gitea/workflows/buildenv.Dockerfile
+++ b/.gitea/workflows/buildenv.Dockerfile
@ -22,7 +22,7 @@ RUN apt-get update
 RUN apt-get -y install etcd qemu-system-x86 qemu-block-extra qemu-utils fio libasan5 \
    liburing1 liburing-dev libgoogle-perftools-dev devscripts libjerasure-dev cmake libibverbs-dev libisal-dev
 RUN apt-get -y build-dep fio qemu=`dpkg -s qemu-system-x86|grep ^Version:|awk '{print $2}'`
-RUN apt-get -y install jq lp-solve sudo nfs-common
+RUN apt-get -y install jq lp-solve sudo
 RUN apt-get --download-only source fio qemu=`dpkg -s qemu-system-x86|grep ^Version:|awk '{print $2}'`

 RUN set -ex; \
--- a/.gitea/workflows/test.yml
+++ b/.gitea/workflows/test.yml
@ -64,13 +64,6 @@ jobs:
    # leak sanitizer sometimes crashes
    - run: cd /root/vitastor/build && ASAN_OPTIONS=detect_leaks=0 make -j16 test

-  npm_lint:
-    runs-on: ubuntu-latest
-    needs: build
-    container: ${{env.TEST_IMAGE}}:${{github.sha}}
-    steps:
-    - run: cd /root/vitastor/mon && npm run lint
-
  test_add_osd:
    runs-on: ubuntu-latest
    needs: build
@ -402,7 +395,7 @@ jobs:
    steps:
    - name: Run test
      id: test
-      timeout-minutes: 6
+      timeout-minutes: 3
      run: SCHEME=ec /root/vitastor/tests/test_snapshot_chain.sh
    - name: Print logs
      if: always() && steps.test.outcome == 'failure'
@ -539,42 +532,6 @@ jobs:
          echo ""
        done

-  test_root_node:
-    runs-on: ubuntu-latest
-    needs: build
-    container: ${{env.TEST_IMAGE}}:${{github.sha}}
-    steps:
-    - name: Run test
-      id: test
-      timeout-minutes: 3
-      run: /root/vitastor/tests/test_root_node.sh
-    - name: Print logs
-      if: always() && steps.test.outcome == 'failure'
-      run: |
-        for i in /root/vitastor/testdata/*.log /root/vitastor/testdata/*.txt; do
-          echo "-------- $i --------"
-          cat $i
-          echo ""
-        done
-
-  test_switch_primary:
-    runs-on: ubuntu-latest
-    needs: build
-    container: ${{env.TEST_IMAGE}}:${{github.sha}}
-    steps:
-    - name: Run test
-      id: test
-      timeout-minutes: 3
-      run: /root/vitastor/tests/test_switch_primary.sh
-    - name: Print logs
-      if: always() && steps.test.outcome == 'failure'
-      run: |
-        for i in /root/vitastor/testdata/*.log /root/vitastor/testdata/*.txt; do
-          echo "-------- $i --------"
-          cat $i
-          echo ""
-        done
-
  test_write:
    runs-on: ubuntu-latest
    needs: build
@ -773,96 +730,6 @@ jobs:
          echo ""
        done

-  test_osd_tags:
-    runs-on: ubuntu-latest
-    needs: build
-    container: ${{env.TEST_IMAGE}}:${{github.sha}}
-    steps:
-    - name: Run test
-      id: test
-      timeout-minutes: 3
-      run: /root/vitastor/tests/test_osd_tags.sh
-    - name: Print logs
-      if: always() && steps.test.outcome == 'failure'
-      run: |
-        for i in /root/vitastor/testdata/*.log /root/vitastor/testdata/*.txt; do
-          echo "-------- $i --------"
-          cat $i
-          echo ""
-        done
-
-  test_enospc:
-    runs-on: ubuntu-latest
-    needs: build
-    container: ${{env.TEST_IMAGE}}:${{github.sha}}
-    steps:
-    - name: Run test
-      id: test
-      timeout-minutes: 3
-      run: /root/vitastor/tests/test_enospc.sh
-    - name: Print logs
-      if: always() && steps.test.outcome == 'failure'
-      run: |
-        for i in /root/vitastor/testdata/*.log /root/vitastor/testdata/*.txt; do
-          echo "-------- $i --------"
-          cat $i
-          echo ""
-        done
-
-  test_enospc_xor:
-    runs-on: ubuntu-latest
-    needs: build
-    container: ${{env.TEST_IMAGE}}:${{github.sha}}
-    steps:
-    - name: Run test
-      id: test
-      timeout-minutes: 3
-      run: SCHEME=xor /root/vitastor/tests/test_enospc.sh
-    - name: Print logs
-      if: always() && steps.test.outcome == 'failure'
-      run: |
-        for i in /root/vitastor/testdata/*.log /root/vitastor/testdata/*.txt; do
-          echo "-------- $i --------"
-          cat $i
-          echo ""
-        done
-
-  test_enospc_imm:
-    runs-on: ubuntu-latest
-    needs: build
-    container: ${{env.TEST_IMAGE}}:${{github.sha}}
-    steps:
-    - name: Run test
-      id: test
-      timeout-minutes: 3
-      run: IMMEDIATE_COMMIT=1 /root/vitastor/tests/test_enospc.sh
-    - name: Print logs
-      if: always() && steps.test.outcome == 'failure'
-      run: |
-        for i in /root/vitastor/testdata/*.log /root/vitastor/testdata/*.txt; do
-          echo "-------- $i --------"
-          cat $i
-          echo ""
-        done
-
-  test_enospc_imm_xor:
-    runs-on: ubuntu-latest
-    needs: build
-    container: ${{env.TEST_IMAGE}}:${{github.sha}}
-    steps:
-    - name: Run test
-      id: test
-      timeout-minutes: 3
-      run: IMMEDIATE_COMMIT=1 SCHEME=xor /root/vitastor/tests/test_enospc.sh
-    - name: Print logs
-      if: always() && steps.test.outcome == 'failure'
-      run: |
-        for i in /root/vitastor/testdata/*.log /root/vitastor/testdata/*.txt; do
-          echo "-------- $i --------"
-          cat $i
-          echo ""
-        done
-
  test_scrub:
    runs-on: ubuntu-latest
    needs: build
@ -971,21 +838,3 @@ jobs:
          echo ""
        done

-  test_nfs:
-    runs-on: ubuntu-latest
-    needs: build
-    container: ${{env.TEST_IMAGE}}:${{github.sha}}
-    steps:
-    - name: Run test
-      id: test
-      timeout-minutes: 3
-      run: /root/vitastor/tests/test_nfs.sh
-    - name: Print logs
-      if: always() && steps.test.outcome == 'failure'
-      run: |
-        for i in /root/vitastor/testdata/*.log /root/vitastor/testdata/*.txt; do
-          echo "-------- $i --------"
-          cat $i
-          echo ""
-        done
-
--- a/.gitea/workflows/tests-to-yaml.pl
+++ b/.gitea/workflows/tests-to-yaml.pl
@ -39,10 +39,6 @@ for my $line (<>)
                $test_name .= '_'.lc($1).'_'.$2;
            }
        }
-        if ($test_name eq 'test_snapshot_chain_ec')
-        {
-            $timeout = 6;
-        }
        $line =~ s!\./test_!/root/vitastor/tests/test_!;
        # Gitea CI doesn't support artifacts yet, lol
        #- name: Upload results
--- a/.gitignore
+++ b/.gitignore
@ -3,3 +3,16 @@
 package-lock.json
 fio
 qemu
+osd
+stub_osd
+stub_uring_osd
+stub_bench
+osd_test
+osd_peering_pg_test
+dump_journal
+nbd_proxy
+rm_inode
+test_allocator
+test_blockstore
+test_shit
+osd_rmw_test
--- a/CLA-en.md
+++ b/CLA-en.md
@ -1,115 +0,0 @@
-## Contributor License Agreement
-
-> This Agreement is made in the Russian and English languages. **The English
-text of Agreement is for informational purposes only** and is not binding
-for the Parties.
->
-> In the event of a conflict between the provisions of the Russian and
-English versions of this Agreement, the **Russian version shall prevail**.
->
-> Russian version is published at https://git.yourcmc.ru/vitalif/vitastor/src/branch/master/CLA-ru.md
-
-This document represents the offer of Filippov Vitaliy Vladimirovich
-("Author"), author and copyright holder of Vitastor software ("Program"),
-acknowledged by a certificate of Federal Service for Intellectual
-Property of Russian Federation (Rospatent) # 2021617829 dated 20 May 2021,
-to "Contributors" to conclude this license agreement as follows
-("Agreement" or "Offer").
-
-In accordance with Art. 435, Art. 438 of the Civil Code of the Russian
-Federation, this Agreement is an offer and in case of acceptance of the
-offer, an agreement is considered concluded on the conditions specified
-in the offer.
-
-1. Applicable Terms. \
-   1.1. "Official Repository" shall mean the computer storage, operated by
-        the Author, containing all prior and future versions of the Source
-        Code of the Program, at Internet addresses https://git.yourcmc.ru/vitalif/vitastor/
-        or https://github.com/vitalif/vitastor/. \
-   1.2. "Contributions" shall mean results of intellectual activity
-        (including, but not limited to, source code, libraries, components,
-        texts, documentation) which can be software or elements of the software
-        and which are provided by Contributors to the Author for inclusion
-        in the Program. \
-   1.3. "Contributor" shall mean a person who provides Contributions to
-        the Author and agrees with all provisions of this Agreement.
-        A Сontributor can be: 1) an individual; or 2) a legal entity or an
-        individual entrepreneur in case when an individual provides Contributions
-        on behalf of third parties, including on behalf of his employer.
-
-2. Subject of the Agreement. \
-   2.1. Subject of the Agreement shall be the Contributions sent to the Author by Contributors. \
-   2.2. The Contributor grants to the Author the right to use Contributions at his own
-        discretion and without any necessity to get a prior approval from Contributor or
-        any other third party in any way, under a simple (non-exclusive), royalty-free,
-        irrevocable license throughout the world by all means not contrary to law, in whole
-        or as a part of the Program, or other open-source or closed-source computer programs,
-        products or services (hereinafter -- the "License"), including, but not limited to: \
-        2.2.1. to execute Contributions and use them for any tasks; \
-        2.2.2. to publish and distribute Contributions in modified or unmodified form and/or to rent them; \
-        2.2.3. to modify Contributions, add comments, illustrations or any explanations to Contributions while using them; \
-        2.2.4. to create other results of intellectual activity based on Contributions, including derivative works and composite works; \
-        2.2.5. to translate Contributions into other languages, including other programming languages; \
-        2.2.6. to carry out rental and public display of Contributions; \
-        2.2.7. to use Contributions under the trade name and/or any trademark or any other label, or without it, as the Author thinks fit; \
-   2.3. The Contributor grants to the Author the right to sublicense any of the aforementioned
-        rights to third parties on any terms at the Author's discretion. \
-   2.4. The License is provided for the entire duration of Contributor's
-        exclusive intellectual property rights to the Contributions. \
-   2.5. The Contributor grants to the Author the right to decide how and where to mention,
-        or to not mention at all, the fact of his authorship, name, nickname and/or company
-        details when including Contributions into the Program or in any other computer
-        programs, products or services.
-
-3. Acceptance of the Offer \
-   3.1. The Contributor may provide Contributions to the Author in the form of
-        a "Pull Request" in an Official Repository of the Program or by any
-        other electronic means of communication, including, but not limited to,
-        E-mail or messenger applications. \
-   3.2. The acceptance of the Offer shall be the fact of provision of Contributions
-        to the Author by the Contributor by any means with the following remark:
-        “I accept Vitastor CLA agreement: https://git.yourcmc.ru/vitalif/vitastor/src/branch/master/CLA-en.md”
-        or “Я принимаю соглашение Vitastor CLA: https://git.yourcmc.ru/vitalif/vitastor/src/branch/master/CLA-ru.md”. \
-   3.3. Date of acceptance of the Offer shall be the date of such provision.
-
-4. Rights and obligations of the parties. \
-   4.1. The Contributor reserves the right to use Contributions by any lawful means
-        not contrary to this Agreement. \
-   4.2. The Author has the right to refuse to include Contributions into the Program
-        at any moment with no explanation to the Contributor.
-
-5. Representations and Warranties. \
-   5.1. The person providing Contributions for the purpose of their inclusion
-        in the Program represents and warrants that he is the Contributor
-        or legally acts on the Contributor's behalf. Name or company details
-        of the Contributor shall be provided with the Contribution at the moment
-        of their provision to the Author. \
-   5.2. The Contributor represents and warrants that he legally owns exclusive
-        intellectual property rights to the Contributions. \
-   5.3. The Contributor represents and warrants that any further use of
-        Contributions by the Author as provided by Contributor under the terms
-        of the Agreement does not infringe on intellectual and other rights and
-        legitimate interests of third parties. \
-   5.4. The Contributor represents and warrants that he has all rights and legal
-        capacity needed to accept this Offer; \
-   5.5. The Contributor represents and warrants that Contributions don't
-        contain malware or any information considered illegal under the law
-        of Russian Federation.
-
-6. Termination of the Agreement \
-   6.1. The Agreement may be terminated at will of both Author and Contributor,
-        formalised in the written form or if the Agreement is terminated on
-        reasons prescribed by the law of Russian Federation.
-
-7. Final Clauses \
-   7.1. The Contributor may optionally sign the Agreement in the written form. \
-   7.2. The Agreement is deemed to become effective from the Date of signing of
-        the Agreement and until the expiration of Contributor's exclusive
-        intellectual property rights to the Contributions. \
-   7.3. The Author may unilaterally alter the Agreement without informing Contributors.
-        The new version of the document shall come into effect 3 (three) days after
-        being published in the Official Repository of the Program at Internet address
-        [https://git.yourcmc.ru/vitalif/vitastor/src/branch/master/CLA-en.md](https://git.yourcmc.ru/vitalif/vitastor/src/branch/master/CLA-en.md).
-        Contributors should keep informed about the actual version of the Agreement themselves. \
-   7.4. If the Author and the Contributor fail to agree on disputable issues,
-        disputes shall be referred to the Moscow Arbitration court.
--- a/CLA-ru.md
+++ b/CLA-ru.md
@ -1,108 +0,0 @@
-## Лицензионное соглашение с участником
-
-> Данная Оферта написана в Русской и Английской версиях. **Версия на английском
-языке предоставляется в информационных целях** и не связывает стороны договора.
->
-> В случае несоответствий между положениями Русской и Английской версий Договора,
-**Русская версия имеет приоритет**.
->
-> Английская версия опубликована по адресу https://git.yourcmc.ru/vitalif/vitastor/src/branch/master/CLA-en.md
-
-Настоящий договор-оферта (далее по тексту – Оферта, Договор) адресована физическим
-и юридическим лицам (далее – Участникам) и является официальным публичным предложением
-Филиппова Виталия Владимировича (далее – Автора) программного обеспечения Vitastor,
-свидетельство Федеральной службы по интеллектуальной собственности (Роспатент) № 2021617829
-от 20 мая 2021 г. (далее – Программа) о нижеследующем:
-
-1. Термины и определения \
-   1.1. Репозиторий – электронное хранилище, содержащее исходный код Программы. \
-   1.2. Доработка – результат интеллектуальной деятельности Участника, включающий
-        в себя изменения или дополнения к исходному коду Программы, которые Участник
-        желает включить в состав Программы для дальнейшего использования и распространения
-        Автором и для этого направляет их Автору. \
-   1.3. Участник – физическое или юридическое лицо, вносящее Доработки в код Программы. \
-   1.4. ГК РФ – Гражданский кодекс Российской Федерации.
-
-2. Предмет оферты \
-   2.1. Предметом настоящей оферты являются Доработки, отправляемые Участником Автору. \
-   2.2. Участник предоставляет Автору право использовать Доработки по собственному усмотрению
-        и без необходимости предварительного согласования с Участником или иным третьим лицом
-        на условиях простой (неисключительной) безвозмездной безотзывной лицензии, полностью
-        или фрагментарно, в составе Программы или других программ, продуктов или сервисов
-        как с открытым, так и с закрытым исходным кодом, любыми способами, не противоречащими
-        закону, включая, но не ограничиваясь следующими: \
-        2.2.1. Запускать и использовать Доработки для выполнения любых задач; \
-        2.2.2. Распространять, импортировать и доводить Доработки до всеобщего сведения; \
-        2.2.3. Вносить в Доработки изменения, сокращения и дополнения, снабжать Доработки
-               при их использовании комментариями, иллюстрациями или пояснениями; \
-        2.2.4. Создавать на основе Доработок иные результаты интеллектуальной деятельности,
-               в том числе производные и составные произведения; \
-        2.2.5. Переводить Доработки на другие языки, в том числе на другие языки программирования; \
-        2.2.6. Осуществлять прокат и публичный показ Доработок; \
-        2.2.7. Использовать Доработки под любым фирменным наименованием, товарным знаком
-               (знаком обслуживания) или иным обозначением, или без такового. \
-   2.3. Участник предоставляет Автору право сублицензировать полученные права на Доработки
-        третьим лицам на любых условиях на усмотрение Автора. \
-   2.4. Участник предоставляет Автору права на Доработки на территории всего мира. \
-   2.5. Участник предоставляет Автору права на весь срок действия исключительного права
-        Участника на Доработки. \
-   2.6. Участник предоставляет Автору права на Доработки на безвозмездной основе. \
-   2.7. Участник разрешает Автору самостоятельно определять порядок, способ и
-        место указания его имени, реквизитов и/или псевдонима при включении
-        Доработок в состав Программы или других программ, продуктов или сервисов.
-
-3. Акцепт Оферты \
-   3.1. Участник может передавать Доработки в адрес Автора через зеркала официального
-        Репозитория Программы по адресам https://git.yourcmc.ru/vitalif/vitastor/ или
-        https://github.com/vitalif/vitastor/ в виде “запроса на слияние” (pull request),
-        либо в письменном виде или с помощью любых других электронных средств коммуникации,
-        например, электронной почты или мессенджеров. \
-   3.2. Факт передачи Участником Доработок в адрес Автора любым способом с одной из пометок
-        “I accept Vitastor CLA agreement: https://git.yourcmc.ru/vitalif/vitastor/src/branch/master/CLA-en.md”
-        или “Я принимаю соглашение Vitastor CLA: https://git.yourcmc.ru/vitalif/vitastor/src/branch/master/CLA-ru.md”
-        является полным и безоговорочным акцептом (принятием) Участником условий настоящей
-        Оферты, т.е. Участник считается ознакомившимся с настоящим публичным договором и
-        в соответствии с ГК РФ признается лицом, вступившим с Автором в договорные отношения
-        на основании настоящей Оферты. \
-   3.3. Датой акцептирования настоящей Оферты считается дата такой передачи.
-
-4. Права и обязанности Сторон \
-   4.1. Участник сохраняет за собой право использовать Доработки любым законным
-        способом, не противоречащим настоящему Договору. \
-   4.2. Автор вправе отказать Участнику во включении Доработок в состав
-        Программы без объяснения причин в любой момент по своему усмотрению.
-
-5. Гарантии и заверения \
-   5.1. Лицо, направляющее Доработки для целей их включения в состав Программы,
-        гарантирует, что является Участником или представителем Участника. Имя или реквизиты
-        Участника должны быть указаны при их передаче в адрес Автора Программы. \
-   5.2. Участник гарантирует, что является законным обладателем исключительных прав
-        на Доработки. \
-   5.3. Участник гарантирует, что на момент акцептирования настоящей Оферты ему
-        ничего не известно (и не могло быть известно) о правах третьих лиц на
-        передаваемые Автору Доработки или их часть, которые могут быть нарушены
-        в связи с передачей Доработок по настоящему Договору. \
-   5.4. Участник гарантирует, что является дееспособным лицом и обладает всеми
-        необходимыми правами для заключения Договора. \
-   5.5. Участник гарантирует, что Доработки не содержат вредоносного ПО, а также
-        любой другой информации, запрещённой к распространению по законам Российской
-        Федерации.
-
-6. Прекращение действия оферты \
-   6.1. Действие настоящего договора может быть прекращено по соглашению сторон,
-        оформленному в письменном виде, а также вследствие его расторжения по основаниям,
-        предусмотренным законом.
-
-7. Заключительные положения \
-   7.1. Участник вправе по желанию подписать настоящий Договор в письменном виде. \
-   7.2. Настоящий договор действует с момента его заключения и до истечения срока
-        действия исключительных прав Участника на Доработки. \
-   7.3. Автор имеет право в одностороннем порядке вносить изменения и дополнения в договор
-        без специального уведомления об этом Участников. Новая редакция документа вступает
-        в силу через 3 (Три) календарных дня со дня опубликования в официальном Репозитории
-        Программы по адресу в сети Интернет
-        [https://git.yourcmc.ru/vitalif/vitastor/src/branch/master/CLA-ru.md](https://git.yourcmc.ru/vitalif/vitastor/src/branch/master/CLA-ru.md).
-        Участники самостоятельно отслеживают действующие условия Оферты. \
-   7.4. Все споры, возникающие между сторонами в процессе их взаимодействия по настоящему
-        договору, решаются путём переговоров. В случае невозможности урегулирования споров
-        переговорным порядком стороны разрешают их в Арбитражном суде г.Москвы.
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@ -2,6 +2,6 @@ cmake_minimum_required(VERSION 2.8.12)

 project(vitastor)

-set(VERSION "1.6.1")
+set(VERSION "0.9.3")

 add_subdirectory(src)
--- a/README-ru.md
+++ b/README-ru.md
@ -6,8 +6,8 @@

 Вернём былую скорость кластерному блочному хранилищу!

-Vitastor - распределённая блочная и файловая SDS (программная СХД), прямой аналог Ceph RBD и CephFS,
-а также внутренних СХД популярных облачных провайдеров. Однако, в отличие от них, Vitastor
+Vitastor - распределённая блочная SDS (программная СХД), прямой аналог Ceph RBD и
+внутренних СХД популярных облачных провайдеров. Однако, в отличие от них, Vitastor
 быстрый и при этом простой. Только пока маленький :-).

 Vitastor архитектурно похож на Ceph, что означает атомарность и строгую консистентность,
@ -50,7 +50,6 @@ Vitastor поддерживает QEMU-драйвер, протоколы NBD и
  - Параметры
    - [Общие](docs/config/common.ru.md)
    - [Сетевые](docs/config/network.ru.md)
-    - [Клиентский код](docs/config/client.en.md)
    - [Глобальные дисковые параметры](docs/config/layout-cluster.ru.md)
    - [Дисковые параметры OSD](docs/config/layout-osd.ru.md)
    - [Прочие параметры OSD](docs/config/osd.ru.md)
@ -63,13 +62,11 @@ Vitastor поддерживает QEMU-драйвер, протоколы NBD и
  - [fio](docs/usage/fio.ru.md) для тестов производительности
  - [NBD](docs/usage/nbd.ru.md) для монтирования ядром
  - [QEMU и qemu-img](docs/usage/qemu.ru.md)
-  - [NFS](docs/usage/nfs.ru.md) кластерная файловая система и псевдо-ФС прокси
-  - [Администрирование](docs/usage/admin.ru.md)
+  - [NFS](docs/usage/nfs.ru.md)-прокси для VMWare и подобных
 - Производительность
  - [Понимание сути производительности](docs/performance/understanding.ru.md)
  - [Теоретический максимум](docs/performance/theoretical.ru.md)
  - [Пример сравнения с Ceph](docs/performance/comparison1.ru.md)
-  - [Более новый тест Vitastor 1.3.1](docs/performance/bench2.ru.md)

 ## Автор и лицензия

--- a/README.md
+++ b/README.md
@ -6,9 +6,9 @@

 Make Clustered Block Storage Fast Again.

-Vitastor is a distributed block and file SDS, direct replacement of Ceph RBD and CephFS,
-and also internal SDS's of public clouds. However, in contrast to them, Vitastor is fast
-and simple at the same time. The only thing is it's slightly young :-).
+Vitastor is a distributed block SDS, direct replacement of Ceph RBD and internal SDS's
+of public clouds. However, in contrast to them, Vitastor is fast and simple at the same time.
+The only thing is it's slightly young :-).

 Vitastor is architecturally similar to Ceph which means strong consistency,
 primary-replication, symmetric clustering and automatic data distribution over any
@ -50,7 +50,6 @@ Read more details below in the documentation.
  - Parameter Reference
    - [Common](docs/config/common.en.md)
    - [Network](docs/config/network.en.md)
-    - [Client](docs/config/client.en.md)
    - [Global Disk Layout](docs/config/layout-cluster.en.md)
    - [OSD Disk Layout](docs/config/layout-osd.en.md)
    - [OSD Runtime Parameters](docs/config/osd.en.md)
@ -63,13 +62,11 @@ Read more details below in the documentation.
  - [fio](docs/usage/fio.en.md) for benchmarks
  - [NBD](docs/usage/nbd.en.md) for kernel mounts
  - [QEMU and qemu-img](docs/usage/qemu.en.md)
-  - [NFS](docs/usage/nfs.en.md) clustered file system and pseudo-FS proxy
-  - [Administration](docs/usage/admin.en.md)
+  - [NFS](docs/usage/nfs.en.md) emulator for VMWare and similar
 - Performance
  - [Understanding storage performance](docs/performance/understanding.en.md)
  - [Theoretical performance](docs/performance/theoretical.en.md)
  - [Example comparison with Ceph](docs/performance/comparison1.en.md)
-  - [Newer benchmark of Vitastor 1.3.1](docs/performance/bench2.en.md)

 ## Author and License

--- a/copy-fio-includes.sh
+++ b/copy-fio-includes.sh
@ -1,6 +1,6 @@
 #!/bin/bash

-gcc -I. -E -o fio_headers.i src/util/fio_headers.h
+gcc -I. -E -o fio_headers.i src/fio_headers.h

 rm -rf fio-copy
 for i in `grep -Po 'fio/[^"]+' fio_headers.i | sort | uniq`; do
--- a/copy-qemu-includes.sh
+++ b/copy-qemu-includes.sh
@ -5,7 +5,7 @@
 #cd b/qemu; make qapi

 gcc -I qemu/b/qemu `pkg-config glib-2.0 --cflags` \
-    -I qemu/include -E -o qemu_driver.i src/client/qemu_driver.c
+    -I qemu/include -E -o qemu_driver.i src/qemu_driver.c

 rm -rf qemu-copy
 for i in `grep -Po 'qemu/[^"]+' qemu_driver.i | sort | uniq`; do
--- a/2
+++ b/2
@ -1 +1 @@
-Subproject commit 8de8b467acbca50cfd8835c20e0e379110f3b32b
+Subproject commit 45e6d1f13196a0824e2089a586c53b9de0283f17
--- a/csi/Dockerfile
+++ b/csi/Dockerfile
@ -1,15 +1,14 @@
 # Compile stage
-FROM golang:bookworm AS build
+FROM golang:buster AS build

 ADD go.sum go.mod /app/
 RUN cd /app; CGO_ENABLED=1 GOOS=linux GOARCH=amd64 go mod download -x
 ADD . /app
-RUN perl -i -e '$/ = undef; while(<>) { s/\n\s*(\{\s*\n)/$1\n/g; s/\}(\s*\n\s*)else\b/$1} else/g; print; }' `find /app -name '*.go'` && \
-    cd /app && \
-    CGO_ENABLED=1 GOOS=linux GOARCH=amd64 go build -o vitastor-csi
+RUN perl -i -e '$/ = undef; while(<>) { s/\n\s*(\{\s*\n)/$1\n/g; s/\}(\s*\n\s*)else\b/$1} else/g; print; }' `find /app -name '*.go'`
+RUN cd /app; CGO_ENABLED=1 GOOS=linux GOARCH=amd64 go build -o vitastor-csi

 # Final stage
-FROM debian:bookworm
+FROM debian:buster

 LABEL maintainers="Vitaliy Filippov <vitalif@yourcmc.ru>"
 LABEL description="Vitastor CSI Driver"
@ -19,30 +18,19 @@ ENV CSI_ENDPOINT=""

 RUN apt-get update && \
    apt-get install -y wget && \
+    (echo deb http://deb.debian.org/debian buster-backports main > /etc/apt/sources.list.d/backports.list) && \
    (echo "APT::Install-Recommends false;" > /etc/apt/apt.conf) && \
    apt-get update && \
-    apt-get install -y e2fsprogs xfsprogs kmod iproute2 \
-        # dependencies of qemu-storage-daemon
-        libnuma1 liburing2 libglib2.0-0 libfuse3-3 libaio1 libzstd1 libnettle8 \
-        libgmp10 libhogweed6 libp11-kit0 libidn2-0 libunistring2 libtasn1-6 libpcre2-8-0 libffi8 && \
+    apt-get install -y e2fsprogs xfsprogs kmod && \
    apt-get clean && \
    (echo options nbd nbds_max=128 > /etc/modprobe.d/nbd.conf)

 COPY --from=build /app/vitastor-csi /bin/

-RUN (echo deb http://vitastor.io/debian bookworm main > /etc/apt/sources.list.d/vitastor.list) && \
-    ((echo 'Package: *'; echo 'Pin: origin "vitastor.io"'; echo 'Pin-Priority: 1000') > /etc/apt/preferences.d/vitastor.pref) && \
+RUN (echo deb http://vitastor.io/debian buster main > /etc/apt/sources.list.d/vitastor.list) && \
    wget -q -O /etc/apt/trusted.gpg.d/vitastor.gpg https://vitastor.io/debian/pubkey.gpg && \
    apt-get update && \
    apt-get install -y vitastor-client && \
-    wget https://vitastor.io/archive/qemu/qemu-bookworm-8.1.2%2Bds-1%2Bvitastor1/qemu-utils_8.1.2%2Bds-1%2Bvitastor1_amd64.deb && \
-    wget https://vitastor.io/archive/qemu/qemu-bookworm-8.1.2%2Bds-1%2Bvitastor1/qemu-block-extra_8.1.2%2Bds-1%2Bvitastor1_amd64.deb && \
-    dpkg -x qemu-utils*.deb tmp1 && \
-    dpkg -x qemu-block-extra*.deb tmp1 && \
-    cp -a tmp1/usr/bin/qemu-storage-daemon /usr/bin/ && \
-    mkdir -p /usr/lib/x86_64-linux-gnu/qemu && \
-    cp -a tmp1/usr/lib/x86_64-linux-gnu/qemu/block-vitastor.so /usr/lib/x86_64-linux-gnu/qemu/ && \
-    rm -rf tmp1 *.deb && \
    apt-get clean

 ENTRYPOINT ["/bin/vitastor-csi"]
--- a/csi/Makefile
+++ b/csi/Makefile
@ -1,4 +1,4 @@
-VERSION ?= v1.6.1
+VERSION ?= v0.9.3

 all: build push

--- a/csi/deploy/001-csi-config-map.yaml
+++ b/csi/deploy/001-csi-config-map.yaml
@ -2,7 +2,6 @@
 apiVersion: v1
 kind: ConfigMap
 data:
-  # You can add multiple configuration files here to use a multi-cluster setup
  vitastor.conf: |-
    {"etcd_address":"http://192.168.7.2:2379","etcd_prefix":"/vitastor"}
 metadata:
--- a/csi/deploy/004-csi-nodeplugin.yaml
+++ b/csi/deploy/004-csi-nodeplugin.yaml
@ -49,7 +49,7 @@ spec:
            capabilities:
              add: ["SYS_ADMIN"]
            allowPrivilegeEscalation: true
-          image: vitalif/vitastor-csi:v1.6.1
+          image: vitalif/vitastor-csi:v0.9.3
          args:
            - "--node=$(NODE_ID)"
            - "--endpoint=$(CSI_ENDPOINT)"
@ -82,8 +82,6 @@ spec:
              name: host-sys
            - mountPath: /run/mount
              name: host-mount
-            - mountPath: /run/vitastor-csi
-              name: run-vitastor-csi
            - mountPath: /lib/modules
              name: lib-modules
              readOnly: true
@ -134,9 +132,6 @@ spec:
        - name: host-mount
          hostPath:
            path: /run/mount
-        - name: run-vitastor-csi
-          hostPath:
-            path: /run/vitastor-csi
        - name: lib-modules
          hostPath:
            path: /lib/modules
--- a/csi/deploy/005-csi-provisioner-rbac.yaml
+++ b/csi/deploy/005-csi-provisioner-rbac.yaml
@ -35,13 +35,10 @@ rules:
    verbs: ["get", "list", "watch"]
  - apiGroups: ["snapshot.storage.k8s.io"]
    resources: ["volumesnapshots"]
-    verbs: ["get", "list", "patch"]
-  - apiGroups: ["snapshot.storage.k8s.io"]
-    resources: ["volumesnapshots/status"]
-    verbs: ["get", "list", "patch"]
+    verbs: ["get", "list"]
  - apiGroups: ["snapshot.storage.k8s.io"]
    resources: ["volumesnapshotcontents"]
-    verbs: ["create", "get", "list", "watch", "update", "delete", "patch"]
+    verbs: ["create", "get", "list", "watch", "update", "delete"]
  - apiGroups: ["snapshot.storage.k8s.io"]
    resources: ["volumesnapshotclasses"]
    verbs: ["get", "list", "watch"]
@ -56,7 +53,7 @@ rules:
    verbs: ["get", "list", "watch"]
  - apiGroups: ["snapshot.storage.k8s.io"]
    resources: ["volumesnapshotcontents/status"]
-    verbs: ["update", "patch"]
+    verbs: ["update"]
  - apiGroups: [""]
    resources: ["configmaps"]
    verbs: ["get"]
--- a/csi/deploy/007-csi-provisioner.yaml
+++ b/csi/deploy/007-csi-provisioner.yaml
@ -23,11 +23,6 @@ metadata:
  name: csi-vitastor-provisioner
 spec:
  replicas: 3
-  strategy:
-    type: RollingUpdate
-    rollingUpdate:
-      maxUnavailable: 1
-      maxSurge: 0
  selector:
    matchLabels:
      app: csi-vitastor-provisioner
@ -51,7 +46,7 @@ spec:
      priorityClassName: system-cluster-critical
      containers:
        - name: csi-provisioner
-          image: k8s.gcr.io/sig-storage/csi-provisioner:v3.0.0
+          image: k8s.gcr.io/sig-storage/csi-provisioner:v2.2.0
          args:
            - "--csi-address=$(ADDRESS)"
            - "--v=5"
@ -121,7 +116,7 @@ spec:
            privileged: true
            capabilities:
              add: ["SYS_ADMIN"]
-          image: vitalif/vitastor-csi:v1.6.1
+          image: vitalif/vitastor-csi:v0.9.3
          args:
            - "--node=$(NODE_ID)"
            - "--endpoint=$(CSI_ENDPOINT)"
--- a/csi/deploy/009-storage-class.yaml
+++ b/csi/deploy/009-storage-class.yaml
@ -12,6 +12,8 @@ parameters:
  etcdVolumePrefix: ""
  poolId: "1"
  # you can choose other configuration file if you have it in the config map
-  # different etcd URLs and prefixes should also be put in the config
  #configPath: "/etc/vitastor/vitastor.conf"
-allowVolumeExpansion: true
+  # you can also specify etcdUrl here, maybe to connect to another Vitastor cluster
+  # multiple etcdUrls may be specified, delimited by comma
+  #etcdUrl: "http://192.168.7.2:2379"
+  #etcdPrefix: "/vitastor"
--- a/csi/deploy/example-snapshot-class.yaml
+++ b/csi/deploy/example-snapshot-class.yaml
@ -1,7 +0,0 @@
-apiVersion: snapshot.storage.k8s.io/v1
-kind: VolumeSnapshotClass
-metadata:
-  name: vitastor-snapclass
-driver: csi.vitastor.io
-deletionPolicy: Delete
-parameters:
--- a/csi/deploy/example-snapshot-clone.yaml
+++ b/csi/deploy/example-snapshot-clone.yaml
@ -1,16 +0,0 @@
---
-apiVersion: v1
-kind: PersistentVolumeClaim
-metadata:
-  name: test-vitastor-clone
-spec:
-  storageClassName: vitastor
-  dataSource:
-    name: snap1
-    kind: VolumeSnapshot
-    apiGroup: snapshot.storage.k8s.io
-  accessModes:
-    - ReadWriteOnce
-  resources:
-    requests:
-      storage: 10Gi
--- a/csi/deploy/example-snapshot.yaml
+++ b/csi/deploy/example-snapshot.yaml
@ -1,8 +0,0 @@
-apiVersion: snapshot.storage.k8s.io/v1
-kind: VolumeSnapshot
-metadata:
-  name: snap1
-spec:
-  volumeSnapshotClassName: vitastor-snapclass
-  source:
-    persistentVolumeClaimName: test-vitastor-pvc
--- a/csi/go.mod
+++ b/csi/go.mod
@ -9,7 +9,6 @@ require (
 	golang.org/x/net v0.0.0-20201202161906-c7110b5ffcbb
 	golang.org/x/xerrors v0.0.0-20200804184101-5ec99f83aff1 // indirect
 	google.golang.org/grpc v1.33.1
-	google.golang.org/protobuf v1.24.0
 	k8s.io/klog v1.0.0
 	k8s.io/utils v0.0.0-20210305010621-2afb4311ab10
 )
--- a/csi/src/config.go
+++ b/csi/src/config.go
@ -5,7 +5,7 @@ package vitastor

 const (
    vitastorCSIDriverName    = "csi.vitastor.io"
-    vitastorCSIDriverVersion = "1.6.1"
+    vitastorCSIDriverVersion = "0.9.3"
 )

 // Config struct fills the parameters of request or user input
--- a/csi/src/controllerserver.go
+++ b/csi/src/controllerserver.go
@ -20,7 +20,6 @@ import (

    "google.golang.org/grpc/codes"
    "google.golang.org/grpc/status"
-    "google.golang.org/protobuf/types/known/timestamppb"

    "github.com/container-storage-interface/spec/lib/go/csi"
 )
@ -46,7 +45,6 @@ type InodeConfig struct
    ParentPool uint64 `json:"parent_pool,omitempty"`
    ParentId uint64 `json:"parent_id,omitempty"`
    Readonly bool `json:"readonly,omitempty"`
-    CreateTs uint64 `json:"create_ts,omitempty"`
 }

 type ControllerServer struct
@ -62,7 +60,7 @@ func NewControllerServer(driver *Driver) *ControllerServer
    }
 }

-func GetConnectionParams(params map[string]string) (map[string]string, error)
+func GetConnectionParams(params map[string]string) (map[string]string, []string, string)
 {
    ctxVars := make(map[string]string)
    configPath := params["configPath"]
@ -75,69 +73,71 @@ func GetConnectionParams(params map[string]string) (map[string]string, error)
        ctxVars["configPath"] = configPath
    }
    config := make(map[string]interface{})
-    configFD, err := os.Open(configPath)
-    if (err != nil)
+    if configFD, err := os.Open(configPath); err == nil
    {
-        return nil, err
+        defer configFD.Close()
+        data, _ := ioutil.ReadAll(configFD)
+        json.Unmarshal(data, &config)
    }
-    defer configFD.Close()
-    data, _ := ioutil.ReadAll(configFD)
-    json.Unmarshal(data, &config)
-    // Check etcd URL in the config, but do not use the explicit etcdUrl
-    // parameter for CLI calls, otherwise users won't be able to later
-    // change them - storage class parameters are saved in volume IDs
+    // Try to load prefix & etcd URL from the config
    var etcdUrl []string
-    switch config["etcd_address"].(type)
+    if (params["etcdUrl"] != "")
    {
-    case string:
-        url := strings.TrimSpace(config["etcd_address"].(string))
-        if (url != "")
-        {
-            etcdUrl = strings.Split(url, ",")
-        }
-    case []string:
-        etcdUrl = config["etcd_address"].([]string)
-    case []interface{}:
-        for _, url := range config["etcd_address"].([]interface{})
-        {
-            s, ok := url.(string)
-            if (ok)
-            {
-                etcdUrl = append(etcdUrl, s)
-            }
-        }
+        ctxVars["etcdUrl"] = params["etcdUrl"]
+        etcdUrl = strings.Split(params["etcdUrl"], ",")
    }
    if (len(etcdUrl) == 0)
    {
-        return nil, status.Error(codes.InvalidArgument, "etcd_address is missing in "+configPath)
+        switch config["etcd_address"].(type)
+        {
+        case string:
+            etcdUrl = strings.Split(config["etcd_address"].(string), ",")
+        case []string:
+            etcdUrl = config["etcd_address"].([]string)
+        }
    }
-    return ctxVars, nil
-}
-
-func system(program string, args ...string) ([]byte, []byte, error)
-{
-    klog.Infof("Running "+program+" "+strings.Join(args, " "))
-    c := exec.Command(program, args...)
-    var stdout, stderr bytes.Buffer
-    c.Stdout, c.Stderr = &stdout, &stderr
-    err := c.Run()
-    if (err != nil)
+    etcdPrefix := params["etcdPrefix"]
+    if (etcdPrefix == "")
    {
-        stdoutStr, stderrStr := string(stdout.Bytes()), string(stderr.Bytes())
-        klog.Errorf(program+" "+strings.Join(args, " ")+" failed: %s, status %s\n", stdoutStr+stderrStr, err)
-        return nil, nil, status.Error(codes.Internal, stdoutStr+stderrStr+" (status "+err.Error()+")")
+        etcdPrefix, _ = config["etcd_prefix"].(string)
+        if (etcdPrefix == "")
+        {
+            etcdPrefix = "/vitastor"
+        }
    }
-    return stdout.Bytes(), stderr.Bytes(), nil
+    else
+    {
+        ctxVars["etcdPrefix"] = etcdPrefix
+    }
+    return ctxVars, etcdUrl, etcdPrefix
 }

 func invokeCLI(ctxVars map[string]string, args []string) ([]byte, error)
 {
+    if (ctxVars["etcdUrl"] != "")
+    {
+        args = append(args, "--etcd_address", ctxVars["etcdUrl"])
+    }
+    if (ctxVars["etcdPrefix"] != "")
+    {
+        args = append(args, "--etcd_prefix", ctxVars["etcdPrefix"])
+    }
    if (ctxVars["configPath"] != "")
    {
        args = append(args, "--config_path", ctxVars["configPath"])
    }
-    stdout, _, err := system("/usr/bin/vitastor-cli", args...)
-    return stdout, err
+    c := exec.Command("/usr/bin/vitastor-cli", args...)
+    var stdout, stderr bytes.Buffer
+    c.Stdout = &stdout
+    c.Stderr = &stderr
+    err := c.Run()
+    stderrStr := string(stderr.Bytes())
+    if (err != nil)
+    {
+        klog.Errorf("vitastor-cli %s failed: %s, status %s\n", strings.Join(args, " "), stderrStr, err)
+        return nil, status.Error(codes.Internal, stderrStr+" (status "+err.Error()+")")
+    }
+    return stdout.Bytes(), nil
 }

 // Create the volume
@ -172,49 +172,33 @@ func (cs *ControllerServer) CreateVolume(ctx context.Context, req *csi.CreateVol
        volSize = ((capRange.GetRequiredBytes() + MB - 1) / MB) * MB
    }

-    ctxVars, err := GetConnectionParams(req.Parameters)
-    if (err != nil)
+    ctxVars, etcdUrl, _ := GetConnectionParams(req.Parameters)
+    if (len(etcdUrl) == 0)
    {
-        return nil, err
-    }
-
-    args := []string{ "create", volName, "-s", fmt.Sprintf("%v", volSize), "--pool", fmt.Sprintf("%v", poolId) }
-
-    // Support creation from snapshot
-    var src *csi.VolumeContentSource
-    if (req.VolumeContentSource.GetSnapshot() != nil)
-    {
-        snapId := req.VolumeContentSource.GetSnapshot().GetSnapshotId()
-        if (snapId != "")
-        {
-            snapVars := make(map[string]string)
-            err := json.Unmarshal([]byte(snapId), &snapVars)
-            if (err != nil)
-            {
-                return nil, status.Error(codes.Internal, "volume ID not in JSON format")
-            }
-            args = append(args, "--parent", snapVars["name"]+"@"+snapVars["snapshot"])
-            src = &csi.VolumeContentSource{
-                Type: &csi.VolumeContentSource_Snapshot{
-                    Snapshot: &csi.VolumeContentSource_SnapshotSource{
-                        SnapshotId: snapId,
-                    },
-                },
-            }
-        }
+        return nil, status.Error(codes.InvalidArgument, "no etcdUrl in storage class configuration and no etcd_address in vitastor.conf")
    }

    // Create image using vitastor-cli
-    _, err = invokeCLI(ctxVars, args)
+    _, err := invokeCLI(ctxVars, []string{ "create", volName, "-s", fmt.Sprintf("%v", volSize), "--pool", fmt.Sprintf("%v", poolId) })
    if (err != nil)
    {
        if (strings.Index(err.Error(), "already exists") > 0)
        {
-            inodeCfg, err := invokeList(ctxVars, volName, true)
+            stat, err := invokeCLI(ctxVars, []string{ "ls", "--json", volName })
            if (err != nil)
            {
                return nil, err
            }
+            var inodeCfg []InodeConfig
+            err = json.Unmarshal(stat, &inodeCfg)
+            if (err != nil)
+            {
+                return nil, status.Error(codes.Internal, "Invalid JSON in vitastor-cli ls: "+err.Error())
+            }
+            if (len(inodeCfg) == 0)
+            {
+                return nil, status.Error(codes.Internal, "vitastor-cli create said that image already exists, but ls can't find it")
+            }
            if (inodeCfg[0].Size < uint64(volSize))
            {
                return nil, status.Error(codes.Internal, "image "+volName+" is already created, but size is less than expected")
@ -233,7 +217,6 @@ func (cs *ControllerServer) CreateVolume(ctx context.Context, req *csi.CreateVol
            // Ugly, but VolumeContext isn't passed to DeleteVolume :-(
            VolumeId: string(volumeIdJson),
            CapacityBytes: volSize,
-            ContentSource: src,
        },
    }, nil
 }
@ -247,19 +230,15 @@ func (cs *ControllerServer) DeleteVolume(ctx context.Context, req *csi.DeleteVol
        return nil, status.Error(codes.InvalidArgument, "request cannot be empty")
    }

-    volVars := make(map[string]string)
-    err := json.Unmarshal([]byte(req.VolumeId), &volVars)
+    ctxVars := make(map[string]string)
+    err := json.Unmarshal([]byte(req.VolumeId), &ctxVars)
    if (err != nil)
    {
        return nil, status.Error(codes.Internal, "volume ID not in JSON format")
    }
-    volName := volVars["name"]
+    volName := ctxVars["name"]

-    ctxVars, err := GetConnectionParams(volVars)
-    if (err != nil)
-    {
-        return nil, err
-    }
+    ctxVars, _, _ = GetConnectionParams(ctxVars)

    _, err = invokeCLI(ctxVars, []string{ "rm", volName })
    if (err != nil)
@ -365,8 +344,6 @@ func (cs *ControllerServer) ControllerGetCapabilities(ctx context.Context, req *
        csi.ControllerServiceCapability_RPC_LIST_VOLUMES,
        csi.ControllerServiceCapability_RPC_EXPAND_VOLUME,
        csi.ControllerServiceCapability_RPC_CREATE_DELETE_SNAPSHOT,
-        csi.ControllerServiceCapability_RPC_LIST_SNAPSHOTS,
-        // TODO: csi.ControllerServiceCapability_RPC_CLONE_VOLUME,
    } {
        controllerServerCapabilities = append(controllerServerCapabilities, functionControllerServerCapabilities(capability))
    }
@ -376,226 +353,28 @@ func (cs *ControllerServer) ControllerGetCapabilities(ctx context.Context, req *
    }, nil
 }

-func invokeList(ctxVars map[string]string, pattern string, expectExist bool) ([]InodeConfig, error)
-{
-    stat, err := invokeCLI(ctxVars, []string{ "ls", "--json", pattern })
-    if (err != nil)
-    {
-        return nil, err
-    }
-    var inodeCfg []InodeConfig
-    err = json.Unmarshal(stat, &inodeCfg)
-    if (err != nil)
-    {
-        return nil, status.Error(codes.Internal, "Invalid JSON in vitastor-cli ls: "+err.Error())
-    }
-    if (expectExist && len(inodeCfg) == 0)
-    {
-        return nil, status.Error(codes.Internal, "Can't find expected image "+pattern+" via vitastor-cli ls")
-    }
-    return inodeCfg, nil
-}
-
 // CreateSnapshot create snapshot of an existing PV
 func (cs *ControllerServer) CreateSnapshot(ctx context.Context, req *csi.CreateSnapshotRequest) (*csi.CreateSnapshotResponse, error)
 {
-    klog.Infof("received controller create snapshot request %+v", protosanitizer.StripSecrets(req))
-    if (req == nil)
-    {
-        return nil, status.Errorf(codes.InvalidArgument, "request cannot be empty")
-    }
-    if (req.SourceVolumeId == "" || req.Name == "")
-    {
-        return nil, status.Error(codes.InvalidArgument, "source volume ID and snapshot name are required fields")
-    }
-
-    // snapshot name
-    snapName := req.Name
-
-    // req.VolumeId is an ugly json string in our case :)
-    ctxVars := make(map[string]string)
-    err := json.Unmarshal([]byte(req.SourceVolumeId), &ctxVars)
-    if (err != nil)
-    {
-        return nil, status.Error(codes.Internal, "volume ID not in JSON format")
-    }
-    volName := ctxVars["name"]
-
-    // Create image using vitastor-cli
-    _, err = invokeCLI(ctxVars, []string{ "create", "--snapshot", snapName, volName })
-    if (err != nil && strings.Index(err.Error(), "already exists") <= 0)
-    {
-        return nil, err
-    }
-
-    // Check created snapshot
-    inodeCfg, err := invokeList(ctxVars, volName+"@"+snapName, true)
-    if (err != nil)
-    {
-        return nil, err
-    }
-
-    // Use ugly JSON snapshot ID again, DeleteSnapshot doesn't have context :-(
-    ctxVars["snapshot"] = snapName
-    snapIdJson, _ := json.Marshal(ctxVars)
-    return &csi.CreateSnapshotResponse{
-        Snapshot: &csi.Snapshot{
-            SizeBytes: int64(inodeCfg[0].Size),
-            SnapshotId: string(snapIdJson),
-            SourceVolumeId: req.SourceVolumeId,
-            CreationTime: &timestamppb.Timestamp{ Seconds: int64(inodeCfg[0].CreateTs) },
-            ReadyToUse: true,
-        },
-    }, nil
+    return nil, status.Error(codes.Unimplemented, "")
 }

 // DeleteSnapshot delete provided snapshot of a PV
 func (cs *ControllerServer) DeleteSnapshot(ctx context.Context, req *csi.DeleteSnapshotRequest) (*csi.DeleteSnapshotResponse, error)
 {
-    klog.Infof("received controller delete snapshot request %+v", protosanitizer.StripSecrets(req))
-    if (req == nil)
-    {
-        return nil, status.Errorf(codes.InvalidArgument, "request cannot be empty")
-    }
-    if (req.SnapshotId == "")
-    {
-        return nil, status.Error(codes.InvalidArgument, "snapshot ID is a required field")
-    }
-
-    volVars := make(map[string]string)
-    err := json.Unmarshal([]byte(req.SnapshotId), &volVars)
-    if (err != nil)
-    {
-        return nil, status.Error(codes.Internal, "snapshot ID not in JSON format")
-    }
-    volName := volVars["name"]
-    snapName := volVars["snapshot"]
-
-    ctxVars, err := GetConnectionParams(volVars)
-    if (err != nil)
-    {
-        return nil, err
-    }
-
-    _, err = invokeCLI(ctxVars, []string{ "rm", volName+"@"+snapName })
-    if (err != nil)
-    {
-        return nil, err
-    }
-
-    return &csi.DeleteSnapshotResponse{}, nil
+    return nil, status.Error(codes.Unimplemented, "")
 }

 // ListSnapshots list the snapshots of a PV
 func (cs *ControllerServer) ListSnapshots(ctx context.Context, req *csi.ListSnapshotsRequest) (*csi.ListSnapshotsResponse, error)
 {
-    klog.Infof("received controller list snapshots request %+v", protosanitizer.StripSecrets(req))
-    if (req == nil)
-    {
-        return nil, status.Error(codes.InvalidArgument, "request cannot be empty")
-    }
-
-    volVars := make(map[string]string)
-    err := json.Unmarshal([]byte(req.SourceVolumeId), &volVars)
-    if (err != nil)
-    {
-        return nil, status.Error(codes.Internal, "volume ID not in JSON format")
-    }
-    volName := volVars["name"]
-    ctxVars, err := GetConnectionParams(volVars)
-    if (err != nil)
-    {
-        return nil, err
-    }
-
-    inodeCfg, err := invokeList(ctxVars, volName+"@*", false)
-    if (err != nil)
-    {
-        return nil, err
-    }
-
-    resp := &csi.ListSnapshotsResponse{}
-    for _, ino := range inodeCfg
-    {
-        snapName := ino.Name[len(volName)+1:]
-        if (len(req.StartingToken) > 0 && snapName < req.StartingToken)
-        {
-        }
-        else if (req.MaxEntries == 0 || len(resp.Entries) < int(req.MaxEntries))
-        {
-            volVars["snapshot"] = snapName
-            snapIdJson, _ := json.Marshal(volVars)
-            resp.Entries = append(resp.Entries, &csi.ListSnapshotsResponse_Entry{
-                Snapshot: &csi.Snapshot{
-                    SizeBytes: int64(ino.Size),
-                    SnapshotId: string(snapIdJson),
-                    SourceVolumeId: req.SourceVolumeId,
-                    CreationTime: &timestamppb.Timestamp{ Seconds: int64(ino.CreateTs) },
-                    ReadyToUse: true,
-                },
-            })
-        }
-        else
-        {
-            resp.NextToken = snapName
-            break
-        }
-    }
-
-    return resp, nil
+    return nil, status.Error(codes.Unimplemented, "")
 }

-// ControllerExpandVolume increases the size of a volume
+// ControllerExpandVolume resizes a volume
 func (cs *ControllerServer) ControllerExpandVolume(ctx context.Context, req *csi.ControllerExpandVolumeRequest) (*csi.ControllerExpandVolumeResponse, error)
 {
-    klog.Infof("received controller expand volume request %+v", protosanitizer.StripSecrets(req))
-    if (req == nil)
-    {
-        return nil, status.Error(codes.InvalidArgument, "request cannot be empty")
-    }
-    if (req.VolumeId == "" || req.CapacityRange == nil || req.CapacityRange.RequiredBytes == 0)
-    {
-        return nil, status.Error(codes.InvalidArgument, "VolumeId, CapacityRange and RequiredBytes are required fields")
-    }
-
-    volVars := make(map[string]string)
-    err := json.Unmarshal([]byte(req.VolumeId), &volVars)
-    if (err != nil)
-    {
-        return nil, status.Error(codes.Internal, "volume ID not in JSON format")
-    }
-    volName := volVars["name"]
-    ctxVars, err := GetConnectionParams(volVars)
-    if (err != nil)
-    {
-        return nil, err
-    }
-
-    inodeCfg, err := invokeList(ctxVars, volName, true)
-    if (err != nil)
-    {
-        return nil, err
-    }
-
-    if (req.CapacityRange.RequiredBytes > 0 && inodeCfg[0].Size < uint64(req.CapacityRange.RequiredBytes))
-    {
-        sz := ((req.CapacityRange.RequiredBytes+4095)/4096)*4096
-        _, err := invokeCLI(ctxVars, []string{ "modify", "--inc_size", "1", "--resize", fmt.Sprintf("%d", sz), volName })
-        if (err != nil)
-        {
-            return nil, err
-        }
-        inodeCfg, err = invokeList(ctxVars, volName, true)
-        if (err != nil)
-        {
-            return nil, err
-        }
-    }
-
-    return &csi.ControllerExpandVolumeResponse{
-        CapacityBytes: int64(inodeCfg[0].Size),
-        NodeExpansionRequired: false,
-    }, nil
+    return nil, status.Error(codes.Unimplemented, "")
 }

 // ControllerGetVolume get volume info
--- a/csi/src/identityserver.go
+++ b/csi/src/identityserver.go
@ -49,13 +49,6 @@ func (is *IdentityServer) GetPluginCapabilities(ctx context.Context, req *csi.Ge
                    },
                },
            },
-            {
-                Type: &csi.PluginCapability_VolumeExpansion_{
-                    VolumeExpansion: &csi.PluginCapability_VolumeExpansion{
-                        Type: csi.PluginCapability_VolumeExpansion_OFFLINE,
-                    },
-                },
-            },
        },
    }, nil
 }
--- a/csi/src/nodeserver.go
+++ b/csi/src/nodeserver.go
@ -5,15 +5,11 @@ package vitastor

 import (
    "context"
-    "encoding/json"
-    "fmt"
    "os"
    "os/exec"
-    "path/filepath"
+    "encoding/json"
    "strings"
-    "sync"
-    "syscall"
-    "time"
+    "bytes"

    "google.golang.org/grpc/codes"
    "google.golang.org/grpc/status"
@ -29,440 +25,55 @@ import (
 type NodeServer struct
 {
    *Driver
-    useVduse bool
-    stateDir string
    mounter mount.Interface
-    restartInterval time.Duration
-    mu sync.Mutex
-    cond *sync.Cond
-    volumeLocks map[string]bool
-}
-
-type DeviceState struct
-{
-    ConfigPath string `json:"configPath"`
-    VdpaId     string `json:"vdpaId"`
-    Image      string `json:"image"`
-    Blockdev   string `json:"blockdev"`
-    Readonly   bool   `json:"readonly"`
-    PidFile    string `json:"pidFile"`
 }

 // NewNodeServer create new instance node
 func NewNodeServer(driver *Driver) *NodeServer
 {
-    stateDir := os.Getenv("STATE_DIR")
-    if (stateDir == "")
-    {
-        stateDir = "/run/vitastor-csi"
-    }
-    if (stateDir[len(stateDir)-1] != '/')
-    {
-        stateDir += "/"
-    }
-    ns := &NodeServer{
+    return &NodeServer{
        Driver: driver,
-        useVduse: checkVduseSupport(),
-        stateDir: stateDir,
        mounter: mount.New(""),
-        volumeLocks: make(map[string]bool),
-    }
-    ns.cond = sync.NewCond(&ns.mu)
-    if (ns.useVduse)
-    {
-        ns.restoreVduseDaemons()
-        dur, err := time.ParseDuration(os.Getenv("RESTART_INTERVAL"))
-        if (err != nil)
-        {
-            dur = 10 * time.Second
-        }
-        ns.restartInterval = dur
-        if (ns.restartInterval != time.Duration(0))
-        {
-            go ns.restarter()
-        }
-    }
-    return ns
-}
-
-func (ns *NodeServer) lockVolume(lockId string)
-{
-    ns.mu.Lock()
-    defer ns.mu.Unlock()
-    for (ns.volumeLocks[lockId])
-    {
-        ns.cond.Wait()
-    }
-    ns.volumeLocks[lockId] = true
-    ns.cond.Broadcast()
-}
-
-func (ns *NodeServer) unlockVolume(lockId string)
-{
-    ns.mu.Lock()
-    defer ns.mu.Unlock()
-    delete(ns.volumeLocks, lockId)
-    ns.cond.Broadcast()
-}
-
-func (ns *NodeServer) restarter()
-{
-    // Restart dead VDUSE daemons at regular intervals
-    // Otherwise volume I/O may hang in case of a qemu-storage-daemon crash
-    // Moreover, it may lead to a kernel panic of the kernel is configured to
-    // panic on hung tasks
-    ticker := time.NewTicker(ns.restartInterval)
-    defer ticker.Stop()
-    for
-    {
-        <-ticker.C
-        ns.restoreVduseDaemons()
-    }
-}
-
-func (ns *NodeServer) restoreVduseDaemons()
-{
-    pattern := ns.stateDir+"vitastor-vduse-*.json"
-    matches, err := filepath.Glob(pattern)
-    if (err != nil)
-    {
-        klog.Errorf("failed to list %s: %v", pattern, err)
-    }
-    if (len(matches) == 0)
-    {
-        return
-    }
-    devList := make(map[string]interface{})
-    // example output: {"dev":{"test1":{"type":"block","mgmtdev":"vduse","vendor_id":0,"max_vqs":16,"max_vq_size":128}}}
-    devListJSON, _, err := system("/sbin/vdpa", "-j", "dev", "list")
-    if (err != nil)
-    {
-        return
-    }
-    err = json.Unmarshal(devListJSON, &devList)
-    devs, ok := devList["dev"].(map[string]interface{})
-    if (err != nil || !ok)
-    {
-        klog.Errorf("/sbin/vdpa -j dev list returned bad JSON (error %v): %v", err, string(devListJSON))
-        return
-    }
-    for _, stateFile := range matches
-    {
-        vdpaId := filepath.Base(stateFile)
-        vdpaId = vdpaId[0:len(vdpaId)-5]
-        // Check if VDPA device is still added to the bus
-        if (devs[vdpaId] == nil)
-        {
-            // Unused, clean it up
-            unmapVduseById(ns.stateDir, vdpaId)
-            continue
-        }
-
-        stateJSON, err := os.ReadFile(stateFile)
-        if (err != nil)
-        {
-            klog.Warningf("error reading state file %v: %v", stateFile, err)
-            continue
-        }
-        var state DeviceState
-        err = json.Unmarshal(stateJSON, &state)
-        if (err != nil)
-        {
-            klog.Warningf("state file %v contains invalid JSON (error %v): %v", stateFile, err, string(stateJSON))
-            continue
-        }
-
-        ns.lockVolume(state.ConfigPath+":"+state.Image)
-
-        // Recheck state file after locking
-        _, err = os.ReadFile(stateFile)
-        if (err != nil)
-        {
-            klog.Warningf("state file %v disappeared, skipping volume", stateFile)
-            ns.unlockVolume(state.ConfigPath+":"+state.Image)
-            continue
-        }
-
-        // Check if the storage daemon is still active
-        pidFile := ns.stateDir + vdpaId + ".pid"
-        exists := false
-        proc, err := findByPidFile(pidFile)
-        if (err == nil)
-        {
-            exists = proc.Signal(syscall.Signal(0)) == nil
-        }
-        if (!exists)
-        {
-            // Restart daemon
-            klog.Warningf("restarting storage daemon for volume %v (VDPA ID %v)", state.Image, vdpaId)
-            _ = startStorageDaemon(vdpaId, state.Image, pidFile, state.ConfigPath, state.Readonly)
-        }
-
-        ns.unlockVolume(state.ConfigPath+":"+state.Image)
    }
 }

 // NodeStageVolume mounts the volume to a staging path on the node.
 func (ns *NodeServer) NodeStageVolume(ctx context.Context, req *csi.NodeStageVolumeRequest) (*csi.NodeStageVolumeResponse, error)
 {
-    klog.Infof("received node stage volume request %+v", protosanitizer.StripSecrets(req))
-
-    ctxVars := make(map[string]string)
-    err := json.Unmarshal([]byte(req.VolumeId), &ctxVars)
-    if (err != nil)
-    {
-        return nil, status.Error(codes.Internal, "volume ID not in JSON format")
-    }
-    _, err = GetConnectionParams(ctxVars)
-    if (err != nil)
-    {
-        return nil, err
-    }
-    volName := ctxVars["name"]
-
-    ns.lockVolume(ctxVars["configPath"]+":"+volName)
-    defer ns.unlockVolume(ctxVars["configPath"]+":"+volName)
-
-    targetPath := req.GetStagingTargetPath()
-    isBlock := req.GetVolumeCapability().GetBlock() != nil
-
-    // Check that it's not already mounted
-    _, err = mount.IsNotMountPoint(ns.mounter, targetPath)
-    if (err != nil)
-    {
-        if (os.IsNotExist(err))
-        {
-            if (isBlock)
-            {
-                pathFile, err := os.OpenFile(targetPath, os.O_CREATE|os.O_RDWR, 0o600)
-                if (err != nil)
-                {
-                    klog.Errorf("failed to create block device mount target %s with error: %v", targetPath, err)
-                    return nil, err
-                }
-                err = pathFile.Close()
-                if (err != nil)
-                {
-                    klog.Errorf("failed to close %s with error: %v", targetPath, err)
-                    return nil, err
-                }
-            }
-            else
-            {
-                err := os.MkdirAll(targetPath, 0777)
-                if (err != nil)
-                {
-                    klog.Errorf("failed to create fs mount target %s with error: %v", targetPath, err)
-                    return nil, err
-                }
-            }
-        }
-        else
-        {
-            return nil, err
-        }
-    }
-
-    var devicePath, vdpaId string
-    if (!ns.useVduse)
-    {
-        devicePath, err = mapNbd(volName, ctxVars, false)
-    }
-    else
-    {
-        devicePath, vdpaId, err = mapVduse(ns.stateDir, volName, ctxVars, false)
-    }
-    if (err != nil)
-    {
-        return nil, err
-    }
-
-    diskMounter := &mount.SafeFormatAndMount{Interface: ns.mounter, Exec: utilexec.New()}
-    if (isBlock)
-    {
-        err = diskMounter.Mount(devicePath, targetPath, "", []string{"bind"})
-    }
-    else
-    {
-        // Check existing format
-        existingFormat, err := diskMounter.GetDiskFormat(devicePath)
-        if (err != nil)
-        {
-            klog.Errorf("failed to get disk format for path %s, error: %v", err)
-            goto unmap
-        }
-
-        // Format the device (ext4 or xfs)
-        fsType := req.GetVolumeCapability().GetMount().GetFsType()
-        opt := req.GetVolumeCapability().GetMount().GetMountFlags()
-        opt = append(opt, "_netdev")
-        if ((req.VolumeCapability.AccessMode.Mode == csi.VolumeCapability_AccessMode_MULTI_NODE_READER_ONLY ||
-            req.VolumeCapability.AccessMode.Mode == csi.VolumeCapability_AccessMode_SINGLE_NODE_READER_ONLY) &&
-            !Contains(opt, "ro"))
-        {
-            opt = append(opt, "ro")
-        }
-        if (fsType == "xfs")
-        {
-            opt = append(opt, "nouuid")
-        }
-        readOnly := Contains(opt, "ro")
-        if (existingFormat == "" && !readOnly)
-        {
-            var cmdOut []byte
-            switch fsType
-            {
-                case "ext4":
-                    args := []string{"-m0", "-Enodiscard,lazy_itable_init=1,lazy_journal_init=1", devicePath}
-                    cmdOut, err = diskMounter.Exec.Command("mkfs.ext4", args...).CombinedOutput()
-                case "xfs":
-                    cmdOut, err = diskMounter.Exec.Command("mkfs.xfs", "-K", devicePath).CombinedOutput()
-            }
-            if (err != nil)
-            {
-                klog.Errorf("failed to run mkfs error: %v, output: %v", err, string(cmdOut))
-                goto unmap
-            }
-        }
-
-        err = diskMounter.FormatAndMount(devicePath, targetPath, fsType, opt)
-
-        // Try to run online resize on mount.
-        // FIXME: Implement online resize. It requires online resize support in vitastor-nbd.
-        if (err == nil && existingFormat != "" && !readOnly)
-        {
-            var cmdOut []byte
-            switch (fsType)
-            {
-                case "ext4":
-                    cmdOut, err = diskMounter.Exec.Command("resize2fs", devicePath).CombinedOutput()
-                case "xfs":
-                    cmdOut, err = diskMounter.Exec.Command("xfs_growfs", devicePath).CombinedOutput()
-            }
-            if (err != nil)
-            {
-                klog.Errorf("failed to run resizefs error: %v, output: %v", err, string(cmdOut))
-                goto unmap
-            }
-        }
-    }
-    if (err != nil)
-    {
-        klog.Errorf(
-            "failed to mount device path (%s) to path (%s) for volume (%s) error: %s",
-            devicePath, targetPath, volName, err,
-        )
-        goto unmap
-    }
    return &csi.NodeStageVolumeResponse{}, nil
-
-unmap:
-    if (!ns.useVduse || len(devicePath) >= 8 && devicePath[0:8] == "/dev/nbd")
-    {
-        unmapNbd(devicePath)
-    }
-    else
-    {
-        unmapVduseById(ns.stateDir, vdpaId)
-    }
-    return nil, err
 }

 // NodeUnstageVolume unstages the volume from the staging path
 func (ns *NodeServer) NodeUnstageVolume(ctx context.Context, req *csi.NodeUnstageVolumeRequest) (*csi.NodeUnstageVolumeResponse, error)
 {
-    klog.Infof("received node unstage volume request %+v", protosanitizer.StripSecrets(req))
-
-    ctxVars := make(map[string]string)
-    err := json.Unmarshal([]byte(req.VolumeId), &ctxVars)
-    if (err != nil)
-    {
-        return nil, status.Error(codes.Internal, "volume ID not in JSON format")
-    }
-    volName := ctxVars["name"]
-
-    ns.lockVolume(ctxVars["configPath"]+":"+volName)
-    defer ns.unlockVolume(ctxVars["configPath"]+":"+volName)
-
-    targetPath := req.GetStagingTargetPath()
-    devicePath, refCount, err := mount.GetDeviceNameFromMount(ns.mounter, targetPath)
-    if (err != nil)
-    {
-        if (os.IsNotExist(err))
-        {
-            return nil, status.Error(codes.NotFound, "Target path not found")
-        }
-        return nil, err
-    }
-    if (devicePath == "")
-    {
-        // volume not mounted
-        klog.Warningf("%s is not a mountpoint, deleting", targetPath)
-        os.Remove(targetPath)
-        return &csi.NodeUnstageVolumeResponse{}, nil
-    }
-
-    // unmount
-    err = mount.CleanupMountPoint(targetPath, ns.mounter, false)
-    if (err != nil)
-    {
-        return nil, err
-    }
-
-    // unmap device
-    if (refCount == 1)
-    {
-        if (!ns.useVduse)
-        {
-            unmapNbd(devicePath)
-        }
-        else
-        {
-            unmapVduse(ns.stateDir, devicePath)
-        }
-    }
-
    return &csi.NodeUnstageVolumeResponse{}, nil
 }

+func Contains(list []string, s string) bool
+{
+    for i := 0; i < len(list); i++
+    {
+        if (list[i] == s)
+        {
+            return true
+        }
+    }
+    return false
+}
+
 // NodePublishVolume mounts the volume mounted to the staging path to the target path
 func (ns *NodeServer) NodePublishVolume(ctx context.Context, req *csi.NodePublishVolumeRequest) (*csi.NodePublishVolumeResponse, error)
 {
    klog.Infof("received node publish volume request %+v", protosanitizer.StripSecrets(req))

-    ctxVars := make(map[string]string)
-    err := json.Unmarshal([]byte(req.VolumeId), &ctxVars)
-    if (err != nil)
-    {
-        return nil, status.Error(codes.Internal, "volume ID not in JSON format")
-    }
-    _, err = GetConnectionParams(ctxVars)
-    if (err != nil)
-    {
-        return nil, err
-    }
-    volName := ctxVars["name"]
-
-    ns.lockVolume(ctxVars["configPath"]+":"+volName)
-    defer ns.unlockVolume(ctxVars["configPath"]+":"+volName)
-
-    stagingTargetPath := req.GetStagingTargetPath()
    targetPath := req.GetTargetPath()
    isBlock := req.GetVolumeCapability().GetBlock() != nil

-    // Check that stagingTargetPath is mounted
-    _, err = mount.IsNotMountPoint(ns.mounter, stagingTargetPath)
-    if (err != nil)
+    // Check that it's not already mounted
+    _, error := mount.IsNotMountPoint(ns.mounter, targetPath)
+    if (error != nil)
    {
-        klog.Errorf("staging path %v is not mounted: %v", stagingTargetPath, err)
-        return nil, fmt.Errorf("staging path %v is not mounted: %v", stagingTargetPath, err)
-    }
-
-    // Check that targetPath is not already mounted
-    _, err = mount.IsNotMountPoint(ns.mounter, targetPath)
-    if (err != nil)
-    {
-        if (os.IsNotExist(err))
+        if (os.IsNotExist(error))
        {
            if (isBlock)
            {
@ -470,13 +81,13 @@ func (ns *NodeServer) NodePublishVolume(ctx context.Context, req *csi.NodePublis
                if (err != nil)
                {
                    klog.Errorf("failed to create block device mount target %s with error: %v", targetPath, err)
-                    return nil, err
+                    return nil, status.Error(codes.Internal, err.Error())
                }
                err = pathFile.Close()
                if (err != nil)
                {
                    klog.Errorf("failed to close %s with error: %v", targetPath, err)
-                    return nil, err
+                    return nil, status.Error(codes.Internal, err.Error())
                }
            }
            else
@ -485,38 +96,16 @@ func (ns *NodeServer) NodePublishVolume(ctx context.Context, req *csi.NodePublis
                if (err != nil)
                {
                    klog.Errorf("failed to create fs mount target %s with error: %v", targetPath, err)
-                    return nil, err
+                    return nil, status.Error(codes.Internal, err.Error())
                }
            }
        }
        else
        {
-            return nil, err
+            return nil, status.Error(codes.Internal, error.Error())
        }
    }

-    execArgs := []string{"--bind", stagingTargetPath, targetPath}
-    if (req.GetReadonly())
-    {
-        execArgs = append(execArgs, "-o", "ro")
-    }
-    cmd := exec.Command("mount", execArgs...)
-    cmd.Stderr = os.Stderr
-    klog.Infof("binding volume %v (%v) from %v to %v", volName, ctxVars["configPath"], stagingTargetPath, targetPath)
-    out, err := cmd.Output()
-    if (err != nil)
-    {
-        return nil, fmt.Errorf("Error running mount %v: %s", strings.Join(execArgs, " "), out)
-    }
-
-    return &csi.NodePublishVolumeResponse{}, nil
-}
-
-// NodeUnpublishVolume unmounts the volume from the target path
-func (ns *NodeServer) NodeUnpublishVolume(ctx context.Context, req *csi.NodeUnpublishVolumeRequest) (*csi.NodeUnpublishVolumeResponse, error)
-{
-    klog.Infof("received node unpublish volume request %+v", protosanitizer.StripSecrets(req))
-
    ctxVars := make(map[string]string)
    err := json.Unmarshal([]byte(req.VolumeId), &ctxVars)
    if (err != nil)
@ -525,34 +114,154 @@ func (ns *NodeServer) NodeUnpublishVolume(ctx context.Context, req *csi.NodeUnpu
    }
    volName := ctxVars["name"]

-    ns.lockVolume(ctxVars["configPath"]+":"+volName)
-    defer ns.unlockVolume(ctxVars["configPath"]+":"+volName)
+    _, etcdUrl, etcdPrefix := GetConnectionParams(ctxVars)
+    if (len(etcdUrl) == 0)
+    {
+        return nil, status.Error(codes.InvalidArgument, "no etcdUrl in storage class configuration and no etcd_address in vitastor.conf")
+    }

+    // Map NBD device
+    // FIXME: Check if already mapped
+    args := []string{
+        "map", "--etcd_address", strings.Join(etcdUrl, ","),
+        "--etcd_prefix", etcdPrefix,
+        "--image", volName,
+    };
+    if (ctxVars["configPath"] != "")
+    {
+        args = append(args, "--config_path", ctxVars["configPath"])
+    }
+    if (req.GetReadonly())
+    {
+        args = append(args, "--readonly", "1")
+    }
+    c := exec.Command("/usr/bin/vitastor-nbd", args...)
+    var stdout, stderr bytes.Buffer
+    c.Stdout, c.Stderr = &stdout, &stderr
+    err = c.Run()
+    stdoutStr, stderrStr := string(stdout.Bytes()), string(stderr.Bytes())
+    if (err != nil)
+    {
+        klog.Errorf("vitastor-nbd map failed: %s, status %s\n", stdoutStr+stderrStr, err)
+        return nil, status.Error(codes.Internal, stdoutStr+stderrStr+" (status "+err.Error()+")")
+    }
+    devicePath := strings.TrimSpace(stdoutStr)
+
+    // Check existing format
+    diskMounter := &mount.SafeFormatAndMount{Interface: ns.mounter, Exec: utilexec.New()}
+    existingFormat, err := diskMounter.GetDiskFormat(devicePath)
+    if (err != nil)
+    {
+        klog.Errorf("failed to get disk format for path %s, error: %v", err)
+        // unmap NBD device
+        unmapOut, unmapErr := exec.Command("/usr/bin/vitastor-nbd", "unmap", devicePath).CombinedOutput()
+        if (unmapErr != nil)
+        {
+            klog.Errorf("failed to unmap NBD device %s: %s, error: %v", devicePath, unmapOut, unmapErr)
+        }
+        return nil, err
+    }
+
+    // Format the device (ext4 or xfs)
+    fsType := req.GetVolumeCapability().GetMount().GetFsType()
+    opt := req.GetVolumeCapability().GetMount().GetMountFlags()
+    opt = append(opt, "_netdev")
+    if ((req.VolumeCapability.AccessMode.Mode == csi.VolumeCapability_AccessMode_MULTI_NODE_READER_ONLY ||
+        req.VolumeCapability.AccessMode.Mode == csi.VolumeCapability_AccessMode_SINGLE_NODE_READER_ONLY) &&
+        !Contains(opt, "ro"))
+    {
+        opt = append(opt, "ro")
+    }
+    if (fsType == "xfs")
+    {
+        opt = append(opt, "nouuid")
+    }
+    readOnly := Contains(opt, "ro")
+    if (existingFormat == "" && !readOnly)
+    {
+        args := []string{}
+        switch fsType
+        {
+            case "ext4":
+                args = []string{"-m0", "-Enodiscard,lazy_itable_init=1,lazy_journal_init=1", devicePath}
+            case "xfs":
+                args = []string{"-K", devicePath}
+        }
+        if (len(args) > 0)
+        {
+            cmdOut, cmdErr := diskMounter.Exec.Command("mkfs."+fsType, args...).CombinedOutput()
+            if (cmdErr != nil)
+            {
+                klog.Errorf("failed to run mkfs error: %v, output: %v", cmdErr, string(cmdOut))
+                // unmap NBD device
+                unmapOut, unmapErr := exec.Command("/usr/bin/vitastor-nbd", "unmap", devicePath).CombinedOutput()
+                if (unmapErr != nil)
+                {
+                    klog.Errorf("failed to unmap NBD device %s: %s, error: %v", devicePath, unmapOut, unmapErr)
+                }
+                return nil, status.Error(codes.Internal, cmdErr.Error())
+            }
+        }
+    }
+    if (isBlock)
+    {
+        opt = append(opt, "bind")
+        err = diskMounter.Mount(devicePath, targetPath, fsType, opt)
+    }
+    else
+    {
+        err = diskMounter.FormatAndMount(devicePath, targetPath, fsType, opt)
+    }
+    if (err != nil)
+    {
+        klog.Errorf(
+            "failed to mount device path (%s) to path (%s) for volume (%s) error: %s",
+            devicePath, targetPath, volName, err,
+        )
+        // unmap NBD device
+        unmapOut, unmapErr := exec.Command("/usr/bin/vitastor-nbd", "unmap", devicePath).CombinedOutput()
+        if (unmapErr != nil)
+        {
+            klog.Errorf("failed to unmap NBD device %s: %s, error: %v", devicePath, unmapOut, unmapErr)
+        }
+        return nil, status.Error(codes.Internal, err.Error())
+    }
+    return &csi.NodePublishVolumeResponse{}, nil
+}
+
+// NodeUnpublishVolume unmounts the volume from the target path
+func (ns *NodeServer) NodeUnpublishVolume(ctx context.Context, req *csi.NodeUnpublishVolumeRequest) (*csi.NodeUnpublishVolumeResponse, error)
+{
+    klog.Infof("received node unpublish volume request %+v", protosanitizer.StripSecrets(req))
    targetPath := req.GetTargetPath()
-    devicePath, _, err := mount.GetDeviceNameFromMount(ns.mounter, targetPath)
+    devicePath, refCount, err := mount.GetDeviceNameFromMount(ns.mounter, targetPath)
    if (err != nil)
    {
        if (os.IsNotExist(err))
        {
            return nil, status.Error(codes.NotFound, "Target path not found")
        }
-        return nil, err
+        return nil, status.Error(codes.Internal, err.Error())
    }
    if (devicePath == "")
    {
-        // volume not mounted
-        klog.Warningf("%s is not a mountpoint, deleting", targetPath)
-        os.Remove(targetPath)
-        return &csi.NodeUnpublishVolumeResponse{}, nil
+        return nil, status.Error(codes.NotFound, "Volume not mounted")
    }
-
    // unmount
    err = mount.CleanupMountPoint(targetPath, ns.mounter, false)
    if (err != nil)
    {
-        return nil, err
+        return nil, status.Error(codes.Internal, err.Error())
+    }
+    // unmap NBD device
+    if (refCount == 1)
+    {
+        unmapOut, unmapErr := exec.Command("/usr/bin/vitastor-nbd", "unmap", devicePath).CombinedOutput()
+        if (unmapErr != nil)
+        {
+            klog.Errorf("failed to unmap NBD device %s: %s, error: %v", devicePath, unmapOut, unmapErr)
+        }
    }
-
    return &csi.NodeUnpublishVolumeResponse{}, nil
 }

@ -571,17 +280,7 @@ func (ns *NodeServer) NodeExpandVolume(ctx context.Context, req *csi.NodeExpandV
 // NodeGetCapabilities returns the supported capabilities of the node server
 func (ns *NodeServer) NodeGetCapabilities(ctx context.Context, req *csi.NodeGetCapabilitiesRequest) (*csi.NodeGetCapabilitiesResponse, error)
 {
-    return &csi.NodeGetCapabilitiesResponse{
-        Capabilities: []*csi.NodeServiceCapability{
-            &csi.NodeServiceCapability{
-                Type: &csi.NodeServiceCapability_Rpc{
-                    Rpc: &csi.NodeServiceCapability_RPC{
-                        Type: csi.NodeServiceCapability_RPC_STAGE_UNSTAGE_VOLUME,
-                    },
-                },
-            },
-        },
-    }, nil
+    return &csi.NodeGetCapabilitiesResponse{}, nil
 }

 // NodeGetInfo returns NodeGetInfoResponse for CO.
--- a/csi/src/utils.go
+++ b/csi/src/utils.go
@ -1,301 +0,0 @@
-// Copyright (c) Vitaliy Filippov, 2019+
-// License: VNPL-1.1 or GNU GPL-2.0+ (see README.md for details)
-
-package vitastor
-
-import (
-    "errors"
-    "encoding/json"
-    "fmt"
-    "os"
-    "os/exec"
-    "path/filepath"
-    "strconv"
-    "strings"
-    "syscall"
-
-    "k8s.io/klog"
-)
-
-func Contains(list []string, s string) bool
-{
-    for i := 0; i < len(list); i++
-    {
-        if (list[i] == s)
-        {
-            return true
-        }
-    }
-    return false
-}
-
-func checkVduseSupport() bool
-{
-    // Check VDUSE support (vdpa, vduse, virtio-vdpa kernel modules)
-    vduse := true
-    for _, mod := range []string{"vdpa", "vduse", "virtio-vdpa"}
-    {
-        _, err := os.Stat("/sys/module/"+mod)
-        if (err != nil)
-        {
-            if (!errors.Is(err, os.ErrNotExist))
-            {
-                klog.Errorf("failed to check /sys/module/%s: %v", mod, err)
-            }
-            c := exec.Command("/sbin/modprobe", mod)
-            c.Stdout = os.Stderr
-            c.Stderr = os.Stderr
-            err := c.Run()
-            if (err != nil)
-            {
-                klog.Errorf("/sbin/modprobe %s failed: %v", mod, err)
-                vduse = false
-                break
-            }
-        }
-    }
-    // Check that vdpa tool functions
-    if (vduse)
-    {
-        c := exec.Command("/sbin/vdpa", "-j", "dev")
-        c.Stderr = os.Stderr
-        err := c.Run()
-        if (err != nil)
-        {
-            klog.Errorf("/sbin/vdpa -j dev failed: %v", err)
-            vduse = false
-        }
-    }
-    if (!vduse)
-    {
-        klog.Errorf(
-            "Your host apparently has no VDUSE support. VDUSE support disabled, NBD will be used to map devices."+
-            " For VDUSE you need at least Linux 5.15 and the following kernel modules: vdpa, virtio-vdpa, vduse.",
-        )
-    }
-    return vduse
-}
-
-func mapNbd(volName string, ctxVars map[string]string, readonly bool) (string, error)
-{
-    // Map NBD device
-    // FIXME: Check if already mapped
-    args := []string{
-        "map", "--image", volName,
-    }
-    if (ctxVars["configPath"] != "")
-    {
-        args = append(args, "--config_path", ctxVars["configPath"])
-    }
-    if (readonly)
-    {
-        args = append(args, "--readonly", "1")
-    }
-    stdout, stderr, err := system("/usr/bin/vitastor-nbd", args...)
-    dev := strings.TrimSpace(string(stdout))
-    if (dev == "")
-    {
-        return "", fmt.Errorf("vitastor-nbd did not return the name of NBD device. output: %s", stderr)
-    }
-    return dev, err
-}
-
-func unmapNbd(devicePath string)
-{
-    // unmap NBD device
-    unmapOut, unmapErr := exec.Command("/usr/bin/vitastor-nbd", "unmap", devicePath).CombinedOutput()
-    if (unmapErr != nil)
-    {
-        klog.Errorf("failed to unmap NBD device %s: %s, error: %v", devicePath, unmapOut, unmapErr)
-    }
-}
-
-func findByPidFile(pidFile string) (*os.Process, error)
-{
-    pidBuf, err := os.ReadFile(pidFile)
-    if (err != nil)
-    {
-        return nil, err
-    }
-    pid, err := strconv.ParseInt(strings.TrimSpace(string(pidBuf)), 0, 64)
-    if (err != nil)
-    {
-        return nil, err
-    }
-    proc, err := os.FindProcess(int(pid))
-    if (err != nil)
-    {
-        return nil, err
-    }
-    return proc, nil
-}
-
-func killByPidFile(pidFile string) error
-{
-    klog.Infof("killing process with PID from file %s", pidFile)
-    proc, err := findByPidFile(pidFile)
-    if (err != nil)
-    {
-        return err
-    }
-    return proc.Signal(syscall.SIGTERM)
-}
-
-func startStorageDaemon(vdpaId, volName, pidFile, configPath string, readonly bool) error
-{
-    // Start qemu-storage-daemon
-    blockSpec := map[string]interface{}{
-        "node-name": "disk1",
-        "driver": "vitastor",
-        "image": volName,
-        "cache": map[string]bool{
-            "direct": true,
-            "no-flush": false,
-        },
-        "discard": "unmap",
-    }
-    if (configPath != "")
-    {
-        blockSpec["config-path"] = configPath
-    }
-    blockSpecJson, _ := json.Marshal(blockSpec)
-    writable := "true"
-    if (readonly)
-    {
-        writable = "false"
-    }
-    _, _, err := system(
-        "/usr/bin/qemu-storage-daemon", "--daemonize", "--pidfile", pidFile, "--blockdev", string(blockSpecJson),
-        "--export", "vduse-blk,id="+vdpaId+",node-name=disk1,name="+vdpaId+",num-queues=16,queue-size=128,writable="+writable,
-    )
-    return err
-}
-
-func mapVduse(stateDir string, volName string, ctxVars map[string]string, readonly bool) (string, string, error)
-{
-    // Generate state file
-    stateFd, err := os.CreateTemp(stateDir, "vitastor-vduse-*.json")
-    if (err != nil)
-    {
-        return "", "", err
-    }
-    stateFile := stateFd.Name()
-    stateFd.Close()
-    vdpaId := filepath.Base(stateFile)
-    vdpaId = vdpaId[0:len(vdpaId)-5] // remove ".json"
-    pidFile := stateDir + vdpaId + ".pid"
-    // Map VDUSE device via qemu-storage-daemon
-    err = startStorageDaemon(vdpaId, volName, pidFile, ctxVars["configPath"], readonly)
-    if (err == nil)
-    {
-        // Add device to VDPA bus
-        _, _, err = system("/sbin/vdpa", "-j", "dev", "add", "name", vdpaId, "mgmtdev", "vduse")
-        if (err == nil)
-        {
-            // Find block device name
-            var matches []string
-            matches, err = filepath.Glob("/sys/bus/vdpa/devices/"+vdpaId+"/virtio*/block/*")
-            if (err == nil && len(matches) == 0)
-            {
-                err = errors.New("/sys/bus/vdpa/devices/"+vdpaId+"/virtio*/block/* is not found")
-            }
-            if (err == nil)
-            {
-                blockdev := "/dev/"+filepath.Base(matches[0])
-                _, err = os.Stat(blockdev)
-                if (err == nil)
-                {
-                    // Generate state file
-                    stateJSON, _ := json.Marshal(&DeviceState{
-                        ConfigPath: ctxVars["configPath"],
-                        VdpaId:     vdpaId,
-                        Image:      volName,
-                        Blockdev:   blockdev,
-                        Readonly:   readonly,
-                        PidFile:    pidFile,
-                    })
-                    err = os.WriteFile(stateFile, stateJSON, 0600)
-                    if (err == nil)
-                    {
-                        return blockdev, vdpaId, nil
-                    }
-                }
-            }
-        }
-        killErr := killByPidFile(pidFile)
-        if (killErr != nil)
-        {
-            klog.Errorf("Failed to kill started qemu-storage-daemon: %v", killErr)
-        }
-        os.Remove(stateFile)
-        os.Remove(pidFile)
-    }
-    return "", "", err
-}
-
-func unmapVduse(stateDir, devicePath string)
-{
-    if (len(devicePath) < 6 || devicePath[0:6] != "/dev/v")
-    {
-        klog.Errorf("%s does not start with /dev/v", devicePath)
-        return
-    }
-    vduseDev, err := os.Readlink("/sys/block/"+devicePath[5:])
-    if (err != nil)
-    {
-        klog.Errorf("%s is not a symbolic link to VDUSE device (../devices/virtual/vduse/xxx): %v", devicePath, err)
-        return
-    }
-    vdpaId := ""
-    p := strings.Index(vduseDev, "/vduse/")
-    if (p >= 0)
-    {
-        vduseDev = vduseDev[p+7:]
-        p = strings.Index(vduseDev, "/")
-        if (p >= 0)
-        {
-            vdpaId = vduseDev[0:p]
-        }
-    }
-    if (vdpaId == "")
-    {
-        klog.Errorf("%s is not a symbolic link to VDUSE device (../devices/virtual/vduse/xxx), but is %v", devicePath, vduseDev)
-        return
-    }
-    unmapVduseById(stateDir, vdpaId)
-}
-
-func unmapVduseById(stateDir, vdpaId string)
-{
-    _, err := os.Stat("/sys/bus/vdpa/devices/"+vdpaId)
-    if (err != nil)
-    {
-        klog.Errorf("failed to stat /sys/bus/vdpa/devices/"+vdpaId+": %v", err)
-    }
-    else
-    {
-        _, _, _ = system("/sbin/vdpa", "-j", "dev", "del", vdpaId)
-    }
-    stateFile := stateDir + vdpaId + ".json"
-    os.Remove(stateFile)
-    pidFile := stateDir + vdpaId + ".pid"
-    _, err = os.Stat(pidFile)
-    if (os.IsNotExist(err))
-    {
-        // ok, already killed
-    }
-    else if (err != nil)
-    {
-        klog.Errorf("Failed to stat %v: %v", pidFile, err)
-        return
-    }
-    else
-    {
-        err = killByPidFile(pidFile)
-        if (err != nil)
-        {
-            klog.Errorf("Failed to kill started qemu-storage-daemon: %v", err)
-        }
-        os.Remove(pidFile)
-    }
-}
--- a/debian/build-vitastor-bookworm.sh
+++ b/debian/build-vitastor-bookworm.sh
@ -3,5 +3,5 @@
 cat < vitastor.Dockerfile > ../Dockerfile
 cd ..
 mkdir -p packages
-sudo podman build --build-arg DISTRO=debian --build-arg REL=bookworm -v `pwd`/packages:/root/packages -f Dockerfile .
+sudo podman build --build-arg REL=bookworm -v `pwd`/packages:/root/packages -f Dockerfile .
 rm Dockerfile
--- a/debian/build-vitastor-bullseye.sh
+++ b/debian/build-vitastor-bullseye.sh
@ -3,5 +3,5 @@
 cat < vitastor.Dockerfile > ../Dockerfile
 cd ..
 mkdir -p packages
-sudo podman build --build-arg DISTRO=debian --build-arg REL=bullseye -v `pwd`/packages:/root/packages -f Dockerfile .
+sudo podman build --build-arg REL=bullseye -v `pwd`/packages:/root/packages -f Dockerfile .
 rm Dockerfile
--- a/debian/build-vitastor-buster.sh
+++ b/debian/build-vitastor-buster.sh
@ -3,5 +3,5 @@
 cat < vitastor.Dockerfile > ../Dockerfile
 cd ..
 mkdir -p packages
-sudo podman build --build-arg DISTRO=debian --build-arg REL=buster -v `pwd`/packages:/root/packages -f Dockerfile .
+sudo podman build --build-arg REL=buster -v `pwd`/packages:/root/packages -f Dockerfile .
 rm Dockerfile
--- a/debian/build-vitastor-ubuntu-jammy.sh
+++ b/debian/build-vitastor-ubuntu-jammy.sh
@ -1,7 +0,0 @@
-#!/bin/bash
-
-cat < vitastor.Dockerfile > ../Dockerfile
-cd ..
-mkdir -p packages
-sudo podman build --build-arg DISTRO=ubuntu --build-arg REL=jammy -v `pwd`/packages:/root/packages -f Dockerfile .
-rm Dockerfile
--- a/debian/changelog
+++ b/debian/changelog
@ -1,10 +1,10 @@
-vitastor (1.6.1-1) unstable; urgency=medium
+vitastor (0.9.3-1) unstable; urgency=medium

  * Bugfixes

 -- Vitaliy Filippov <vitalif@yourcmc.ru>  Fri, 03 Jun 2022 02:09:44 +0300

-vitastor (0.7.0-1) unstable; urgency=medium
+vitastor (0.9.3-1) unstable; urgency=medium

  * Implement NFS proxy
  * Add documentation
--- a/debian/control
+++ b/debian/control
@ -2,7 +2,7 @@ Source: vitastor
 Section: admin
 Priority: optional
 Maintainer: Vitaliy Filippov <vitalif@yourcmc.ru>
-Build-Depends: debhelper, liburing-dev (>= 0.6), g++ (>= 8), libstdc++6 (>= 8), linux-libc-dev, libgoogle-perftools-dev, libjerasure-dev, libgf-complete-dev, libibverbs-dev, libisal-dev, cmake, pkg-config, libnl-3-dev, libnl-genl-3-dev
+Build-Depends: debhelper, liburing-dev (>= 0.6), g++ (>= 8), libstdc++6 (>= 8), linux-libc-dev, libgoogle-perftools-dev, libjerasure-dev, libgf-complete-dev, libibverbs-dev, libisal-dev
 Standards-Version: 4.5.0
 Homepage: https://vitastor.io/
 Rules-Requires-Root: no
--- a/debian/libvirt.Dockerfile
+++ b/debian/libvirt.Dockerfile
@ -1,14 +1,13 @@
 # Build patched libvirt for Debian Buster or Bullseye/Sid inside a container
-# cd ..; podman build --build-arg DISTRO=debian --build-arg REL=bullseye -v `pwd`/packages:/root/packages -f debian/libvirt.Dockerfile .
+# cd ..; podman build --build-arg REL=bullseye -v `pwd`/packages:/root/packages -f debian/libvirt.Dockerfile .

-ARG DISTRO=
 ARG REL=
-FROM $DISTRO:$REL
+FROM debian:$REL
 ARG REL=

 WORKDIR /root

-RUN if ([ "${DISTRO}" = "debian" ]) && ( [ "${REL}" = "buster" -o "${REL}" = "bullseye" ] ); then \
+RUN if [ "$REL" = "buster" -o "$REL" = "bullseye" ]; then \
        echo "deb http://deb.debian.org/debian $REL-backports main" >> /etc/apt/sources.list; \
        echo >> /etc/apt/preferences; \
        echo 'Package: *' >> /etc/apt/preferences; \
@ -24,7 +23,7 @@ RUN apt-get -y build-dep libvirt0
 RUN apt-get -y install libglusterfs-dev
 RUN apt-get --download-only source libvirt

-ADD patches/libvirt-5.0-vitastor.diff patches/libvirt-7.0-vitastor.diff patches/libvirt-7.5-vitastor.diff patches/libvirt-7.6-vitastor.diff patches/libvirt-8.0-vitastor.diff /root
+ADD patches/libvirt-5.0-vitastor.diff patches/libvirt-7.0-vitastor.diff patches/libvirt-7.5-vitastor.diff patches/libvirt-7.6-vitastor.diff /root
 RUN set -e; \
    mkdir -p /root/packages/libvirt-$REL; \
    rm -rf /root/packages/libvirt-$REL/*; \
--- a/debian/patched-qemu.Dockerfile
+++ b/debian/patched-qemu.Dockerfile
@ -7,7 +7,7 @@ ARG REL=

 WORKDIR /root

-RUN if [ "$REL" = "buster" -o "$REL" = "bullseye" -o "$REL" = "bookworm" ]; then \
+RUN if [ "$REL" = "buster" -o "$REL" = "bullseye" ]; then \
        echo "deb http://deb.debian.org/debian $REL-backports main" >> /etc/apt/sources.list; \
        echo >> /etc/apt/preferences; \
        echo 'Package: *' >> /etc/apt/preferences; \
@ -27,35 +27,28 @@ RUN apt-get -y build-dep qemu
 RUN apt-get --download-only source qemu

 ADD patches /root/vitastor/patches
-ADD src/client/qemu_driver.c /root/qemu_driver.c
-
-#RUN set -e; \
-#    apt-get install -y wget; \
-#    wget -q -O /etc/apt/trusted.gpg.d/vitastor.gpg https://vitastor.io/debian/pubkey.gpg; \
-#    (echo deb http://vitastor.io/debian $REL main > /etc/apt/sources.list.d/vitastor.list); \
-#    (echo "APT::Install-Recommends false;" > /etc/apt/apt.conf) && \
-#    apt-get update; \
-#    apt-get install -y vitastor-client vitastor-client-dev quilt
-
+ADD src/qemu_driver.c /root/vitastor/src/qemu_driver.c
 RUN set -e; \
-    dpkg -i /root/packages/vitastor-$REL/vitastor-client_*.deb /root/packages/vitastor-$REL/vitastor-client-dev_*.deb; \
+    apt-get install -y wget; \
+    wget -q -O /etc/apt/trusted.gpg.d/vitastor.gpg https://vitastor.io/debian/pubkey.gpg; \
+    (echo deb http://vitastor.io/debian $REL main > /etc/apt/sources.list.d/vitastor.list); \
+    (echo "APT::Install-Recommends false;" > /etc/apt/apt.conf) && \
    apt-get update; \
-    apt-get install -y quilt; \
+    apt-get install -y vitastor-client vitastor-client-dev quilt; \
    mkdir -p /root/packages/qemu-$REL; \
    rm -rf /root/packages/qemu-$REL/*; \
    cd /root/packages/qemu-$REL; \
    dpkg-source -x /root/qemu*.dsc; \
-    QEMU_VER=$(ls -d qemu*/ | perl -pe 's!^.*?(\d+\.\d+).*!$1!'); \
+    QEMU_VER=$(ls -d qemu*/ | perl -pe 's!^.*(\d+\.\d+).*!$1!'); \
    D=$(ls -d qemu*/); \
    cp /root/vitastor/patches/qemu-$QEMU_VER-vitastor.patch ./qemu-*/debian/patches; \
    echo qemu-$QEMU_VER-vitastor.patch >> $D/debian/patches/series; \
    cd /root/packages/qemu-$REL/qemu-*/; \
    quilt push -a; \
    quilt add block/vitastor.c; \
-    cp /root/qemu_driver.c block/vitastor.c; \
+    cp /root/vitastor/src/qemu_driver.c block/vitastor.c; \
    quilt refresh; \
-    V=$(head -n1 debian/changelog | perl -pe 's/5\.2\+dfsg-9/5.2+dfsg-11/; s/^.*\((.*?)(~bpo[\d\+]*)?\).*$/$1/')+vitastor4; \
-    if [ "$REL" = bullseye ]; then V=${V}bullseye; fi; \
+    V=$(head -n1 debian/changelog | perl -pe 's/^.*\((.*?)(~bpo[\d\+]*)?\).*$/$1/')+vitastor3; \
    DEBEMAIL="Vitaliy Filippov <vitalif@yourcmc.ru>" dch -D $REL -v $V 'Plug Vitastor block driver'; \
    DEB_BUILD_OPTIONS=nocheck dpkg-buildpackage --jobs=auto -sa; \
    rm -rf /root/packages/qemu-$REL/qemu-*/
--- a/debian/vitastor-client.install
+++ b/debian/vitastor-client.install
@ -3,6 +3,4 @@ usr/bin/vitastor-cli
 usr/bin/vitastor-rm
 usr/bin/vitastor-nbd
 usr/bin/vitastor-nfs
-usr/bin/vitastor-kv
-usr/bin/vitastor-kv-stress
 usr/lib/*/libvitastor*.so*
--- a/debian/vitastor-mon.install
+++ b/debian/vitastor-mon.install
@ -1,3 +1,2 @@
-mon usr/lib/vitastor/mon
-mon/scripts/make-etcd usr/lib/vitastor/mon
-mon/scripts/vitastor-mon.service /lib/systemd/system
+mon usr/lib/vitastor
+mon/vitastor-mon.service /lib/systemd/system
--- a/debian/vitastor-osd.install
+++ b/debian/vitastor-osd.install
@ -1,6 +1,6 @@
 usr/bin/vitastor-osd
 usr/bin/vitastor-disk
 usr/bin/vitastor-dump-journal
-mon/scripts/vitastor-osd@.service /lib/systemd/system
-mon/scripts/vitastor.target /lib/systemd/system
-mon/scripts/90-vitastor.rules /lib/udev/rules.d
+mon/vitastor-osd@.service /lib/systemd/system
+mon/vitastor.target /lib/systemd/system
+mon/90-vitastor.rules /lib/udev/rules.d
--- a/debian/vitastor.Dockerfile
+++ b/debian/vitastor.Dockerfile
@ -1,10 +1,8 @@
 # Build Vitastor packages for Debian inside a container
-# cd ..; podman build --build-arg DISTRO=debian --build-arg REL=bullseye -v `pwd`/packages:/root/packages -f debian/vitastor.Dockerfile .
+# cd ..; podman build --build-arg REL=bullseye -v `pwd`/packages:/root/packages -f debian/vitastor.Dockerfile .

-ARG DISTRO=debian
 ARG REL=
-FROM $DISTRO:$REL
-ARG DISTRO=debian
+FROM debian:$REL
 ARG REL=

 WORKDIR /root
@ -25,7 +23,7 @@ RUN apt-get update
 RUN apt-get -y install fio liburing-dev libgoogle-perftools-dev devscripts
 RUN apt-get -y build-dep fio
 RUN apt-get --download-only source fio
-RUN apt-get update && apt-get -y install libjerasure-dev cmake libibverbs-dev libisal-dev libnl-3-dev libnl-genl-3-dev
+RUN apt-get update && apt-get -y install libjerasure-dev cmake libibverbs-dev libisal-dev

 ADD . /root/vitastor
 RUN set -e -x; \
@ -37,8 +35,8 @@ RUN set -e -x; \
    mkdir -p /root/packages/vitastor-$REL; \
    rm -rf /root/packages/vitastor-$REL/*; \
    cd /root/packages/vitastor-$REL; \
-    cp -r /root/vitastor vitastor-1.6.1; \
-    cd vitastor-1.6.1; \
+    cp -r /root/vitastor vitastor-0.9.3; \
+    cd vitastor-0.9.3; \
    ln -s /root/fio-build/fio-*/ ./fio; \
    FIO=$(head -n1 fio/debian/changelog | perl -pe 's/^.*\((.*?)\).*$/$1/'); \
    ls /usr/include/linux/raw.h || cp ./debian/raw.h /usr/include/linux/raw.h; \
@ -51,8 +49,8 @@ RUN set -e -x; \
    rm -rf a b; \
    echo "dep:fio=$FIO" > debian/fio_version; \
    cd /root/packages/vitastor-$REL; \
-    tar --sort=name --mtime='2020-01-01' --owner=0 --group=0 --exclude=debian -cJf vitastor_1.6.1.orig.tar.xz vitastor-1.6.1; \
-    cd vitastor-1.6.1; \
+    tar --sort=name --mtime='2020-01-01' --owner=0 --group=0 --exclude=debian -cJf vitastor_0.9.3.orig.tar.xz vitastor-0.9.3; \
+    cd vitastor-0.9.3; \
    V=$(head -n1 debian/changelog | perl -pe 's/^.*\((.*?)\).*$/$1/'); \
    DEBFULLNAME="Vitaliy Filippov <vitalif@yourcmc.ru>" dch -D $REL -v "$V""$REL" "Rebuild for $REL"; \
    DEB_BUILD_OPTIONS=nocheck dpkg-buildpackage --jobs=auto -sa; \
--- a/docs/config.en.md
+++ b/docs/config.en.md
@ -33,7 +33,6 @@ In the future, additional configuration methods may be added:

 - [Common](config/common.en.md)
 - [Network](config/network.en.md)
- [Client](config/client.en.md)
 - [Global Disk Layout](config/layout-cluster.en.md)
 - [OSD Disk Layout](config/layout-osd.en.md)
 - [OSD Runtime Parameters](config/osd.en.md)
--- a/docs/config.ru.md
+++ b/docs/config.ru.md
@ -36,7 +36,6 @@

 - [Общие](config/common.ru.md)
 - [Сеть](config/network.ru.md)
- [Клиентский код](config/client.ru.md)
 - [Глобальные дисковые параметры](config/layout-cluster.ru.md)
 - [Дисковые параметры OSD](config/layout-osd.ru.md)
 - [Прочие параметры OSD](config/osd.ru.md)
--- a/docs/config/client.en.md
+++ b/docs/config/client.en.md
@ -1,185 +0,0 @@
-[Documentation](../../README.md#documentation) → [Configuration](../config.en.md) → Client Parameters
-
-----
-
-[Читать на русском](client.ru.md)
-
-# Client Parameters
-
-These parameters apply only to Vitastor clients (QEMU, fio, NBD and so on) and
-affect their interaction with the cluster.
-
- [client_retry_interval](#client_retry_interval)
- [client_eio_retry_interval](#client_eio_retry_interval)
- [client_retry_enospc](#client_retry_enospc)
- [client_max_dirty_bytes](#client_max_dirty_bytes)
- [client_max_dirty_ops](#client_max_dirty_ops)
- [client_enable_writeback](#client_enable_writeback)
- [client_max_buffered_bytes](#client_max_buffered_bytes)
- [client_max_buffered_ops](#client_max_buffered_ops)
- [client_max_writeback_iodepth](#client_max_writeback_iodepth)
- [nbd_timeout](#nbd_timeout)
- [nbd_max_devices](#nbd_max_devices)
- [nbd_max_part](#nbd_max_part)
- [osd_nearfull_ratio](#osd_nearfull_ratio)
-
-## client_retry_interval
-
- Type: milliseconds
- Default: 50
- Minimum: 10
- Can be changed online: yes
-
-Retry time for I/O requests failed due to inactive PGs or network
-connectivity errors.
-
-## client_eio_retry_interval
-
- Type: milliseconds
- Default: 1000
- Can be changed online: yes
-
-Retry time for I/O requests failed due to data corruption or unfinished
-EC object deletions (has_incomplete PG state). 0 disables such retries
-and clients are not blocked and just get EIO error code instead.
-
-## client_retry_enospc
-
- Type: boolean
- Default: true
- Can be changed online: yes
-
-Retry writes on out of space errors to wait until some space is freed on
-OSDs.
-
-## client_max_dirty_bytes
-
- Type: integer
- Default: 33554432
- Can be changed online: yes
-
-Without [immediate_commit](layout-cluster.en.md#immediate_commit)=all this parameter sets the limit of "dirty"
-(not committed by fsync) data allowed by the client before forcing an
-additional fsync and committing the data. Also note that the client always
-holds a copy of uncommitted data in memory so this setting also affects
-RAM usage of clients.
-
-## client_max_dirty_ops
-
- Type: integer
- Default: 1024
- Can be changed online: yes
-
-Same as client_max_dirty_bytes, but instead of total size, limits the number
-of uncommitted write operations.
-
-## client_enable_writeback
-
- Type: boolean
- Default: false
- Can be changed online: yes
-
-This parameter enables client-side write buffering. This means that write
-requests are accumulated in memory for a short time before being sent to
-a Vitastor cluster which allows to send them in parallel and increase
-performance of some applications. Writes are buffered until client forces
-a flush with fsync() or until the amount of buffered writes exceeds the
-limit.
-
-Write buffering significantly increases performance of some applications,
-for example, CrystalDiskMark under Windows (LOL :-D), but also any other
-applications if they do writes in one of two non-optimal ways: either if
-they do a lot of small (4 kb or so) sequential writes, or if they do a lot
-of small random writes, but without any parallelism or asynchrony, and also
-without calling fsync().
-
-With write buffering enabled, you can expect around 22000 T1Q1 random write
-iops in QEMU more or less regardless of the quality of your SSDs, and this
-number is in fact bound by QEMU itself rather than Vitastor (check it
-yourself by adding a "driver=null-co" disk in QEMU). Without write
-buffering, the current record is 9900 iops, but the number is usually
-even lower with non-ideal hardware, for example, it may be 5000 iops.
-
-Even when this parameter is enabled, write buffering isn't enabled until
-the client explicitly allows it, because enabling it without the client
-being aware of the fact that his writes may be buffered may lead to data
-loss. Because of this, older versions of clients don't support write
-buffering at all, newer versions of the QEMU driver allow write buffering
-only if it's enabled in disk settings with `-blockdev cache.direct=false`,
-and newer versions of FIO only allow write buffering if you don't specify
-`-direct=1`. NBD and NFS drivers allow write buffering by default.
-
-You can overcome this restriction too with the `client_writeback_allowed`
-parameter, but you shouldn't do that unless you **really** know what you
-are doing.
-
-## client_max_buffered_bytes
-
- Type: integer
- Default: 33554432
- Can be changed online: yes
-
-Maximum total size of buffered writes which triggers write-back when reached.
-
-## client_max_buffered_ops
-
- Type: integer
- Default: 1024
- Can be changed online: yes
-
-Maximum number of buffered writes which triggers write-back when reached.
-Multiple consecutive modified data regions are counted as 1 write here.
-
-## client_max_writeback_iodepth
-
- Type: integer
- Default: 256
- Can be changed online: yes
-
-Maximum number of parallel writes when flushing buffered data to the server.
-
-## nbd_timeout
-
- Type: seconds
- Default: 300
-
-Timeout for I/O operations for [NBD](../usage/nbd.en.md). If an operation
-executes for longer than this timeout, including when your cluster is just
-temporarily down for more than timeout, the NBD device will detach by itself
-(and possibly break the mounted file system).
-
-You can set timeout to 0 to never detach, but in that case you won't be
-able to remove the kernel device at all if the NBD process dies - you'll have
-to reboot the host.
-
-## nbd_max_devices
-
- Type: integer
- Default: 64
-
-Maximum number of NBD devices in the system. This value is passed as
-`nbds_max` parameter for the nbd kernel module when vitastor-nbd autoloads it.
-
-## nbd_max_part
-
- Type: integer
- Default: 3
-
-Maximum number of partitions per NBD device. This value is passed as
-`max_part` parameter for the nbd kernel module when vitastor-nbd autoloads it.
-Note that (nbds_max)*(1+max_part) usually can't exceed 256.
-
-## osd_nearfull_ratio
-
- Type: number
- Default: 0.95
- Can be changed online: yes
-
-Ratio of used space on OSD to treat it as "almost full" in vitastor-cli status output.
-
-Remember that some client writes may hang or complete with an error if even
-just one OSD becomes 100 % full!
-
-However, unlike in Ceph, 100 % full Vitastor OSDs don't crash (in Ceph they're
-unable to start at all), so you'll be able to recover from "out of space" errors
-without destroying and recreating OSDs.
--- a/docs/config/client.ru.md
+++ b/docs/config/client.ru.md
@ -1,188 +0,0 @@
-[Документация](../../README-ru.md#документация) → [Конфигурация](../config.ru.md) → Параметры клиентского кода
-
-----
-
-[Read in English](client.en.md)
-
-# Параметры клиентского кода
-
-Данные параметры применяются только к клиентам Vitastor (QEMU, fio, NBD и т.п.) и
-затрагивают логику их работы с кластером.
-
- [client_retry_interval](#client_retry_interval)
- [client_eio_retry_interval](#client_eio_retry_interval)
- [client_retry_enospc](#client_retry_enospc)
- [client_max_dirty_bytes](#client_max_dirty_bytes)
- [client_max_dirty_ops](#client_max_dirty_ops)
- [client_enable_writeback](#client_enable_writeback)
- [client_max_buffered_bytes](#client_max_buffered_bytes)
- [client_max_buffered_ops](#client_max_buffered_ops)
- [client_max_writeback_iodepth](#client_max_writeback_iodepth)
- [nbd_timeout](#nbd_timeout)
- [nbd_max_devices](#nbd_max_devices)
- [nbd_max_part](#nbd_max_part)
- [osd_nearfull_ratio](#osd_nearfull_ratio)
-
-## client_retry_interval
-
- Тип: миллисекунды
- Значение по умолчанию: 50
- Минимальное значение: 10
- Можно менять на лету: да
-
-Время повтора запросов ввода-вывода, неудачных из-за неактивных PG или
-ошибок сети.
-
-## client_eio_retry_interval
-
- Тип: миллисекунды
- Значение по умолчанию: 1000
- Можно менять на лету: да
-
-Время повтора запросов ввода-вывода, неудачных из-за повреждения данных
-или незавершённых удалений EC-объектов (состояния PG has_incomplete).
-0 отключает повторы таких запросов и клиенты не блокируются, а вместо
-этого просто получают код ошибки EIO.
-
-## client_retry_enospc
-
- Тип: булево (да/нет)
- Значение по умолчанию: true
- Можно менять на лету: да
-
-Повторять запросы записи, завершившиеся с ошибками нехватки места, т.е.
-ожидать, пока на OSD не освободится место.
-
-## client_max_dirty_bytes
-
- Тип: целое число
- Значение по умолчанию: 33554432
- Можно менять на лету: да
-
-При работе без [immediate_commit](layout-cluster.ru.md#immediate_commit)=all - это лимит объёма "грязных" (не
-зафиксированных fsync-ом) данных, при достижении которого клиент будет
-принудительно вызывать fsync и фиксировать данные. Также стоит иметь в виду,
-что в этом случае до момента fsync клиент хранит копию незафиксированных
-данных в памяти, то есть, настройка влияет на потребление памяти клиентами.
-
-## client_max_dirty_ops
-
- Тип: целое число
- Значение по умолчанию: 1024
- Можно менять на лету: да
-
-Аналогично client_max_dirty_bytes, но ограничивает количество
-незафиксированных операций записи вместо их общего объёма.
-
-## client_enable_writeback
-
- Тип: булево (да/нет)
- Значение по умолчанию: false
- Можно менять на лету: да
-
-Данный параметр разрешает включать буферизацию записи в памяти. Буферизация
-означает, что операции записи отправляются на кластер Vitastor не сразу, а
-могут небольшое время накапливаться в памяти и сбрасываться сразу пакетами,
-до тех пор, пока либо не будет превышен лимит неотправленных записей, либо
-пока клиент не вызовет fsync.
-
-Буферизация значительно повышает производительность некоторых приложений,
-например, CrystalDiskMark в Windows (ха-ха :-D), но также и любых других,
-которые пишут на диск неоптимально: либо последовательно, но мелкими блоками
-(например, по 4 кб), либо случайно, но без параллелизма и без fsync - то
-есть, например, отправляя 128 операций записи в разные места диска, но не
-все сразу с помощью асинхронного I/O, а по одной.
-
-В QEMU с буферизацией записи можно ожидать показателя примерно 22000
-операций случайной записи в секунду в 1 поток и с глубиной очереди 1 (T1Q1)
-без fsync, почти вне зависимости от того, насколько хороши ваши диски - эта
-цифра упирается в сам QEMU. Без буферизации рекорд пока что - 9900 операций
-в секунду, но на железе похуже может быть и поменьше, например, 5000 операций
-в секунду.
-
-При этом, даже если данный параметр включён, буферизация не включается, если
-явно не разрешена клиентом, т.к. если клиент не знает, что запросы записи
-буферизуются, это может приводить к потере данных. Поэтому в старых версиях
-клиентских драйверов буферизация записи не включается вообще, в новых
-версиях QEMU-драйвера включается, только если разрешена опцией диска
-`-blockdev cache.direct=false`, а в fio - только если нет опции `-direct=1`.
-В NBD и NFS драйверах буферизация записи разрешена по умолчанию.
-
-Можно обойти и это ограничение с помощью параметра `client_writeback_allowed`,
-но делать так не надо, если только вы не уверены в том, что делаете, на все
-100%. :-)
-
-## client_max_buffered_bytes
-
- Тип: целое число
- Значение по умолчанию: 33554432
- Можно менять на лету: да
-
-Максимальный общий размер буферизованных записей, при достижении которого
-начинается процесс сброса данных на сервер.
-
-## client_max_buffered_ops
-
- Тип: целое число
- Значение по умолчанию: 1024
- Можно менять на лету: да
-
-Максимальное количество буферизованных записей, при достижении которого
-начинается процесс сброса данных на сервер. При этом несколько
-последовательных изменённых областей здесь считаются 1 записью.
-
-## client_max_writeback_iodepth
-
- Тип: целое число
- Значение по умолчанию: 256
- Можно менять на лету: да
-
-Максимальное число параллельных операций записи при сбросе буферов на сервер.
-
-## nbd_timeout
-
- Тип: секунды
- Значение по умолчанию: 300
-
-Таймаут для операций чтения/записи через [NBD](../usage/nbd.ru.md). Если
-операция выполняется дольше таймаута, включая временную недоступность
-кластера на время, большее таймаута, NBD-устройство отключится само собой
-(и, возможно, сломает примонтированную ФС).
-
-Вы можете установить таймаут в 0, чтобы никогда не отключать устройство по
-таймауту, но в этом случае вы вообще не сможете удалить устройство, если
-процесс NBD умрёт - вам придётся перезагружать сервер.
-
-## nbd_max_devices
-
- Тип: целое число
- Значение по умолчанию: 64
-
-Максимальное число NBD-устройств в системе. Данное значение передаётся
-модулю ядра nbd как параметр `nbds_max`, когда его загружает vitastor-nbd.
-
-## nbd_max_part
-
- Тип: целое число
- Значение по умолчанию: 3
-
-Максимальное число разделов на одном NBD-устройстве. Данное значение передаётся
-модулю ядра nbd как параметр `max_part`, когда его загружает vitastor-nbd.
-Имейте в виду, что (nbds_max)*(1+max_part) обычно не может превышать 256.
-
-## osd_nearfull_ratio
-
- Тип: число
- Значение по умолчанию: 0.95
- Можно менять на лету: да
-
-Доля занятого места на OSD, начиная с которой он считается "почти заполненным" в
-выводе vitastor-cli status.
-
-Помните, что часть клиентских запросов может зависнуть или завершиться с ошибкой,
-если на 100 % заполнится хотя бы 1 OSD!
-
-Однако, в отличие от Ceph, заполненные на 100 % OSD Vitastor не падают (в Ceph
-заполненные на 100% OSD вообще не могут стартовать), так что вы сможете
-восстановить работу кластера после ошибок отсутствия свободного места
-без уничтожения и пересоздания OSD.
--- a/docs/config/layout-cluster.en.md
+++ b/docs/config/layout-cluster.en.md
@ -96,9 +96,8 @@ SSD cache or "media-cache" - for example, a lot of Seagate EXOS drives have
 it (they have internal SSD cache even though it's not stated in datasheets).

 Setting this parameter to "all" or "small" in OSD parameters requires enabling
-[disable_journal_fsync](layout-osd.en.yml#disable_journal_fsync) and
-[disable_meta_fsync](layout-osd.en.yml#disable_meta_fsync), setting it to
-"all" also requires enabling [disable_data_fsync](layout-osd.en.yml#disable_data_fsync).
+disable_journal_fsync and disable_meta_fsync, setting it to "all" also requires
+enabling disable_data_fsync.

 TLDR: For optimal performance, set immediate_commit to "all" if you only use
 SSDs with supercapacitor-based power loss protection (nonvolatile
--- a/docs/config/layout-cluster.ru.md
+++ b/docs/config/layout-cluster.ru.md
@ -103,9 +103,8 @@ HDD-дисках с внутренним SSD или "медиа" кэшем - н
 указано в спецификациях).

 Указание "all" или "small" в настройках / командной строке OSD требует
-включения [disable_journal_fsync](layout-osd.ru.yml#disable_journal_fsync) и
-[disable_meta_fsync](layout-osd.ru.yml#disable_meta_fsync), значение "all"
-также требует включения [disable_data_fsync](layout-osd.ru.yml#disable_data_fsync).
+включения disable_journal_fsync и disable_meta_fsync, значение "all" также
+требует включения disable_data_fsync.

 Итого, вкратце: для оптимальной производительности установите
 immediate_commit в значение "all", если вы используете в кластере только SSD
--- a/docs/config/layout-osd.en.md
+++ b/docs/config/layout-osd.en.md
@ -197,22 +197,21 @@ Must be equal or a multiple of [bitmap_granularity](layout-cluster.en.md#bitmap_

 Checksums increase metadata size by 4 bytes per each csum_block_size of data.

-Checksums are always a tradeoff:
+Checksums are always a compromise:
 1. You either sacrifice +1 GB RAM per 1 TB of data
 2. Or you raise csum_block_size, for example, to 32k and sacrifice
   50% random write iops due to checksum read-modify-write
 3. Or you turn off [inmemory_metadata](osd.en.md#inmemory_metadata) and
   sacrifice 50% random read iops due to checksum reads

-All-flash clusters usually have enough RAM to use default csum_block_size,
-which uses 1 GB RAM per 1 TB of data. HDD clusters usually don't.
+Option 1 (default) is recommended for all-flash setups because these usually
+have enough RAM.

-Thus, recommended setups are:
-1. All-flash, 1 GB RAM per 1 TB data: default (csum_block_size=4k)
-2. All-flash, less RAM: csum_block_size=4k + inmemory_metadata=false
-3. Hybrid HDD+SSD: csum_block_size=4k + inmemory_metadata=false
-4. HDD-only, faster random read: csum_block_size=32k
-5. HDD-only, faster random write: csum_block_size=4k +
-   inmemory_metadata=false + meta_io=cached
+Option 2 is recommended for HDD-only setups. HDD-only setups usually do NOT
+have enough RAM for the default 4 KB csum_block_size.

-See also [meta_io](osd.en.md#meta_io).
+Option 3 is recommended for SSD+HDD setups (because metadata SSDs will handle
+extra reads without any performance drop) and also *maybe* for NVMe all-flash
+setups when you don't have enough RAM (because NVMe drives have plenty
+of read iops to spare). You may also consider enabling
+[cached_read_meta](osd.en.md#cached_read_meta) in this case.
--- a/docs/config/layout-osd.ru.md
+++ b/docs/config/layout-osd.ru.md
@ -220,12 +220,17 @@ csum_block_size данных.
   жертвуете 50% скорости случайного чтения из-за чтения контрольных сумм
   с диска

-Таким образом, рекомендуются следующие варианты настроек:
-1. All-flash, 1 ГБ памяти на 1 ТБ данных: по умолчанию (csum_block_size=4k)
-2. All-flash, меньше памяти: csum_block_size=4k + inmemory_metadata=false
-3. Гибридные HDD+SSD: csum_block_size=4k + inmemory_metadata=false
-4. Только HDD, быстрее случайное чтение: csum_block_size=32k
-5. Только HDD, быстрее случайная запись: csum_block_size=4k +
-   inmemory_metadata=false + meta_io=cached
+Вариант 1 (при настройках по умолчанию) рекомендуется для SSD (All-Flash)
+кластеров, потому что памяти в них обычно хватает.

-Смотрите также [meta_io](osd.ru.md#meta_io).
+Вариант 2 рекомендуется для кластеров на одних жёстких дисках (без SSD
+под метаданные). На 4 кб блок контрольной суммы памяти в таких кластерах
+обычно НЕ хватает.
+
+Вариант 3 рекомендуется для гибридных кластеров (SSD+HDD), потому что
+скорости SSD под метаданными хватит, чтобы обработать дополнительные чтения
+без снижения производительности. Также вариант 3 *может* рекомендоваться
+для All-Flash кластеров на основе NVMe-дисков, когда памяти НЕ достаточно,
+потому что NVMe-диски имеют огромный запас производительности по чтению.
+В таких случаях, возможно, также имеет смысл включать параметр
+[cached_read_meta](osd.ru.md#cached_read_meta).
--- a/docs/config/monitor.en.md
+++ b/docs/config/monitor.en.md
@ -15,13 +15,12 @@ These parameters only apply to Monitors.
 - [mon_stats_timeout](#mon_stats_timeout)
 - [osd_out_time](#osd_out_time)
 - [placement_levels](#placement_levels)
- [use_old_pg_combinator](#use_old_pg_combinator)

 ## etcd_mon_ttl

 - Type: seconds
- Default: 1
- Minimum: 5
+- Default: 30
+- Minimum: 10

 Monitor etcd lease refresh interval in seconds

@ -78,11 +77,3 @@ values.  Smaller priority means higher level in tree. For example,
 levels are always predefined and can't be removed. If one of them is not
 present in the configuration, then it is defined with the default priority
 (100 for "host", 101 for "osd").
-
-## use_old_pg_combinator
-
- Type: boolean
- Default: false
-
-Use the old PG combination generator which doesn't support [level_placement](pool.en.md#level_placement)
-and [raw_placement](pool.en.md#raw_placement) for pools which don't use this features.
--- a/docs/config/monitor.ru.md
+++ b/docs/config/monitor.ru.md
@ -15,13 +15,12 @@
 - [mon_stats_timeout](#mon_stats_timeout)
 - [osd_out_time](#osd_out_time)
 - [placement_levels](#placement_levels)
- [use_old_pg_combinator](#use_old_pg_combinator)

 ## etcd_mon_ttl

 - Тип: секунды
- Значение по умолчанию: 1
- Минимальное значение: 5
+- Значение по умолчанию: 30
+- Минимальное значение: 10

 Интервал обновления etcd резервации (lease) монитором

@ -79,11 +78,3 @@ OSD перед обновлением агрегированной статис
 "host" и "osd" являются предопределёнными и не могут быть удалены. Если
 один из них отсутствует в конфигурации, он доопределяется с приоритетом по
 умолчанию (100 для уровня "host", 101 для "osd").
-
-## use_old_pg_combinator
-
- Тип: булево (да/нет)
- Значение по умолчанию: false
-
-Использовать старый генератор комбинаций PG, не поддерживающий [level_placement](pool.ru.md#level_placement)
-и [raw_placement](pool.ru.md#raw_placement) для пулов, которые не используют данные функции.
--- a/docs/config/network.en.md
+++ b/docs/config/network.en.md
@ -20,16 +20,17 @@ between clients, OSDs and etcd.
 - [rdma_max_msg](#rdma_max_msg)
 - [rdma_max_recv](#rdma_max_recv)
 - [rdma_max_send](#rdma_max_send)
- [rdma_odp](#rdma_odp)
 - [peer_connect_interval](#peer_connect_interval)
 - [peer_connect_timeout](#peer_connect_timeout)
 - [osd_idle_timeout](#osd_idle_timeout)
 - [osd_ping_timeout](#osd_ping_timeout)
+- [up_wait_retry_interval](#up_wait_retry_interval)
 - [max_etcd_attempts](#max_etcd_attempts)
 - [etcd_quick_timeout](#etcd_quick_timeout)
 - [etcd_slow_timeout](#etcd_slow_timeout)
 - [etcd_keepalive_timeout](#etcd_keepalive_timeout)
- [etcd_ws_keepalive_interval](#etcd_ws_keepalive_interval)
+- [etcd_ws_keepalive_timeout](#etcd_ws_keepalive_timeout)
+- [client_dirty_limit](#client_dirty_limit)

 ## tcp_header_buffer_size

@ -68,14 +69,11 @@ but they are not connected to the cluster.
 - Type: string

 RDMA device name to use for Vitastor OSD communications (for example,
-"rocep5s0f0"). Now Vitastor supports all adapters, even ones without
-ODP support, like Mellanox ConnectX-3 and non-Mellanox cards.
-
-Versions up to Vitastor 1.2.0 required ODP which is only present in
-Mellanox ConnectX >= 4. See also [rdma_odp](#rdma_odp).
-
-Run `ibv_devinfo -v` as root to list available RDMA devices and their
-features.
+"rocep5s0f0"). Please note that Vitastor RDMA requires Implicit On-Demand
+Paging (Implicit ODP) and Scatter/Gather (SG) support from the RDMA device
+to work. For example, Mellanox ConnectX-3 and older adapters don't have
+Implicit ODP, so they're unsupported by Vitastor. Run `ibv_devinfo -v` as
+root to list available RDMA devices and their features.

 Remember that you also have to configure your network switches if you use
 RoCE/RoCEv2, otherwise you may experience unstable performance. Refer to
@ -150,28 +148,6 @@ less than `rdma_max_recv` so the receiving side doesn't run out of buffers.
 Doesn't affect memory usage - additional memory isn't allocated for send
 operations.

-## rdma_odp
-
- Type: boolean
- Default: false
-
-Use RDMA with On-Demand Paging. ODP is currently only available on Mellanox
-ConnectX-4 and newer adapters. ODP allows to not register memory explicitly
-for RDMA adapter to be able to use it. This, in turn, allows to skip memory
-copying during sending. One would think this should improve performance, but
-**in reality** RDMA performance with ODP is **drastically** worse. Example
-3-node cluster with 8 NVMe in each node and 2*25 GBit/s ConnectX-6 RDMA network
-without ODP pushes 3950000 read iops, but only 239000 iops with ODP...
-
-This happens because Mellanox ODP implementation seems to be based on
-message retransmissions when the adapter doesn't know about the buffer yet -
-it likely uses standard "RNR retransmissions" (RNR = receiver not ready)
-which is generally slow in RDMA/RoCE networks. Here's a presentation about
-it from ISPASS-2021 conference: https://tkygtr6.github.io/pub/ISPASS21_slides.pdf
-
-ODP support is retained in the code just in case a good ODP implementation
-appears one day.
-
 ## peer_connect_interval

 - Type: seconds
@ -211,6 +187,17 @@ Maximum time to wait for OSD keepalive responses. If an OSD doesn't respond
 within this time, the connection to it is dropped and a reconnection attempt
 is scheduled.

+## up_wait_retry_interval
+
+- Type: milliseconds
+- Default: 500
+- Minimum: 50
+- Can be changed online: yes
+
+OSDs respond to clients with a special error code when they receive I/O
+requests for a PG that's not synchronized and started. This parameter sets
+the time for the clients to wait before re-attempting such I/O requests.
+
 ## max_etcd_attempts

 - Type: integer
@ -245,11 +232,25 @@ Timeout for etcd requests which are allowed to wait for some time.
 Timeout for etcd connection HTTP Keep-Alive. Should be higher than
 etcd_report_interval to guarantee that keepalive actually works.

-## etcd_ws_keepalive_interval
+## etcd_ws_keepalive_timeout

 - Type: seconds
- Default: 5
+- Default: 30
 - Can be changed online: yes

 etcd websocket ping interval required to keep the connection alive and
 detect disconnections quickly.
+
+## client_dirty_limit
+
+- Type: integer
+- Default: 33554432
+- Can be changed online: yes
+
+Without immediate_commit=all this parameter sets the limit of "dirty"
+(not committed by fsync) data allowed by the client before forcing an
+additional fsync and committing the data. Also note that the client always
+holds a copy of uncommitted data in memory so this setting also affects
+RAM usage of clients.
+
+This parameter doesn't affect OSDs themselves.
--- a/docs/config/network.ru.md
+++ b/docs/config/network.ru.md
@ -20,16 +20,17 @@
 - [rdma_max_msg](#rdma_max_msg)
 - [rdma_max_recv](#rdma_max_recv)
 - [rdma_max_send](#rdma_max_send)
- [rdma_odp](#rdma_odp)
 - [peer_connect_interval](#peer_connect_interval)
 - [peer_connect_timeout](#peer_connect_timeout)
 - [osd_idle_timeout](#osd_idle_timeout)
 - [osd_ping_timeout](#osd_ping_timeout)
+- [up_wait_retry_interval](#up_wait_retry_interval)
 - [max_etcd_attempts](#max_etcd_attempts)
 - [etcd_quick_timeout](#etcd_quick_timeout)
 - [etcd_slow_timeout](#etcd_slow_timeout)
 - [etcd_keepalive_timeout](#etcd_keepalive_timeout)
- [etcd_ws_keepalive_interval](#etcd_ws_keepalive_interval)
+- [etcd_ws_keepalive_timeout](#etcd_ws_keepalive_timeout)
+- [client_dirty_limit](#client_dirty_limit)

 ## tcp_header_buffer_size

@ -71,15 +72,12 @@ RDMA может быть нужно только если у клиентов е
 - Тип: строка

 Название RDMA-устройства для связи с Vitastor OSD (например, "rocep5s0f0").
-Сейчас Vitastor поддерживает все модели адаптеров, включая те, у которых
-нет поддержки ODP, то есть вы можете использовать RDMA с ConnectX-3 и
-картами производства не Mellanox.
-
-Версии Vitastor до 1.2.0 включительно требовали ODP, который есть только
-на Mellanox ConnectX 4 и более новых. См. также [rdma_odp](#rdma_odp).
-
-Запустите `ibv_devinfo -v` от имени суперпользователя, чтобы посмотреть
-список доступных RDMA-устройств, их параметры и возможности.
+Имейте в виду, что поддержка RDMA в Vitastor требует функций устройства
+Implicit On-Demand Paging (Implicit ODP) и Scatter/Gather (SG). Например,
+адаптеры Mellanox ConnectX-3 и более старые не поддерживают Implicit ODP и
+потому не поддерживаются в Vitastor. Запустите `ibv_devinfo -v` от имени
+суперпользователя, чтобы посмотреть список доступных RDMA-устройств, их
+параметры и возможности.

 Обратите внимание, что если вы используете RoCE/RoCEv2, вам также необходимо
 правильно настроить для него коммутаторы, иначе вы можете столкнуться с
@ -158,29 +156,6 @@ OSD в любом случае согласовывают реальное зн
 Не влияет на потребление памяти - дополнительная память на операции отправки
 не выделяется.

-## rdma_odp
-
- Тип: булево (да/нет)
- Значение по умолчанию: false
-
-Использовать RDMA с On-Demand Paging. ODP - функция, доступная пока что
-исключительно на адаптерах Mellanox ConnectX-4 и более новых. ODP позволяет
-не регистрировать память для её использования RDMA-картой. Благодаря этому
-можно не копировать данные при отправке их в сеть и, казалось бы, это должно
-улучшать производительность - но **по факту** получается так, что
-производительность только ухудшается, причём сильно. Пример - на 3-узловом
-кластере с 8 NVMe в каждом узле и сетью 2*25 Гбит/с на чтение с RDMA без ODP
-удаётся снять 3950000 iops, а с ODP - всего 239000 iops...
-
-Это происходит из-за того, что реализация ODP у Mellanox неоптимальная и
-основана на повторной передаче сообщений, когда карте не известен буфер -
-вероятно, на стандартных "RNR retransmission" (RNR = receiver not ready).
-А данные повторные передачи в RDMA/RoCE - всегда очень медленная штука.
-Презентация на эту тему с конференции ISPASS-2021: https://tkygtr6.github.io/pub/ISPASS21_slides.pdf
-
-Возможность использования ODP сохранена в коде на случай, если вдруг в один
-прекрасный день появится хорошая реализация ODP.
-
 ## peer_connect_interval

 - Тип: секунды
@ -220,6 +195,19 @@ OSD в любом случае согласовывают реальное зн
 Если OSD не отвечает за это время, соединение отключается и производится
 повторная попытка соединения.

+## up_wait_retry_interval
+
+- Тип: миллисекунды
+- Значение по умолчанию: 500
+- Минимальное значение: 50
+- Можно менять на лету: да
+
+Когда OSD получают от клиентов запросы ввода-вывода, относящиеся к не
+поднятым на данный момент на них PG, либо к PG в процессе синхронизации,
+они отвечают клиентам специальным кодом ошибки, означающим, что клиент
+должен некоторое время подождать перед повторением запроса. Именно это время
+ожидания задаёт данный параметр.
+
 ## max_etcd_attempts

 - Тип: целое число
@ -256,10 +244,24 @@ OSD в любом случае согласовывают реальное зн
 Таймаут для HTTP Keep-Alive в соединениях к etcd. Должен быть больше, чем
 etcd_report_interval, чтобы keepalive гарантированно работал.

-## etcd_ws_keepalive_interval
+## etcd_ws_keepalive_timeout

 - Тип: секунды
- Значение по умолчанию: 5
+- Значение по умолчанию: 30
 - Можно менять на лету: да

 Интервал проверки живости вебсокет-подключений к etcd.
+
+## client_dirty_limit
+
+- Тип: целое число
+- Значение по умолчанию: 33554432
+- Можно менять на лету: да
+
+При работе без immediate_commit=all - это лимит объёма "грязных" (не
+зафиксированных fsync-ом) данных, при достижении которого клиент будет
+принудительно вызывать fsync и фиксировать данные. Также стоит иметь в виду,
+что в этом случае до момента fsync клиент хранит копию незафиксированных
+данных в памяти, то есть, настройка влияет на потребление памяти клиентами.
+
+Параметр не влияет на сами OSD.
--- a/docs/config/osd.en.md
+++ b/docs/config/osd.en.md
@ -11,7 +11,6 @@ initialization and can be changed - either with an OSD restart or, for some of
 them, even without restarting by updating configuration in etcd.

 - [etcd_report_interval](#etcd_report_interval)
- [etcd_stats_interval](#etcd_stats_interval)
 - [run_primary](#run_primary)
 - [osd_network](#osd_network)
 - [bind_address](#bind_address)
@ -19,7 +18,6 @@ them, even without restarting by updating configuration in etcd.
 - [autosync_interval](#autosync_interval)
 - [autosync_writes](#autosync_writes)
 - [recovery_queue_depth](#recovery_queue_depth)
- [recovery_sleep_us](#recovery_sleep_us)
 - [recovery_pg_switch](#recovery_pg_switch)
 - [recovery_sync_batch](#recovery_sync_batch)
 - [readonly](#readonly)
@ -33,9 +31,9 @@ them, even without restarting by updating configuration in etcd.
 - [max_flusher_count](#max_flusher_count)
 - [inmemory_metadata](#inmemory_metadata)
 - [inmemory_journal](#inmemory_journal)
- [data_io](#data_io)
- [meta_io](#meta_io)
- [journal_io](#journal_io)
+- [cached_read_data](#cached_read_data)
+- [cached_read_meta](#cached_read_meta)
+- [cached_read_journal](#cached_read_journal)
 - [journal_sector_buffer_count](#journal_sector_buffer_count)
 - [journal_no_same_sector_overwrites](#journal_no_same_sector_overwrites)
 - [throttle_small_writes](#throttle_small_writes)
@ -52,35 +50,17 @@ them, even without restarting by updating configuration in etcd.
 - [scrub_list_limit](#scrub_list_limit)
 - [scrub_find_best](#scrub_find_best)
 - [scrub_ec_max_bruteforce](#scrub_ec_max_bruteforce)
- [recovery_tune_interval](#recovery_tune_interval)
- [recovery_tune_util_low](#recovery_tune_util_low)
- [recovery_tune_util_high](#recovery_tune_util_high)
- [recovery_tune_client_util_low](#recovery_tune_client_util_low)
- [recovery_tune_client_util_high](#recovery_tune_client_util_high)
- [recovery_tune_agg_interval](#recovery_tune_agg_interval)
- [recovery_tune_sleep_min_us](#recovery_tune_sleep_min_us)
- [recovery_tune_sleep_cutoff_us](#recovery_tune_sleep_cutoff_us)

 ## etcd_report_interval

 - Type: seconds
 - Default: 5

-Interval at which OSDs report their liveness to etcd. Affects OSD lease time
+Interval at which OSDs report their state to etcd. Affects OSD lease time
 and thus the failover speed. Lease time is equal to this parameter value
 plus max_etcd_attempts * etcd_quick_timeout because it should be guaranteed
 that every OSD always refreshes its lease in time.

-## etcd_stats_interval
-
- Type: seconds
- Default: 30
-
-Interval at which OSDs report their statistics to etcd. Highly affects the
-imposed load on etcd, because statistics include a key for every OSD and
-for every PG. At the same time, low statistic intervals make `vitastor-cli`
-statistics more responsive.
-
 ## run_primary

 - Type: boolean
@ -144,24 +124,12 @@ operations before issuing an fsync operation internally.
 ## recovery_queue_depth

 - Type: integer
- Default: 1
+- Default: 4
 - Can be changed online: yes

-Maximum recovery and rebalance operations initiated by each OSD in parallel.
-Note that each OSD talks to a lot of other OSDs so actual number of parallel
-recovery operations per each OSD is greater than just recovery_queue_depth.
-Increasing this parameter can speedup recovery if [auto-tuning](#recovery_tune_interval)
-allows it or if it is disabled.
-
-## recovery_sleep_us
-
- Type: microseconds
- Default: 0
- Can be changed online: yes
-
-Delay for all recovery- and rebalance- related operations. If non-zero,
-such operations are artificially slowed down to reduce the impact on
-client I/O.
+Maximum recovery operations per one primary OSD at any given moment of time.
+Currently it's the only parameter available to tune the speed or recovery
+and rebalancing, but it's planned to implement more.

 ## recovery_pg_switch

@ -290,59 +258,45 @@ is typically very small because it's sufficient to have 16-32 MB journal
 for SSD OSDs. However, in theory it's possible that you'll want to turn it
 off for hybrid (HDD+SSD) OSDs with large journals on quick devices.

-## data_io
+## cached_read_data

- Type: string
- Default: direct
+- Type: boolean
+- Default: false

-I/O mode for *data*. One of "direct", "cached" or "directsync". Corresponds
-to O_DIRECT, O_SYNC and O_DIRECT|O_SYNC, respectively.
+Read data through Linux page cache, i.e. use a file descriptor opened without
+O_DIRECT for data reads. May improve read performance for frequently accessed
+data if it fits in RAM. Memory in page cache is shared by all processes and
+not accounted in OSD memory consumption.

-Choose "cached" to use Linux page cache. This may improve read performance
-for hot data and slower disks - HDDs and maybe SATA SSDs - but will slightly
-decrease write performance for fast disks because page cache is an overhead
-itself.
+## cached_read_meta

-Choose "directsync" to use [immediate_commit](layout-cluster.ru.md#immediate_commit)
-(which requires disable_data_fsync) with drives having write-back cache
-which can't be turned off, for example, Intel Optane. Also note that *some*
-desktop SSDs (for example, HP EX950) may ignore O_SYNC thus making
-disable_data_fsync unsafe even with "directsync".
+- Type: boolean
+- Default: false

-## meta_io
+Read metadata through Linux page cache. May be beneficial when checksums
+are enabled and [inmemory_metadata](#inmemory_metadata) is disabled, because
+in this case metadata blocks are read from disk to verify checksums on every
+read request and caching them may reduce this extra read load.

- Type: string
- Default: direct
-
-I/O mode for *metadata*. One of "direct", "cached" or "directsync".
-
-"cached" may improve read performance, but only under the following conditions:
-1. your drives are relatively slow (HDD, SATA SSD), and
-2. checksums are enabled, and
-3. [inmemory_metadata](#inmemory_metadata) is disabled.
-Under all these conditions, metadata blocks are read from disk on every
-read request to verify checksums and caching them may reduce this extra
-read load. Without (3) metadata is never read from the disk after starting,
-and without (2) metadata blocks are read from disk only during journal
+Absolutely pointless to enable with enabled inmemory_metadata because all
+metadata is kept in memory anyway, and likely pointless without checksums,
+because in that case, metadata blocks are read from disk only during journal
 flushing.

-"directsync" is the same as above.
+If the same device is used for data and metadata, enabling [cached_read_data](#cached_read_data)
+also enables this parameter, given that it isn't turned off explicitly.

-If the same device is used for data and metadata, meta_io by default is set
-to the same value as [data_io](#data_io).
+## cached_read_journal

-## journal_io
+- Type: boolean
+- Default: false

- Type: string
- Default: direct
+Read buffered data from journal through Linux page cache. Does not have sense
+without disabling [inmemory_journal](#inmemory_journal), which, again, is
+enabled by default.

-I/O mode for *journal*. One of "direct", "cached" or "directsync".
-
-Here, "cached" may only improve read performance for recent writes and
-only if [inmemory_journal](#inmemory_journal) is turned off.
-
-If the same device is used for metadata and journal, journal_io by default
-is set to the same value as [meta_io](#meta_io).
+If the same device is used for metadata and journal, enabling [cached_read_meta](#cached_read_meta)
+also enables this parameter, given that it isn't turned off explicitly.

 ## journal_sector_buffer_count

@ -529,90 +483,3 @@ the variant with most available equal copies is correct. For example, if
 you have 3 replicas and 1 of them differs, this one is considered to be
 corrupted. But if there is no "best" version with more copies than all
 others have then the object is also marked as inconsistent.
-
-## recovery_tune_interval
-
- Type: seconds
- Default: 1
- Can be changed online: yes
-
-Interval at which OSD re-considers client and recovery load and automatically
-adjusts [recovery_sleep_us](#recovery_sleep_us). Recovery auto-tuning is
-disabled if recovery_tune_interval is set to 0.
-
-Auto-tuning targets utilization. Utilization is a measure of load and is
-equal to the product of iops and average latency (so it may be greater
-than 1). You set "low" and "high" client utilization thresholds and two
-corresponding target recovery utilization levels. OSD calculates desired
-recovery utilization from client utilization using linear interpolation
-and auto-tunes recovery operation delay to make actual recovery utilization
-match desired.
-
-This allows to reduce recovery/rebalance impact on client operations. It is
-of course impossible to remove it completely, but it should become adequate.
-In some tests rebalance could earlier drop client write speed from 1.5 GB/s
-to 50-100 MB/s, with default auto-tuning settings it now only reduces
-to ~1 GB/s.
-
-## recovery_tune_util_low
-
- Type: number
- Default: 0.1
- Can be changed online: yes
-
-Desired recovery/rebalance utilization when client load is high, i.e. when
-it is at or above recovery_tune_client_util_high.
-
-## recovery_tune_util_high
-
- Type: number
- Default: 1
- Can be changed online: yes
-
-Desired recovery/rebalance utilization when client load is low, i.e. when
-it is at or below recovery_tune_client_util_low.
-
-## recovery_tune_client_util_low
-
- Type: number
- Default: 0
- Can be changed online: yes
-
-Client utilization considered "low".
-
-## recovery_tune_client_util_high
-
- Type: number
- Default: 0.5
- Can be changed online: yes
-
-Client utilization considered "high".
-
-## recovery_tune_agg_interval
-
- Type: integer
- Default: 10
- Can be changed online: yes
-
-The number of last auto-tuning iterations to use for calculating the
-delay as average. Lower values result in quicker response to client
-load change, higher values result in more stable delay. Default value of 10
-is usually fine.
-
-## recovery_tune_sleep_min_us
-
- Type: microseconds
- Default: 10
- Can be changed online: yes
-
-Minimum possible value for auto-tuned recovery_sleep_us. Lower values
-are changed to 0.
-
-## recovery_tune_sleep_cutoff_us
-
- Type: microseconds
- Default: 10000000
- Can be changed online: yes
-
-Maximum possible value for auto-tuned recovery_sleep_us. Higher values
-are treated as outliers and ignored in aggregation.
--- a/docs/config/osd.ru.md
+++ b/docs/config/osd.ru.md
@ -12,7 +12,6 @@
 изменения конфигурации в etcd.

 - [etcd_report_interval](#etcd_report_interval)
- [etcd_stats_interval](#etcd_stats_interval)
 - [run_primary](#run_primary)
 - [osd_network](#osd_network)
 - [bind_address](#bind_address)
@ -20,7 +19,6 @@
 - [autosync_interval](#autosync_interval)
 - [autosync_writes](#autosync_writes)
 - [recovery_queue_depth](#recovery_queue_depth)
- [recovery_sleep_us](#recovery_sleep_us)
 - [recovery_pg_switch](#recovery_pg_switch)
 - [recovery_sync_batch](#recovery_sync_batch)
 - [readonly](#readonly)
@ -34,9 +32,9 @@
 - [max_flusher_count](#max_flusher_count)
 - [inmemory_metadata](#inmemory_metadata)
 - [inmemory_journal](#inmemory_journal)
- [data_io](#data_io)
- [meta_io](#meta_io)
- [journal_io](#journal_io)
+- [cached_read_data](#cached_read_data)
+- [cached_read_meta](#cached_read_meta)
+- [cached_read_journal](#cached_read_journal)
 - [journal_sector_buffer_count](#journal_sector_buffer_count)
 - [journal_no_same_sector_overwrites](#journal_no_same_sector_overwrites)
 - [throttle_small_writes](#throttle_small_writes)
@ -53,35 +51,17 @@
 - [scrub_list_limit](#scrub_list_limit)
 - [scrub_find_best](#scrub_find_best)
 - [scrub_ec_max_bruteforce](#scrub_ec_max_bruteforce)
- [recovery_tune_interval](#recovery_tune_interval)
- [recovery_tune_util_low](#recovery_tune_util_low)
- [recovery_tune_util_high](#recovery_tune_util_high)
- [recovery_tune_client_util_low](#recovery_tune_client_util_low)
- [recovery_tune_client_util_high](#recovery_tune_client_util_high)
- [recovery_tune_agg_interval](#recovery_tune_agg_interval)
- [recovery_tune_sleep_min_us](#recovery_tune_sleep_min_us)
- [recovery_tune_sleep_cutoff_us](#recovery_tune_sleep_cutoff_us)

 ## etcd_report_interval

 - Тип: секунды
 - Значение по умолчанию: 5

-Интервал, с которым OSD сообщает о том, что жив, в etcd. Значение параметра
-влияет на время резервации (lease) OSD и поэтому - на скорость переключения
+Интервал, с которым OSD обновляет своё состояние в etcd. Значение параметра
+влияет на время резервации (lease) OSD и поэтому на скорость переключения
 при падении OSD. Время lease равняется значению этого параметра плюс
 max_etcd_attempts * etcd_quick_timeout.

-## etcd_stats_interval
-
- Тип: секунды
- Значение по умолчанию: 30
-
-Интервал, с которым OSD обновляет свою статистику в etcd. Сильно влияет на
-создаваемую нагрузку на etcd, потому что статистика содержит по ключу на
-каждый OSD и на каждую PG. В то же время низкий интервал делает
-статистику, печатаемую `vitastor-cli`, отзывчивей.
-
 ## run_primary

 - Тип: булево (да/нет)
@ -147,25 +127,13 @@ OSD, чтобы успевать очищать журнал - без них OSD
 ## recovery_queue_depth

 - Тип: целое число
- Значение по умолчанию: 1
+- Значение по умолчанию: 4
 - Можно менять на лету: да

-Максимальное число параллельных операций восстановления, инициируемых одним
-OSD в любой момент времени. Имейте в виду, что каждый OSD обычно работает с
-многими другими OSD, так что на практике параллелизм восстановления больше,
-чем просто recovery_queue_depth. Увеличение значения этого параметра может
-ускорить восстановление если [автотюнинг скорости](#recovery_tune_interval)
-разрешает это или если он отключён.
-
-## recovery_sleep_us
-
- Тип: микросекунды
- Значение по умолчанию: 0
- Можно менять на лету: да
-
-Delay for all recovery- and rebalance- related operations. If non-zero,
-such operations are artificially slowed down to reduce the impact on
-client I/O.
+Максимальное число операций восстановления на одном первичном OSD в любой
+момент времени. На данный момент единственный параметр, который можно менять
+для ускорения или замедления восстановления и перебалансировки данных, но
+в планах реализация других параметров.

 ## recovery_pg_switch

@ -298,62 +266,50 @@ Flusher - это микро-поток (корутина), которая коп
 параметра может оказаться полезным для гибридных OSD (HDD+SSD) с большими
 журналами, расположенными на быстром по сравнению с HDD устройстве.

-## data_io
+## cached_read_data

- Тип: строка
- Значение по умолчанию: direct
+- Тип: булево (да/нет)
+- Значение по умолчанию: false

-Режим ввода-вывода для *данных*. Одно из значений "direct", "cached" или
-"directsync", означающих O_DIRECT, O_SYNC и O_DIRECT|O_SYNC, соответственно.
+Читать данные через системный кэш Linux (page cache), то есть, использовать
+для чтения данных файловый дескриптор, открытый без флага O_DIRECT. Может
+улучшить производительность чтения для часто используемых данных, если они
+помещаются в память. Память кэша разделяется между всеми процессами в
+системе и не учитывается в потреблении памяти процессом OSD.

-Выберите "cached", чтобы использовать системный кэш Linux (page cache) при
-чтении и записи. Это может улучшить скорость чтения горячих данных с
-относительно медленных дисков - HDD и, возможно, SATA SSD - но немного
-снижает производительность записи для быстрых дисков, так как кэш сам по
-себе тоже добавляет накладные расходы.
+## cached_read_meta

-Выберите "directsync", если хотите задействовать
-[immediate_commit](layout-cluster.ru.md#immediate_commit) (требующий
-включенияd disable_data_fsync) на дисках с неотключаемым кэшем. Пример таких
-дисков - Intel Optane. При этом также стоит иметь в виду, что *некоторые*
-настольные SSD (например, HP EX950) игнорируют флаг O_SYNC, делая отключение
-fsync небезопасным даже с режимом "directsync".
+- Тип: булево (да/нет)
+- Значение по умолчанию: false

-## meta_io
+Читать метаданные через системный кэш Linux. Может быть полезно, когда
+включены контрольные суммы, а параметр [inmemory_metadata](#inmemory_metadata)
+отключён, так как в этом случае блоки метаданных читаются с диска при каждом
+запросе чтения для проверки контрольных сумм и их кэширование может снизить
+дополнительную нагрузку на диск.

- Тип: строка
- Значение по умолчанию: direct
+Абсолютно бессмысленно включать данный параметр, если параметр
+inmemory_metadata включён (по умолчанию это так), и также вероятно
+бессмысленно включать его, если не включены контрольные суммы, так как в
+этом случае блоки метаданных читаются с диска только во время сброса
+журнала.

-Режим ввода-вывода для *метаданных*. Одно из значений "direct", "cached" или
-"directsync".
+Если одно и то же устройство используется для данных и метаданных, включение
+[cached_read_data](#cached_read_data) также включает данный параметр, при
+условии, что он не отключён явным образом.

-"cached" может улучшить скорость чтения, если:
-1. у вас медленные диски (HDD, SATA SSD)
-2. контрольные суммы включены
-3. параметр [inmemory_metadata](#inmemory_metadata) отключён.
-При этих условиях блоки метаданных читаются с диска при каждом запросе чтения
-для проверки контрольных сумм и их кэширование может снизить дополнительную
-нагрузку на диск. Без (3) метаданные никогда не читаются с диска после
-запуска OSD, а без (2) блоки метаданных читаются только при сбросе журнала.
+## cached_read_journal

-Если одно и то же устройство используется для данных и метаданных, режим
-ввода-вывода метаданных по умолчанию устанавливается равным [data_io](#data_io).
+- Тип: булево (да/нет)
+- Значение по умолчанию: false

-## journal_io
-
- Тип: строка
- Значение по умолчанию: direct
-
-Режим ввода-вывода для *журнала*. Одно из значений "direct", "cached" или
-"directsync".
-
-Здесь "cached" может улучшить скорость чтения только недавно записанных
-данных и только если параметр [inmemory_journal](#inmemory_journal)
-отключён.
+Читать буферизованные в журнале данные через системный кэш Linux. Не имеет
+смысла без отключения параметра [inmemory_journal](#inmemory_journal),
+который, опять же, по умолчанию включён.

 Если одно и то же устройство используется для метаданных и журнала,
-режим ввода-вывода журнала по умолчанию устанавливается равным
-[meta_io](#meta_io).
+включение [cached_read_meta](#cached_read_meta) также включает данный
+параметр, при условии, что он не отключён явным образом.

 ## journal_sector_buffer_count

@ -556,93 +512,3 @@ EC (кодов коррекции ошибок) с более, чем 1 диск
 считается некорректной. Однако, если "лучшую" версию с числом доступных
 копий большим, чем у всех других версий, найти невозможно, то объект тоже
 маркируется неконсистентным.
-
-## recovery_tune_interval
-
- Тип: секунды
- Значение по умолчанию: 1
- Можно менять на лету: да
-
-Интервал, с которым OSD пересматривает клиентскую нагрузку и нагрузку
-восстановления и автоматически подстраивает [recovery_sleep_us](#recovery_sleep_us).
-Автотюнинг (автоподстройка) отключается, если recovery_tune_interval
-устанавливается в значение 0.
-
-Автотюнинг регулирует утилизацию. Утилизация является мерой нагрузки
-и равна произведению числа операций в секунду и средней задержки
-(то есть, она может быть выше 1). Вы задаёте два уровня клиентской
-утилизации - "низкий" и "высокий" (low и high) и два соответствующих
-целевых уровня утилизации операциями восстановления. OSD рассчитывает
-желаемый уровень утилизации восстановления линейной интерполяцией от
-клиентской утилизации и подстраивает задержку операций восстановления
-так, чтобы фактическая утилизация восстановления совпадала с желаемой.
-
-Это позволяет снизить влияние восстановления и ребаланса на клиентские
-операции. Конечно, невозможно исключить такое влияние полностью, но оно
-должно становиться адекватнее. В некоторых тестах перебалансировка могла
-снижать клиентскую скорость записи с 1.5 ГБ/с до 50-100 МБ/с, а теперь, с
-настройками автотюнинга по умолчанию, она снижается только до ~1 ГБ/с.
-
-## recovery_tune_util_low
-
- Тип: число
- Значение по умолчанию: 0.1
- Можно менять на лету: да
-
-Желаемая утилизация восстановления в моменты, когда клиентская нагрузка
-высокая, то есть, находится на уровне или выше recovery_tune_client_util_high.
-
-## recovery_tune_util_high
-
- Тип: число
- Значение по умолчанию: 1
- Можно менять на лету: да
-
-Желаемая утилизация восстановления в моменты, когда клиентская нагрузка
-низкая, то есть, находится на уровне или ниже recovery_tune_client_util_low.
-
-## recovery_tune_client_util_low
-
- Тип: число
- Значение по умолчанию: 0
- Можно менять на лету: да
-
-Клиентская утилизация, которая считается "низкой".
-
-## recovery_tune_client_util_high
-
- Тип: число
- Значение по умолчанию: 0.5
- Можно менять на лету: да
-
-Клиентская утилизация, которая считается "высокой".
-
-## recovery_tune_agg_interval
-
- Тип: целое число
- Значение по умолчанию: 10
- Можно менять на лету: да
-
-Число последних итераций автоподстройки для расчёта задержки как среднего
-значения. Меньшие значения параметра ускоряют отклик на изменение нагрузки,
-большие значения делают задержку стабильнее. Значение по умолчанию 10
-обычно нормальное и не требует изменений.
-
-## recovery_tune_sleep_min_us
-
- Тип: микросекунды
- Значение по умолчанию: 10
- Можно менять на лету: да
-
-Минимальное возможное значение авто-подстроенного recovery_sleep_us.
-Меньшие значения заменяются на 0.
-
-## recovery_tune_sleep_cutoff_us
-
- Тип: микросекунды
- Значение по умолчанию: 10000000
- Можно менять на лету: да
-
-Максимальное возможное значение авто-подстроенного recovery_sleep_us.
-Большие значения считаются случайными выбросами и игнорируются в
-усреднении.
--- a/docs/config/pool.en.md
+++ b/docs/config/pool.en.md
@ -32,8 +32,6 @@ Parameters:
 - [pg_minsize](#pg_minsize)
 - [pg_count](#pg_count)
 - [failure_domain](#failure_domain)
- [level_placement](#level_placement)
- [raw_placement](#raw_placement)
 - [max_osd_combinations](#max_osd_combinations)
 - [block_size](#block_size)
 - [bitmap_granularity](#bitmap_granularity)
@ -43,7 +41,6 @@ Parameters:
 - [osd_tags](#osd_tags)
 - [primary_affinity_tags](#primary_affinity_tags)
 - [scrub_interval](#scrub_interval)
- [used_for_fs](#used_for_fs)

 Examples:

@ -86,11 +83,7 @@ Parent node reference is required for intermediate tree nodes.
 Separate OSD settings are set in etc keys `/vitastor/config/osd/<number>`
 in JSON format `{"<key>":<value>}`.

-As of now, the following settings are supported:
-
- [reweight](#reweight)
- [tags](#tags)
- [noout](#noout)
+As of now, two settings are supported:

 ## reweight

@ -113,14 +106,6 @@ subsets and then use a specific subset for pool instead of all OSDs.
 For example you can mark SSD OSDs with tag "ssd" and HDD OSDs with "hdd" and
 such tags will work as device classes.

-## noout
-
- Type: boolean
- Default: false
-
-If set to true, [osd_out_time](monitor.en.md#osd_out_time) is ignored for this
-OSD and it's never removed from data distribution by the monitor.
-
 # Pool parameters

 ## name
@ -169,26 +154,6 @@ That is, if it becomes impossible to place PG data on at least (pg_minsize)
 OSDs, PG is deactivated for both read and write. So you know that a fresh
 write always goes to at least (pg_minsize) OSDs (disks).

-For example, the difference between pg_minsize 2 and 1 in a 3-way replicated
-pool (pg_size=3) is:
- If 2 hosts go down with pg_minsize=2, the pool becomes inactive and remains
-  inactive for [osd_out_time](monitor.en.md#osd_out_time) (10 minutes). After
-  this timeout, the monitor selects replacement hosts/OSDs and the pool comes
-  up and starts to heal. Therefore, if you don't have replacement OSDs, i.e.
-  if you only have 3 hosts with OSDs and 2 of them are down, the pool remains
-  inactive until you add or return at least 1 host (or change failure_domain
-  to "osd").
- If 2 hosts go down with pg_minsize=1, the pool only experiences a short
-  I/O pause until the monitor notices that OSDs are down (5-10 seconds with
-  the default [etcd_report_interval](osd.en.md#etcd_report_interval)). After
-  this pause, I/O resumes, but new data is temporarily written in only 1 copy.
-  Then, after osd_out_time, the monitor also selects replacement OSDs and the
-  pool starts to heal.
-
-So, pg_minsize regulates the number of failures that a pool can tolerate
-without temporary downtime for [osd_out_time](monitor.en.md#osd_out_time),
-but at a cost of slightly reduced storage reliability.
-
 FIXME: pg_minsize behaviour may be changed in the future to only make PGs
 read-only instead of deactivating them.

@ -200,8 +165,8 @@ read-only instead of deactivating them.
 Number of PGs for this pool. The value should be big enough for the monitor /
 LP solver to be able to optimize data placement.

-"Enough" is usually around 10-100 PGs per OSD, i.e. you set pg_count for pool
-to (total OSD count * 10 / pg_size). You can round it to the closest power of 2,
+"Enough" is usually around 64-128 PGs per OSD, i.e. you set pg_count for pool
+to (total OSD count * 100 / pg_size). You can round it to the closest power of 2,
 because it makes it easier to reduce or increase PG count later by dividing or
 multiplying it by 2.

@ -223,69 +188,6 @@ never put on OSDs in the same failure domain (for example, on the same host).
 So failure domain specifies the unit which failure you are protecting yourself
 from.

-## level_placement
-
- Type: string
-
-Additional failure domain rules, applied in conjuction with failure_domain.
-Must be specified in the following form:
-
-`<placement level>=<sequence of characters>, <level2>=<sequence2>, ...`
-
-Sequence should be exactly [pg_size](#pg_size) character long. Each character
-corresponds to an OSD in the PG of this pool. Equal characters mean that
-corresponding items of the PG should be placed into the same placement tree
-item at this level. Different characters mean that items should be placed into
-different items.
-
-For example, if you want a EC 4+2 pool and you want every 2 chunks to be stored
-in its own datacenter and you also want each chunk to be stored on a different
-host, you should set `level_placement` to `dc=112233 host=123456`.
-
-Or you can set `level_placement` to `dc=112233` and leave `failure_domain` empty,
-because `host` is the default `failure_domain` and it will be applied anyway.
-
-Without this rule, it may happen that 3 chunks will be stored on OSDs in the
-same datacenter, and the data will become inaccessibly if that datacenter goes
-down in this case.
-
-Of course, you should group your hosts into datacenters before applying the rule
-by setting [placement_levels](monitor.en.md#placement_levels) to something like
-`{"dc":90,"host":100,"osd":110}` and add DCs to [node_placement](#placement-tree),
-like `{"dc1":{"level":"dc"},"host1":{"parent":"dc1"},...}`.
-
-## raw_placement
-
- Type: string
-
-Raw PG placement rules, specified in the form of a DSL (domain-specific language).
-Use only if you really know what you're doing :)
-
-DSL specification:
-
-```
-dsl := item | item ("\n" | ",") items
-item := "any" | rules
-rules := rule | rule rules
-rule := level operator arg
-level := /\w+/
-operator := "!=" | "=" | ">" | "?="
-arg := value | "(" values ")"
-values := value | value "," values
-value := item_ref | constant_id
-item_ref := /\d+/
-constant_id := /"([^"]+)"/
-```
-
-"?=" operator means "preferred". I.e. `dc ?= "meow"` means "prefer datacenter meow
-for this chunk, but put into another dc if it's unavailable".
-
-Examples:
-
- Simple 3 replicas with failure_domain=host: `any, host!=1, host!=(1,2)`
- EC 4+2 in 3 DC: `any, dc=1 host!=1, dc!=1, dc=3 host!=3, dc!=(1,3), dc=5 host!=5`
- 1 replica in fixed DC + 2 in random DCs: `dc?=meow, dc!=1, dc!=(1,2)`
-
 ## max_osd_combinations

 - Type: integer
@ -303,8 +205,9 @@ This parameter usually doesn't require to be changed.
 - Default: 131072

 Block size for this pool. The value from /vitastor/config/global is used when
-unspecified. Only OSDs with matching block_size are used for each pool. If you
-want to further restrict OSDs for the pool, use [osd_tags](#osd_tags).
+unspecified. If your cluster has OSDs with different block sizes then pool must
+be restricted by [osd_tags](#osd_tags) to only include OSDs with matching block
+size.

 Read more about this parameter in [Cluster-Wide Disk Layout Parameters](layout-cluster.en.md#block_size).

@ -313,9 +216,10 @@ Read more about this parameter in [Cluster-Wide Disk Layout Parameters](layout-c
 - Type: integer
 - Default: 4096

-"Sector" size of virtual disks in this pool. The value from /vitastor/config/global
-is used when unspecified. Similarly to block_size, only OSDs with matching
-bitmap_granularity are used for each pool.
+"Sector" size of virtual disks in this pool. The value from
+/vitastor/config/global is used when unspecified. Similar to block_size, the
+pool must be restricted by [osd_tags](#osd_tags) to only include OSDs with
+matching bitmap_granularity.

 Read more about this parameter in [Cluster-Wide Disk Layout Parameters](layout-cluster.en.md#bitmap_granularity).

@ -325,11 +229,10 @@ Read more about this parameter in [Cluster-Wide Disk Layout Parameters](layout-c
 - Default: none

 Immediate commit setting for this pool. The value from /vitastor/config/global
-is used when unspecified. Similarly to block_size, only OSDs with compatible
-bitmap_granularity are used for each pool. "Compatible" means that a pool with
-non-immediate commit will use OSDs with immediate commit enabled, but not vice
-versa. I.e., pools with "none" use all OSDs, pools with "small" only use OSDs
-with "all" or "small", and pools with "all" only use OSDs with "all".
+is used when unspecified. Similar to block_size, the pool must be restricted by
+[osd_tags](#osd_tags) to only include OSDs with compatible immediate_commit.
+Compatible means that a pool with non-immediate commit will work with OSDs with
+immediate commit enabled, but not vice versa.

 Read more about this parameter in [Cluster-Wide Disk Layout Parameters](layout-cluster.en.md#immediate_commit).

@ -377,25 +280,6 @@ of the OSDs containing a data chunk for a PG.
 Automatic scrubbing interval for this pool. Overrides
 [global scrub_interval setting](osd.en.md#scrub_interval).

-## used_for_fs
-
- Type: string
-
-If non-empty, the pool is marked as used for VitastorFS with metadata stored
-in block image (regular Vitastor volume) named as the value of this pool parameter.
-
-When a pool is marked as used for VitastorFS, regular block volume creation in it
-is disabled (vitastor-cli refuses to create images without --force) to protect
-the user from block volume and FS file ID collisions and data loss.
-
-[vitastor-nfs](../usage/nfs.ru.md), in its turn, refuses to use pools not marked
-for the corresponding FS when starting. This also implies that you can use one
-pool only for one VitastorFS.
-
-The second thing that is disabled for VitastorFS pools is reporting per-inode space
-usage statistics in etcd because a FS pool may store a very large number of files
-and statistics for them all would take a lot of space in etcd.
-
 # Examples

 ## Replicated pool
--- a/docs/config/pool.ru.md
+++ b/docs/config/pool.ru.md
@ -31,8 +31,6 @@
 - [pg_minsize](#pg_minsize)
 - [pg_count](#pg_count)
 - [failure_domain](#failure_domain)
- [level_placement](#level_placement)
- [raw_placement](#raw_placement)
 - [max_osd_combinations](#max_osd_combinations)
 - [block_size](#block_size)
 - [bitmap_granularity](#bitmap_granularity)
@ -42,7 +40,6 @@
 - [osd_tags](#osd_tags)
 - [primary_affinity_tags](#primary_affinity_tags)
 - [scrub_interval](#scrub_interval)
- [used_for_fs](#used_for_fs)

 Примеры:

@ -85,11 +82,10 @@
 Настройки отдельных OSD задаются в ключах etcd `/vitastor/config/osd/<number>`
 в JSON-формате `{"<key>":<value>}`.

-На данный момент поддерживаются следующие настройки:
+На данный момент поддерживаются две настройки:

 - [reweight](#reweight)
 - [tags](#tags)
- [noout](#noout)

 ## reweight

@ -113,14 +109,6 @@
 всех. Можно, например, пометить SSD OSD тегом "ssd", а HDD тегом "hdd", в
 этом смысле теги работают аналогично классам устройств.

-## noout
-
- Тип: булево (да/нет)
- Значение по умолчанию: false
-
-Если установлено в true, то [osd_out_time](monitor.ru.md#osd_out_time) для этого
-OSD игнорируется и OSD не удаляется из распределения данных монитором.
-
 # Параметры

 ## name
@ -169,26 +157,6 @@ OSD игнорируется и OSD не удаляется из распред
 OSD, PG деактивируется на чтение и запись. Иными словами, всегда известно,
 что новые блоки данных всегда записываются как минимум на pg_minsize дисков.

-Для примера, разница между pg_minsize 2 и 1 в реплицированном пуле с 3 копиями
-данных (pg_size=3), проявляется следующим образом:
- Если 2 сервера отключаются при pg_minsize=2, пул становится неактивным и
-  остаётся неактивным в течение [osd_out_time](monitor.ru.md#osd_out_time)
-  (10 минут), после чего монитор назначает другие OSD/серверы на замену, пул
-  поднимается и начинает восстанавливать недостающие копии данных. Соответственно,
-  если OSD на замену нет - то есть, если у вас всего 3 сервера с OSD и 2 из них
-  недоступны - пул так и остаётся недоступным до тех пор, пока вы не вернёте
-  или не добавите хотя бы 1 сервер (или не переключите failure_domain на "osd").
- Если 2 сервера отключаются при pg_minsize=1, ввод-вывод лишь приостанавливается
-  на короткое время, до тех пор, пока монитор не поймёт, что OSD отключены
-  (что занимает 5-10 секунд при стандартном [etcd_report_interval](osd.ru.md#etcd_report_interval)).
-  После этого ввод-вывод восстанавливается, но новые данные временно пишутся
-  всего в 1 копии. Когда же проходит osd_out_time, монитор точно так же назначает
-  другие OSD на замену выбывшим и пул начинает восстанавливать копии данных.
-
-То есть, pg_minsize регулирует число отказов, которые пул может пережить без
-временной остановки обслуживания на [osd_out_time](monitor.ru.md#osd_out_time),
-но ценой немного пониженных гарантий надёжности.
-
 FIXME: Поведение pg_minsize может быть изменено в будущем с полной деактивации
 PG на перевод их в режим только для чтения.

@ -200,8 +168,8 @@ PG на перевод их в режим только для чтения.
 Число PG для данного пула. Число должно быть достаточно большим, чтобы монитор
 мог равномерно распределить по ним данные.

-Обычно это означает примерно 10-100 PG на 1 OSD, т.е. pg_count можно устанавливать
-равным (общему числу OSD * 10 / pg_size). Значение можно округлить до ближайшей
+Обычно это означает примерно 64-128 PG на 1 OSD, т.е. pg_count можно устанавливать
+равным (общему числу OSD * 100 / pg_size). Значение можно округлить до ближайшей
 степени 2, чтобы потом было легче уменьшать или увеличивать число PG, умножая
 или деля его на 2.

@ -222,71 +190,6 @@ PG в Vitastor эферемерны, то есть вы можете менят
 Иными словами, домен отказа - это то, от отказа чего вы защищаете себя избыточным
 хранением.

-## level_placement
-
- Тип: строка
-
-Правила дополнительных доменов отказа, применяемые вместе с failure_domain.
-Должны задаваться в следующем виде:
-
-`<уровень>=<последовательность символов>, <уровень2>=<последовательность2>, ...`
-
-Каждая `<последовательность>` должна состоять ровно из [pg_size](#pg_size) символов.
-Каждый символ соответствует одному OSD (размещению одной части PG) этого пула.
-Одинаковые символы означают, что соответствующие части размещаются в один и тот же
-узел дерева OSD на заданном `<уровне>`. Разные символы означают, что части
-размещаются в разные узлы.
-
-Например, если вы хотите сделать пул EC 4+2 и хотите поместить каждые 2 части
-данных в свой датацентр, и также вы хотите, чтобы каждая часть размещалась на
-другом хосте, то вы должны задать `level_placement` равным `dc=112233 host=123456`.
-
-Либо вы просто можете задать `level_placement` равным `dc=112233` и оставить
-`failure_domain` пустым, т.к. `host` это его значение по умолчанию и оно также
-применится автоматически.
-
-Без этого правила может получиться так, что в одном из датацентров окажется
-3 части данных одной PG и данные окажутся недоступными при временном отключении
-этого датацентра.
-
-Естественно, перед установкой правила вам нужно сгруппировать ваши хосты в
-датацентры, установив [placement_levels](monitor.ru.md#placement_levels) во что-то
-типа `{"dc":90,"host":100,"osd":110}` и добавив датацентры в [node_placement](#дерево-размещения),
-примерно так: `{"dc1":{"level":"dc"},"host1":{"parent":"dc1"},...}`.
-
-## raw_placement
-
- Type: string
-
-Низкоуровневые правила генерации PG в форме DSL (доменно-специфичного языка).
-Используйте, только если действительно знаете, зачем вам это надо :)
-
-Спецификация DSL:
-
-```
-dsl := item | item ("\n" | ",") items
-item := "any" | rules
-rules := rule | rule rules
-rule := level operator arg
-level := /\w+/
-operator := "!=" | "=" | ">" | "?="
-arg := value | "(" values ")"
-values := value | value "," values
-value := item_ref | constant_id
-item_ref := /\d+/
-constant_id := /"([^"]+)"/
-```
-
-Оператор "?=" означает "предпочитаемый". Т.е. `dc ?= "meow"` означает "предпочитать
-датацентр meow для этой части данных, но разместить её в другом датацентре, если
-meow недоступен".
-
-Примеры:
-
- Простые 3 реплики с failure_domain=host: `any, host!=1, host!=(1,2)`
- EC 4+2 в 3 датацентрах: `any, dc=1 host!=1, dc!=1, dc=3 host!=3, dc!=(1,3), dc=5 host!=5`
- 1 копия в фиксированном ДЦ + 2 в других ДЦ: `dc?=meow, dc!=1, dc!=(1,2)`
-
 ## max_osd_combinations

 - Тип: целое число
@ -305,9 +208,8 @@ meow недоступен".

 Размер блока для данного пула. Если не задан, используется значение из
 /vitastor/config/global. Если в вашем кластере есть OSD с разными размерами
-блока, пул будет использовать только OSD с размером блока, равным размеру блока
-пула. Если вы хотите сильнее ограничить набор используемых для пула OSD -
-используйте [osd_tags](#osd_tags).
+блока, пул должен быть ограничен только OSD, блок которых равен блоку пула,
+с помощью [osd_tags](#osd_tags).

 О самом параметре читайте в разделе [Дисковые параметры уровня кластера](layout-cluster.ru.md#block_size).

@ -317,8 +219,9 @@ meow недоступен".
 - По умолчанию: 4096

 Размер "сектора" виртуальных дисков в данном пуле. Если не задан, используется
-значение из /vitastor/config/global. Аналогично block_size, каждый пул будет
-использовать только OSD с совпадающей с пулом настройкой bitmap_granularity.
+значение из /vitastor/config/global. Аналогично block_size, пул должен быть
+ограничен OSD со значением bitmap_granularity, равным значению пула, с помощью
+[osd_tags](#osd_tags).

 О самом параметре читайте в разделе [Дисковые параметры уровня кластера](layout-cluster.ru.md#bitmap_granularity).

@ -328,13 +231,11 @@ meow недоступен".
 - По умолчанию: none

 Настройка мгновенного коммита для данного пула. Если не задана, используется
-значение из /vitastor/config/global. Аналогично block_size, каждый пул будет
-использовать только OSD с *совместимыми* настройками immediate_commit.
-"Совместимыми" означает, что пул с отключенным мгновенным коммитом будет
-использовать OSD с включённым мгновенным коммитом, но не наоборот. То есть,
-пул со значением "none" будет использовать все OSD, пул со "small" будет
-использовать OSD с "all" или "small", а пул с "all" будет использовать только
-OSD с "all".
+значение из /vitastor/config/global. Аналогично block_size, пул должен быть
+ограничен OSD со значением bitmap_granularity, совместимым со значением пула, с
+помощью [osd_tags](#osd_tags). Совместимость означает, что пул с отключенным
+мгновенным коммитом может работать на OSD с включённым мгновенным коммитом, но
+не наоборот.

 О самом параметре читайте в разделе [Дисковые параметры уровня кластера](layout-cluster.ru.md#immediate_commit).

@ -383,27 +284,6 @@ OSD с "all".
 Интервал скраба, то есть, автоматической фоновой проверки данных для данного пула.
 Переопределяет [глобальную настройку scrub_interval](osd.ru.md#scrub_interval).

-## used_for_fs
-
- Type: string
-
-Если непусто, пул помечается как используемый для файловой системы VitastorFS с
-метаданными, хранимыми в блочном образе Vitastor с именем, равным значению
-этого параметра.
-
-Когда пул помечается как используемый для VitastorFS, создание обычных блочных
-образов в нём отключается (vitastor-cli отказывается создавать образы без --force),
-чтобы защитить пользователя от коллизий ID файлов и блочных образов и, таким
-образом, от потери данных.
-
-[vitastor-nfs](../usage/nfs.ru.md), в свою очередь, при запуске отказывается
-использовать для ФС пулы, не выделенные для неё. Это также означает, что один
-пул может использоваться только для одной VitastorFS.
-
-Также для ФС-пулов отключается передача статистики в etcd по отдельным инодам,
-так как ФС-пул может содержать очень много файлов и статистика по ним всем
-заняла бы очень много места в etcd.
-
 # Примеры

 ## Реплицированный пул
--- a/docs/config/src/client.en.md
+++ b/docs/config/src/client.en.md
@ -1,4 +0,0 @@
-# Client Parameters
-
-These parameters apply only to Vitastor clients (QEMU, fio, NBD and so on) and
-affect their interaction with the cluster.
--- a/docs/config/src/client.ru.md
+++ b/docs/config/src/client.ru.md
@ -1,4 +0,0 @@
-# Параметры клиентского кода
-
-Данные параметры применяются только к клиентам Vitastor (QEMU, fio, NBD и т.п.) и
-затрагивают логику их работы с кластером.
--- a/docs/config/src/client.yml
+++ b/docs/config/src/client.yml
@ -1,226 +0,0 @@
- name: client_retry_interval
-  type: ms
-  min: 10
-  default: 50
-  online: true
-  info: |
-    Retry time for I/O requests failed due to inactive PGs or network
-    connectivity errors.
-  info_ru: |
-    Время повтора запросов ввода-вывода, неудачных из-за неактивных PG или
-    ошибок сети.
- name: client_eio_retry_interval
-  type: ms
-  default: 1000
-  online: true
-  info: |
-    Retry time for I/O requests failed due to data corruption or unfinished
-    EC object deletions (has_incomplete PG state). 0 disables such retries
-    and clients are not blocked and just get EIO error code instead.
-  info_ru: |
-    Время повтора запросов ввода-вывода, неудачных из-за повреждения данных
-    или незавершённых удалений EC-объектов (состояния PG has_incomplete).
-    0 отключает повторы таких запросов и клиенты не блокируются, а вместо
-    этого просто получают код ошибки EIO.
- name: client_retry_enospc
-  type: bool
-  default: true
-  online: true
-  info: |
-    Retry writes on out of space errors to wait until some space is freed on
-    OSDs.
-  info_ru: |
-    Повторять запросы записи, завершившиеся с ошибками нехватки места, т.е.
-    ожидать, пока на OSD не освободится место.
- name: client_max_dirty_bytes
-  type: int
-  default: 33554432
-  online: true
-  info: |
-    Without [immediate_commit](layout-cluster.en.md#immediate_commit)=all this parameter sets the limit of "dirty"
-    (not committed by fsync) data allowed by the client before forcing an
-    additional fsync and committing the data. Also note that the client always
-    holds a copy of uncommitted data in memory so this setting also affects
-    RAM usage of clients.
-  info_ru: |
-    При работе без [immediate_commit](layout-cluster.ru.md#immediate_commit)=all - это лимит объёма "грязных" (не
-    зафиксированных fsync-ом) данных, при достижении которого клиент будет
-    принудительно вызывать fsync и фиксировать данные. Также стоит иметь в виду,
-    что в этом случае до момента fsync клиент хранит копию незафиксированных
-    данных в памяти, то есть, настройка влияет на потребление памяти клиентами.
- name: client_max_dirty_ops
-  type: int
-  default: 1024
-  online: true
-  info: |
-    Same as client_max_dirty_bytes, but instead of total size, limits the number
-    of uncommitted write operations.
-  info_ru: |
-    Аналогично client_max_dirty_bytes, но ограничивает количество
-    незафиксированных операций записи вместо их общего объёма.
- name: client_enable_writeback
-  type: bool
-  default: false
-  online: true
-  info: |
-    This parameter enables client-side write buffering. This means that write
-    requests are accumulated in memory for a short time before being sent to
-    a Vitastor cluster which allows to send them in parallel and increase
-    performance of some applications. Writes are buffered until client forces
-    a flush with fsync() or until the amount of buffered writes exceeds the
-    limit.
-
-    Write buffering significantly increases performance of some applications,
-    for example, CrystalDiskMark under Windows (LOL :-D), but also any other
-    applications if they do writes in one of two non-optimal ways: either if
-    they do a lot of small (4 kb or so) sequential writes, or if they do a lot
-    of small random writes, but without any parallelism or asynchrony, and also
-    without calling fsync().
-
-    With write buffering enabled, you can expect around 22000 T1Q1 random write
-    iops in QEMU more or less regardless of the quality of your SSDs, and this
-    number is in fact bound by QEMU itself rather than Vitastor (check it
-    yourself by adding a "driver=null-co" disk in QEMU). Without write
-    buffering, the current record is 9900 iops, but the number is usually
-    even lower with non-ideal hardware, for example, it may be 5000 iops.
-
-    Even when this parameter is enabled, write buffering isn't enabled until
-    the client explicitly allows it, because enabling it without the client
-    being aware of the fact that his writes may be buffered may lead to data
-    loss. Because of this, older versions of clients don't support write
-    buffering at all, newer versions of the QEMU driver allow write buffering
-    only if it's enabled in disk settings with `-blockdev cache.direct=false`,
-    and newer versions of FIO only allow write buffering if you don't specify
-    `-direct=1`. NBD and NFS drivers allow write buffering by default.
-
-    You can overcome this restriction too with the `client_writeback_allowed`
-    parameter, but you shouldn't do that unless you **really** know what you
-    are doing.
-  info_ru: |
-    Данный параметр разрешает включать буферизацию записи в памяти. Буферизация
-    означает, что операции записи отправляются на кластер Vitastor не сразу, а
-    могут небольшое время накапливаться в памяти и сбрасываться сразу пакетами,
-    до тех пор, пока либо не будет превышен лимит неотправленных записей, либо
-    пока клиент не вызовет fsync.
-
-    Буферизация значительно повышает производительность некоторых приложений,
-    например, CrystalDiskMark в Windows (ха-ха :-D), но также и любых других,
-    которые пишут на диск неоптимально: либо последовательно, но мелкими блоками
-    (например, по 4 кб), либо случайно, но без параллелизма и без fsync - то
-    есть, например, отправляя 128 операций записи в разные места диска, но не
-    все сразу с помощью асинхронного I/O, а по одной.
-
-    В QEMU с буферизацией записи можно ожидать показателя примерно 22000
-    операций случайной записи в секунду в 1 поток и с глубиной очереди 1 (T1Q1)
-    без fsync, почти вне зависимости от того, насколько хороши ваши диски - эта
-    цифра упирается в сам QEMU. Без буферизации рекорд пока что - 9900 операций
-    в секунду, но на железе похуже может быть и поменьше, например, 5000 операций
-    в секунду.
-
-    При этом, даже если данный параметр включён, буферизация не включается, если
-    явно не разрешена клиентом, т.к. если клиент не знает, что запросы записи
-    буферизуются, это может приводить к потере данных. Поэтому в старых версиях
-    клиентских драйверов буферизация записи не включается вообще, в новых
-    версиях QEMU-драйвера включается, только если разрешена опцией диска
-    `-blockdev cache.direct=false`, а в fio - только если нет опции `-direct=1`.
-    В NBD и NFS драйверах буферизация записи разрешена по умолчанию.
-
-    Можно обойти и это ограничение с помощью параметра `client_writeback_allowed`,
-    но делать так не надо, если только вы не уверены в том, что делаете, на все
-    100%. :-)
- name: client_max_buffered_bytes
-  type: int
-  default: 33554432
-  online: true
-  info: |
-    Maximum total size of buffered writes which triggers write-back when reached.
-  info_ru: |
-    Максимальный общий размер буферизованных записей, при достижении которого
-    начинается процесс сброса данных на сервер.
- name: client_max_buffered_ops
-  type: int
-  default: 1024
-  online: true
-  info: |
-    Maximum number of buffered writes which triggers write-back when reached.
-    Multiple consecutive modified data regions are counted as 1 write here.
-  info_ru: |
-    Максимальное количество буферизованных записей, при достижении которого
-    начинается процесс сброса данных на сервер. При этом несколько
-    последовательных изменённых областей здесь считаются 1 записью.
- name: client_max_writeback_iodepth
-  type: int
-  default: 256
-  online: true
-  info: |
-    Maximum number of parallel writes when flushing buffered data to the server.
-  info_ru: |
-    Максимальное число параллельных операций записи при сбросе буферов на сервер.
- name: nbd_timeout
-  type: sec
-  default: 300
-  online: false
-  info: |
-    Timeout for I/O operations for [NBD](../usage/nbd.en.md). If an operation
-    executes for longer than this timeout, including when your cluster is just
-    temporarily down for more than timeout, the NBD device will detach by itself
-    (and possibly break the mounted file system).
-
-    You can set timeout to 0 to never detach, but in that case you won't be
-    able to remove the kernel device at all if the NBD process dies - you'll have
-    to reboot the host.
-  info_ru: |
-    Таймаут для операций чтения/записи через [NBD](../usage/nbd.ru.md). Если
-    операция выполняется дольше таймаута, включая временную недоступность
-    кластера на время, большее таймаута, NBD-устройство отключится само собой
-    (и, возможно, сломает примонтированную ФС).
-
-    Вы можете установить таймаут в 0, чтобы никогда не отключать устройство по
-    таймауту, но в этом случае вы вообще не сможете удалить устройство, если
-    процесс NBD умрёт - вам придётся перезагружать сервер.
- name: nbd_max_devices
-  type: int
-  default: 64
-  online: false
-  info: |
-    Maximum number of NBD devices in the system. This value is passed as
-    `nbds_max` parameter for the nbd kernel module when vitastor-nbd autoloads it.
-  info_ru: |
-    Максимальное число NBD-устройств в системе. Данное значение передаётся
-    модулю ядра nbd как параметр `nbds_max`, когда его загружает vitastor-nbd.
- name: nbd_max_part
-  type: int
-  default: 3
-  online: false
-  info: |
-    Maximum number of partitions per NBD device. This value is passed as
-    `max_part` parameter for the nbd kernel module when vitastor-nbd autoloads it.
-    Note that (nbds_max)*(1+max_part) usually can't exceed 256.
-  info_ru: |
-    Максимальное число разделов на одном NBD-устройстве. Данное значение передаётся
-    модулю ядра nbd как параметр `max_part`, когда его загружает vitastor-nbd.
-    Имейте в виду, что (nbds_max)*(1+max_part) обычно не может превышать 256.
- name: osd_nearfull_ratio
-  type: float
-  default: 0.95
-  online: true
-  info: |
-    Ratio of used space on OSD to treat it as "almost full" in vitastor-cli status output.
-
-    Remember that some client writes may hang or complete with an error if even
-    just one OSD becomes 100 % full!
-
-    However, unlike in Ceph, 100 % full Vitastor OSDs don't crash (in Ceph they're
-    unable to start at all), so you'll be able to recover from "out of space" errors
-    without destroying and recreating OSDs.
-  info_ru: |
-    Доля занятого места на OSD, начиная с которой он считается "почти заполненным" в
-    выводе vitastor-cli status.
-
-    Помните, что часть клиентских запросов может зависнуть или завершиться с ошибкой,
-    если на 100 % заполнится хотя бы 1 OSD!
-
-    Однако, в отличие от Ceph, заполненные на 100 % OSD Vitastor не падают (в Ceph
-    заполненные на 100% OSD вообще не могут стартовать), так что вы сможете
-    восстановить работу кластера после ошибок отсутствия свободного места
-    без уничтожения и пересоздания OSD.
--- a/docs/config/src/included.en.md
+++ b/docs/config/src/included.en.md
@ -28,8 +28,6 @@

 {{../../config/network.en.md|indent=2}}

-{{../../config/client.en.md|indent=2}}
-
 {{../../config/layout-cluster.en.md|indent=2}}

 {{../../config/layout-osd.en.md|indent=2}}
@ -56,8 +54,6 @@

 {{../../usage/nfs.en.md}}

-{{../../usage/admin.en.md}}
-
 ## Performance

 {{../../performance/understanding.en.md}}
@ -66,6 +62,4 @@

 {{../../performance/comparison1.en.md}}

-{{../../performance/bench2.en.md}}
-
 {{../../intro/author.en.md|indent=1}}
--- a/docs/config/src/included.ru.md
+++ b/docs/config/src/included.ru.md
@ -28,8 +28,6 @@

 {{../../config/network.ru.md|indent=2}}

-{{../../config/client.ru.md|indent=2}}
-
 {{../../config/layout-cluster.ru.md|indent=2}}

 {{../../config/layout-osd.ru.md|indent=2}}
@ -56,8 +54,6 @@

 {{../../usage/nfs.ru.md}}

-{{../../usage/admin.ru.md}}
-
 ## Производительность

 {{../../performance/understanding.ru.md}}
@ -66,6 +62,4 @@

 {{../../performance/comparison1.ru.md}}

-{{../../performance/bench2.ru.md}}
-
 {{../../intro/author.ru.md|indent=1}}
--- a/docs/config/src/layout-cluster.yml
+++ b/docs/config/src/layout-cluster.yml
@ -87,9 +87,8 @@
    it (they have internal SSD cache even though it's not stated in datasheets).

    Setting this parameter to "all" or "small" in OSD parameters requires enabling
-    [disable_journal_fsync](layout-osd.en.yml#disable_journal_fsync) and
-    [disable_meta_fsync](layout-osd.en.yml#disable_meta_fsync), setting it to
-    "all" also requires enabling [disable_data_fsync](layout-osd.en.yml#disable_data_fsync).
+    disable_journal_fsync and disable_meta_fsync, setting it to "all" also requires
+    enabling disable_data_fsync.

    TLDR: For optimal performance, set immediate_commit to "all" if you only use
    SSDs with supercapacitor-based power loss protection (nonvolatile
@ -141,9 +140,8 @@
    указано в спецификациях).

    Указание "all" или "small" в настройках / командной строке OSD требует
-    включения [disable_journal_fsync](layout-osd.ru.yml#disable_journal_fsync) и
-    [disable_meta_fsync](layout-osd.ru.yml#disable_meta_fsync), значение "all"
-    также требует включения [disable_data_fsync](layout-osd.ru.yml#disable_data_fsync).
+    включения disable_journal_fsync и disable_meta_fsync, значение "all" также
+    требует включения disable_data_fsync.

    Итого, вкратце: для оптимальной производительности установите
    immediate_commit в значение "all", если вы используете в кластере только SSD
--- a/docs/config/src/layout-osd.yml
+++ b/docs/config/src/layout-osd.yml
@ -228,25 +228,24 @@

    Checksums increase metadata size by 4 bytes per each csum_block_size of data.

-    Checksums are always a tradeoff:
+    Checksums are always a compromise:
    1. You either sacrifice +1 GB RAM per 1 TB of data
    2. Or you raise csum_block_size, for example, to 32k and sacrifice
       50% random write iops due to checksum read-modify-write
    3. Or you turn off [inmemory_metadata](osd.en.md#inmemory_metadata) and
       sacrifice 50% random read iops due to checksum reads

-    All-flash clusters usually have enough RAM to use default csum_block_size,
-    which uses 1 GB RAM per 1 TB of data. HDD clusters usually don't.
+    Option 1 (default) is recommended for all-flash setups because these usually
+    have enough RAM.

-    Thus, recommended setups are:
-    1. All-flash, 1 GB RAM per 1 TB data: default (csum_block_size=4k)
-    2. All-flash, less RAM: csum_block_size=4k + inmemory_metadata=false
-    3. Hybrid HDD+SSD: csum_block_size=4k + inmemory_metadata=false
-    4. HDD-only, faster random read: csum_block_size=32k
-    5. HDD-only, faster random write: csum_block_size=4k +
-       inmemory_metadata=false + meta_io=cached
+    Option 2 is recommended for HDD-only setups. HDD-only setups usually do NOT
+    have enough RAM for the default 4 KB csum_block_size.

-    See also [meta_io](osd.en.md#meta_io).
+    Option 3 is recommended for SSD+HDD setups (because metadata SSDs will handle
+    extra reads without any performance drop) and also *maybe* for NVMe all-flash
+    setups when you don't have enough RAM (because NVMe drives have plenty
+    of read iops to spare). You may also consider enabling
+    [cached_read_meta](osd.en.md#cached_read_meta) in this case.
  info_ru: |
    Размер блока расчёта контрольных сумм.

@ -265,12 +264,17 @@
       жертвуете 50% скорости случайного чтения из-за чтения контрольных сумм
       с диска

-    Таким образом, рекомендуются следующие варианты настроек:
-    1. All-flash, 1 ГБ памяти на 1 ТБ данных: по умолчанию (csum_block_size=4k)
-    2. All-flash, меньше памяти: csum_block_size=4k + inmemory_metadata=false
-    3. Гибридные HDD+SSD: csum_block_size=4k + inmemory_metadata=false
-    4. Только HDD, быстрее случайное чтение: csum_block_size=32k
-    5. Только HDD, быстрее случайная запись: csum_block_size=4k +
-       inmemory_metadata=false + meta_io=cached
+    Вариант 1 (при настройках по умолчанию) рекомендуется для SSD (All-Flash)
+    кластеров, потому что памяти в них обычно хватает.

-    Смотрите также [meta_io](osd.ru.md#meta_io).
+    Вариант 2 рекомендуется для кластеров на одних жёстких дисках (без SSD
+    под метаданные). На 4 кб блок контрольной суммы памяти в таких кластерах
+    обычно НЕ хватает.
+
+    Вариант 3 рекомендуется для гибридных кластеров (SSD+HDD), потому что
+    скорости SSD под метаданными хватит, чтобы обработать дополнительные чтения
+    без снижения производительности. Также вариант 3 *может* рекомендоваться
+    для All-Flash кластеров на основе NVMe-дисков, когда памяти НЕ достаточно,
+    потому что NVMe-диски имеют огромный запас производительности по чтению.
+    В таких случаях, возможно, также имеет смысл включать параметр
+    [cached_read_meta](osd.ru.md#cached_read_meta).
--- a/docs/config/src/make.js
+++ b/docs/config/src/make.js
@ -38,7 +38,6 @@ const types = {
        bool: 'boolean',
        int: 'integer',
        sec: 'seconds',
-        float: 'number',
        ms: 'milliseconds',
        us: 'microseconds',
    },
@ -47,7 +46,6 @@ const types = {
        bool: 'булево (да/нет)',
        int: 'целое число',
        sec: 'секунды',
-        float: 'число',
        ms: 'миллисекунды',
        us: 'микросекунды',
    },
--- a/docs/config/src/monitor.yml
+++ b/docs/config/src/monitor.yml
@ -1,7 +1,7 @@
 - name: etcd_mon_ttl
  type: sec
-  min: 5
-  default: 1
+  min: 10
+  default: 30
  info: Monitor etcd lease refresh interval in seconds
  info_ru: Интервал обновления etcd резервации (lease) монитором
 - name: etcd_mon_timeout
@ -63,12 +63,3 @@
    "host" и "osd" являются предопределёнными и не могут быть удалены. Если
    один из них отсутствует в конфигурации, он доопределяется с приоритетом по
    умолчанию (100 для уровня "host", 101 для "osd").
- name: use_old_pg_combinator
-  type: bool
-  default: false
-  info: |
-    Use the old PG combination generator which doesn't support [level_placement](pool.en.md#level_placement)
-    and [raw_placement](pool.en.md#raw_placement) for pools which don't use this features.
-  info_ru: |
-    Использовать старый генератор комбинаций PG, не поддерживающий [level_placement](pool.ru.md#level_placement)
-    и [raw_placement](pool.ru.md#raw_placement) для пулов, которые не используют данные функции.
--- a/docs/config/src/network.yml
+++ b/docs/config/src/network.yml
@ -48,14 +48,11 @@
  type: string
  info: |
    RDMA device name to use for Vitastor OSD communications (for example,
-    "rocep5s0f0"). Now Vitastor supports all adapters, even ones without
-    ODP support, like Mellanox ConnectX-3 and non-Mellanox cards.
-
-    Versions up to Vitastor 1.2.0 required ODP which is only present in
-    Mellanox ConnectX >= 4. See also [rdma_odp](#rdma_odp).
-
-    Run `ibv_devinfo -v` as root to list available RDMA devices and their
-    features.
+    "rocep5s0f0"). Please note that Vitastor RDMA requires Implicit On-Demand
+    Paging (Implicit ODP) and Scatter/Gather (SG) support from the RDMA device
+    to work. For example, Mellanox ConnectX-3 and older adapters don't have
+    Implicit ODP, so they're unsupported by Vitastor. Run `ibv_devinfo -v` as
+    root to list available RDMA devices and their features.

    Remember that you also have to configure your network switches if you use
    RoCE/RoCEv2, otherwise you may experience unstable performance. Refer to
@ -64,15 +61,12 @@
    PFC (Priority Flow Control) and ECN (Explicit Congestion Notification).
  info_ru: |
    Название RDMA-устройства для связи с Vitastor OSD (например, "rocep5s0f0").
-    Сейчас Vitastor поддерживает все модели адаптеров, включая те, у которых
-    нет поддержки ODP, то есть вы можете использовать RDMA с ConnectX-3 и
-    картами производства не Mellanox.
-
-    Версии Vitastor до 1.2.0 включительно требовали ODP, который есть только
-    на Mellanox ConnectX 4 и более новых. См. также [rdma_odp](#rdma_odp).
-
-    Запустите `ibv_devinfo -v` от имени суперпользователя, чтобы посмотреть
-    список доступных RDMA-устройств, их параметры и возможности.
+    Имейте в виду, что поддержка RDMA в Vitastor требует функций устройства
+    Implicit On-Demand Paging (Implicit ODP) и Scatter/Gather (SG). Например,
+    адаптеры Mellanox ConnectX-3 и более старые не поддерживают Implicit ODP и
+    потому не поддерживаются в Vitastor. Запустите `ibv_devinfo -v` от имени
+    суперпользователя, чтобы посмотреть список доступных RDMA-устройств, их
+    параметры и возможности.

    Обратите внимание, что если вы используете RoCE/RoCEv2, вам также необходимо
    правильно настроить для него коммутаторы, иначе вы можете столкнуться с
@ -166,45 +160,6 @@
    у принимающей стороны в процессе работы не заканчивались буферы на приём.
    Не влияет на потребление памяти - дополнительная память на операции отправки
    не выделяется.
- name: rdma_odp
-  type: bool
-  default: false
-  online: false
-  info: |
-    Use RDMA with On-Demand Paging. ODP is currently only available on Mellanox
-    ConnectX-4 and newer adapters. ODP allows to not register memory explicitly
-    for RDMA adapter to be able to use it. This, in turn, allows to skip memory
-    copying during sending. One would think this should improve performance, but
-    **in reality** RDMA performance with ODP is **drastically** worse. Example
-    3-node cluster with 8 NVMe in each node and 2*25 GBit/s ConnectX-6 RDMA network
-    without ODP pushes 3950000 read iops, but only 239000 iops with ODP...
-
-    This happens because Mellanox ODP implementation seems to be based on
-    message retransmissions when the adapter doesn't know about the buffer yet -
-    it likely uses standard "RNR retransmissions" (RNR = receiver not ready)
-    which is generally slow in RDMA/RoCE networks. Here's a presentation about
-    it from ISPASS-2021 conference: https://tkygtr6.github.io/pub/ISPASS21_slides.pdf
-
-    ODP support is retained in the code just in case a good ODP implementation
-    appears one day.
-  info_ru: |
-    Использовать RDMA с On-Demand Paging. ODP - функция, доступная пока что
-    исключительно на адаптерах Mellanox ConnectX-4 и более новых. ODP позволяет
-    не регистрировать память для её использования RDMA-картой. Благодаря этому
-    можно не копировать данные при отправке их в сеть и, казалось бы, это должно
-    улучшать производительность - но **по факту** получается так, что
-    производительность только ухудшается, причём сильно. Пример - на 3-узловом
-    кластере с 8 NVMe в каждом узле и сетью 2*25 Гбит/с на чтение с RDMA без ODP
-    удаётся снять 3950000 iops, а с ODP - всего 239000 iops...
-
-    Это происходит из-за того, что реализация ODP у Mellanox неоптимальная и
-    основана на повторной передаче сообщений, когда карте не известен буфер -
-    вероятно, на стандартных "RNR retransmission" (RNR = receiver not ready).
-    А данные повторные передачи в RDMA/RoCE - всегда очень медленная штука.
-    Презентация на эту тему с конференции ISPASS-2021: https://tkygtr6.github.io/pub/ISPASS21_slides.pdf
-
-    Возможность использования ODP сохранена в коде на случай, если вдруг в один
-    прекрасный день появится хорошая реализация ODP.
 - name: peer_connect_interval
  type: sec
  min: 1
@ -243,6 +198,21 @@
    Максимальное время ожидания ответа на запрос проверки состояния соединения.
    Если OSD не отвечает за это время, соединение отключается и производится
    повторная попытка соединения.
+- name: up_wait_retry_interval
+  type: ms
+  min: 50
+  default: 500
+  online: true
+  info: |
+    OSDs respond to clients with a special error code when they receive I/O
+    requests for a PG that's not synchronized and started. This parameter sets
+    the time for the clients to wait before re-attempting such I/O requests.
+  info_ru: |
+    Когда OSD получают от клиентов запросы ввода-вывода, относящиеся к не
+    поднятым на данный момент на них PG, либо к PG в процессе синхронизации,
+    они отвечают клиентам специальным кодом ошибки, означающим, что клиент
+    должен некоторое время подождать перед повторением запроса. Именно это время
+    ожидания задаёт данный параметр.
 - name: max_etcd_attempts
  type: int
  default: 5
@ -280,12 +250,32 @@
  info_ru: |
    Таймаут для HTTP Keep-Alive в соединениях к etcd. Должен быть больше, чем
    etcd_report_interval, чтобы keepalive гарантированно работал.
- name: etcd_ws_keepalive_interval
+- name: etcd_ws_keepalive_timeout
  type: sec
-  default: 5
+  default: 30
  online: true
  info: |
    etcd websocket ping interval required to keep the connection alive and
    detect disconnections quickly.
  info_ru: |
    Интервал проверки живости вебсокет-подключений к etcd.
+- name: client_dirty_limit
+  type: int
+  default: 33554432
+  online: true
+  info: |
+    Without immediate_commit=all this parameter sets the limit of "dirty"
+    (not committed by fsync) data allowed by the client before forcing an
+    additional fsync and committing the data. Also note that the client always
+    holds a copy of uncommitted data in memory so this setting also affects
+    RAM usage of clients.
+
+    This parameter doesn't affect OSDs themselves.
+  info_ru: |
+    При работе без immediate_commit=all - это лимит объёма "грязных" (не
+    зафиксированных fsync-ом) данных, при достижении которого клиент будет
+    принудительно вызывать fsync и фиксировать данные. Также стоит иметь в виду,
+    что в этом случае до момента fsync клиент хранит копию незафиксированных
+    данных в памяти, то есть, настройка влияет на потребление памяти клиентами.
+
+    Параметр не влияет на сами OSD.
--- a/docs/config/src/osd.yml
+++ b/docs/config/src/osd.yml
@ -2,28 +2,15 @@
  type: sec
  default: 5
  info: |
-    Interval at which OSDs report their liveness to etcd. Affects OSD lease time
+    Interval at which OSDs report their state to etcd. Affects OSD lease time
    and thus the failover speed. Lease time is equal to this parameter value
    plus max_etcd_attempts * etcd_quick_timeout because it should be guaranteed
    that every OSD always refreshes its lease in time.
  info_ru: |
-    Интервал, с которым OSD сообщает о том, что жив, в etcd. Значение параметра
-    влияет на время резервации (lease) OSD и поэтому - на скорость переключения
+    Интервал, с которым OSD обновляет своё состояние в etcd. Значение параметра
+    влияет на время резервации (lease) OSD и поэтому на скорость переключения
    при падении OSD. Время lease равняется значению этого параметра плюс
    max_etcd_attempts * etcd_quick_timeout.
- name: etcd_stats_interval
-  type: sec
-  default: 30
-  info: |
-    Interval at which OSDs report their statistics to etcd. Highly affects the
-    imposed load on etcd, because statistics include a key for every OSD and
-    for every PG. At the same time, low statistic intervals make `vitastor-cli`
-    statistics more responsive.
-  info_ru: |
-    Интервал, с которым OSD обновляет свою статистику в etcd. Сильно влияет на
-    создаваемую нагрузку на etcd, потому что статистика содержит по ключу на
-    каждый OSD и на каждую PG. В то же время низкий интервал делает
-    статистику, печатаемую `vitastor-cli`, отзывчивей.
 - name: run_primary
  type: bool
  default: true
@ -107,29 +94,17 @@
    принудительной отправкой fsync-а.
 - name: recovery_queue_depth
  type: int
-  default: 1
+  default: 4
  online: true
  info: |
-    Maximum recovery and rebalance operations initiated by each OSD in parallel.
-    Note that each OSD talks to a lot of other OSDs so actual number of parallel
-    recovery operations per each OSD is greater than just recovery_queue_depth.
-    Increasing this parameter can speedup recovery if [auto-tuning](#recovery_tune_interval)
-    allows it or if it is disabled.
+    Maximum recovery operations per one primary OSD at any given moment of time.
+    Currently it's the only parameter available to tune the speed or recovery
+    and rebalancing, but it's planned to implement more.
  info_ru: |
-    Максимальное число параллельных операций восстановления, инициируемых одним
-    OSD в любой момент времени. Имейте в виду, что каждый OSD обычно работает с
-    многими другими OSD, так что на практике параллелизм восстановления больше,
-    чем просто recovery_queue_depth. Увеличение значения этого параметра может
-    ускорить восстановление если [автотюнинг скорости](#recovery_tune_interval)
-    разрешает это или если он отключён.
- name: recovery_sleep_us
-  type: us
-  default: 0
-  online: true
-  info: |
-    Delay for all recovery- and rebalance- related operations. If non-zero,
-    such operations are artificially slowed down to reduce the impact on
-    client I/O.
+    Максимальное число операций восстановления на одном первичном OSD в любой
+    момент времени. На данный момент единственный параметр, который можно менять
+    для ускорения или замедления восстановления и перебалансировки данных, но
+    в планах реализация других параметров.
 - name: recovery_pg_switch
  type: int
  default: 128
@ -285,96 +260,70 @@
    достаточно 16- или 32-мегабайтного журнала. Однако в теории отключение
    параметра может оказаться полезным для гибридных OSD (HDD+SSD) с большими
    журналами, расположенными на быстром по сравнению с HDD устройстве.
- name: data_io
-  type: string
-  default: direct
+- name: cached_read_data
+  type: bool
+  default: false
  info: |
-    I/O mode for *data*. One of "direct", "cached" or "directsync". Corresponds
-    to O_DIRECT, O_SYNC and O_DIRECT|O_SYNC, respectively.
-
-    Choose "cached" to use Linux page cache. This may improve read performance
-    for hot data and slower disks - HDDs and maybe SATA SSDs - but will slightly
-    decrease write performance for fast disks because page cache is an overhead
-    itself.
-
-    Choose "directsync" to use [immediate_commit](layout-cluster.ru.md#immediate_commit)
-    (which requires disable_data_fsync) with drives having write-back cache
-    which can't be turned off, for example, Intel Optane. Also note that *some*
-    desktop SSDs (for example, HP EX950) may ignore O_SYNC thus making
-    disable_data_fsync unsafe even with "directsync".
+    Read data through Linux page cache, i.e. use a file descriptor opened without
+    O_DIRECT for data reads. May improve read performance for frequently accessed
+    data if it fits in RAM. Memory in page cache is shared by all processes and
+    not accounted in OSD memory consumption.
  info_ru: |
-    Режим ввода-вывода для *данных*. Одно из значений "direct", "cached" или
-    "directsync", означающих O_DIRECT, O_SYNC и O_DIRECT|O_SYNC, соответственно.
-
-    Выберите "cached", чтобы использовать системный кэш Linux (page cache) при
-    чтении и записи. Это может улучшить скорость чтения горячих данных с
-    относительно медленных дисков - HDD и, возможно, SATA SSD - но немного
-    снижает производительность записи для быстрых дисков, так как кэш сам по
-    себе тоже добавляет накладные расходы.
-
-    Выберите "directsync", если хотите задействовать
-    [immediate_commit](layout-cluster.ru.md#immediate_commit) (требующий
-    включенияd disable_data_fsync) на дисках с неотключаемым кэшем. Пример таких
-    дисков - Intel Optane. При этом также стоит иметь в виду, что *некоторые*
-    настольные SSD (например, HP EX950) игнорируют флаг O_SYNC, делая отключение
-    fsync небезопасным даже с режимом "directsync".
- name: meta_io
-  type: string
-  default: direct
+    Читать данные через системный кэш Linux (page cache), то есть, использовать
+    для чтения данных файловый дескриптор, открытый без флага O_DIRECT. Может
+    улучшить производительность чтения для часто используемых данных, если они
+    помещаются в память. Память кэша разделяется между всеми процессами в
+    системе и не учитывается в потреблении памяти процессом OSD.
+- name: cached_read_meta
+  type: bool
+  default: false
  info: |
-    I/O mode for *metadata*. One of "direct", "cached" or "directsync".
+    Read metadata through Linux page cache. May be beneficial when checksums
+    are enabled and [inmemory_metadata](#inmemory_metadata) is disabled, because
+    in this case metadata blocks are read from disk to verify checksums on every
+    read request and caching them may reduce this extra read load.

-    "cached" may improve read performance, but only under the following conditions:
-    1. your drives are relatively slow (HDD, SATA SSD), and
-    2. checksums are enabled, and
-    3. [inmemory_metadata](#inmemory_metadata) is disabled.
-    Under all these conditions, metadata blocks are read from disk on every
-    read request to verify checksums and caching them may reduce this extra
-    read load. Without (3) metadata is never read from the disk after starting,
-    and without (2) metadata blocks are read from disk only during journal
+    Absolutely pointless to enable with enabled inmemory_metadata because all
+    metadata is kept in memory anyway, and likely pointless without checksums,
+    because in that case, metadata blocks are read from disk only during journal
    flushing.

-    "directsync" is the same as above.
-
-    If the same device is used for data and metadata, meta_io by default is set
-    to the same value as [data_io](#data_io).
+    If the same device is used for data and metadata, enabling [cached_read_data](#cached_read_data)
+    also enables this parameter, given that it isn't turned off explicitly.
  info_ru: |
-    Режим ввода-вывода для *метаданных*. Одно из значений "direct", "cached" или
-    "directsync".
+    Читать метаданные через системный кэш Linux. Может быть полезно, когда
+    включены контрольные суммы, а параметр [inmemory_metadata](#inmemory_metadata)
+    отключён, так как в этом случае блоки метаданных читаются с диска при каждом
+    запросе чтения для проверки контрольных сумм и их кэширование может снизить
+    дополнительную нагрузку на диск.

-    "cached" может улучшить скорость чтения, если:
-    1. у вас медленные диски (HDD, SATA SSD)
-    2. контрольные суммы включены
-    3. параметр [inmemory_metadata](#inmemory_metadata) отключён.
-    При этих условиях блоки метаданных читаются с диска при каждом запросе чтения
-    для проверки контрольных сумм и их кэширование может снизить дополнительную
-    нагрузку на диск. Без (3) метаданные никогда не читаются с диска после
-    запуска OSD, а без (2) блоки метаданных читаются только при сбросе журнала.
+    Абсолютно бессмысленно включать данный параметр, если параметр
+    inmemory_metadata включён (по умолчанию это так), и также вероятно
+    бессмысленно включать его, если не включены контрольные суммы, так как в
+    этом случае блоки метаданных читаются с диска только во время сброса
+    журнала.

-    Если одно и то же устройство используется для данных и метаданных, режим
-    ввода-вывода метаданных по умолчанию устанавливается равным [data_io](#data_io).
- name: journal_io
-  type: string
-  default: direct
+    Если одно и то же устройство используется для данных и метаданных, включение
+    [cached_read_data](#cached_read_data) также включает данный параметр, при
+    условии, что он не отключён явным образом.
+- name: cached_read_journal
+  type: bool
+  default: false
  info: |
-    I/O mode for *journal*. One of "direct", "cached" or "directsync".
+    Read buffered data from journal through Linux page cache. Does not have sense
+    without disabling [inmemory_journal](#inmemory_journal), which, again, is
+    enabled by default.

-    Here, "cached" may only improve read performance for recent writes and
-    only if [inmemory_journal](#inmemory_journal) is turned off.
-
-    If the same device is used for metadata and journal, journal_io by default
-    is set to the same value as [meta_io](#meta_io).
+    If the same device is used for metadata and journal, enabling [cached_read_meta](#cached_read_meta)
+    also enables this parameter, given that it isn't turned off explicitly.
  info_ru: |
-    Режим ввода-вывода для *журнала*. Одно из значений "direct", "cached" или
-    "directsync".
-
-    Здесь "cached" может улучшить скорость чтения только недавно записанных
-    данных и только если параметр [inmemory_journal](#inmemory_journal)
-    отключён.
+    Читать буферизованные в журнале данные через системный кэш Linux. Не имеет
+    смысла без отключения параметра [inmemory_journal](#inmemory_journal),
+    который, опять же, по умолчанию включён.

    Если одно и то же устройство используется для метаданных и журнала,
-    режим ввода-вывода журнала по умолчанию устанавливается равным
-    [meta_io](#meta_io).
+    включение [cached_read_meta](#cached_read_meta) также включает данный
+    параметр, при условии, что он не отключён явным образом.
 - name: journal_sector_buffer_count
  type: int
  default: 32
@ -638,112 +587,3 @@
    считается некорректной. Однако, если "лучшую" версию с числом доступных
    копий большим, чем у всех других версий, найти невозможно, то объект тоже
    маркируется неконсистентным.
- name: recovery_tune_interval
-  type: sec
-  default: 1
-  online: true
-  info: |
-    Interval at which OSD re-considers client and recovery load and automatically
-    adjusts [recovery_sleep_us](#recovery_sleep_us). Recovery auto-tuning is
-    disabled if recovery_tune_interval is set to 0.
-
-    Auto-tuning targets utilization. Utilization is a measure of load and is
-    equal to the product of iops and average latency (so it may be greater
-    than 1). You set "low" and "high" client utilization thresholds and two
-    corresponding target recovery utilization levels. OSD calculates desired
-    recovery utilization from client utilization using linear interpolation
-    and auto-tunes recovery operation delay to make actual recovery utilization
-    match desired.
-
-    This allows to reduce recovery/rebalance impact on client operations. It is
-    of course impossible to remove it completely, but it should become adequate.
-    In some tests rebalance could earlier drop client write speed from 1.5 GB/s
-    to 50-100 MB/s, with default auto-tuning settings it now only reduces
-    to ~1 GB/s.
-  info_ru: |
-    Интервал, с которым OSD пересматривает клиентскую нагрузку и нагрузку
-    восстановления и автоматически подстраивает [recovery_sleep_us](#recovery_sleep_us).
-    Автотюнинг (автоподстройка) отключается, если recovery_tune_interval
-    устанавливается в значение 0.
-
-    Автотюнинг регулирует утилизацию. Утилизация является мерой нагрузки
-    и равна произведению числа операций в секунду и средней задержки
-    (то есть, она может быть выше 1). Вы задаёте два уровня клиентской
-    утилизации - "низкий" и "высокий" (low и high) и два соответствующих
-    целевых уровня утилизации операциями восстановления. OSD рассчитывает
-    желаемый уровень утилизации восстановления линейной интерполяцией от
-    клиентской утилизации и подстраивает задержку операций восстановления
-    так, чтобы фактическая утилизация восстановления совпадала с желаемой.
-
-    Это позволяет снизить влияние восстановления и ребаланса на клиентские
-    операции. Конечно, невозможно исключить такое влияние полностью, но оно
-    должно становиться адекватнее. В некоторых тестах перебалансировка могла
-    снижать клиентскую скорость записи с 1.5 ГБ/с до 50-100 МБ/с, а теперь, с
-    настройками автотюнинга по умолчанию, она снижается только до ~1 ГБ/с.
- name: recovery_tune_util_low
-  type: float
-  default: 0.1
-  online: true
-  info: |
-    Desired recovery/rebalance utilization when client load is high, i.e. when
-    it is at or above recovery_tune_client_util_high.
-  info_ru: |
-    Желаемая утилизация восстановления в моменты, когда клиентская нагрузка
-    высокая, то есть, находится на уровне или выше recovery_tune_client_util_high.
- name: recovery_tune_util_high
-  type: float
-  default: 1
-  online: true
-  info: |
-    Desired recovery/rebalance utilization when client load is low, i.e. when
-    it is at or below recovery_tune_client_util_low.
-  info_ru: |
-    Желаемая утилизация восстановления в моменты, когда клиентская нагрузка
-    низкая, то есть, находится на уровне или ниже recovery_tune_client_util_low.
- name: recovery_tune_client_util_low
-  type: float
-  default: 0
-  online: true
-  info: Client utilization considered "low".
-  info_ru: Клиентская утилизация, которая считается "низкой".
- name: recovery_tune_client_util_high
-  type: float
-  default: 0.5
-  online: true
-  info: Client utilization considered "high".
-  info_ru: Клиентская утилизация, которая считается "высокой".
- name: recovery_tune_agg_interval
-  type: int
-  default: 10
-  online: true
-  info: |
-    The number of last auto-tuning iterations to use for calculating the
-    delay as average. Lower values result in quicker response to client
-    load change, higher values result in more stable delay. Default value of 10
-    is usually fine.
-  info_ru: |
-    Число последних итераций автоподстройки для расчёта задержки как среднего
-    значения. Меньшие значения параметра ускоряют отклик на изменение нагрузки,
-    большие значения делают задержку стабильнее. Значение по умолчанию 10
-    обычно нормальное и не требует изменений.
- name: recovery_tune_sleep_min_us
-  type: us
-  default: 10
-  online: true
-  info: |
-    Minimum possible value for auto-tuned recovery_sleep_us. Lower values
-    are changed to 0.
-  info_ru: |
-    Минимальное возможное значение авто-подстроенного recovery_sleep_us.
-    Меньшие значения заменяются на 0.
- name: recovery_tune_sleep_cutoff_us
-  type: us
-  default: 10000000
-  online: true
-  info: |
-    Maximum possible value for auto-tuned recovery_sleep_us. Higher values
-    are treated as outliers and ignored in aggregation.
-  info_ru: |
-    Максимальное возможное значение авто-подстроенного recovery_sleep_us.
-    Большие значения считаются случайными выбросами и игнорируются в
-    усреднении.
--- a/docs/installation/kubernetes.en.md
+++ b/docs/installation/kubernetes.en.md
@ -17,27 +17,4 @@ and apply all `NNN-*.yaml` manifests to your Kubernetes installation:
 for i in ./???-*.yaml; do kubectl apply -f $i; done
 ```

-After that you'll be able to create PersistentVolumes.
-
-**Important:** For best experience, use Linux kernel at least 5.15 with [VDUSE](../usage/qemu.en.md#vduse)
-kernel modules enabled (vdpa, vduse, virtio-vdpa). If your distribution doesn't
-have them pre-built - build them yourself ([instructions](../usage/qemu.en.md#vduse)),
-I promise it's worth it :-). When VDUSE is unavailable, CSI driver uses [NBD](../usage/nbd.en.md)
-to map Vitastor devices. NBD is slower and prone to timeout issues: if Vitastor
-cluster becomes unresponsible for more than [nbd_timeout](../config/client.en.md#nbd_timeout),
-the NBD device detaches and breaks pods using it.
-
-## Features
-
-Vitastor CSI supports:
- Kubernetes starting with 1.20 (or 1.17 for older vitastor-csi <= 1.1.0)
- Filesystem RWO (ReadWriteOnce) volumes. Example: [PVC](../../csi/deploy/example-pvc.yaml), [pod](../../csi/deploy/example-test-pod.yaml)
- Raw block RWX (ReadWriteMany) volumes. Example: [PVC](../../csi/deploy/example-pvc-block.yaml), [pod](../../csi/deploy/example-test-pod-block.yaml)
- Volume expansion
- Volume snapshots. Example: [snapshot class](../../csi/deploy/example-snapshot-class.yaml), [snapshot](../../csi/deploy/example-snapshot.yaml), [clone](../../csi/deploy/example-snapshot-clone.yaml)
- [VDUSE](../usage/qemu.en.md#vduse) (preferred) and [NBD](../usage/nbd.en.md) device mapping methods
- Upgrades with VDUSE - new handler processes are restarted when CSI pods are restarted themselves
- VDUSE daemon auto-restart - handler processes are automatically restarted if they crash due to a bug in Vitastor client code
- Multiple clusters by using multiple configuration files in ConfigMap.
-
-Remember that to use snapshots with CSI you also have to install [Snapshot Controller and CRDs](https://kubernetes-csi.github.io/docs/snapshot-controller.html#deployment).
+After that you'll be able to create PersistentVolumes. See example in [csi/deploy/example-pvc.yaml](../../csi/deploy/example-pvc.yaml).
--- a/docs/installation/kubernetes.ru.md
+++ b/docs/installation/kubernetes.ru.md
@ -17,27 +17,4 @@
 for i in ./???-*.yaml; do kubectl apply -f $i; done
 ```

-После этого вы сможете создавать PersistentVolume.
-
-**Важно:** Лучше всего использовать ядро Linux версии не менее 5.15 с включёнными модулями
-[VDUSE](../usage/qemu.ru.md#vduse) (vdpa, vduse, virtio-vdpa). Если в вашем дистрибутиве
-они не собраны из коробки - соберите их сами, обещаю, что это стоит того ([инструкция](../usage/qemu.ru.md#vduse)) :-).
-Когда VDUSE недоступно, CSI-плагин использует [NBD](../usage/nbd.ru.md) для подключения
-дисков, а NBD медленнее и имеет проблему таймаута - если кластер остаётся недоступным
-дольше, чем [nbd_timeout](../config/client.ru.md#nbd_timeout), NBD-устройство отключается
-и ломает поды, использующие его.
-
-## Возможности
-
-CSI-плагин Vitastor поддерживает:
- Версии Kubernetes, начиная с 1.20 (или с 1.17 для более старых vitastor-csi <= 1.1.0)
- Файловые RWO (ReadWriteOnce) тома. Пример: [PVC](../../csi/deploy/example-pvc.yaml), [под](../../csi/deploy/example-test-pod.yaml)
- Сырые блочные RWX (ReadWriteMany) тома. Пример: [PVC](../../csi/deploy/example-pvc-block.yaml), [под](../../csi/deploy/example-test-pod-block.yaml)
- Расширение размера томов
- Снимки томов. Пример: [класс снимков](../../csi/deploy/example-snapshot-class.yaml), [снимок](../../csi/deploy/example-snapshot.yaml), [клон снимка](../../csi/deploy/example-snapshot-clone.yaml)
- Способы подключения устройств [VDUSE](../usage/qemu.ru.md#vduse) (предпочитаемый) и [NBD](../usage/nbd.ru.md)
- Обновление при использовании VDUSE - новые процессы-обработчики устройств успешно перезапускаются вместе с самими подами CSI
- Автоперезауск демонов VDUSE - процесс-обработчик автоматически перезапустится, если он внезапно упадёт из-за бага в коде клиента Vitastor
- Несколько кластеров через задание нескольких файлов конфигурации в ConfigMap.
-
-Не забывайте, что для использования снимков нужно сначала установить [контроллер снимков и CRD](https://kubernetes-csi.github.io/docs/snapshot-controller.html#deployment).
+После этого вы сможете создавать PersistentVolume. Пример смотрите в файле [csi/deploy/example-pvc.yaml](../../csi/deploy/example-pvc.yaml).
--- a/docs/installation/openstack.en.md
+++ b/docs/installation/openstack.en.md
@ -11,8 +11,7 @@ To enable Vitastor support in an OpenStack installation:
 - Install vitastor-client, patched QEMU and libvirt packages from Vitastor DEB or RPM repository
 - Use `patches/nova-21.diff` or `patches/nova-23.diff` to patch your Nova installation.
  Patch 21 fits Nova 21-22, patch 23 fits Nova 23-24.
- Install `patches/cinder-vitastor-21.py` or `pathces/cinder-vitastor-22.py` as `..../cinder/volume/drivers/vitastor.py`
-  Patch 21 fits Cinder up 21 (zed), Patch 22 fits Cinder after 22 (2023.1)
+- Install `patches/cinder-vitastor.py` as `..../cinder/volume/drivers/vitastor.py`
 - Define a volume type in cinder.conf (see below)
 - Block network access from VMs to Vitastor network (to OSDs and etcd),
  because Vitastor doesn't support authentication
--- a/docs/installation/openstack.ru.md
+++ b/docs/installation/openstack.ru.md
@ -11,8 +11,7 @@
 - Установите пакеты vitastor-client, libvirt и QEMU из DEB или RPM репозитория Vitastor
 - Примените патч `patches/nova-21.diff` или `patches/nova-23.diff` к вашей инсталляции Nova.
  nova-21.diff подходит для Nova 21-22, nova-23.diff подходит для Nova 23-24.
- Скопируйте `patches/cinder-vitastor-21.py` или `pathces/cinder-vitastor-22.py` в инсталляцию Cinder как `cinder/volume/drivers/vitastor.py`.
-  `cinder-vitastor-21.py` подходит для Cinder 21 (zed) и младше, `cinder-vitastor-22.py` подходит для Cinder 22 (2023.1) и старше.
+- Скопируйте `patches/cinder-vitastor.py` в инсталляцию Cinder как `cinder/volume/drivers/vitastor.py`
 - Создайте тип томов в cinder.conf (см. ниже)
 - Обязательно заблокируйте доступ от виртуальных машин к сети Vitastor (OSD и etcd), т.к. Vitastor (пока) не поддерживает аутентификацию
 - Перезапустите Cinder и Nova
--- a/docs/installation/packages.en.md
+++ b/docs/installation/packages.en.md
@ -14,11 +14,9 @@
  - Debian 12 (Bookworm/Sid): `deb https://vitastor.io/debian bookworm main`
  - Debian 11 (Bullseye): `deb https://vitastor.io/debian bullseye main`
  - Debian 10 (Buster): `deb https://vitastor.io/debian buster main`
-  - Add `-oldstable` to bookworm/bullseye/buster in this line to install the last
-    stable version from 0.9.x branch instead of 1.x
 - For Debian 10 (Buster) also enable backports repository:
  `deb http://deb.debian.org/debian buster-backports main`
- Install packages: `apt update; apt install vitastor lp-solve etcd linux-image-amd64 qemu-system-x86`
+- Install packages: `apt update; apt install vitastor lp-solve etcd linux-image-amd64 qemu`

 ## CentOS

--- a/docs/installation/packages.ru.md
+++ b/docs/installation/packages.ru.md
@ -14,11 +14,9 @@
  - Debian 12 (Bookworm/Sid): `deb https://vitastor.io/debian bookworm main`
  - Debian 11 (Bullseye): `deb https://vitastor.io/debian bullseye main`
  - Debian 10 (Buster): `deb https://vitastor.io/debian buster main`
-  - Добавьте `-oldstable` к слову bookworm/bullseye/buster в этой строке, чтобы
-    установить последнюю стабильную версию из ветки 0.9.x вместо 1.x
 - Для Debian 10 (Buster) также включите репозиторий backports:
  `deb http://deb.debian.org/debian buster-backports main`
- Установите пакеты: `apt update; apt install vitastor lp-solve etcd linux-image-amd64 qemu-system-x86`
+- Установите пакеты: `apt update; apt install vitastor lp-solve etcd linux-image-amd64 qemu`

 ## CentOS

--- a/docs/installation/proxmox.en.md
+++ b/docs/installation/proxmox.en.md
@ -6,10 +6,10 @@

 # Proxmox VE

-To enable Vitastor support in Proxmox Virtual Environment (6.4-8.1 are supported):
+To enable Vitastor support in Proxmox Virtual Environment (6.4-8.0 are supported):

 - Add the corresponding Vitastor Debian repository into sources.list on Proxmox hosts:
-  bookworm for 8.1, pve8.0 for 8.0, bullseye for 7.4, pve7.3 for 7.3, pve7.2 for 7.2, pve7.1 for 7.1, buster for 6.4
+  bookworm for 8.0, bullseye for 7.4, pve7.3 for 7.3, pve7.2 for 7.2, pve7.1 for 7.1, buster for 6.4
 - Install vitastor-client, pve-qemu-kvm, pve-storage-vitastor (* or see note) packages from Vitastor repository
 - Define storage in `/etc/pve/storage.cfg` (see below)
 - Block network access from VMs to Vitastor network (to OSDs and etcd),
@ -25,7 +25,7 @@ vitastor: vitastor
    vitastor_pool testpool
    # path to the configuration file
    vitastor_config_path /etc/vitastor/vitastor.conf
-    # etcd address(es), OPTIONAL, required only if missing in the configuration file
+    # etcd address(es), required only if missing in the configuration file
    vitastor_etcd_address 192.168.7.2:2379/v3
    # prefix for keys in etcd
    vitastor_etcd_prefix /vitastor
--- a/docs/installation/proxmox.ru.md
+++ b/docs/installation/proxmox.ru.md
@ -6,10 +6,10 @@

 # Proxmox VE

-Чтобы подключить Vitastor к Proxmox Virtual Environment (поддерживаются версии 6.4-8.1):
+Чтобы подключить Vitastor к Proxmox Virtual Environment (поддерживаются версии 6.4-8.0):

 - Добавьте соответствующий Debian-репозиторий Vitastor в sources.list на хостах Proxmox:
-  bookworm для 8.1, pve8.0 для 8.0, bullseye для 7.4, pve7.3 для 7.3, pve7.2 для 7.2, pve7.1 для 7.1, buster для 6.4
+  bookworm для 8.0, bullseye для 7.4, pve7.3 для 7.3, pve7.2 для 7.2, pve7.1 для 7.1, buster для 6.4
 - Установите пакеты vitastor-client, pve-qemu-kvm, pve-storage-vitastor (* или см. сноску) из репозитория Vitastor
 - Определите тип хранилища в `/etc/pve/storage.cfg` (см. ниже)
 - Обязательно заблокируйте доступ от виртуальных машин к сети Vitastor (OSD и etcd), т.к. Vitastor (пока) не поддерживает аутентификацию
@ -24,7 +24,7 @@ vitastor: vitastor
    vitastor_pool testpool
    # Путь к файлу конфигурации
    vitastor_config_path /etc/vitastor/vitastor.conf
-    # Адрес(а) etcd, ОПЦИОНАЛЬНЫ, нужны, только если не указаны в vitastor.conf
+    # Адрес(а) etcd, нужны, только если не указаны в vitastor.conf
    vitastor_etcd_address 192.168.7.2:2379/v3
    # Префикс ключей метаданных в etcd
    vitastor_etcd_prefix /vitastor
--- a/docs/installation/source.en.md
+++ b/docs/installation/source.en.md
@ -21,7 +21,7 @@

 ## Basic instructions

-Download source, for example using git: `git clone --recurse-submodules https://git.yourcmc.ru/vitalif/vitastor/`
+Download source, for example using git: `git clone --recurse-submodules https://yourcmc.ru/git/vitalif/vitastor/`

 Get `fio` source and symlink it into `<vitastor>/fio`. If you don't want to build fio engine,
 you can disable it by passing `-DWITH_FIO=no` to cmake.
@ -41,7 +41,7 @@ It's recommended to build the QEMU driver (qemu_driver.c) in-tree, as a part of
 QEMU build process. To do that:
 - Install vitastor client library headers (from source or from vitastor-client-dev package)
 - Take a corresponding patch from `patches/qemu-*-vitastor.patch` and apply it to QEMU source
- Copy `src/client/qemu_driver.c` to QEMU source directory as `block/vitastor.c`
+- Copy `src/qemu_driver.c` to QEMU source directory as `block/block-vitastor.c`
 - Build QEMU as usual

 But it is also possible to build it out-of-tree. To do that:
--- a/docs/installation/source.ru.md
+++ b/docs/installation/source.ru.md
@ -21,7 +21,7 @@

 ## Базовая инструкция

-Скачайте исходные коды, например, из git: `git clone --recurse-submodules https://git.yourcmc.ru/vitalif/vitastor/`
+Скачайте исходные коды, например, из git: `git clone --recurse-submodules https://yourcmc.ru/git/vitalif/vitastor/`

 Скачайте исходные коды пакета `fio`, распакуйте их и создайте символическую ссылку на них
 в директории исходников Vitastor: `<vitastor>/fio`. Либо, если вы не хотите собирать плагин fio,
@ -41,7 +41,7 @@ cmake .. && make -j8 install
 Драйвер QEMU (qemu_driver.c) рекомендуется собирать вместе с самим QEMU. Для этого:
 - Установите заголовки клиентской библиотеки Vitastor (из исходников или из пакета vitastor-client-dev)
 - Возьмите соответствующий патч из `patches/qemu-*-vitastor.patch` и примените его к исходникам QEMU
- Скопируйте [src/client/qemu_driver.c](../../src/client/qemu_driver.c) в директорию исходников QEMU как `block/vitastor.c`
+- Скопируйте [src/qemu_driver.c](../../src/qemu_driver.c) в директорию исходников QEMU как `block/block-vitastor.c`
 - Соберите QEMU как обычно

 Однако в целях отладки драйвер также можно собирать отдельно от QEMU. Для этого:
@ -60,7 +60,7 @@ cmake .. && make -j8 install
      * Для QEMU 2.0+: `<qemu>/qapi-types.h` &rarr; `<vitastor>/qemu/b/qemu/qapi-types.h`
   - `config-host.h` и `qapi` нужны, т.к. в них содержатся автогенерируемые заголовки
 - Сконфигурируйте cmake Vitastor с `WITH_QEMU=yes` (`cmake .. -DWITH_QEMU=yes`) и, если вы
-  используете RHEL-подобный дистрибутив, также с `QEMU_PLUGINDIR=qemu-kvm`.
+  используете RHEL-подобый дистрибутив, также с `QEMU_PLUGINDIR=qemu-kvm`.
 - После этого в процессе сборки Vitastor также будет собираться подходящий для вашей
  версии QEMU `block-vitastor.so`.
 - Таким образом можно использовать драйвер даже с немодифицированным QEMU, но в этом случае
--- a/docs/intro/architecture.ru.md
+++ b/docs/intro/architecture.ru.md
@ -54,8 +54,7 @@
  виртуальные диски, их снимки и клоны.
 - **Драйвер QEMU** — подключаемый модуль QEMU, позволяющий QEMU/KVM виртуальным машинам работать
  с виртуальными дисками Vitastor напрямую из пространства пользователя с помощью клиентской
-  библиотеки, без необходимости отображения дисков в виде блочных устройств. Тот же драйвер
-  позволяет подключать диски в систему через [VDUSE](../usage/qemu.ru.md#vduse).
+  библиотеки, без необходимости отображения дисков в виде блочных устройств.
 - **vitastor-nbd** — утилита, позволяющая монтировать образы Vitastor в виде блочных устройств
  с помощью NBD (Network Block Device), на самом деле скорее работающего как "BUSE"
  (Block Device In Userspace). Модуля ядра Linux для выполнения той же задачи в Vitastor нет
--- a/docs/intro/features.en.md
+++ b/docs/intro/features.en.md
@ -13,7 +13,7 @@
 ## Server-side features

 - Basic part: highly-available block storage with symmetric clustering and no SPOF
- [Performance](../performance/bench2.en.md) ;-D
+- [Performance](../performance/comparison1.en.md) ;-D
 - [Multiple redundancy schemes](../config/pool.en.md#scheme): Replication, XOR n+1, Reed-Solomon erasure codes
  based on jerasure and ISA-L libraries with any number of data and parity drives in a group
 - Configuration via simple JSON data structures in etcd (parameters, pools and images)
@ -29,11 +29,8 @@
 - Snapshots and copy-on-write image clones
 - [Write throttling to smooth random write workloads in SSD+HDD configurations](../config/osd.en.md#throttle_small_writes)
 - [RDMA/RoCEv2 support via libibverbs](../config/network.en.md#rdma_device)
- [Scrubbing](../config/osd.en.md#auto_scrub) (verification of copies)
+- [Scrubbing without checksums](../config/osd.en.md#auto_scrub) (verification of copies)
 - [Checksums](../config/layout-osd.en.md#data_csum_type)
- [Client write-back cache](../config/client.en.md#client_enable_writeback)
- [Intelligent recovery auto-tuning](../config/osd.en.md#recovery_tune_interval)
- [Clustered file system](../usage/nfs.en.md#vitastorfs)

 ## Plugins and tools

@ -47,20 +44,19 @@
 - [CSI plugin for Kubernetes](../installation/kubernetes.en.md)
 - [OpenStack support: Cinder driver, Nova and libvirt patches](../installation/openstack.en.md)
 - [Proxmox storage plugin and packages](../installation/proxmox.en.md)
- [Simplified NFS proxy for file-based image access emulation (suitable for VMWare)](../usage/nfs.en.md#pseudo-fs)
+- [Simplified NFS proxy for file-based image access emulation (suitable for VMWare)](../usage/nfs.en.md)

 ## Roadmap

 The following features are planned for the future:

- Control plane optimisation
 - Other administrative tools
 - Web GUI
 - OpenNebula plugin
- iSCSI and NVMeoF gateways
+- iSCSI proxy
 - Multi-threaded client
 - Faster failover
- S3
 - Tiered storage (SSD caching)
 - NVDIMM support
 - Compression (possibly)
+- Read caching using system page cache (possibly)
--- a/docs/intro/features.ru.md
+++ b/docs/intro/features.ru.md
@ -13,7 +13,7 @@
 ## Серверные функции

 - Базовая часть - надёжное кластерное блочное хранилище без единой точки отказа
- [Производительность](../performance/bench2.ru.md) ;-D
+- [Производительность](../performance/comparison1.ru.md) ;-D
 - [Несколько схем отказоустойчивости](../config/pool.ru.md#scheme): репликация, XOR n+1 (1 диск чётности), коды коррекции ошибок
  Рида-Соломона на основе библиотек jerasure и ISA-L с любым числом дисков данных и чётности в группе
 - Конфигурация через простые человекочитаемые JSON-структуры в etcd
@ -31,11 +31,8 @@
 - Снапшоты и copy-on-write клоны
 - [Сглаживание производительности случайной записи в SSD+HDD конфигурациях](../config/osd.ru.md#throttle_small_writes)
 - [Поддержка RDMA/RoCEv2 через libibverbs](../config/network.ru.md#rdma_device)
- [Фоновая проверка целостности](../config/osd.ru.md#auto_scrub) (сверка копий)
+- [Фоновая проверка целостности без контрольных сумм](../config/osd.ru.md#auto_scrub) (сверка копий)
 - [Контрольные суммы](../config/layout-osd.ru.md#data_csum_type)
- [Буферизация записи на стороне клиента](../config/client.ru.md#client_enable_writeback)
- [Интеллектуальная автоподстройка скорости восстановления](../config/osd.ru.md#recovery_tune_interval)
- [Кластерная файловая система](../usage/nfs.ru.md#vitastorfs)

 ## Драйверы и инструменты

@ -49,18 +46,16 @@
 - [CSI-плагин для Kubernetes](../installation/kubernetes.ru.md)
 - [Базовая поддержка OpenStack: драйвер Cinder, патчи для Nova и libvirt](../installation/openstack.ru.md)
 - [Плагин для Proxmox](../installation/proxmox.ru.md)
- [Упрощённая NFS-прокси для эмуляции файлового доступа к образам (подходит для VMWare)](../usage/nfs.ru.md#псевдо-фс)
+- [Упрощённая NFS-прокси для эмуляции файлового доступа к образам (подходит для VMWare)](../usage/nfs.ru.md)

 ## Планы развития

- Оптимизация слоя управления
 - Другие инструменты администрирования
 - Web-интерфейс
 - Плагин для OpenNebula
- iSCSI и NVMeoF прокси
+- iSCSI-прокси
 - Многопоточный клиент
 - Более быстрое переключение при отказах
- S3
 - Поддержка SSD-кэширования (tiered storage)
 - Поддержка NVDIMM
 - Возможно, сжатие
--- a/docs/intro/quickstart.en.md
+++ b/docs/intro/quickstart.en.md
@ -14,7 +14,6 @@
 - [Check cluster status](#check-cluster-status)
 - [Create an image](#create-an-image)
 - [Install plugins](#install-plugins)
- [Create VitastorFS](#create-vitastorfs)

 ## Preparation

@ -22,7 +21,7 @@
  with lazy fsync, but prepare for inferior single-thread latency. Read more about capacitors
  [here](../config/layout-cluster.en.md#immediate_commit).
 - If you want to use HDDs, get modern HDDs with Media Cache or SSD Cache: HGST Ultrastar,
-  Toshiba MG, Seagate EXOS or something similar. If your drives don't have such cache then
+  Toshiba MG08, Seagate EXOS or something similar. If your drives don't have such cache then
  you also need small SSDs for journal and metadata (even 2 GB per 1 TB of HDD space is enough).
 - Get a fast network (at least 10 Gbit/s). Something like Mellanox ConnectX-4 with RoCEv2 is ideal.
 - Disable CPU powersaving: `cpupower idle-set -D 0 && cpupower frequency-set -g performance`.
@ -33,7 +32,7 @@
 - SATA SSD: Micron 5100/5200/5300/5400, Samsung PM863/PM883/PM893, Intel D3-S4510/4520/4610/4620, Kingston DC500M
 - NVMe: Micron 9100/9200/9300/9400, Micron 7300/7450, Samsung PM983/PM9A3, Samsung PM1723/1735/1743,
  Intel DC-P3700/P4500/P4600, Intel D7-P5500/P5600, Intel Optane, Kingston DC1000B/DC1500M
- HDD: HGST Ultrastar, Toshiba MG, Seagate EXOS
+- HDD: HGST Ultrastar, Toshiba MG06/MG07/MG08, Seagate EXOS

 ## Configure monitors

@ -76,16 +75,18 @@ On the monitor hosts:

 ## Create a pool

-Create a pool using vitastor-cli:
+Create pool configuration in etcd:

 ```
-vitastor-cli create-pool testpool --pg_size 2 --pg_count 256
+etcdctl --endpoints=... put /vitastor/config/pools '{"1":{"name":"testpool",
+  "scheme":"replicated","pg_size":2,"pg_minsize":1,"pg_count":256,"failure_domain":"host"}}'
 ```

 For EC pools the configuration should look like the following:

 ```
-vitastor-cli create-pool testpool --ec 2+2 --pg_count 256
+etcdctl --endpoints=... put /vitastor/config/pools '{"2":{"name":"ecpool",
+  "scheme":"ec","pg_size":4,"parity_chunks":2,"pg_minsize":2,"pg_count":256,"failure_domain":"host"}}'
 ```

 After you do this, one of the monitors will configure PGs and OSDs will start them.
@ -115,9 +116,3 @@ After that, you can [run benchmarks](../usage/fio.en.md) or [start QEMU manually
 - [Proxmox](../installation/proxmox.en.md)
 - [OpenStack](../installation/openstack.en.md)
 - [Kubernetes CSI](../installation/kubernetes.en.md)
-
-## Create VitastorFS
-
-If you want to use clustered file system in addition to VM or container images:
-
- [Follow the instructions here](../usage/nfs.en.md#vitastorfs)
--- a/docs/intro/quickstart.ru.md
+++ b/docs/intro/quickstart.ru.md
@ -14,7 +14,6 @@
 - [Проверьте состояние кластера](#проверьте-состояние-кластера)
 - [Создайте образ](#создайте-образ)
 - [Установите плагины](#установите-плагины)
- [Создайте VitastorFS](#создайте-vitastorfs)

 ## Подготовка

@ -78,16 +77,18 @@

 ## Создайте пул

-Создайте пул с помощью vitastor-cli:
+Создайте конфигурацию пула с помощью etcdctl:

 ```
-vitastor-cli create-pool testpool --pg_size 2 --pg_count 256
+etcdctl --endpoints=... put /vitastor/config/pools '{"1":{"name":"testpool",
+  "scheme":"replicated","pg_size":2,"pg_minsize":1,"pg_count":256,"failure_domain":"host"}}'
 ```

 Для пулов с кодами коррекции ошибок конфигурация должна выглядеть примерно так:

 ```
-vitastor-cli create-pool testpool --ec 2+2 --pg_count 256
+etcdctl --endpoints=... put /vitastor/config/pools '{"2":{"name":"ecpool",
+  "scheme":"ec","pg_size":4,"parity_chunks":2,"pg_minsize":2,"pg_count":256,"failure_domain":"host"}}'
 ```

 После этого один из мониторов должен сконфигурировать PG, а OSD должны запустить их.
@ -117,10 +118,3 @@ vitastor-cli create -s 10G testimg
 - [Proxmox](../installation/proxmox.ru.md)
 - [OpenStack](../installation/openstack.ru.md)
 - [Kubernetes CSI](../installation/kubernetes.ru.md)
-
-## Создайте VitastorFS
-
-Если вы хотите использовать не только блочные образы виртуальных машин или контейнеров,
-а также кластерную файловую систему, то:
-
- [Следуйте инструкциям](../usage/nfs.ru.md#vitastorfs)
--- a/docs/performance/bench2.en.md
+++ b/docs/performance/bench2.en.md
@ -1,154 +0,0 @@
-[Documentation](../../README.md#documentation) → Performance → Newer benchmark of Vitastor 1.3.1
-
-----
-
-[Читать на русском](bench2.ru.md)
-
-# Newer benchmark of Vitastor 1.3.1
-
- [Test environment](#test-environment)
- [Notes](#notes)
- [Raw drive performance](#raw-drive-performance)
- [2 replicas](#2-replicas)
- [3 replicas](#3-replicas)
- [EC 2+1](#ec-2-1)
-
-## Test environment
-
-Hardware configuration: 3 nodes, each with:
- 8x NVMe Samsung PM9A3 1.92 TB
- 2x Xeon Gold 6342 (24 cores @ 2.8 GHz)
- 256 GB RAM
- Dual-port 25 GbE Mellanox ConnectX-4 LX network card with RoCEv2
- Connected to 2 Mellanox SN2010 switches with MLAG
-
-## Notes
-
-Vitastor version was 1.3.1.
-
-Tests were ran from the storage nodes - 4 fio clients per each of 3 nodes.
-
-The same large 3 TB image was tested from all hosts because Vitastor has no
-performance penalties related to running multiple clients over a single inode.
-
-CPU power saving was disabled. 4 OSDs were created per each NVMe.
-Checksums were not enabled. Tests with checksums will be conducted later,
-along with the newer version of Vitastor, and results will be updated.
-
-CPU configuration was not optimal because of NUMA. It's better to avoid 2-socket
-platforms. It was especially noticeable in RDMA tests - in the form of ksoftirqd
-processes (usually 1 per server) eating 100 % of one CPU core and actual bandwidth
-of one network port reduced to 3-5 Gbit/s instead of 25 Gbit/s - probably because
-of RFS (Receive Flow Steering) misses. Many network configurations were tried during
-tests, but nothing helped to solve this problem, so final tests were conducted with
-the default settings.
-
-# Raw drive performance
-
- Linear write ~1000-2000 MB/s, depending on current state of the drive's garbage collector
- Linear read ~3300 MB/s
- T1Q1 random write ~60000 iops (latency ~0.015ms)
- T1Q1 random read ~14700 iops (latency ~0.066ms)
- T1Q16 random write ~180000 iops
- T1Q16 random read ~120000 iops
- T1Q32 random write ~180000 iops
- T1Q32 random read ~195000 iops
- T1Q128 random write ~180000 iops
- T1Q128 random read ~195000 iops
- T4Q128 random write ~525000 iops
- T4Q128 random read ~750000 iops
-
-These numbers make obvious that results could be much better if a faster network
-was available, because NVMe drives obviously weren't a bottleneck. For example,
-theoretical maximum linear read performance for 24 drives could be 79.2 GByte/s,
-which is 633 Gbit/s. Real Vitastor read speed (both linear and random) was around
-16 Gbyte/s, which is 130 Gbit/s. It's important to note that it was still much
-larger than the network bandwidth of one server (50 Gbit/s). This is also correct
-because tests were conducted from all 3 nodes.
-
-## 2 replicas
-
-|                              | TCP          | RDMA         |
-|------------------------------|--------------|--------------|
-| Linear read (4M T6 Q16)      | 13.13 GB/s   | 16.25 GB/s   |
-| Linear write (4M T6 Q16)     | 8.16 GB/s    | 7.88 GB/s    |
-| Read 4k T1 Q1                | 8745 iops    | 10252 iops   |
-| Write 4k T1 Q1               | 8097 iops    | 11488 iops   |
-| Read 4k T12 Q128             | 1305936 iops | 4265861 iops |
-| Write 4k T12 Q128            | 660490 iops  | 1384033 iops |
-
-CPU consumption OSD per 1 disk:
-
-|                              | TCP     | RDMA    |
-|------------------------------|---------|---------|
-| Linear read (4M T6 Q16)      | 29.7 %  | 29.8 %  |
-| Linear write (4M T6 Q16)     | 84.4 %  | 33.2 %  |
-| Read 4k T12 Q128             | 98.4 %  | 119.1 % |
-| Write 4k T12 Q128            | 173.4 % | 175.9 % |
-
-CPU consumption per 1 client (fio):
-
-|                              | TCP    | RDMA   |
-|------------------------------|--------|--------|
-| Linear read (4M T6 Q16)      | 100 %  | 85.2 % |
-| Linear write (4M T6 Q16)     | 55.8 % | 48.8 % |
-| Read 4k T12 Q128             | 99.9 % | 96 %   |
-| Write 4k T12 Q128            | 71.6 % | 48.5 % |
-
-## 3 replicas
-
-|                              | TCP          | RDMA         |
-|------------------------------|--------------|--------------|
-| Linear read (4M T6 Q16)      | 13.98 GB/s   | 16.54 GB/s   |
-| Linear write (4M T6 Q16)     | 5.38 GB/s    | 5.7 GB/s     |
-| Read 4k T1 Q1                | 8969 iops    | 9980 iops    |
-| Write 4k T1 Q1               | 8126 iops    | 11672 iops   |
-| Read 4k T12 Q128             | 1358818 iops | 4279088 iops |
-| Write 4k T12 Q128            | 433890 iops  | 993506 iops  |
-
-CPU consumption OSD per 1 disk:
-
-|                              | TCP    | RDMA    |
-|------------------------------|--------|---------|
-| Linear read (4M T6 Q16)      | 24.9 % | 25.4 %  |
-| Linear write (4M T6 Q16)     | 99.3 % | 38.4 %  |
-| Read 4k T12 Q128             | 95.3 % | 111.7 % |
-| Write 4k T12 Q128            | 173 %  | 194 %   |
-
-CPU consumption per 1 client (fio):
-
-|                              | TCP    | RDMA   |
-|------------------------------|--------|--------|
-| Linear read (4M T6 Q16)      | 99.9 % | 85.8 % |
-| Linear write (4M T6 Q16)     | 38.9 % | 38.1 % |
-| Read 4k T12 Q128             | 100 %  | 96.1 % |
-| Write 4k T12 Q128            | 51.6 % | 41.9 % |
-
-## EC 2+1
-
-|                              | TCP          | RDMA         |
-|------------------------------|--------------|--------------|
-| Linear read (4M T6 Q16)      | 10.07 GB/s   | 11.43 GB/s   |
-| Linear write (4M T6 Q16)     | 7.74 GB/s    | 8.32 GB/s    |
-| Read 4k T1 Q1                | 7408 iops    | 8891 iops    |
-| Write 4k T1 Q1               | 3525 iops    | 4903 iops    |
-| Read 4k T12 Q128             | 1216496 iops | 2552765 iops |
-| Write 4k T12 Q128            | 278110 iops  | 821261 iops  |
-
-CPU consumption OSD per 1 disk:
-
-|                              | TCP     | RDMA    |
-|------------------------------|---------|---------|
-| Linear read (4M T6 Q16)      | 68.6 %  | 33.6 %  |
-| Linear write (4M T6 Q16)     | 108.3 % | 50.2 %  |
-| Read 4k T12 Q128             | 138.1 % | 97.9 %  |
-| Write 4k T12 Q128            | 168.7 % | 188.5 % |
-
-CPU consumption per 1 client (fio):
-
-|                              | TCP    | RDMA   |
-|------------------------------|--------|--------|
-| Linear read (4M T6 Q16)      | 88.2 % | 52.4 % |
-| Linear write (4M T6 Q16)     | 51.8 % | 46.8 % |
-| Read 4k T12 Q128             | 99.7 % | 61.3 % |
-| Write 4k T12 Q128            | 35.1 % | 31.3 % |
--- a/docs/performance/bench2.ru.md
+++ b/docs/performance/bench2.ru.md
@ -1,157 +0,0 @@
-[Документация](../../README-ru.md#документация) → Производительность → Более новый тест Vitastor 1.3.1
-
-----
-
-[Read in English](bench2.en.md)
-
-# Более новый тест Vitastor 1.3.1
-
- [Описание стенда](#описание-стенда)
- [Примечания](#примечания)
- [Производительность голых дисков](#производительность-голых-дисков)
- [2 реплики](#2-реплики)
- [3 реплики](#3-реплики)
- [EC 2+1](#ec-2-1)
-
-## Описание стенда
-
-Железо: 3 сервера, в каждом:
- 8x NVMe Samsung PM9A3 1.92 TB
- 2x Xeon Gold 6342 (24 ядра @ 2.8 GHz)
- 256 GB RAM
- Двухпортовая 25 GbE сетевая карта Mellanox ConnectX-4 LX с поддержкой RoCEv2
- Подключение к 2 коммутаторам Mellanox SN2010 в MLAG
-
-## Примечания
-
-Версия Vitastor 1.3.1.
-
-Тесты проводились с самих серверов хранения - по 4 клиента fio с каждого из 3 серверов.
-
-Тестировался один большой образ размером 3 ТБ со всех хостов - создавать отдельные образы
-для тестов в Vitastor необязательно, т.к. в Vitastor нет замедления при записи в один
-узел несколькими клиентами.
-
-Экономия энергии CPU отключена. На каждый NVMe создавалось 4 OSD.
-Контрольные суммы не включались. Тесты с контрольными суммами будут проведены
-позднее. Тогда же будет протестирована более новая версия Vitastor, и результаты
-будут обновлены.
-
-Конфигурация CPU стенда неоптимальна из-за NUMA - двухпроцессорных серверов лучше
-избегать. Особенно это проявлялось во время тестов с RDMA - выражалось это в потреблении
-100% одного ядра CPU одним процессом ksoftirqd и работой одного из двух сетевых портов
-на скорости 3-5 ГБит/с вместо 25 ГБит/с - предположительно, из-за "непопаданий" RFS
-(Receive Flow Steering) на нужные ядра. Решить эту проблему во время проведения тестов
-не получилось. Было перепробовано множество различных настроек, но в итоге тесты проведены
-с настройками по умолчанию, т.к. улучшения добиться не удалось.
-
-# Производительность голых дисков
-
- Линейная запись ~1000-2000 МБ/с, в зависимости от состояния сборщика мусора диска
- Линейное чтение ~3300 МБ/с
- T1Q1 запись ~60000 iops (задержка ~0.015ms)
- T1Q1 чтение ~14700 iops (задержка ~0.066ms)
- T1Q16 запись ~180000 iops
- T1Q16 чтение ~120000 iops
- T1Q32 запись ~180000 iops
- T1Q32 чтение ~195000 iops
- T1Q128 запись ~180000 iops
- T1Q128 чтение ~195000 iops
- T4Q128 запись ~525000 iops
- T4Q128 чтение ~750000 iops
-
-Из данных цифр очевидно, что при наличии более быстрой сети результаты были бы
-значительно лучше, так как в диски тест, очевидно, не упирался. Например, теоретический предел по
-линейному чтению для 24 таких дисков был бы около 79.2 ГБайт/с, то есть,
-633 гигабита в секунду. Реальная скорость чтения (и случайного, и линейного)
-Vitastor составила примерно 16 ГБайт/с, то есть 130 гигабит в секунду. При этом
-следует заметить, что этот результат всё равно значительно лучше пропускной способности
-сети отдельно взятого узла - что тоже вполне логично, так как тест выполнялся со
-всех трёх узлов.
-
-## 2 реплики
-
-|                              | TCP          | RDMA         |
-|------------------------------|--------------|--------------|
-| Линейное чтение (4M T6 Q16)  | 13.13 ГБ/с   | 16.25 ГБ/с   |
-| Линейная запись (4M T6 Q16)  | 8.16 ГБ/с    | 7.88 ГБ/с    |
-| Чтение 4k T1 Q1              | 8745 iops    | 10252 iops   |
-| Запись 4k T1 Q1              | 8097 iops    | 11488 iops   |
-| Чтение 4k T12 Q128           | 1305936 iops | 4265861 iops |
-| Запись 4k T12 Q128           | 660490 iops  | 1384033 iops |
-
-Потребление CPU OSD на 1 диск:
-
-|                              | TCP     | RDMA    |
-|------------------------------|---------|---------|
-| Линейное чтение (4M T6 Q16)  | 29.7 %  | 29.8 %  |
-| Линейная запись (4M T6 Q16)  | 84.4 %  | 33.2 %  |
-| Чтение 4k T12 Q128           | 98.4 %  | 119.1 % |
-| Запись 4k T12 Q128           | 173.4 % | 175.9 % |
-
-Потребление CPU на 1 клиента (fio):
-
-|                              | TCP    | RDMA   |
-|------------------------------|--------|--------|
-| Линейное чтение (4M T6 Q16)  | 100 %  | 85.2 % |
-| Линейная запись (4M T6 Q16)  | 55.8 % | 48.8 % |
-| Чтение 4k T12 Q128           | 99.9 % | 96 %   |
-| Запись 4k T12 Q128           | 71.6 % | 48.5 % |
-
-## 3 реплики
-
-|                              | TCP          | RDMA         |
-|------------------------------|--------------|--------------|
-| Линейное чтение (4M T6 Q16)  | 13.98 ГБ/с   | 16.54 ГБ/с   |
-| Линейная запись (4M T6 Q16)  | 5.38 ГБ/с    | 5.7 ГБ/с     |
-| Чтение 4k T1 Q1              | 8969 iops    | 9980 iops    |
-| Запись 4k T1 Q1              | 8126 iops    | 11672 iops   |
-| Чтение 4k T12 Q128           | 1358818 iops | 4279088 iops |
-| Запись 4k T12 Q128           | 433890 iops  | 993506 iops  |
-
-Потребление CPU OSD на 1 диск:
-
-|                              | TCP    | RDMA    |
-|------------------------------|--------|---------|
-| Линейное чтение (4M T6 Q16)  | 24.9 % | 25.4 %  |
-| Линейная запись (4M T6 Q16)  | 99.3 % | 38.4 %  |
-| Чтение 4k T12 Q128           | 95.3 % | 111.7 % |
-| Запись 4k T12 Q128           | 173 %  | 194 %   |
-
-Потребление CPU на 1 клиента (fio):
-
-|                              | TCP    | RDMA   |
-|------------------------------|--------|--------|
-| Линейное чтение (4M T6 Q16)  | 99.9 % | 85.8 % |
-| Линейная запись (4M T6 Q16)  | 38.9 % | 38.1 % |
-| Чтение 4k T12 Q128           | 100 %  | 96.1 % |
-| Запись 4k T12 Q128           | 51.6 % | 41.9 % |
-
-## EC 2+1
-
-|                              | TCP          | RDMA         |
-|------------------------------|--------------|--------------|
-| Линейное чтение (4M T6 Q16)  | 10.07 ГБ/с   | 11.43 ГБ/с   |
-| Линейная запись (4M T6 Q16)  | 7.74 ГБ/с    | 8.32 ГБ/с    |
-| Чтение 4k T1 Q1              | 7408 iops    | 8891 iops    |
-| Запись 4k T1 Q1              | 3525 iops    | 4903 iops    |
-| Чтение 4k T12 Q128           | 1216496 iops | 2552765 iops |
-| Запись 4k T12 Q128           | 278110 iops  | 821261 iops  |
-
-Потребление CPU OSD на 1 диск:
-
-|                              | TCP     | RDMA    |
-|------------------------------|---------|---------|
-| Линейное чтение (4M T6 Q16)  | 68.6 %  | 33.6 %  |
-| Линейная запись (4M T6 Q16)  | 108.3 % | 50.2 %  |
-| Чтение 4k T12 Q128           | 138.1 % | 97.9 %  |
-| Запись 4k T12 Q128           | 168.7 % | 188.5 % |
-
-Потребление CPU на 1 клиента (fio):
-
-|                              | TCP    | RDMA   |
-|------------------------------|--------|--------|
-| Линейное чтение (4M T6 Q16)  | 88.2 % | 52.4 % |
-| Линейная запись (4M T6 Q16)  | 51.8 % | 46.8 % |
-| Чтение 4k T12 Q128           | 99.7 % | 61.3 % |
-| Запись 4k T12 Q128           | 35.1 % | 31.3 % |
--- a/docs/performance/theoretical.en.md
+++ b/docs/performance/theoretical.en.md
@ -11,26 +11,19 @@ Replicated setups:
 - Single-threaded write+fsync latency:
  - With immediate commit: 2 network roundtrips + 1 disk write.
  - With lazy commit: 4 network roundtrips + 1 disk write + 1 disk flush.
- Linear read: `min(total network bandwidth, sum(disk read MB/s))`.
- Linear write: `min(total network bandwidth, sum(disk write MB/s / number of replicas))`.
- Saturated parallel read iops: `min(total network bandwidth, sum(disk read iops))`.
- Saturated parallel write iops: `min(total network bandwidth / number of replicas, sum(disk write iops / number of replicas / (write amplification = 4)))`.
+- Saturated parallel read iops: min(network bandwidth, sum(disk read iops)).
+- Saturated parallel write iops: min(network bandwidth, sum(disk write iops / number of replicas / write amplification)).

-EC/XOR setups (EC N+K):
+EC/XOR setups:
 - Single-threaded (T1Q1) read latency: 1.5 network roundtrips + 1 disk read.
 - Single-threaded write+fsync latency:
  - With immediate commit: 3.5 network roundtrips + 1 disk read + 2 disk writes.
  - With lazy commit: 5.5 network roundtrips + 1 disk read + 2 disk writes + 2 disk fsyncs.
-  - 0.5 in actually `(N-1)/N` which means that an additional roundtrip doesn't happen when
+  - 0.5 in actually (k-1)/k which means that an additional roundtrip doesn't happen when
    the read sub-operation can be served locally.
- Linear read: `min(total network bandwidth, sum(disk read MB/s))`.
- Linear write: `min(total network bandwidth, sum(disk write MB/s * N/(N+K)))`.
- Saturated parallel read iops: `min(total network bandwidth, sum(disk read iops))`.
- Saturated parallel write iops: roughly `total iops / (N+K) / WA`. More exactly,
-  `min(total network bandwidth * N/(N+K), sum(disk randrw iops / (N*4 + K*5 + 1)))` with
-  random read/write mix corresponding to `(N-1)/(N*4 + K*5 + 1)*100 % reads`.
-  - For example, with EC 2+1 it is: `(7% randrw iops) / 14`.
-  - With EC 6+3 it is: `(12.5% randrw iops) / 40`.
+- Saturated parallel read iops: min(network bandwidth, sum(disk read iops)).
+- Saturated parallel write iops: min(network bandwidth, sum(disk write iops * number of data drives / (number of data + parity drives) / write amplification)).
+  In fact, you should put disk write iops under the condition of ~10% reads / ~90% writes in this formula.

 Write amplification for 4 KB blocks is usually 3-5 in Vitastor:
 1. Journal block write
--- a/docs/performance/theoretical.ru.md
+++ b/docs/performance/theoretical.ru.md
@ -11,27 +11,20 @@
 - Запись+fsync в 1 поток:
  - С мгновенным сбросом: 2 RTT + 1 запись.
  - С отложенным ("ленивым") сбросом: 4 RTT + 1 запись + 1 fsync.
- Линейное чтение: сумма МБ/с чтения всех дисков, либо общая производительность сети (сумма пропускной способности сети всех нод), если в сеть упрётся раньше.
- Линейная запись: сумма МБ/с записи всех дисков / число реплик, либо производительность сети / число реплик, если в сеть упрётся раньше.
- Параллельное случайное мелкое чтение: сумма IOPS чтения всех дисков, либо производительность сети, если в сеть упрётся раньше.
- Параллельная случайная мелкая запись: сумма IOPS записи всех дисков / число реплик / WA, либо производительность сети / число реплик, если в сеть упрётся раньше.
+- Параллельное чтение: сумма IOPS всех дисков либо производительность сети, если в сеть упрётся раньше.
+- Параллельная запись: сумма IOPS всех дисков / число реплик / WA либо производительность сети, если в сеть упрётся раньше.

-При использовании кодов коррекции ошибок (EC N+K):
+При использовании кодов коррекции ошибок (EC):
 - Задержка чтения в 1 поток (T1Q1): 1.5 RTT + 1 чтение.
 - Запись+fsync в 1 поток:
  - С мгновенным сбросом: 3.5 RTT + 1 чтение + 2 записи.
  - С отложенным ("ленивым") сбросом: 5.5 RTT + 1 чтение + 2 записи + 2 fsync.
- Под 0.5 на самом деле подразумевается (N-1)/N, где N - число дисков данных,
+- Под 0.5 на самом деле подразумевается (k-1)/k, где k - число дисков данных,
  что означает, что дополнительное обращение по сети не нужно, когда операция
  чтения обслуживается локально.
- Линейное чтение: сумма МБ/с чтения всех дисков, либо общая производительность сети, если в сеть упрётся раньше.
- Линейная запись: сумма МБ/с записи всех дисков * N/(N+K), либо производительность сети * N / (N+K), если в сеть упрётся раньше.
- Параллельное случайное мелкое чтение: сумма IOPS чтения всех дисков либо производительность сети, если в сеть упрётся раньше.
- Параллельная случайная мелкая запись: грубо `(сумма IOPS / (N+K) / WA)`. Если точнее, то:
-  сумма смешанного IOPS всех дисков при `(N-1)/(N*4 + K*5 + 1)*100 %` чтения, делённая на `(N*4 + K*5 + 1)`.
-  Либо, производительность сети * N/(N+K), если в сеть упрётся раньше.
-  - Например, при EC 2+1 это: `(сумма IOPS при 7% чтения) / 14`.
-  - При EC 6+3 это: `(сумма IOPS при 12.5% чтения) / 40`.
+- Параллельное чтение: сумма IOPS всех дисков либо производительность сети, если в сеть упрётся раньше.
+- Параллельная запись: сумма IOPS всех дисков / общее число дисков данных и чётности / WA либо производительность сети, если в сеть упрётся раньше.
+  Примечание: IOPS дисков в данном случае надо брать в смешанном режиме чтения/записи в пропорции, аналогичной формулам выше.

 WA (мультипликатор записи) для 4 КБ блоков в Vitastor обычно составляет 3-5:
 1. Запись метаданных в журнал
--- a/docs/usage/admin.en.md
+++ b/docs/usage/admin.en.md
@ -1,215 +0,0 @@
-[Documentation](../../README.md#documentation) → Usage → Administration
-
-----
-
-[Читать на русском](admin.ru.md)
-
-# Administration
-
- [Pool states](#pool-states)
- [PG states](#pg-states)
-  - [Base PG states](#base-pg-states)
-  - [Additional PG states](#additional-pg-states)
- [Removing a healthy disk](#removing-a-healthy-disk)
- [Removing a failed disk](#removing-a-failed-disk)
- [Adding a disk](#adding-a-disk)
- [Restoring from lost pool configuration](#restoring-from-lost-pool-configuration)
- [Upgrading Vitastor](#upgrading-vitastor)
- [OSD memory usage](#osd-memory-usage)
-
-## Pool states
-
-Pool is active — that is, fully available for client input/output — when all its PGs are
-'active' (maybe with some additional state flags).
-
-If at least 1 PG is inactive, pool is also inactive and all clients suspend their I/O and
-wait until you fix the cluster. :-)
-
-## PG states
-
-PG states may be seen in [vitastor-cli status](cli.en.md#status) output.
-
-PG state consists of exactly 1 base state and an arbitrary number of additional states.
-
-### Base PG states
-
-PG state always includes exactly 1 of the following base states:
- **active** — PG is active and handles user I/O.
- **incomplete** — Not enough OSDs are available to activate this PG. That is, more disks
-  are lost than it's allowed by the pool's redundancy scheme. For example, if the pool has
-  pg_size=3 and pg_minsize=1, part of the data may be written only to 1 OSD. If that exact
-  OSD is lost, PG will become **incomplete**.
- **offline** — PG isn't activated by any OSD at all. Either primary OSD isn't set for
-  this PG at all (if the pool is just created), or an unavailable OSD is set as primary,
-  or the primary OSD refuses to start this PG (for example, because of wrong block_size),
-  or the PG is stopped by the monitor using `pause: true` flag in `/vitastor/config/pgs` in etcd.
- **starting** — primary OSD has acquired PG lock in etcd, PG is starting.
- **peering** — primary OSD requests PG object listings from secondary OSDs and calculates
-  the PG state.
- **repeering** — PG is waiting for current I/O operations to complete and will
-  then transition to **peering**.
- **stopping** — PG is waiting for current I/O operations to complete and will
-  then transition to **offline** or be activated by another OSD.
-
-All states except **active** mean that PG is inactive and client I/O is suspended.
-
-**peering** state is normally visible only for a short period of time during OSD restarts
-and during switching primary OSD of PGs.
-
-**starting**, **repeering**, **stopping** states normally almost aren't visible at all.
-If you notice them for any noticeable time — chances are some operations on some OSDs hung.
-Search for "slow op" in OSD logs to find them — operations hung for more than
-[slow_log_interval](../config/osd.en.md#slow_log_interval) are logged as "slow ops".
-
-State transition diagram:
-
-![PG state transitions](pg_states.svg "PG state transitions")
-
-### Additional PG states
-
-If a PG is active it can also have any number of the following additional states:
-
- **degraded** — PG is running on reduced number of drives (OSDs), redundancy of all
-  objects in this PG is reduced.
- **has_incomplete** — some objects in this PG are incomplete (unrecoverable), that is,
-  they have too many lost EC parts (more than pool's [parity_chunks](../config/pool.en.md#parity_chunks)).
- **has_degraded** — some objects in this PG have reduced redundancy
-  compared to the rest of the PG (so PG can be degraded+has_degraded at the same time).
-  These objects should be healed automatically by recovery process, unless
-  it's disabled by [no_recovery](../config/osd.en.md#no_recovery).
- **has_misplaced** — some objects in this PG are stored on an OSD set different from
-  the target set of the PG. These objects should be moved automatically, unless
-  rebalance is disabled by [no_rebalance](../config/osd.en.md#no_rebalance). Objects
-  that are degraded and misplaced at the same time are treated as just degraded.
- **has_unclean** — one more state normally noticeable only for very short time during
-  PG activation. It's used only with EC pools and means that some objects of this PG
-  have started but not finished modifications. All such objects are either quickly
-  committed or rolled back by the primary OSD when starting the PG, that is why the
-  state shouldn't be noticeable. If you notice it, it probably means that commit or
-  rollback operations are hung.
- **has_invalid** — PG contains objects with incorrect part ID. Never occurs normally.
-  It can only occur if you delete a non-empty EC pool and then recreate it as a replica
-  pool or with smaller data part count.
- **has_corrupted** — PG has corrupted objects, discovered by checking checksums during
-  read or during scrub. When possible, such objects should be recovered automatically.
-  If objects remain corrupted, use [vitastor-cli describe](cli.en.md#describe) to find
-  out details and/or look into the log of the primary OSD of the PG.
- **has_inconsistent** — PG has objects with non-matching parts or copies on different OSDs,
-  and it's impossible to determine which copy is correct automatically. It may happen
-  if you use a pool with 2 replica and you don't enable checksums, and if data on one
-  of replicas becomes corrupted. You should also use vitastor-cli [describe](cli.en.md#describe)
-  and [fix](cli.en.md#fix) commands to remove the incorrect version in this case.
- **left_on_dead** — part of the data of this PG is left on unavailable OSD that isn't
-  fully removed from the cluster. You should either start the corresponding OSD back and
-  let it remove the unneeded data or remove it from cluster using vitastor-cli
-  [rm-osd](cli.en.md#rm-osd) if you know that it's gone forever (for example, if the disk died).
- **scrubbing** — data [scrub](../config/osd.en.md#auto_scrub) is running for this PG.
-
-## Removing a healthy disk
-
-Befor removing a healthy disk from the cluster set its OSD weight(s) to 0 to
-move data away. To do that, add `"reweight":0` to etcd key `/vitastor/config/osd/<OSD_NUMBER>`.
-For example:
-
-```
-etcdctl --endpoints=http://1.1.1.1:2379/v3 put /vitastor/config/osd/1 '{"reweight":0}'
-```
-
-Then wait until rebalance finishes and remove OSD by running `vitastor-disk purge /dev/vitastor/osdN-data`.
-
-## Removing a failed disk
-
-If a disk is already dead, its OSD(s) are likely already stopped.
-
-In this case just remove OSD(s) from the cluster by running `vitastor-cli rm-osd OSD_NUMBER`.
-
-## Adding a disk
-
-If you're adding a server, first install Vitastor packages and copy the
-`/etc/vitastor/vitastor.conf` configuration file to it.
-
-After that you can just run `vitastor-disk prepare /dev/nvmeXXX`, of course with
-the same parameters which you used for other OSDs in your cluster before.
-
-## Restoring from lost pool configuration
-
-If you remove or corrupt `/vitastor/config/pools` key in etcd all pools will
-be deleted. Don't worry, the data won't be lost, but you'll need to perform
-a specific recovery procedure.
-
-First you need to restore previous configuration of the pool with the same ID
-and EC/replica parameters and wait until pool PGs appear in `vitastor-cli status`.
-
-Then add all OSDs into the history records of all PGs. You can do it by running
-the following script (just don't forget to use your own PG_COUNT and POOL_ID):
-
-```
-PG_COUNT=32
-POOL_ID=1
-ALL_OSDS=$(etcdctl --endpoints=your_etcd_address:2379 get --keys-only --prefix /vitastor/osd/stats/ | \
-    perl -e '$/ = undef; $a = <>; $a =~ s/\s*$//; $a =~ s!/vitastor/osd/stats/!!g; $a =~ s/\s+/,/g; print $a')
-for i in $(seq 1 $PG_COUNT); do
-    etcdctl --endpoints=your_etcd_address:2379 put /vitastor/pg/history/$POOL_ID/$i '{"all_peers":['$ALL_OSDS']}'; done
-done
-```
-
-After that all PGs should peer and find all previous data.
-
-## Upgrading Vitastor
-
-Every upcoming Vitastor version is usually compatible with previous both forward
-and backward regarding the network protocol and etcd data structures.
-
-So, by default, if this page doesn't contain explicit different instructions, you
-can upgrade your Vitastor cluster by simply upgrading packages and restarting all
-OSDs and monitors in any order.
-
-Upgrading is performed without stopping clients (VMs/containers), you just need to
-upgrade and restart servers one by one. However, ideally you should restart VMs too
-to make them use the new version of the client library.
-
-Exceptions (specific upgrade instructions):
- Upgrading <= 1.1.x to 1.2.0 or later, if you use EC n+k with k>=2, is recommended
-  to be performed with full downtime: first you should stop all clients, then all OSDs,
-  then upgrade and start everything back — because versions before 1.2.0 have several
-  bugs leading to invalid data being read in EC n+k, k>=2 configurations in degraded pools.
- Versions <= 0.8.7 are incompatible with versions >= 0.9.0, so you should first
-  upgrade from <= 0.8.7 to 0.8.8 or 0.8.9, and only then to >= 0.9.x. If you upgrade
-  without this intermediate step, client I/O will hang until the end of upgrade process.
- Upgrading from <= 0.5.x to >= 0.6.x is not supported.
-
-Rollback:
- Version 1.0.0 has a new disk format, so OSDs initiaziled on 1.0.0 can't be rolled
-  back to 0.9.x or previous versions.
- Versions before 0.8.0 don't have vitastor-disk, so OSDs, initialized by it, won't
-  start with 0.7.x or 0.6.x. :-)
-
-## OSD memory usage
-
-OSD uses RAM mainly for:
-
- Metadata index: `data_size`/[`block_size`](../config/layout-cluster.en.md#block_size) * `approximately 1.1` * `32` bytes.
-  Consumed always.
- Copy of the on-disk metadata area: `data_size`/[`block_size`](../config/layout-cluster.en.md#block_size) * `28` bytes.
-  Consumed if [inmemory_metadata](../config/osd.en.md#inmemory_metadata) isn't disabled.
- Bitmaps: `data_size`/[`bitmap_granularity`](../config/layout-cluster.en.md#bitmap_granularity)/`8` * `2` bytes.
-  Consumed always.
- Journal index: between 0 and, approximately, journal size. Consumed always.
- Copy of the on-disk journal area: exactly journal size. Consumed if
-  [inmemory_journal](../config/osd.en.md#inmemory_journal) isn't disabled.
- Checksums: `data_size`/[`csum_block_size`](../config/osd.en.md#csum_block_size) * 4 bytes.
-  Consumed if checksums are enabled and [inmemory_metadata](../config/osd.en.md#inmemory_metadata) isn't disabled.
-
-bitmap_granularity is almost always 4 KB.
-
-So with default SSD settings (block_size=128k, journal_size=32M, csum_block_size=4k) memory usage is:
-
- Metadata and bitmaps: ~600 MB per 1 TB of data.
- Journal: up to 64 MB per 1 OSD.
- Checksums: 1 GB per 1 TB of data.
-
-With default HDD settings (block_size=1M, journal_size=128M, csum_block_size=32k):
-
- Metadata and bitmaps: ~128 MB per 1 TB of data.
- Journal: up to 256 MB per 1 OSD.
- Checksums: 128 MB per 1 TB of data.
--- a/docs/usage/admin.ru.md
+++ b/docs/usage/admin.ru.md
@ -1,211 +0,0 @@
-[Документация](../../README-ru.md#документация) → Использование → Администрирование
-
-----
-
-[Read in English](admin.en.md)
-
-# Администрирование
-
- [Состояния пулов](#состояния-пулов)
- [Состояния PG](#состояния-pg)
-  - [Базовые состояния PG](#базовые-состояния-pg)
-  - [Дополнительные состояния PG](#дополнительные-состояния-pg)
- [Удаление исправного диска](#удаление-исправного-диска)
- [Удаление неисправного диска](#удаление-неисправного-диска)
- [Добавление диска](#добавление-диска)
- [Восстановление потерянной конфигурации пулов](#восстановление-потерянной-конфигурации-пулов)
- [Обновление Vitastor](#обновление-vitastor)
- [Потребление памяти OSD](#потребление-памяти-osd)
-
-## Состояния пулов
-
-Пул активен — то есть, полностью доступен для клиентского ввода-вывода — когда все его PG
-активны, то есть, имеют статус active, возможно, с любым набором дополнительных флагов.
-
-Если хотя бы 1 PG неактивна, пул неактивен и все клиенты зависают и ждут, пока вы почините
-кластер. :-)
-
-## Состояния PG
-
-Вы можете видеть состояния PG в выводе команды [vitastor-cli status](cli.ru.md#status).
-
-Состояние PG состоит из ровно 1 базового флага состояния, плюс любого числа дополнительных.
-
-### Базовые состояния PG
-
-Состояние PG включает в себя ровно 1 флаг из следующих:
- **active** — PG активна и обрабатывает запросы ввода-вывода от пользователей.
- **incomplete** — Недостаточно живых OSD, чтобы включить эту PG.
-  То есть, дисков потеряно больше, чем разрешено схемой отказоустойчивости пула и pg_minsize.
-  Например, если у пула pg_size=3 и pg_minsize=1, то часть данных может записаться всего на 1 OSD.
-  Если потом конкретно этот OSD упадёт, PG окажется **incomplete**.
- **offline** — PG вообще не активирована ни одним OSD. Либо первичный OSD не назначен вообще
-  (если пул только создан), либо в качестве первичного назначен недоступный OSD, либо
-  назначенный OSD отказывается запускать эту PG (например, из-за несовпадения block_size),
-  либо PG остановлена монитором через флаг `pause: true` в `/vitastor/config/pgs` в etcd.
- **starting** — первичный OSD захватил блокировку PG в etcd, PG запускается.
- **peering** — первичный OSD опрашивает вторичные OSD на предмет списков объектов данной PG и рассчитывает её состояние.
- **repeering** — PG ожидает завершения текущих операций ввода-вывода, после чего перейдёт в состояние **peering**.
- **stopping** — PG ожидает завершения текущих операций ввода-вывода, после чего перейдёт в состояние **offline** или поднимется на другом OSD.
-
-Все состояния, кроме **active**, означают, что PG неактивна и ввод-вывод приостановлен.
-
-Состояние **peering** в норме заметно только при перезапуске OSD или переключении первичных
-OSD, на протяжении небольшого периода времени.
-
-Состояния **starting**, **repeering**, **stopping** в норме практически не заметны вообще,
-PG должны очень быстро переходить из них в другие. Если эти состояния заметны
-хоть сколько-то значительное время — вероятно, какие-то операции на каких-то OSD зависли.
-Чтобы найти их, ищите "slow op" в журналах OSD — операции, зависшие дольше,
-чем на [slow_log_interval](../config/osd.ru.md#slow_log_interval), записываются в
-журналы OSD как "slow op".
-
-Диаграмма переходов:
-
-![Диаграмма переходов](pg_states.svg "Диаграмма переходов")
-
-### Дополнительные состояния PG
-
-Если PG активна, она также может иметь любое число дополнительных флагов состояний:
-
- **degraded** — PG поднята на неполном числе дисков (OSD), избыточность хранения всех объектов снижена.
- **has_incomplete** — часть объектов в PG неполные (невосстановимые), то есть, у них потеряно
-  слишком много EC-частей (больше, чем [parity_chunks](../config/pool.ru.md#parity_chunks) пула).
- **has_degraded** — часть объектов в PG деградированы, избыточность их хранения снижена по сравнению
-  с остальным содержимым данной PG (то есть, PG может одновременно быть degraded+has_degraded).
-  Данные объекты должны восстановиться автоматически, если только восстановление не отключено
-  через [no_recovery](../config/osd.ru.md#no_recovery).
- **has_misplaced** — часть объектов в PG сейчас расположена не на целевом наборе OSD этой PG.
-  Данные объекты должны переместиться автоматически, если только перебалансировка не отключена
-  через [no_rebalance](../config/osd.ru.md#no_rebalance). Объекты, являющиеся одновременно
-  degraded и misplaced, считаются просто degraded.
- **has_unclean** — ещё одно состояние, в норме заметное только очень короткое время при поднятии PG.
-  Применяется только к EC и означает, что на каких-то OSD этой PG есть EC-части объектов, для которых
-  был начат, но не завершён процесс записи. Все такие объекты первичный OSD либо завершает, либо
-  откатывает при поднятии PG первым делом, поэтому состояние и не должно быть заметно. Опять-таки,
-  если оно заметно — значит, скорее всего, операции отката или завершения записи на каких-то OSD зависли.
- **has_invalid** — в PG найдены объекты с некорректными ID части. В норме не проявляется вообще
-  никогда, проявляется только если, не удалив данные, создать на месте EC-пула либо реплика-пул,
-  либо EC-пул с меньшим числом частей данных.
- **has_corrupted** — в PG есть повреждённые объекты, обнаруженные с помощью контрольных сумм или
-  скраба (сверки копий). Если объекты можно восстановить, они восстановятся автоматически. Если
-  не восстанавливаются, используйте команду [vitastor-cli describe](cli.ru.md#describe) для
-  выяснения деталей и/или смотрите в журнал первичного OSD данной PG.
- **has_inconsistent** — в PG есть объекты, у которых не совпадают копии/части данных на разных OSD,
-  и при этом автоматически определить, какая копия верная, а какая нет, невозможно. Такое может
-  произойти, если вы используете 2 реплики, не включали контрольные суммы, и на одной из реплик
-  данные повредились. В этом случае тоже надо использовать команды vitastor-cli [describe](cli.ru.md#describe)
-  и [fix](cli.ru.md#fix) для удаления некорректной версии.
- **left_on_dead** — часть данных PG осталась на отключённом, но не удалённом из кластера окончательно,
-  OSD. Вам нужно либо вернуть соответствующий OSD в строй и дать ему очистить лишние данные, либо
-  удалить его из кластера окончательно с помощью vitastor-cli [rm-osd](cli.ru.md#rm-osd), если
-  известно, что он уже не вернётся (например, если умер диск).
- **scrubbing** — идёт фоновая проверка данных PG ([скраб](../config/osd.ru.md#auto_scrub)).
-
-## Удаление исправного диска
-
-Перед удалением исправного диска из кластера установите его OSD вес в 0, чтобы убрать с него данные.
-Для этого добавьте в ключ `/vitastor/config/osd/<НОМЕР_OSD>` в etcd значение `"reweight":0`, например:
-
-```
-etcdctl --endpoints=http://1.1.1.1:2379/v3 put /vitastor/config/osd/1 '{"reweight":0}'
-```
-
-Дождитесь завершения ребаланса, после чего удалите OSD командой `vitastor-disk purge /dev/vitastor/osdN-data`.
-
-## Удаление неисправного диска
-
-Если диск уже умер, его OSD, скорее всего, уже будет/будут остановлен(ы).
-
-В этом случае просто удалите OSD из etcd командой `vitastor-cli rm-osd НОМЕР_OSD`.
-
-## Добавление диска
-
-Если сервер новый, установите на него пакеты Vitastor и скопируйте файл конфигурации
-`/etc/vitastor/vitastor.conf`.
-
-После этого достаточно выполнить команду `vitastor-disk prepare /dev/nvmeXXX`, разумеется,
-с параметрами, аналогичными другим OSD в вашем кластере.
-
-## Восстановление потерянной конфигурации пулов
-
-Если удалить или повредить ключ `/vitastor/config/pools` в etcd, все пулы будут удалены.
-Не волнуйтесь, данные потеряны не будут, но вам нужно будет провести специальную
-процедуру восстановления.
-
-Сначала нужно будет восстановить конфигурацию пулов, создав пул с таким же ID и
-с такими же параметрами EC/реплик, и подождать, пока PG пула появятся в `vitastor-cli status`.
-
-Далее нужно будет добавить все OSD в исторические записи всех PG. Примерно так
-(только подставьте свои PG_COUNT и POOL_ID):
-
-```
-PG_COUNT=32
-POOL_ID=1
-ALL_OSDS=$(etcdctl --endpoints=your_etcd_address:2379 get --keys-only --prefix /vitastor/osd/stats/ | \
-    perl -e '$/ = undef; $a = <>; $a =~ s/\s*$//; $a =~ s!/vitastor/osd/stats/!!g; $a =~ s/\s+/,/g; print $a')
-for i in $(seq 1 $PG_COUNT); do
-    etcdctl --endpoints=your_etcd_address:2379 put /vitastor/pg/history/$POOL_ID/$i '{"all_peers":['$ALL_OSDS']}'; done
-done
-```
-
-После этого все PG должны пройти peering и найти все предыдущие данные.
-
-## Обновление Vitastor
-
-Обычно каждая следующая версия Vitastor совместима с предыдущими и "вперёд", и "назад"
-с точки зрения сетевого протокола и структур данных в etcd.
-
-Так что по умолчанию, если на данной странице не указано обратное, считается, что для
-обновления достаточно обновить пакеты и перезапустить все OSD и мониторы Vitastor в
-произвольном порядке.
-
-Обновление производится без остановки клиентов (виртуальных машин/контейнеров), для этого
-достаточно обновлять серверы по одному. Однако, конечно, чтобы запущенные виртуальные машины
-начали использовать новую версию клиентской библиотеки, их тоже нужно перезапустить.
-
-Исключения (особые указания при обновлении):
- Обновляться с версий <= 1.1.x до версий >= 1.2.0, если вы используете EC n+k и k>=2,
-  рекомендуется с временной остановкой кластера — сначала нужно остановить всех клиентов,
-  потом все OSD, потом обновить и запустить всё обратно — из-за нескольких багов, которые
-  могли приводить к некорректному чтению данных в деградированных EC-пулах.
- Версии <= 0.8.7 несовместимы с версиями >= 0.9.0, поэтому при обновлении с <= 0.8.7
-  нужно сначала обновиться до 0.8.8 или 0.8.9, а уже потом до любых версий >= 0.9.x.
-  Иначе клиентский ввод-вывод зависнет до завершения обновления.
- Обновление с версий 0.5.x и более ранних до 0.6.x и более поздних не поддерживается.
-
-Откат:
- В версии 1.0.0 поменялся дисковый формат, поэтому OSD, созданные на версии >= 1.0.0,
-  нельзя откатить до версии 0.9.x и более ранних.
- В версиях ранее 0.8.0 нет vitastor-disk, значит, созданные им OSD нельзя откатить
-  до 0.7.x или 0.6.x. :-)
-
-## Потребление памяти OSD
-
-Основное потребление памяти складывается из:
-
- Индекс метаданных: `размер_данных`/[`block_size`](../config/layout-cluster.ru.md#block_size) * `примерно 1.1` * `32` байт.
-  Потребляется всегда.
- Копия дисковой области метаданных: `размер_данных`/[`block_size`](../config/layout-cluster.ru.md#block_size) * `28` байт.
-  Потребляется, если не отключена настройка [inmemory_metadata](../config/osd.ru.md#inmemory_metadata).
- Битмапы: `размер_данных`/[`bitmap_granularity`](../config/layout-cluster.ru.md#bitmap_granularity)/`8` * `2` байт.
-  Потребляется всегда.
- Индекс журнала: от 0 до, приблизительно, размера журнала. Потребляется всегда.
- Копия дисковой области журнала: в точности размер журнала. Потребляется,
-  если не отключена настройка [inmemory_journal](../config/osd.ru.md#inmemory_journal).
- Контрольные суммы: `размер_данных`/[`csum_block_size`](../config/osd.ru.md#csum_block_size) * `4` байт.
-  Потребляется, если включены контрольные суммы и не отключена настройка [inmemory_metadata](../config/osd.ru.md#inmemory_metadata).
-
-bitmap_granularity, как правило, никогда не меняется и равен 4 килобайтам.
-
-Таким образом, при SSD-настройках по умолчанию (block_size=128k, journal_size=32M, csum_block_size=4k) потребляется:
-
- Метаданные и битмапы: ~600 МБ на 1 ТБ данных
- Журнал: до 64 МБ на 1 OSD
- Контрольные суммы: 1 ГБ на 1 ТБ данных
-
-При HDD-настройках по умолчанию (block_size=1M, journal_size=128M, csum_block_size=32k):
-
- Метаданные и битмапы: ~128 МБ на 1 ТБ данных
- Журнал: до 256 МБ на 1 OSD
- Контрольные суммы: 128 МБ на 1 ТБ данных
--- a/docs/usage/cli.en.md
+++ b/docs/usage/cli.en.md
@ -24,16 +24,11 @@ It supports the following commands:
 - [fix](#fix)
 - [alloc-osd](#alloc-osd)
 - [rm-osd](#rm-osd)
- [create-pool](#create-pool)
- [modify-pool](#modify-pool)
- [ls-pools](#ls-pools)
- [rm-pool](#rm-pool)

 Global options:

 ```
--config_file FILE   Path to Vitastor configuration file
--etcd_address URL   Etcd connection address
+--etcd_address ADDR  Etcd connection address
 --iodepth N          Send N operations in parallel to each OSD when possible (default 32)
 --parallel_osds M    Work with M osds in parallel when possible (default 4)
 --progress 1|0       Report progress (default 1)
@ -135,18 +130,19 @@ See also about [how to export snapshots](qemu.en.md#exporting-snapshots).

 ## modify

-`vitastor-cli modify <name> [--rename <new-name>] [--resize <size>] [--readonly | --readwrite] [-f|--force] [--down-ok]`
+`vitastor-cli modify <name> [--rename <new-name>] [--resize <size>] [--readonly | --readwrite] [-f|--force]`

 Rename, resize image or change its readonly status. Images with children can't be made read-write.
 If the new size is smaller than the old size, extra data will be purged.
 You should resize file system in the image, if present, before shrinking it.

-* `-f|--force` - Proceed with shrinking or setting readwrite flag even if the image has children.
-* `--down-ok` - Proceed with shrinking even if some data will be left on unavailable OSDs.
+```
+-f|--force  Proceed with shrinking or setting readwrite flag even if the image has children.
+```

 ## rm

-`vitastor-cli rm <from> [<to>] [--writers-stopped] [--down-ok]`
+`vitastor-cli rm <from> [<to>] [--writers-stopped]`

 Remove `<from>` or all layers between `<from>` and `<to>` (`<to>` must be a child of `<from>`),
 rebasing all their children accordingly. --writers-stopped allows merging to be a bit
@ -154,10 +150,6 @@ more effective in case of a single 'slim' read-write child and 'fat' removed par
 the child is merged into parent and parent is renamed to child in that case.
 In other cases parent layers are always merged into children.

-Other options:
-
-* `--down-ok` - Continue deletion/merging even if some data will be left on unavailable OSDs.
-
 ## flatten

 `vitastor-cli flatten <layer>`
@ -186,9 +178,11 @@ Merge layer data without changing metadata. Merge `<from>`..`<to>` to `<target>`

 ## describe

-`vitastor-cli describe [OPTIONS]`
+`vitastor-cli describe [--osds <osds>] [--object-state <states>] [--pool <pool>]
+    [--inode <ino>] [--min-inode <ino>] [--max-inode <ino>]
+    [--min-offset <offset>] [--max-offset <offset>]`

-Describe unclean object locations in the cluster. Options:
+Describe unclean object locations in the cluster.

 ```
 --osds <osds>
@ -198,8 +192,6 @@ Describe unclean object locations in the cluster. Options:
    degraded, misplaced, incomplete, corrupted, inconsistent.
 --pool <pool name or number>
    Only list objects in the given pool.
--pg <pg number>
-    Only list objects in the given PG of the pool.
 --inode, --min-inode, --max-inode
    Restrict listing to specific inode numbers.
 --min-offset, --max-offset
@ -245,93 +237,3 @@ Refuses to remove OSDs with data without `--force` and `--allow-data-loss`.

 With `--dry-run` only checks if deletion is possible without data loss and
 redundancy degradation.
-
-## create-pool
-
-`vitastor-cli create-pool|pool-create <name> (-s <pg_size>|--ec <N>+<K>) -n <pg_count> [OPTIONS]`
-
-Create a pool. Required parameters:
-
-| <!-- -->                 | <!-- -->                                                                              |
-|--------------------------|---------------------------------------------------------------------------------------|
-| `-s R` or `--pg_size R`  | Number of replicas for replicated pools                                               |
-| `--ec N+K`               | Number of data (N) and parity (K) chunks for erasure-coded pools                      |
-| `-n N` or `--pg_count N` | PG count for the new pool (start with 10*<OSD count>/pg_size rounded to a power of 2) |
-
-Optional parameters:
-
-| <!-- -->                       | <!-- -->                                                                   |
-|--------------------------------|----------------------------------------------------------------------------|
-| `--pg_minsize <number>`        | R or N+K minus number of failures to tolerate without downtime ([details](../config/pool.en.md#pg_minsize)) |
-| `--failure_domain host`        | Failure domain: host, osd or a level from placement_levels. Default: host  |
-| `--root_node <node>`           | Put pool only on child OSDs of this placement tree node                    |
-| `--osd_tags <tag>[,<tag>]...`  | Put pool only on OSDs tagged with all specified tags                       |
-| `--block_size 128k`            | Put pool only on OSDs with this data block size                            |
-| `--bitmap_granularity 4k`      | Put pool only on OSDs with this logical sector size                        |
-| `--immediate_commit none`      | Put pool only on OSDs with this or larger immediate_commit (none < small < all) |
-| `--level_placement <rules>`    | Use additional failure domain rules (example: "dc=112233")                 |
-| `--raw_placement <rules>`      | Specify raw PG generation rules ([details](../config/pool.en.md#raw_placement)) |
-| `--primary_affinity_tags tags` | Prefer to put primary copies on OSDs with all specified tags               |
-| `--scrub_interval <time>`      | Enable regular scrubbing for this pool. Format: number + unit s/m/h/d/M/y  |
-| `--used_for_fs <name>`         | Mark pool as used for VitastorFS with metadata in image <name>             |
-| `--pg_stripe_size <number>`    | Increase object grouping stripe                                            |
-| `--max_osd_combinations 10000` | Maximum number of random combinations for LP solver input                  |
-| `--wait`                       | Wait for the new pool to come online                                       |
-| `-f` or `--force`              | Do not check that cluster has enough OSDs to create the pool               |
-
-See also [Pool configuration](../config/pool.en.md) for detailed parameter descriptions.
-
-Examples:
-
-`vitastor-cli create-pool test_x4 -s 4 -n 32`
-
-`vitastor-cli create-pool test_ec42 --ec 4+2 -n 32`
-
-## modify-pool
-
-`vitastor-cli modify-pool|pool-modify <id|name> [--name <new_name>] [PARAMETERS...]`
-
-Modify an existing pool. Modifiable parameters:
-
-```
-[-s|--pg_size <number>] [--pg_minsize <number>] [-n|--pg_count <count>]
-[--failure_domain <level>] [--root_node <node>] [--osd_tags <tags>] [--no_inode_stats 0|1]
-[--max_osd_combinations <number>] [--primary_affinity_tags <tags>] [--scrub_interval <time>]
-```
-
-Non-modifiable parameters (changing them WILL lead to data loss):
-
-```
-[--block_size <size>] [--bitmap_granularity <size>]
-[--immediate_commit <all|small|none>] [--pg_stripe_size <size>]
-```
-
-These, however, can still be modified with -f|--force.
-
-See [create-pool](#create-pool) for parameter descriptions.
-
-Examples:
-
-`vitastor-cli modify-pool pool_A --name pool_B`
-
-`vitastor-cli modify-pool 2 --pg_size 4 -n 128`
-
-## rm-pool
-
-`vitastor-cli rm-pool|pool-rm [--force] <id|name>`
-
-Remove a pool. Refuses to remove pools with images without `--force`.
-
-## ls-pools
-
-`vitastor-cli ls-pools|pool-ls|ls-pool|pools [-l] [--detail] [--sort FIELD] [-r] [-n N] [--stats] [<glob> ...]`
-
-List pools (only matching <glob> patterns if passed).
-
-| <!-- -->             | <!-- -->                                              |
-|----------------------|-------------------------------------------------------|
-| `-l` or `--long`     | Also report I/O statistics                            |
-| `--detail`           | Use list format (not table), show all details         |
-| `--sort FIELD`       | Sort by specified field (see fields in --json output) |
-| `-r` or `--reverse`  | Sort in descending order                              |
-| `-n` or `--count N`  | Only list first N items                               |
--- a/docs/usage/cli.ru.md
+++ b/docs/usage/cli.ru.md
@ -23,16 +23,11 @@ vitastor-cli - интерфейс командной строки для адм
 - [merge-data](#merge-data)
 - [alloc-osd](#alloc-osd)
 - [rm-osd](#rm-osd)
- [create-pool](#create-pool)
- [modify-pool](#modify-pool)
- [ls-pools](#ls-pools)
- [rm-pool](#rm-pool)

 Глобальные опции:

 ```
--config_file FILE   Путь к файлу конфигурации Vitastor
--etcd_address URL   Адрес соединения с etcd
+--etcd_address ADDR  Адрес соединения с etcd
 --iodepth N          Отправлять параллельно N операций на каждый OSD (по умолчанию 32)
 --parallel_osds M    Работать параллельно с M OSD (по умолчанию 4)
 --progress 1|0       Печатать прогресс выполнения (по умолчанию 1)
@ -89,8 +84,8 @@ kaveri    2/1     32   0 B      10 G    0 B        100%    0%

 `vitastor-cli ls [-l] [-p POOL] [--sort FIELD] [-r] [-n N] [<glob> ...]`

-Показать список образов, если передан(ы) шаблон(ы) `<glob>`, то только с именами,
-соответствующими одному из шаблонов (стандартные ФС-шаблоны с * и ?).
+Показать список образов, если переданы шаблоны `<glob>`, то только с именами,
+соответствующими этим шаблонам (стандартные ФС-шаблоны с * и ?).

 Опции:

@ -136,7 +131,7 @@ vitastor-cli snap-create [-p|--pool <id|name>] <image>@<snapshot>

 ## modify

-`vitastor-cli modify <name> [--rename <new-name>] [--resize <size>] [--readonly | --readwrite] [-f|--force] [--down-ok]`
+`vitastor-cli modify <name> [--rename <new-name>] [--resize <size>] [--readonly | --readwrite] [-f|--force]`

 Изменить размер, имя образа или флаг "только для чтения". Снимать флаг "только для чтения"
 и уменьшать размер образов, у которых есть дочерние клоны, без `--force` нельзя.
@ -144,12 +139,13 @@ vitastor-cli snap-create [-p|--pool <id|name>] <image>@<snapshot>
 Если новый размер меньше старого, "лишние" данные будут удалены, поэтому перед уменьшением
 образа сначала уменьшите файловую систему в нём.

-* `-f|--force` - Разрешить уменьшение или перевод в чтение-запись образа, у которого есть клоны.
-* `--down-ok` - Разрешить уменьшение, даже если часть данных останется неудалённой на недоступных OSD.
+```
+-f|--force  Разрешить уменьшение или перевод в чтение-запись образа, у которого есть клоны.
+```

 ## rm

-`vitastor-cli rm <from> [<to>] [--writers-stopped] [--down-ok]`
+`vitastor-cli rm <from> [<to>] [--writers-stopped]`

 Удалить образ `<from>` или все слои от `<from>` до `<to>` (`<to>` должен быть дочерним
 образом `<from>`), одновременно меняя родительские образы их клонов (если таковые есть).
@ -161,10 +157,6 @@ vitastor-cli snap-create [-p|--pool <id|name>] <image>@<snapshot>

 В других случаях родительские слои вливаются в дочерние.

-Другие опции:
-
-* `--down-ok` - Продолжать удаление/слияние, даже если часть данных останется неудалённой на недоступных OSD.
-
 ## flatten

 `vitastor-cli flatten <layer>`
@ -194,10 +186,12 @@ vitastor-cli snap-create [-p|--pool <id|name>] <image>@<snapshot>

 ## describe

-`vitastor-cli describe [ОПЦИИ]`
+`vitastor-cli describe [--osds <osds>] [--object-state <состояния>] [--pool <пул>]
+    [--inode <номер>] [--min-inode <номер>] [--max-inode <номер>]
+    [--min-offset <смещение>] [--max-offset <смещение>]`

 Описать состояние "грязных" объектов в кластере, то есть таких объектов, копии
-или части которых хранятся на наборе OSD, не равном целевому. Опции:
+или части которых хранятся на наборе OSD, не равном целевому.

 ```
 --osds <osds>
@ -212,8 +206,6 @@ vitastor-cli snap-create [-p|--pool <id|name>] <image>@<snapshot>
    - inconsistent - неконсистентный, с неоднозначным расхождением копий/частей
 --pool <имя или ID пула>
    Перечислять только объекты из заданного пула.
--pg <номер PG>
-    Перечислять только объекты из заданной PG пула.
 --inode, --min-inode, --max-inode
    Перечислять только объекты из указанных номеров инодов (образов).
 --min-offset, --max-offset
@ -262,93 +254,3 @@ vitastor-cli snap-create [-p|--pool <id|name>] <image>@<snapshot>

 С опцией `--dry-run` только проверяет, возможно ли удаление без потери данных и деградации
 избыточности.
-
-## create-pool
-
-`vitastor-cli create-pool|pool-create <name> (-s <pg_size>|--ec <N>+<K>) -n <pg_count> [OPTIONS]`
-
-Создать пул. Обязательные параметры:
-
-| <!-- -->                  | <!-- -->                                                                                    |
-|---------------------------|---------------------------------------------------------------------------------------------|
-| `-s R` или `--pg_size R`  | Число копий данных для реплицированных пулов                                                |
-| `--ec N+K`                | Число частей данных (N) и чётности (K) для пулов с кодами коррекции ошибок                  |
-| `-n N` или `--pg_count N` | Число PG для нового пула (начните с 10*<число OSD>/pg_size, округлённого до степени двойки) |
-
-Необязательные параметры:
-
-| <!-- -->                       | <!-- -->                                                                   |
-|--------------------------------|----------------------------------------------------------------------------|
-| `--pg_minsize <number>`        | (R или N+K) минус число разрешённых отказов без остановки пула ([подробнее](../config/pool.ru.md#pg_minsize)) |
-| `--failure_domain host`        | Домен отказа: host, osd или другой из placement_levels. По умолчанию: host |
-| `--root_node <node>`           | Использовать для пула только дочерние OSD этого узла дерева размещения     |
-| `--osd_tags <tag>[,<tag>]...`  | ...только OSD со всеми заданными тегами                                    |
-| `--block_size 128k`            | ...только OSD с данным размером блока                                      |
-| `--bitmap_granularity 4k`      | ...только OSD с данным размером логического сектора                        |
-| `--immediate_commit none`      | ...только OSD с этим или большим immediate_commit (none < small < all)     |
-| `--level_placement <rules>`    | Задать правила дополнительных доменов отказа (пример: "dc=112233")         |
-| `--raw_placement <rules>`      | Задать низкоуровневые правила генерации PG ([детали](../config/pool.ru.md#raw_placement)) |
-| `--primary_affinity_tags tags` | Предпочитать OSD со всеми данными тегами для роли первичных                |
-| `--scrub_interval <time>`      | Включить скрабы с заданным интервалом времени (число + единица s/m/h/d/M/y) |
-| `--pg_stripe_size <number>`    | Увеличить блок группировки объектов по PG                                  |
-| `--max_osd_combinations 10000` | Максимальное число случайных комбинаций OSD для ЛП-солвера                 |
-| `--wait`                       | Подождать, пока новый пул будет активирован                                |
-| `-f` или `--force`             | Не проверять, что в кластере достаточно доменов отказа для создания пула   |
-
-Подробно о параметрах см. [Конфигурация пулов](../config/pool.ru.md).
-
-Примеры:
-
-`vitastor-cli create-pool test_x4 -s 4 -n 32`
-
-`vitastor-cli create-pool test_ec42 --ec 4+2 -n 32`
-
-## modify-pool
-
-`vitastor-cli modify-pool|pool-modify <id|name> [--name <new_name>] [PARAMETERS...]`
-
-Изменить настройки существующего пула. Изменяемые параметры:
-
-```
-[-s|--pg_size <number>] [--pg_minsize <number>] [-n|--pg_count <count>]
-[--failure_domain <level>] [--root_node <node>] [--osd_tags <tags>]
-[--max_osd_combinations <number>] [--primary_affinity_tags <tags>] [--scrub_interval <time>]
-```
-
-Неизменяемые параметры (их изменение ПРИВЕДЁТ к потере данных):
-
-```
-[--block_size <size>] [--bitmap_granularity <size>]
-[--immediate_commit <all|small|none>] [--pg_stripe_size <size>]
-```
-
-Эти параметры можно изменить, только если явно передать опцию -f или --force.
-
-Описания параметров смотрите в [create-pool](#create-pool).
-
-Примеры:
-
-`vitastor-cli modify-pool pool_A --name pool_B`
-
-`vitastor-cli modify-pool 2 --pg_size 4 -n 128`
-
-## rm-pool
-
-`vitastor-cli rm-pool|pool-rm [--force] <id|name>`
-
-Удалить пул. Отказывается удалять пул, в котором ещё есть образы, без `--force`.
-
-## ls-pools
-
-`vitastor-cli ls-pools|pool-ls|ls-pool|pools [-l] [--detail] [--sort FIELD] [-r] [-n N] [--stats] [<glob> ...]`
-
-Показать список пулов. Если передан(ы) шаблон(ы) `<glob>`, то только с именами,
-соответствующими одному из шаблонов (стандартные ФС-шаблоны с * и ?).
-
-| <!-- -->              | <!-- -->                                                   |
-|-----------------------|------------------------------------------------------------|
-| `-l` или `--long`     | Вывести также статистику ввода-вывода                      |
-| `--detail`            | Максимально подробный вывод в виде списка (а не таблицы)   |
-| `--sort FIELD`        | Сортировать по заданному полю (поля см. в выводе с --json) |
-| `-r` или `--reverse`  | Сортировать в обратном порядке                             |
-| `-n` или `--count N`  | Выводить только первые N записей                           |
--- a/docs/usage/disk.en.md
+++ b/docs/usage/disk.en.md
@ -17,7 +17,6 @@ It supports the following commands:
 - [purge](#purge)
 - [read-sb](#read-sb)
 - [write-sb](#write-sb)
- [update-sb](#update-sb)
 - [udev](#udev)
 - [exec-osd](#exec-osd)
 - [pre-exec](#pre-exec)
@ -88,7 +87,7 @@ Options (both modes):
 --block_size 1M/128k       Set blockstore object size
 --bitmap_granularity 4k    Set bitmap granularity
 --data_csum_type none      Set data checksum type (crc32c or none)
--csum_block_size 4k/32k   Set data checksum block size (SSD/HDD default)
+--csum_block_size 4k       Set data checksum block size
 --data_device_block 4k     Override data device block size
 --meta_device_block 4k     Override metadata device block size
 --journal_device_block 4k  Override journal device block size
@ -103,7 +102,7 @@ checks the device cache status on start and tries to disable cache for SATA/SAS
 If it doesn't succeed it issues a warning in the system log.

 You can also pass other OSD options here as arguments and they'll be persisted
-in the superblock: cached_io_data, cached_io_meta, cached_io_journal,
+in the superblock: cached_read_data, cached_read_meta, cached_read_journal,
 inmemory_metadata, inmemory_journal, max_write_iodepth,
 min_flusher_count, max_flusher_count, journal_sector_buffer_count,
 journal_no_same_sector_overwrites, throttle_small_writes, throttle_target_iops,
@ -183,14 +182,6 @@ Try to read Vitastor OSD superblock from `<device>` and print it in JSON format.

 Read JSON from STDIN and write it into Vitastor OSD superblock on `<device>`.

-## update-sb
-
-`vitastor-disk update-sb <device> [--force] [--<parameter> <value>] [...]`
-
-Read Vitastor OSD superblock from <device>, update parameters in it and write it back.
-
-`--force` allows to ignore validation errors.
-
 ## udev

 `vitastor-disk udev <device>`
@ -261,7 +252,7 @@ Options (see also [Cluster-Wide Disk Layout Parameters](../config/layout-cluster
 ```
 --object_size 128k       Set blockstore block size
 --bitmap_granularity 4k  Set bitmap granularity
--journal_size 32M       Set journal size
+--journal_size 16M       Set journal size
 --data_csum_type none    Set data checksum type (crc32c or none)
 --csum_block_size 4k     Set data checksum block size
 --device_block_size 4k   Set device block size
--- a/docs/usage/disk.ru.md
+++ b/docs/usage/disk.ru.md
@ -17,7 +17,6 @@ vitastor-disk - инструмент командной строки для уп
 - [purge](#purge)
 - [read-sb](#read-sb)
 - [write-sb](#write-sb)
- [update-sb](#update-sb)
 - [udev](#udev)
 - [exec-osd](#exec-osd)
 - [pre-exec](#pre-exec)
@ -89,7 +88,7 @@ vitastor-disk - инструмент командной строки для уп
 --block_size 1M/128k       Задать размер объекта хранилища
 --bitmap_granularity 4k    Задать гранулярность битовых карт
 --data_csum_type none      Задать тип контрольных сумм (crc32c или none)
--csum_block_size 4k/32k   Задать размер блока расчёта контрольных сумм (дефолт SSD/HDD)
+--csum_block_size 4k       Задать размер блока расчёта контрольных сумм
 --data_device_block 4k     Задать размер блока устройства данных
 --meta_device_block 4k     Задать размер блока метаданных
 --journal_device_block 4k  Задать размер блока журнала
@ -104,8 +103,8 @@ vitastor-disk - инструмент командной строки для уп
 это не удаётся, в системный журнал выводится предупреждение.

 Вы можете передать данной команде и некоторые другие опции OSD в качестве аргументов
-и они тоже будут сохранены в суперблок: cached_io_data, cached_io_meta,
-cached_io_journal, inmemory_metadata, inmemory_journal, max_write_iodepth,
+и они тоже будут сохранены в суперблок: cached_read_data, cached_read_meta,
+cached_read_journal, inmemory_metadata, inmemory_journal, max_write_iodepth,
 min_flusher_count, max_flusher_count, journal_sector_buffer_count,
 journal_no_same_sector_overwrites, throttle_small_writes, throttle_target_iops,
 throttle_target_mbs, throttle_target_parallelism, throttle_threshold_us.
@ -188,15 +187,6 @@ throttle_target_mbs, throttle_target_parallelism, throttle_threshold_us.

 Прочитать JSON со стандартного ввода и записать его в суперблок OSD на диск `<device>`.

-## update-sb
-
-`vitastor-disk update-sb <device> [--force] [--<параметр> <значение>] [...]`
-
-Прочитать суперблок OSD с диска `<device>`, изменить в нём заданные параметры и записать обратно.
-
-Опция `--force` позволяет читать суперблок, даже если он считается некорректным
-из-за ошибок валидации.
-
 ## udev

 `vitastor-disk udev <device>`
@ -267,7 +257,7 @@ OSD отключены fsync-и.
 ```
 --object_size 128k       Размер блока хранилища
 --bitmap_granularity 4k  Гранулярность битовых карт
--journal_size 32M       Размер журнала
+--journal_size 16M       Размер журнала
 --data_csum_type none    Задать тип контрольных сумм (crc32c или none)
 --csum_block_size 4k     Задать размер блока расчёта контрольных сумм
 --device_block_size 4k   Размер блока устройства
--- a/docs/usage/fio.en.md
+++ b/docs/usage/fio.en.md
@ -14,13 +14,10 @@ Vitastor has a fio driver which can be installed from the package vitastor-fio.
 Use the following command as an example to run tests with fio against a Vitastor cluster:

 ```
-fio -thread -ioengine=libfio_vitastor.so -name=test -bs=4M -direct=1 -iodepth=16 -rw=write -image=testimg
+fio -thread -ioengine=libfio_vitastor.so -name=test -bs=4M -direct=1 -iodepth=16 -rw=write -etcd=10.115.0.10:2379/v3 -image=testimg
 ```

 If you don't want to access your image by name, you can specify pool number, inode number and size
 (`-pool=1 -inode=1 -size=400G`) instead of the image name (`-image=testimg`).

-You can also specify etcd address(es) explicitly by adding `-etcd=10.115.0.10:2379/v3`, or you
-can override configuration file path by adding `-conf=/etc/vitastor/vitastor.conf`.
-
-See exact fio commands to use for benchmarking [here](../performance/understanding.en.md#fio-commands).
+See exact fio commands to use for benchmarking [here](../performance/understanding.en.md#команды-fio).
--- a/docs/usage/fio.ru.md
+++ b/docs/usage/fio.ru.md
@ -14,13 +14,10 @@
 Используйте следующую команду как пример для запуска тестов кластера Vitastor через fio:

 ```
-fio -thread -ioengine=libfio_vitastor.so -name=test -bs=4M -direct=1 -iodepth=16 -rw=write -image=testimg
+fio -thread -ioengine=libfio_vitastor.so -name=test -bs=4M -direct=1 -iodepth=16 -rw=write -etcd=10.115.0.10:2379/v3 -image=testimg
 ```

 Вместо обращения к образу по имени (`-image=testimg`) можно указать номер пула, номер инода и размер:
 `-pool=1 -inode=1 -size=400G`.

-Вы также можете задать адрес(а) подключения к etcd явно, добавив `-etcd=10.115.0.10:2379/v3`,
-или переопределить путь к файлу конфигурации, добавив `-conf=/etc/vitastor/vitastor.conf`.
-
 Конкретные команды fio для тестирования производительности можно посмотреть [здесь](../performance/understanding.ru.md#команды-fio).
--- a/docs/usage/nbd.en.md
+++ b/docs/usage/nbd.en.md
@ -11,52 +11,40 @@ NBD stands for "Network Block Device", but in fact it also functions as "BUSE"
 NBD slighly lowers the performance due to additional overhead, but performance still
 remains decent (see an example [here](../performance/comparison1.en.md#vitastor-0-4-0-nbd)).

-See also [VDUSE](qemu.en.md#vduse) as a better alternative to NBD.
+Vitastor Kubernetes CSI driver is based on NBD.

-Vitastor Kubernetes CSI driver uses NBD when VDUSE is unavailable.
+See also [VDUSE](qemu.en.md#vduse).

-Supports the following commands:
-
- [map](#map)
- [unmap](#unmap)
- [ls](#ls)
- [netlink-map](#netlink-map)
- [netlink-unmap](#netlink-unmap)
- [netlink-revive](#netlink-revive)
-
-## map
+## Map image

 To create a local block device for a Vitastor image run:

 ```
-vitastor-nbd map [/dev/nbdN] --image testimg
+vitastor-nbd map --etcd_address 10.115.0.10:2379/v3 --image testimg
 ```

 It will output a block device name like /dev/nbd0 which you can then use as a normal disk.

 You can also use `--pool <POOL> --inode <INODE> --size <SIZE>` instead of `--image <IMAGE>` if you want.

-vitastor-nbd supports all usual Vitastor configuration options like `--config_file <path_to_config>` plus NBD-specific:
+Additional options for map command:

-* `--nbd_timeout 0` \
-  Timeout for I/O operations in seconds after exceeding which the kernel stops the device.
-  Before Linux 5.19, if nbd_timeout is 0, a dead NBD device can't be removed from
-  the system at all without rebooting.
+* `--nbd_timeout 30` \
+  Timeout for I/O operations in seconds after exceeding which the kernel stops
+  the device. You can set it to 0 to disable the timeout, but beware that you
+  won't be able to stop the device at all if vitastor-nbd process dies.
 * `--nbd_max_devices 64 --nbd_max_part 3` \
  Options for the `nbd` kernel module when modprobing it (`nbds_max` and `max_part`).
+  note that maximum allowed (nbds_max)*(1+max_part) is 256.
 * `--logfile /path/to/log/file.txt` \
  Write log messages to the specified file instead of dropping them (in background mode)
  or printing them to the standard output (in foreground mode).
 * `--dev_num N` \
-  Use the specified device /dev/nbdN instead of automatic selection (alternative syntax
-  to /dev/nbdN positional parameter).
+  Use the specified device /dev/nbdN instead of automatic selection.
 * `--foreground 1` \
  Stay in foreground, do not daemonize.

-Note that `nbd_timeout`, `nbd_max_devices` and `nbd_max_part` options may also be specified
-in `/etc/vitastor/vitastor.conf` or in other configuration file specified with `--config_file`.
-
-## unmap
+## Unmap image

 To unmap the device run:

@ -64,14 +52,12 @@ To unmap the device run:
 vitastor-nbd unmap /dev/nbd0
 ```

-## ls
+## List mapped images

 ```
 vitastor-nbd ls [--json]
 ```

-List mapped images.
-
 Example output (normal format):

 ```
@ -89,45 +75,3 @@ Example output (JSON format):
 ```
 {"/dev/nbd0": {"image": "bench", "pid": 584536}, "/dev/nbd1": {"image": "bench1", "pid": 584546}}
 ```
-
-## netlink-map
-
-```
-vitastor-nbd netlink-map [/dev/nbdN] (--image <image> | --pool <pool> --inode <inode> --size <size in bytes>)
-```
-
-On recent kernel versions it's also possinle to map NBD devices using netlink interface.
-
-This is an experimental feature because it doesn't solve all issues of NBD. Differences from regular ioctl-based 'map':
-
-1. netlink-map can create new `/dev/nbdN` devices (those not present in /dev/).
-2. netlink-mapped devices can be unmapped only using `netlink-unmap` command.
-3. netlink-mapped devices don't show up `ls` output (yet).
-4. Dead netlink-mapped devices can be 'revived' using `netlink-revive`.
-   However, old I/O requests will hang forever if `nbd_timeout` is not specified.
-5. netlink-map supports additional options:
-
-* `--nbd_conn_timeout 0` \
-  Disconnect a dead device automatically after this number of seconds.
-* `--nbd_destroy_on_disconnect 1` \
-  Delete the nbd device on disconnect.
-* `--nbd_disconnect_on_close 1` \
-  Disconnect the nbd device on close by last opener.
-* `--nbd_ro 1` \
-  Set device into read only mode.
-
-## netlink-unmap
-
-```
-vitastor-nbd netlink-unmap /dev/nbdN
-```
-
-Unmap a device using netlink interface. Works with both netlink and ioctl mapped devices.
-
-## netlink-revive
-
-```
-vitastor-nbd netlink-revive /dev/nbdX (--image <image> | --pool <pool> --inode <inode> --size <size in bytes>)
-```
-
-Restart a dead NBD netlink-mapped device without removing it. Supports the same options as `netlink-map`.
--- a/docs/usage/nbd.ru.md
+++ b/docs/usage/nbd.ru.md
@ -14,25 +14,16 @@ NBD на данный момент необходимо, чтобы монтир
 NBD немного снижает производительность из-за дополнительных копирований памяти,
 но она всё равно остаётся на неплохом уровне (см. для примера [тест](../performance/comparison1.ru.md#vitastor-0-4-0-nbd)).

-Смотрите также [VDUSE](qemu.ru.md#vduse), как лучшую альтернативу NBD.
+CSI-драйвер Kubernetes Vitastor основан на NBD.

-CSI-драйвер Kubernetes Vitastor использует NBD, когда VDUSE недоступен.
+Смотрите также [VDUSE](qemu.ru.md#vduse).

-Поддерживаются следующие команды:
-
- [map](#map)
- [unmap](#unmap)
- [ls](#ls)
- [netlink-map](#netlink-map)
- [netlink-unmap](#netlink-unmap)
- [netlink-revive](#netlink-revive)
-
-## map
+## Подключить устройство

 Чтобы создать локальное блочное устройство для образа, выполните команду:

 ```
-vitastor-nbd map [/dev/nbdN] --image testimg
+vitastor-nbd map --etcd_address 10.115.0.10:2379/v3 --image testimg
 ```

 Команда напечатает название блочного устройства вида /dev/nbd0, которое потом можно
@ -41,16 +32,18 @@ vitastor-nbd map [/dev/nbdN] --image testimg
 Для обращения по номеру инода, аналогично другим командам, можно использовать опции
 `--pool <POOL> --inode <INODE> --size <SIZE>` вместо `--image testimg`.

-vitastor-nbd поддерживает все обычные опции Vitastor, например, `--config_file <path_to_config>`,
-плюс специфичные для NBD:
+Дополнительные опции для команды подключения NBD-устройства:

-* `--nbd_timeout 0` \
+* `--nbd_timeout 30` \
  Максимальное время выполнения любой операции чтения/записи в секундах, при
-  превышении которого ядро остановит NBD-устройство. На ядрах Linux старее 5.19,
-  если таймаут установлен в 0, NBD-устройство вообще невозможно отключить из системы
-  при нештатном завершении процесса.
+  превышении которого ядро остановит NBD-устройство. Вы можете установить опцию
+  в 0, чтобы отключить ограничение времени, но имейте в виду, что в этом случае
+  вы вообще не сможете отключить NBD-устройство при нештатном завершении процесса
+  vitastor-nbd.
 * `--nbd_max_devices 64 --nbd_max_part 3` \
-  Опции, передаваемые модулю ядра nbd, если его загружает vitastor-nbd (`nbds_max` и `max_part`).
+  Опции, передаваемые модулю ядра nbd, если его загружает vitastor-nbd
+  (`nbds_max` и `max_part`). Имейте в виду, что (nbds_max)*(1+max_part)
+  обычно не должно превышать 256.
 * `--logfile /path/to/log/file.txt` \
  Писать сообщения о процессе работы в заданный файл, вместо пропуска их
  при фоновом режиме запуска или печати на стандартный вывод при запуске
@ -60,11 +53,7 @@ vitastor-nbd поддерживает все обычные опции Vitastor,
 * `--foreground 1` \
  Не уводить процесс в фоновый режим.

-Обратите внимание, что опции `nbd_timeout`, `nbd_max_devices` и `nbd_max_part` можно
-также задавать в `/etc/vitastor/vitastor.conf` или в другом файле конфигурации,
-заданном опцией `--config_file`.
-
-## unmap
+## Отключить устройство

 Для отключения устройства выполните:

@ -72,14 +61,12 @@ vitastor-nbd поддерживает все обычные опции Vitastor,
 vitastor-nbd unmap /dev/nbd0
 ```

-## ls
+## Вывести подключённые устройства

 ```
 vitastor-nbd ls [--json]
 ```

-Вывести подключённые устройства.
-
 Пример вывода в обычном формате:

 ```
@ -97,46 +84,3 @@ pid: 584546
 ```
 {"/dev/nbd0": {"image": "bench", "pid": 584536}, "/dev/nbd1": {"image": "bench1", "pid": 584546}}
 ```
-
-## netlink-map
-
-```
-vitastor-nbd netlink-map [/dev/nbdN] (--image <image> | --pool <POOL> --inode <INODE> --size <SIZE>)
-```
-
-На свежих версиях ядра Linux также возможно подключать NBD-устройства через интерфейс netlink.
-
-Это экспериментальная функция, так как она не решает всех проблем NBD. Отличия от обычного 'map':
-
-1. Можно создавать новые `/dev/nbdN` устройства (отсутствующие в /dev/).
-2. Отключать netlink-устройства можно только командой `netlink-unmap`.
-3. netlink-устройства не видно в выводе `ls` (пока что).
-4. Мёртвые netlink-устройства можно "оживить" командой `netlink-revive`. Правда, предыдущие
-   запросы ввода-вывода всё равно зависнут навсегда, если `nbd_timeout` не задан.
-5. Поддерживаются дополнительные опции:
-
-* `--nbd_conn_timeout 0` \
-  Отключать мёртвое устройство автоматически через данное число секунд.
-* `--nbd_destroy_on_disconnect 1` \
-  Удалять NBD-устройство при отключении.
-* `--nbd_disconnect_on_close 1` \
-  Отключать NBD-устройство автоматически, когда его все закроют.
-* `--nbd_ro 1` \
-  Установить для NBD-устройства режим "только для чтения".
-
-## netlink-unmap
-
-```
-vitastor-nbd netlink-unmap /dev/nbdN
-```
-
-Отключить устройство через интерфейс netlink. Работает и с обычными, и с netlink-устройствами.
-
-## netlink-revive
-
-```
-vitastor-nbd netlink-revive /dev/nbdX (--image <image> | --pool <pool> --inode <inode> --size <size in bytes>)
-```
-
-Оживить мёртвое NBD-устройство, ранее подключённое через netlink, без удаления. Поддерживает
-те же опции, что и `netlink-map`.
--- a/docs/usage/nfs.en.md
+++ b/docs/usage/nfs.en.md
@ -1,153 +1,45 @@
-[Documentation](../../README.md#documentation) → Usage → VitastorFS and pseudo-FS
+[Documentation](../../README.md#documentation) → Usage → NFS

 -----

 [Читать на русском](nfs.ru.md)

-# VitastorFS and pseudo-FS
+# NFS

-Vitastor has two file system implementations. Both can be used via `vitastor-nfs`.
+Vitastor has a simplified NFS 3.0 proxy for file-based image access emulation. It's not
+suitable as a full-featured file system, at least because all file/image metadata is stored
+in etcd and kept in memory all the time - thus you can't put a lot of files in it.

-Commands:
- [mount](#mount)
- [start](#start)
+However, NFS proxy is totally fine as a method to provide VM image access and allows to
+plug Vitastor into, for example, VMWare. It's important to note that for VMWare it's a much
+better access method than iSCSI, because with iSCSI we'd have to put all VM images into one
+Vitastor image exported as a LUN to VMWare and formatted with VMFS. VMWare doesn't use VMFS
+over NFS.

-## Pseudo-FS
+NFS proxy is stateless if you use immediate_commit=all mode (for SSD with capacitors or
+HDDs with disabled cache), so you can run multiple NFS proxies and use a network load
+balancer or any failover method you want to in that case.

-Simplified pseudo-FS proxy is used for file-based image access emulation. It's not
-suitable as a full-featured file system: it lacks a lot of FS features, it stores
-all file/image metadata in memory and in etcd. So it's fine for hundreds or thousands
-of large files/images, but not for millions.
-
-Pseudo-FS proxy is intended for environments where other block volume access methods
-can't be used or impose additional restrictions - for example, VMWare. NFS is better
-for VMWare than, for example, iSCSI, because with iSCSI, VMWare puts all VM images
-into one large shared block image in its own VMFS file system, and with NFS, VMWare
-doesn't use VMFS and puts each VM disk in a regular file which is equal to one
-Vitastor block image, just as originally intended.
-
-To use Vitastor pseudo-FS locally, run `vitastor-nfs mount --block /mnt/vita`.
-
-Also you can start the network server:
+vitastor-nfs usage:

 ```
-vitastor-nfs start --block --etcd_address 192.168.5.10:2379 --portmap 0 --port 2050 --pool testpool
+vitastor-nfs [--etcd_address ADDR] [OTHER OPTIONS]
+
+--subdir <DIR>    export images prefixed <DIR>/ (default empty - export all images)
+--portmap 0       do not listen on port 111 (portmap/rpcbind, requires root)
+--bind <IP>       bind service to <IP> address (default 0.0.0.0)
+--nfspath <PATH>  set NFS export path to <PATH> (default is /)
+--port <PORT>     use port <PORT> for NFS services (default is 2049)
+--pool <POOL>     use <POOL> as default pool for new files (images)
+--foreground 1    stay in foreground, do not daemonize
 ```

-To mount the FS exported by this server, run:
+Example start and mount commands:

 ```
-mount server:/ /mnt/ -o port=2050,mountport=2050,nfsvers=3,soft,nolock,tcp
+vitastor-nfs --etcd_address 192.168.5.10:2379 --portmap 0 --port 2050 --pool testpool
 ```

-## VitastorFS
-
-VitastorFS is a full-featured clustered (Read-Write-Many) file system. It supports most POSIX
-features like hierarchical organization, symbolic links, hard links, quick renames and so on.
-
-VitastorFS metadata is stored in a Parallel Optimistic B-Tree key-value database,
-implemented over a regular Vitastor block volume. Directory entries and inodes
-are stored in a simple human-readable JSON format in the B-Tree. `vitastor-kv` tool
-can be used to inspect the database.
-
-To use VitastorFS:
-
-1. Create a pool or choose an existing empty pool for FS data
-2. Create an image for FS metadata, preferably in a faster (SSD or replica-HDD) pool,
-   but you can create it in the data pool too if you want (image size doesn't matter):
-   `vitastor-cli create -s 10G -p fastpool testfs`
-3. Mark data pool as an FS pool: `vitastor-cli modify-pool --used-for-fs testfs data-pool`
-4. Either mount the FS: `vitastor-nfs mount --fs testfs --pool data-pool /mnt/vita`
-5. Or start the NFS server: `vitastor-nfs start --fs testfs --pool data-pool`
-
-### Supported POSIX features
-
- Read-after-write semantics (read returns new data immediately after write)
- Linear and random read and write
- Writing outside current file size
- Hierarchical structure, immediate rename of files and directories
- File size change support (truncate)
- Permissions (chmod/chown)
- Flushing data to stable storage (if required) (fsync)
- Symbolic links
- Hard links
- Special files (devices, sockets, named pipes)
- File modification and attribute change time tracking (mtime and ctime)
- Modification time (mtime) and last access time (atime) change support (utimes)
- Correct handling of directory listing during file creation/deletion
-
-### Limitations
-
-POSIX features currently not implemented in VitastorFS:
- File locking is not supported
- Actually used space is not counted, so `du` always reports apparent file sizes
-  instead of actually allocated space
- Access times (`atime`) are not tracked (like `-o noatime`)
- Modification time (`mtime`) is updated lazily every second (like `-o lazytime`)
-
-Other notable missing features which should be addressed in the future:
- Defragmentation of "shared" inodes. Files smaller than pool object size (block_size
-  multiplied by data part count if pool is EC) are internally stored in large block
-  volumes sequentially, one after another, and leave garbage after deleting or resizing.
-  Defragmentator will be implemented to collect this garbage.
- Inode ID reuse. Currently inode IDs always grow, the limit is 2^48 inodes, so
-  in theory you may hit it if you create and delete a very large number of files
- Compaction of the key-value B-Tree. Current implementation never merges or deletes
-  B-Tree blocks, so B-Tree may become bloated over time. Currently you can
-  use `vitastor-kv dumpjson` & `loadjson` commands to recreate the index in such
-  situations.
- Filesystem check tool. VitastorFS doesn't have journal because it would impose a
-  severe performance hit, optimistic CAS-based transactions are used instead of it.
-  So, again, in theory an abnormal shutdown of the FS server may leave some garbage
-  in the DB. The FS is implemented is such way that this garbage doesn't affect its
-  function, but having a tool to clean it up still seems a right thing to do.
-
-## Horizontal scaling
-
-Linux NFS 3.0 client doesn't support built-in scaling or failover, i.e. you can't
-specify multiple server addresses when mounting the FS.
-
-However, you can use any regular TCP load balancing over multiple NFS servers.
-It's absolutely safe with `immediate_commit=all` and `client_enable_writeback=false`
-settings, because Vitastor NFS proxy doesn't keep uncommitted data in memory
-with these settings. But it may even work without `immediate_commit=all` because
-the Linux NFS client repeats all uncommitted writes if it loses the connection.
-
-## Commands
-
-### mount
-
-`vitastor-nfs (--fs <NAME> | --block) [-o <OPT>] mount <MOUNTPOINT>`
-
-Start local filesystem server and mount file system to <MOUNTPOINT>.
-
-Use regular `umount <MOUNTPOINT>` to unmount the FS.
-
-The server will be automatically stopped when the FS is unmounted.
-
- `-o|--options <OPT>` - Pass additional NFS mount options (ex.: -o async).
-
-### start
-
-`vitastor-nfs (--fs <NAME> | --block) start`
-
-Start network NFS server. Options:
-
-| <!-- -->        | <!-- -->                                                   |
-|-----------------|------------------------------------------------------------|
-| `--bind <IP>`   | bind service to \<IP> address (default 0.0.0.0)            |
-| `--port <PORT>` | use port \<PORT> for NFS services (default is 2049)        |
-| `--portmap 0`   | do not listen on port 111 (portmap/rpcbind, requires root) |
-
-## Common options
-
-| <!-- -->           | <!-- -->                                                 |
-|--------------------|----------------------------------------------------------|
-| `--fs <NAME>`      | use VitastorFS with metadata in image \<NAME>            |
-| `--block`          | use pseudo-FS presenting images as files                 |
-| `--pool <POOL>`    | use \<POOL> as default pool for new files                |
-| `--subdir <DIR>`   | export \<DIR> instead of root directory (pseudo-FS only) |
-| `--nfspath <PATH>` | set NFS export path to \<PATH> (default is /)            |
-| `--pidfile <FILE>` | write process ID to the specified file                   |
-| `--logfile <FILE>` | log to the specified file                                |
-| `--foreground 1`   | stay in foreground, do not daemonize                     |
+```
+mount localhost:/ /mnt/ -o port=2050,mountport=2050,nfsvers=3,soft,nolock,tcp
+```
--- a/docs/usage/nfs.ru.md
+++ b/docs/usage/nfs.ru.md
@ -1,159 +1,44 @@
-[Документация](../../README-ru.md#документация) → Использование → VitastorFS и псевдо-ФС
+[Документация](../../README-ru.md#документация) → Использование → NFS

 -----

 [Read in English](nfs.en.md)

-# VitastorFS и псевдо-ФС
+# NFS

-В Vitastor есть две реализации файловой системы. Обе используются через `vitastor-nfs`.
+В Vitastor реализована упрощённая NFS 3.0 прокси для эмуляции файлового доступа к образам.
+Это не полноценная файловая система, т.к. метаданные всех файлов (образов) сохраняются
+в etcd и всё время хранятся в оперативной памяти - то есть, положить туда много файлов
+не получится.

-Команды:
- [mount](#mount)
- [start](#start)
+Однако в качестве способа доступа к образам виртуальных машин NFS прокси прекрасно подходит
+и позволяет подключить Vitastor, например, к VMWare.

-## Псевдо-ФС
+При этом, если вы используете режим immediate_commit=all (для SSD с конденсаторами или HDD
+с отключённым кэшем), то NFS-сервер не имеет состояния и вы можете свободно поднять
+его в нескольких экземплярах и использовать поверх них сетевой балансировщик нагрузки или
+схему с отказоустойчивостью.

-Упрощённая реализация псевдо-ФС используется для эмуляции файлового доступа к блочным
-образам Vitastor. Это не полноценная файловая система - в ней отсутствуют многие функции
-POSIX ФС, а метаданные всех файлов (образов) сохраняются в etcd и всё время хранятся в
-оперативной памяти - то есть, псевдо-ФС подходит для сотен или тысяч файлов, но не миллионов.
-
-Псевдо-ФС предназначена для доступа к образам виртуальных машин в средах, где другие
-способы невозможны или неудобны - например, в VMWare. Для VMWare это лучшая опция, чем
-iSCSI, так как при использовании iSCSI VMWare размещает все виртуальные машины в одном
-большом блочном образе внутри собственной ФС VMFS, а с NFS VMFS не используется и каждый
-диск ВМ представляется в виде одного файла, то есть, соответствует одному блочному образу
-Vitastor, как это и задумано изначально.
-
-Чтобы подключить псевдо-ФС Vitastor, выполните команду `vitastor-nfs mount --block /mnt/vita`.
-
-Либо же запустите сетевой вариант сервера:
+Использование vitastor-nfs:

 ```
-vitastor-nfs start --block --etcd_address 192.168.5.10:2379 --portmap 0 --port 2050 --pool testpool
+vitastor-nfs [--etcd_address ADDR] [ДРУГИЕ ОПЦИИ]
+
+--subdir <DIR>    экспортировать "поддиректорию" - образы с префиксом имени <DIR>/ (по умолчанию пусто - экспортировать все образы)
+--portmap 0       отключить сервис portmap/rpcbind на порту 111 (по умолчанию включён и требует root привилегий)
+--bind <IP>       принимать соединения по адресу <IP> (по умолчанию 0.0.0.0 - на всех)
+--nfspath <PATH>  установить путь NFS-экспорта в <PATH> (по умолчанию /)
+--port <PORT>     использовать порт <PORT> для NFS-сервисов (по умолчанию 2049)
+--pool <POOL>     использовать пул <POOL> для новых образов (обязательно, если пул в кластере не один)
+--foreground 1    не уходить в фон после запуска
 ```

-Примонтировать ФС, запущенную с такими опциями, можно следующей командой:
+Пример монтирования Vitastor через NFS:

 ```
-mount server:/ /mnt/ -o port=2050,mountport=2050,nfsvers=3,soft,nolock,tcp
+vitastor-nfs --etcd_address 192.168.5.10:2379 --portmap 0 --port 2050 --pool testpool
 ```

-## VitastorFS
-
-VitastorFS - полноценная кластерная (Read-Write-Many) файловая система. Она поддерживает
-большую часть функций POSIX - иерархическую организацию, символические ссылки, жёсткие
-ссылки, быстрые переименования и так далее.
-
-Метаданные VitastorFS хранятся в собственной реализации БД формата ключ-значения,
-основанной на Параллельном Оптимистичном Б-дереве поверх обычного блочного образа Vitastor.
-И записи каталогов, и иноды, как обычно в Vitastor, хранятся в простом человекочитаемом
-JSON-формате :-). Для инспекции содержимого БД можно использовать инструмент `vitastor-kv`.
-
-Чтобы использовать VitastorFS:
-
-1. Создайте пул для данных ФС или выберите существующий пустой пул
-2. Создайте блочный образ для метаданных ФС, желательно, в более быстром пуле (на SSD
-   или по крайней мере на HDD, но без EC), но можно и в том же пуле, что данные
-   (размер образа значения не имеет):
-   `vitastor-cli create -s 10G -p fastpool testfs`
-3. Пометьте пул данных как ФС-пул: `vitastor-cli modify-pool --used-for-fs testfs data-pool`
-4. Либо примонтируйте ФС: `vitastor-nfs mount --fs testfs --pool data-pool /mnt/vita`
-5. Либо запустите сетевой NFS-сервер: `vitastor-nfs start --fs testfs --pool data-pool`
-
-### Поддерживаемые функции POSIX
-
- Чтение актуальной версии данных сразу после записи
- Последовательное и произвольное чтение и запись
- Запись за пределами текущего размера файла
- Иерархическая организация, мгновенное переименование файлов и каталогов
- Изменение размера файла (truncate)
- Права на файлы (chmod/chown)
- Фиксация данных на диски (когда необходимо) (fsync)
- Символические ссылки
- Жёсткие ссылки
- Специальные файлы (устройства, сокеты, каналы)
- Отслеживание времён модификации (mtime), изменения атрибутов (ctime)
- Ручное изменение времён модификации (mtime), последнего доступа (atime)
- Корректная обработка изменений списка файлов во время листинга
-
-### Ограничения
-
-Отсутствующие на данный момент в VitastorFS функции POSIX:
- Блокировки файлов не поддерживаются
- Фактически занятое файлами место не подсчитывается и не возвращается вызовами
-  stat(2), так что `du` всегда показывает сумму размеров файлов, а не фактически занятое место
- Времена доступа (`atime`) не отслеживаются (как будто ФС смонтирована с `-o noatime`)
- Времена модификации (`mtime`) отслеживаются асинхронно (как будто ФС смонтирована с `-o lazytime`)
-
-Другие недостающие функции, которые нужно добавить в будущем:
- Дефрагментация "общих инодов". На уровне реализации ФС файлы, меньшие, чем размер
-  объекта пула (block_size умножить на число частей данных, если пул EC),
-  упаковываются друг за другом в большие "общие" иноды/тома. Если такие файлы удалять
-  или увеличивать, они перемещаются и оставляют за собой "мусор", вот тут-то и нужен
-  дефрагментатор.
- Переиспользование номеров инодов. В текущей реализации номера инодов всё время
-  увеличиваются, так что в теории вы можете упереться в лимит, если насоздаёте
-  и наудаляете больше, чем 2^48 файлов.
- Очистка места в Б-дереве метаданных. Текущая реализация никогда не сливает и не
-  удаляет блоки Б-дерева, так что в теории дерево может разростись и стать неоптимальным.
-  Если вы столкнётесь с такой ситуацией сейчас, вы можете решить её с помощью
-  команд `vitastor-kv dumpjson` и `loadjson` (т.е. пересоздав и загрузив обратно все метаданные ФС).
- Инструмент проверки метаданных файловой системы. У VitastorFS нет журнала, так как
-  журнал бы сильно замедлил реализацию, вместо него используются оптимистичные
-  транзакции на основе CAS (сравнить-и-записать), и теоретически при нештатном
-  завершении сервера ФС в БД также могут оставаться неконсистентные "мусорные"
-  записи. ФС устроена так, что на работу они не влияют, но для порядка и их стоит
-  уметь подчищать.
-
-## Горизонтальное масштабирование
-
-Клиент Linux NFS 3.0 не поддерживает встроенное масштабирование или отказоустойчивость.
-То есть, вы не можете задать несколько адресов серверов при монтировании ФС.
-
-Однако вы можете использовать любые стандартные сетевые балансировщики нагрузки
-или схемы с отказоустойчивостью. Это точно безопасно при настройках `immediate_commit=all` и
-`client_enable_writeback=false`, так как с ними NFS-сервер Vitastor вообще не хранит
-в памяти ещё не зафиксированные на дисках данные; и вполне вероятно безопасно
-даже без `immediate_commit=all`, потому что NFS-клиент ядра Linux повторяет все
-незафиксированные запросы при потере соединения.
-
-## Команды
-
-### mount
-
-`vitastor-nfs (--fs <NAME> | --block) mount [-o <OPT>] <MOUNTPOINT>`
-
-Запустить локальный сервер и примонтировать ФС в директорию <MOUNTPOINT>.
-
-Чтобы отмонтировать ФС, используйте обычную команду `umount <MOUNTPOINT>`.
-
-Сервер автоматически останавливается при отмонтировании ФС.
-
- `-o|--options <OPT>` - Передать дополнительные опции монтирования NFS (пример: -o async).
-
-### start
-
-`vitastor-nfs (--fs <NAME> | --block) start`
-
-Запустить сетевой NFS-сервер. Опции:
-
-| <!-- -->        | <!-- -->                                                              |
-|-----------------|-----------------------------------------------------------------------|
-| `--bind <IP>`   | принимать соединения по адресу \<IP> (по умолчанию 0.0.0.0 - на всех) |
-| `--port <PORT>` | использовать порт \<PORT> для NFS-сервисов (по умолчанию 2049)        |
-| `--portmap 0`   | отключить сервис portmap/rpcbind на порту 111 (по умолчанию включён и требует root привилегий) |
-
-## Общие опции
-
-| <!-- -->           | <!-- -->                                                |
-|--------------------|---------------------------------------------------------|
-| `--fs <NAME>`      | использовать VitastorFS с метаданными в образе \<NAME>  |
-| `--block`          | использовать псевдо-ФС для доступа к блочным образам    |
-| `--pool <POOL>`    | использовать пул \<POOL> для новых файлов (обязательно, если пул в кластере не один) |
-| `--subdir <DIR>`   | экспортировать подкаталог \<DIR>, а не корень (только для псевдо-ФС) |
-| `--nfspath <PATH>` | установить путь NFS-экспорта в \<PATH> (по умолчанию /) |
-| `--pidfile <FILE>` | записать ID процесса в заданный файл                    |
-| `--logfile <FILE>` | записывать логи в заданный файл                         |
-| `--foreground 1`   | не уходить в фон после запуска                          |
+```
+mount localhost:/ /mnt/ -o port=2050,mountport=2050,nfsvers=3,soft,nolock,tcp
+```
--- a/docs/usage/pg_states.dot
+++ b/docs/usage/pg_states.dot
@ -1,13 +0,0 @@
-digraph G {
-    rankdir=LR;
-    bgcolor=transparent;
-    edge [color="#00A000"];
-    node [shape=hexagon, fillcolor="#A0A000", fontcolor=white, fontname="sans-serif", fontsize=12, style=filled, penwidth=0];
-    offline -> starting -> peering -> offline;
-    stopping -> offline;
-    starting -> incomplete -> offline;
-    active -> repeering -> peering -> active -> stopping;
-    offline [fillcolor="#A00000"];
-    incomplete [fillcolor="#A00000"];
-    active [fillcolor="#00A000"];
-}
--- a/docs/usage/pg_states.svg
+++ b/docs/usage/pg_states.svg
@ -1,114 +0,0 @@
-<?xml version="1.0" encoding="UTF-8" standalone="no"?>
-<!DOCTYPE svg PUBLIC "-//W3C//DTD SVG 1.1//EN"
- "http://www.w3.org/Graphics/SVG/1.1/DTD/svg11.dtd">
-<!-- Generated by graphviz version 2.43.0 (0)
- -->
-<!-- Title: G Pages: 1 -->
-<svg width="603pt" height="123pt"
- viewBox="0.00 0.00 602.66 122.55" xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink">
-<g id="graph0" class="graph" transform="scale(1 1) rotate(0) translate(4 118.55)">
-<title>G</title>
-<!-- offline -->
-<g id="node1" class="node">
-<title>offline</title>
-<polygon fill="#a00000" stroke="black" stroke-width="0" points="75.52,-56 56.6,-74 18.75,-74 -0.17,-56 18.75,-38 56.6,-38 75.52,-56"/>
-<text text-anchor="middle" x="37.67" y="-52.9" font-family="sans-serif" font-size="12.00" fill="white">offline</text>
-</g>
-<!-- starting -->
-<g id="node2" class="node">
-<title>starting</title>
-<polygon fill="#a0a000" stroke="black" stroke-width="0" points="199.56,-79 177.49,-97 133.35,-97 111.28,-79 133.35,-61 177.49,-61 199.56,-79"/>
-<text text-anchor="middle" x="155.42" y="-75.9" font-family="sans-serif" font-size="12.00" fill="white">starting</text>
-</g>
-<!-- offline&#45;&gt;starting -->
-<g id="edge1" class="edge">
-<title>offline&#45;&gt;starting</title>
-<path fill="none" stroke="#00a000" d="M69.39,-62.1C81.66,-64.54 96.04,-67.4 109.45,-70.06"/>
-<polygon fill="#00a000" stroke="#00a000" points="108.98,-73.54 119.47,-72.05 110.34,-66.67 108.98,-73.54"/>
-</g>
-<!-- peering -->
-<g id="node3" class="node">
-<title>peering</title>
-<polygon fill="#a0a000" stroke="black" stroke-width="0" points="335.57,-95 313.96,-113 270.74,-113 249.13,-95 270.74,-77 313.96,-77 335.57,-95"/>
-<text text-anchor="middle" x="292.35" y="-91.9" font-family="sans-serif" font-size="12.00" fill="white">peering</text>
-</g>
-<!-- starting&#45;&gt;peering -->
-<g id="edge2" class="edge">
-<title>starting&#45;&gt;peering</title>
-<path fill="none" stroke="#00a000" d="M194.36,-83.5C209.71,-85.32 227.6,-87.44 243.8,-89.36"/>
-<polygon fill="#00a000" stroke="#00a000" points="243.82,-92.89 254.16,-90.59 244.64,-85.94 243.82,-92.89"/>
-</g>
-<!-- incomplete -->
-<g id="node5" class="node">
-<title>incomplete</title>
-<polygon fill="#a00000" stroke="black" stroke-width="0" points="349.09,-41 320.72,-59 263.99,-59 235.62,-41 263.99,-23 320.72,-23 349.09,-41"/>
-<text text-anchor="middle" x="292.35" y="-37.9" font-family="sans-serif" font-size="12.00" fill="white">incomplete</text>
-</g>
-<!-- starting&#45;&gt;incomplete -->
-<g id="edge5" class="edge">
-<title>starting&#45;&gt;incomplete</title>
-<path fill="none" stroke="#00a000" d="M188.74,-69.9C204.92,-65.34 224.85,-59.73 242.82,-54.67"/>
-<polygon fill="#00a000" stroke="#00a000" points="243.9,-58 252.57,-51.92 242,-51.26 243.9,-58"/>
-</g>
-<!-- peering&#45;&gt;offline -->
-<g id="edge3" class="edge">
-<title>peering&#45;&gt;offline</title>
-<path fill="none" stroke="#00a000" d="M259.32,-103.69C222.67,-112.11 161.28,-121.52 111.35,-106 94.55,-100.78 78.2,-90.18 65.27,-80.08"/>
-<polygon fill="#00a000" stroke="#00a000" points="67.26,-77.19 57.3,-73.58 62.84,-82.61 67.26,-77.19"/>
-</g>
-<!-- active -->
-<g id="node6" class="node">
-<title>active</title>
-<polygon fill="#00a000" stroke="black" stroke-width="0" points="456.34,-49 438.55,-67 402.97,-67 385.18,-49 402.97,-31 438.55,-31 456.34,-49"/>
-<text text-anchor="middle" x="420.76" y="-45.9" font-family="sans-serif" font-size="12.00" fill="white">active</text>
-</g>
-<!-- peering&#45;&gt;active -->
-<g id="edge9" class="edge">
-<title>peering&#45;&gt;active</title>
-<path fill="none" stroke="#00a000" d="M322.99,-84.22C341.47,-77.49 365.34,-68.8 384.75,-61.74"/>
-<polygon fill="#00a000" stroke="#00a000" points="385.96,-65.03 394.16,-58.32 383.56,-58.45 385.96,-65.03"/>
-</g>
-<!-- stopping -->
-<g id="node4" class="node">
-<title>stopping</title>
-<polygon fill="#a0a000" stroke="black" stroke-width="0" points="591.65,-18 567.57,-36 519.39,-36 495.31,-18 519.39,0 567.57,0 591.65,-18"/>
-<text text-anchor="middle" x="543.48" y="-14.9" font-family="sans-serif" font-size="12.00" fill="white">stopping</text>
-</g>
-<!-- stopping&#45;&gt;offline -->
-<g id="edge4" class="edge">
-<title>stopping&#45;&gt;offline</title>
-<path fill="none" stroke="#00a000" d="M500.13,-14.3C440.78,-9.83 329.58,-4.07 235.49,-14 179.71,-19.89 116.5,-34.9 77.11,-45.29"/>
-<polygon fill="#00a000" stroke="#00a000" points="76.14,-41.92 67.38,-47.89 77.94,-48.69 76.14,-41.92"/>
-</g>
-<!-- incomplete&#45;&gt;offline -->
-<g id="edge6" class="edge">
-<title>incomplete&#45;&gt;offline</title>
-<path fill="none" stroke="#00a000" d="M240.25,-44.03C194.33,-46.76 127.57,-50.72 83.64,-53.33"/>
-<polygon fill="#00a000" stroke="#00a000" points="83.32,-49.84 73.54,-53.93 83.73,-56.83 83.32,-49.84"/>
-</g>
-<!-- active&#45;&gt;stopping -->
-<g id="edge10" class="edge">
-<title>active&#45;&gt;stopping</title>
-<path fill="none" stroke="#00a000" d="M449.46,-41.89C463.64,-38.25 481.26,-33.72 497.34,-29.59"/>
-<polygon fill="#00a000" stroke="#00a000" points="498.29,-32.96 507.11,-27.08 496.55,-26.18 498.29,-32.96"/>
-</g>
-<!-- repeering -->
-<g id="node7" class="node">
-<title>repeering</title>
-<polygon fill="#a0a000" stroke="black" stroke-width="0" points="594.84,-83 569.16,-101 517.8,-101 492.12,-83 517.8,-65 569.16,-65 594.84,-83"/>
-<text text-anchor="middle" x="543.48" y="-79.9" font-family="sans-serif" font-size="12.00" fill="white">repeering</text>
-</g>
-<!-- active&#45;&gt;repeering -->
-<g id="edge7" class="edge">
-<title>active&#45;&gt;repeering</title>
-<path fill="none" stroke="#00a000" d="M448.85,-56.63C462.9,-60.59 480.44,-65.53 496.53,-70.06"/>
-<polygon fill="#00a000" stroke="#00a000" points="495.74,-73.47 506.32,-72.82 497.64,-66.74 495.74,-73.47"/>
-</g>
-<!-- repeering&#45;&gt;peering -->
-<g id="edge8" class="edge">
-<title>repeering&#45;&gt;peering</title>
-<path fill="none" stroke="#00a000" d="M495.33,-85.27C451.99,-87.36 387.93,-90.44 343.63,-92.58"/>
-<polygon fill="#00a000" stroke="#00a000" points="343.2,-89.09 333.38,-93.07 343.54,-96.09 343.2,-89.09"/>
-</g>
-</g>
-</svg>
--- a/Show More
+++ b/Show More
Author	SHA1	Message	Date
Vitaliy Filippov	37d14e35f3	Save cached_read_* in superblock	2023-07-17 00:34:58 +03:00
Vitaliy Filippov	8c4a11b51c	Document cached_read_{data,meta,journal} parameters	2023-07-17 00:34:58 +03:00
Vitaliy Filippov	98d5849190	Support using Linux page cache for reads	2023-07-17 00:34:58 +03:00
Vitaliy Filippov	e4ea8a9514	Track used blocks, not object versions	2023-07-17 00:34:58 +03:00
Vitaliy Filippov	3c565e7b94	Document data_csum_type and csum_block_size parameters	2023-07-17 00:34:58 +03:00
Vitaliy Filippov	708918a4c7	Remove creepy "metadata copying" during overwrite Instead of it, just do not verify checksums of currently mutated objects. When clean data modification during flush runs in parallel to a read request, that request may read a mix of old and new data. It may even read a mix of multiple flushed versions if it lasts too long... And attempts to verify it using temporary copies of metadata make the algorithm too complex and creepy.	2023-07-15 02:34:20 +03:00
Vitaliy Filippov	8e099c1d11	Support keeping checksums on disk (not in memory) Definitely beneficial for SSD+HDD setups	2023-07-14 00:38:15 +03:00
Vitaliy Filippov	debb00a535	Check for "Checksum mismatch" and "BUG" messages during test_heal	2023-07-13 01:49:45 +03:00
Vitaliy Filippov	c8891ab1d6	Use clean_dyn_size for space check	2023-07-13 01:49:45 +03:00
Vitaliy Filippov	7062b73d87	Log more details about checksum mismatch in big_writes	2023-07-13 01:49:45 +03:00
Vitaliy Filippov	02e24f5144	Use find_holes() in flusher for unification	2023-07-13 01:49:45 +03:00
Vitaliy Filippov	55f506f6e0	Fill journal header to know checksum type & size when dumping journal with --all	2023-07-13 01:49:45 +03:00
Vitaliy Filippov	e12dd9b82c	Fix journal read checksum verification with inmemory_journal=false	2023-07-13 01:49:45 +03:00
Vitaliy Filippov	82d8848b8f	Call fill_partial_checksum_blocks() correctly in regard to COPY_BUF_CSUM_FILL	2023-07-13 01:49:45 +03:00
Vitaliy Filippov	ef800408dc	Wait for journal reads before checking them in clear_incomplete_csum_block_bits	2023-07-13 01:49:45 +03:00
Vitaliy Filippov	3b1150c478	Check for checksum mismatch absence in test_heal	2023-07-13 01:49:45 +03:00
Vitaliy Filippov	ccdf87dc81	Use zero checksum size for zero-length writes	2023-07-13 01:49:45 +03:00
Vitaliy Filippov	bede73d158	Fix journal data checksum mangling on corrupted block overwrite	2023-07-13 01:49:45 +03:00
Vitaliy Filippov	8e35319a34	Check journal entry size when checking block checksums	2023-07-13 01:49:45 +03:00
Vitaliy Filippov	dc6e88e2ca	Fix journal data checksum verification on start	2023-07-13 01:49:45 +03:00
Vitaliy Filippov	4c3370220b	Add more details to "journal entry data is corrupt" messages	2023-07-13 01:49:45 +03:00
Vitaliy Filippov	f02344c0a4	...and partially remove the perversion with bitmap inlining	2023-07-13 01:49:45 +03:00
Vitaliy Filippov	b369032665	Sadly we have to refcount dyn_data...	2023-07-13 01:49:45 +03:00
Vitaliy Filippov	319b0833eb	Fix clean block checksum read	2023-07-13 01:49:45 +03:00
Vitaliy Filippov	0641b06fb1	Allow to forcibly set meta_format	2023-07-13 01:49:45 +03:00
Vitaliy Filippov	3b9873b9a9	Rename meta_version to meta_format	2023-07-13 01:49:45 +03:00
Vitaliy Filippov	eef97a0dc4	Support old metadata format in vitastor-disk dump-meta	2023-07-13 01:49:45 +03:00
Vitaliy Filippov	5b16e5ab5b	Fix journal big_write simple reads after checksum changes	2023-07-13 01:49:45 +03:00
Vitaliy Filippov	bb430fccd5	Verify checksums in test_heal in different combinations	2023-07-13 01:49:45 +03:00
Vitaliy Filippov	bafadd5559	Fix bitmap-granular checksums	2023-07-13 01:49:45 +03:00
Vitaliy Filippov	6ebca5fedc	Fix wait_journal_count not being zeroed	2023-07-13 01:49:45 +03:00
Vitaliy Filippov	e3e2325ef5	Rewrite and fix find_holes into a more obvious version	2023-07-13 01:49:45 +03:00
Vitaliy Filippov	9a908f3e66	Fix missing checksum read offset	2023-07-13 01:49:45 +03:00
Vitaliy Filippov	89f6fef920	Add a test for checksums	2023-07-13 01:49:45 +03:00
Vitaliy Filippov	63b9382067	Fix checksum verification in big_write journal reads	2023-07-13 01:49:45 +03:00
Vitaliy Filippov	8a7dea9fa2	Verify checksums during journal reads	2023-07-13 01:49:45 +03:00
Vitaliy Filippov	1adf77f8fb	Add backwards compatibility with non-checksum metadata and journal formats	2023-07-13 01:49:45 +03:00
Vitaliy Filippov	7a530346a6	Implement large csum_block_size support (more than 4k) + refactor blockstore_flush	2023-07-13 01:49:45 +03:00
Vitaliy Filippov	cc1f03971d	Implement bitmap-granular (4k) metadata & data checksums	2023-07-13 01:49:45 +03:00