606 changed files with 16063 additions and 112552 deletions
--- a/.dockerignore
+++ b/.dockerignore
@ -1,19 +0,0 @@
-.git
-build
-packages
-mon/node_modules
-*.o
-*.so
-osd
-stub_osd
-stub_uring_osd
-stub_bench
-osd_test
-dump_journal
-nbd_proxy
-rm_inode
-fio
-qemu
-rpm/*.Dockerfile
-debian/*.Dockerfile
-Dockerfile
--- a/.gitea/workflows/buildenv.Dockerfile
+++ b/.gitea/workflows/buildenv.Dockerfile
@ -1,36 +0,0 @@
-FROM node:16-bullseye
-
-WORKDIR /root
-
-ADD ./docker/vitastor.gpg /etc/apt/trusted.gpg.d
-
-RUN echo 'deb http://deb.debian.org/debian bullseye-backports main' >> /etc/apt/sources.list; \
-    echo 'deb http://vitastor.io/debian bullseye main' >> /etc/apt/sources.list; \
-    echo >> /etc/apt/preferences; \
-    echo 'Package: *' >> /etc/apt/preferences; \
-    echo 'Pin: release a=bullseye-backports' >> /etc/apt/preferences; \
-    echo 'Pin-Priority: 500' >> /etc/apt/preferences; \
-    echo >> /etc/apt/preferences; \
-    echo 'Package: *' >> /etc/apt/preferences; \
-    echo 'Pin: origin "vitastor.io"' >> /etc/apt/preferences; \
-    echo 'Pin-Priority: 1000' >> /etc/apt/preferences; \
-    grep '^deb ' /etc/apt/sources.list | perl -pe 's/^deb/deb-src/' >> /etc/apt/sources.list; \
-    echo 'APT::Install-Recommends false;' >> /etc/apt/apt.conf; \
-    echo 'APT::Install-Suggests false;' >> /etc/apt/apt.conf
-
-RUN apt-get update
-RUN apt-get -y install etcd qemu-system-x86 qemu-block-extra qemu-utils fio libasan5 \
-    liburing1 liburing-dev libgoogle-perftools-dev devscripts libjerasure-dev cmake libibverbs-dev libisal-dev
-RUN apt-get -y build-dep fio qemu=`dpkg -s qemu-system-x86|grep ^Version:|awk '{print $2}'`
-RUN apt-get update && apt-get -y install jq lp-solve sudo nfs-common fdisk parted
-RUN apt-get --download-only source fio qemu=`dpkg -s qemu-system-x86|grep ^Version:|awk '{print $2}'`
-
-RUN set -ex; \
-    mkdir qemu-build; \
-    cd qemu-build; \
-    dpkg-source -x /root/qemu*.dsc; \
-    cd qemu*/; \
-    debian/rules configure-qemu || debian/rules b/configure-stamp; \
-    cd b/qemu; \
-    make -j8 config-poison.h || true; \
-    make -j8 qapi/qapi-builtin-types.h
--- a/.gitea/workflows/test.Dockerfile
+++ b/.gitea/workflows/test.Dockerfile
@ -1,19 +0,0 @@
-FROM git.yourcmc.ru/vitalif/vitastor/buildenv
-
-ADD . /root/vitastor
-
-RUN set -e -x; \
-    mkdir -p /root/fio-build/; \
-    cd /root/fio-build/; \
-    dpkg-source -x /root/fio*.dsc; \
-    cd /root/vitastor; \
-    ln -s /root/fio-build/fio-*/ ./fio; \
-    ln -s /root/qemu-build/qemu-*/ ./qemu; \
-    ls /usr/include/linux/raw.h || cp ./debian/raw.h /usr/include/linux/raw.h; \
-    cd mon; \
-    npm install; \
-    cd ..; \
-    mkdir build; \
-    cd build; \
-    cmake .. -DWITH_ASAN=yes -DWITH_QEMU=yes; \
-    make -j16
--- a/.gitea/workflows/test.yml
+++ b/.gitea/workflows/test.yml
--- a/.gitea/workflows/tests-to-yaml.pl
+++ b/.gitea/workflows/tests-to-yaml.pl
@ -1,83 +0,0 @@
-#!/usr/bin/perl
-
-use strict;
-
-for my $line (<>)
-{
-    if ($line =~ /\.\/(test_[^\.]+)/s)
-    {
-        chomp $line;
-        my $base_name = $1;
-        my $test_name = $base_name;
-        my $timeout = 3;
-        if ($test_name eq 'test_etcd_fail' || $test_name eq 'test_heal' || $test_name eq 'test_add_osd' ||
-            $test_name eq 'test_interrupted_rebalance' || $test_name eq 'test_rebalance_verify')
-        {
-            $timeout = 10;
-        }
-        while ($line =~ /([^\s=]+)=(\S+)/gs)
-        {
-            if ($1 eq 'TEST_NAME')
-            {
-                $test_name = $base_name.'_'.$2;
-                last;
-            }
-            elsif ($1 eq 'SCHEME' && $2 eq 'ec')
-            {
-                $test_name .= '_ec';
-            }
-            elsif ($1 eq 'SCHEME' && $2 eq 'xor')
-            {
-                $test_name .= '_xor';
-            }
-            elsif ($1 eq 'IMMEDIATE_COMMIT')
-            {
-                $test_name .= '_imm';
-            }
-            elsif ($1 eq 'ANTIETCD')
-            {
-                $test_name .= '_antietcd';
-            }
-            else
-            {
-                $test_name .= '_'.lc($1).'_'.$2;
-            }
-        }
-        if ($test_name eq 'test_snapshot_chain_ec')
-        {
-            $timeout = 6;
-        }
-        $line =~ s!\./test_!/root/vitastor/tests/test_!;
-        # Gitea CI doesn't support artifacts yet, lol
-        #- name: Upload results
-        #  uses: actions/upload-artifact\@v3
-        #  if: always()
-        #  with:
-        #    name: ${test_name}_result
-        #    path: |
-        #      /root/vitastor/testdata
-        #      !/root/vitastor/testdata/*.bin
-        #    retention-days: 5
-        print <<"EOF"
-  $test_name:
-    runs-on: ubuntu-latest
-    needs: build
-    container: \${{env.TEST_IMAGE}}:\${{github.sha}}
-    steps:
-    - name: Run test
-      id: test
-      timeout-minutes: $timeout
-      run: $line
-    - name: Print logs
-      if: always() && steps.test.outcome == 'failure'
-      run: |
-        for i in /root/vitastor/testdata/*.log /root/vitastor/testdata/*.txt; do
-          echo "-------- \$i --------"
-          cat \$i
-          echo ""
-        done
-
-EOF
-;
-    }
-}
--- a/.gitignore
+++ b/.gitignore
@ -1,5 +0,0 @@
-*.o
-*.so
-package-lock.json
-fio
-qemu
--- a/CLA-en.md
+++ b/CLA-en.md
@ -1,115 +0,0 @@
-## Contributor License Agreement
-
-> This Agreement is made in the Russian and English languages. **The English
-text of Agreement is for informational purposes only** and is not binding
-for the Parties.
->
-> In the event of a conflict between the provisions of the Russian and
-English versions of this Agreement, the **Russian version shall prevail**.
->
-> Russian version is published at https://git.yourcmc.ru/vitalif/vitastor/src/branch/master/CLA-ru.md
-
-This document represents the offer of Filippov Vitaliy Vladimirovich
-("Author"), author and copyright holder of Vitastor software ("Program"),
-acknowledged by a certificate of Federal Service for Intellectual
-Property of Russian Federation (Rospatent) # 2021617829 dated 20 May 2021,
-to "Contributors" to conclude this license agreement as follows
-("Agreement" or "Offer").
-
-In accordance with Art. 435, Art. 438 of the Civil Code of the Russian
-Federation, this Agreement is an offer and in case of acceptance of the
-offer, an agreement is considered concluded on the conditions specified
-in the offer.
-
-1. Applicable Terms. \
-   1.1. "Official Repository" shall mean the computer storage, operated by
-        the Author, containing all prior and future versions of the Source
-        Code of the Program, at Internet addresses https://git.yourcmc.ru/vitalif/vitastor/
-        or https://github.com/vitalif/vitastor/. \
-   1.2. "Contributions" shall mean results of intellectual activity
-        (including, but not limited to, source code, libraries, components,
-        texts, documentation) which can be software or elements of the software
-        and which are provided by Contributors to the Author for inclusion
-        in the Program. \
-   1.3. "Contributor" shall mean a person who provides Contributions to
-        the Author and agrees with all provisions of this Agreement.
-        A Сontributor can be: 1) an individual; or 2) a legal entity or an
-        individual entrepreneur in case when an individual provides Contributions
-        on behalf of third parties, including on behalf of his employer.
-
-2. Subject of the Agreement. \
-   2.1. Subject of the Agreement shall be the Contributions sent to the Author by Contributors. \
-   2.2. The Contributor grants to the Author the right to use Contributions at his own
-        discretion and without any necessity to get a prior approval from Contributor or
-        any other third party in any way, under a simple (non-exclusive), royalty-free,
-        irrevocable license throughout the world by all means not contrary to law, in whole
-        or as a part of the Program, or other open-source or closed-source computer programs,
-        products or services (hereinafter -- the "License"), including, but not limited to: \
-        2.2.1. to execute Contributions and use them for any tasks; \
-        2.2.2. to publish and distribute Contributions in modified or unmodified form and/or to rent them; \
-        2.2.3. to modify Contributions, add comments, illustrations or any explanations to Contributions while using them; \
-        2.2.4. to create other results of intellectual activity based on Contributions, including derivative works and composite works; \
-        2.2.5. to translate Contributions into other languages, including other programming languages; \
-        2.2.6. to carry out rental and public display of Contributions; \
-        2.2.7. to use Contributions under the trade name and/or any trademark or any other label, or without it, as the Author thinks fit; \
-   2.3. The Contributor grants to the Author the right to sublicense any of the aforementioned
-        rights to third parties on any terms at the Author's discretion. \
-   2.4. The License is provided for the entire duration of Contributor's
-        exclusive intellectual property rights to the Contributions. \
-   2.5. The Contributor grants to the Author the right to decide how and where to mention,
-        or to not mention at all, the fact of his authorship, name, nickname and/or company
-        details when including Contributions into the Program or in any other computer
-        programs, products or services.
-
-3. Acceptance of the Offer \
-   3.1. The Contributor may provide Contributions to the Author in the form of
-        a "Pull Request" in an Official Repository of the Program or by any
-        other electronic means of communication, including, but not limited to,
-        E-mail or messenger applications. \
-   3.2. The acceptance of the Offer shall be the fact of provision of Contributions
-        to the Author by the Contributor by any means with the following remark:
-        “I accept Vitastor CLA agreement: https://git.yourcmc.ru/vitalif/vitastor/src/branch/master/CLA-en.md”
-        or “Я принимаю соглашение Vitastor CLA: https://git.yourcmc.ru/vitalif/vitastor/src/branch/master/CLA-ru.md”. \
-   3.3. Date of acceptance of the Offer shall be the date of such provision.
-
-4. Rights and obligations of the parties. \
-   4.1. The Contributor reserves the right to use Contributions by any lawful means
-        not contrary to this Agreement. \
-   4.2. The Author has the right to refuse to include Contributions into the Program
-        at any moment with no explanation to the Contributor.
-
-5. Representations and Warranties. \
-   5.1. The person providing Contributions for the purpose of their inclusion
-        in the Program represents and warrants that he is the Contributor
-        or legally acts on the Contributor's behalf. Name or company details
-        of the Contributor shall be provided with the Contribution at the moment
-        of their provision to the Author. \
-   5.2. The Contributor represents and warrants that he legally owns exclusive
-        intellectual property rights to the Contributions. \
-   5.3. The Contributor represents and warrants that any further use of
-        Contributions by the Author as provided by Contributor under the terms
-        of the Agreement does not infringe on intellectual and other rights and
-        legitimate interests of third parties. \
-   5.4. The Contributor represents and warrants that he has all rights and legal
-        capacity needed to accept this Offer; \
-   5.5. The Contributor represents and warrants that Contributions don't
-        contain malware or any information considered illegal under the law
-        of Russian Federation.
-
-6. Termination of the Agreement \
-   6.1. The Agreement may be terminated at will of both Author and Contributor,
-        formalised in the written form or if the Agreement is terminated on
-        reasons prescribed by the law of Russian Federation.
-
-7. Final Clauses \
-   7.1. The Contributor may optionally sign the Agreement in the written form. \
-   7.2. The Agreement is deemed to become effective from the Date of signing of
-        the Agreement and until the expiration of Contributor's exclusive
-        intellectual property rights to the Contributions. \
-   7.3. The Author may unilaterally alter the Agreement without informing Contributors.
-        The new version of the document shall come into effect 3 (three) days after
-        being published in the Official Repository of the Program at Internet address
-        [https://git.yourcmc.ru/vitalif/vitastor/src/branch/master/CLA-en.md](https://git.yourcmc.ru/vitalif/vitastor/src/branch/master/CLA-en.md).
-        Contributors should keep informed about the actual version of the Agreement themselves. \
-   7.4. If the Author and the Contributor fail to agree on disputable issues,
-        disputes shall be referred to the Moscow Arbitration court.
--- a/CLA-ru.md
+++ b/CLA-ru.md
@ -1,108 +0,0 @@
-## Лицензионное соглашение с участником
-
-> Данная Оферта написана в Русской и Английской версиях. **Версия на английском
-языке предоставляется в информационных целях** и не связывает стороны договора.
->
-> В случае несоответствий между положениями Русской и Английской версий Договора,
-**Русская версия имеет приоритет**.
->
-> Английская версия опубликована по адресу https://git.yourcmc.ru/vitalif/vitastor/src/branch/master/CLA-en.md
-
-Настоящий договор-оферта (далее по тексту – Оферта, Договор) адресована физическим
-и юридическим лицам (далее – Участникам) и является официальным публичным предложением
-Филиппова Виталия Владимировича (далее – Автора) программного обеспечения Vitastor,
-свидетельство Федеральной службы по интеллектуальной собственности (Роспатент) № 2021617829
-от 20 мая 2021 г. (далее – Программа) о нижеследующем:
-
-1. Термины и определения \
-   1.1. Репозиторий – электронное хранилище, содержащее исходный код Программы. \
-   1.2. Доработка – результат интеллектуальной деятельности Участника, включающий
-        в себя изменения или дополнения к исходному коду Программы, которые Участник
-        желает включить в состав Программы для дальнейшего использования и распространения
-        Автором и для этого направляет их Автору. \
-   1.3. Участник – физическое или юридическое лицо, вносящее Доработки в код Программы. \
-   1.4. ГК РФ – Гражданский кодекс Российской Федерации.
-
-2. Предмет оферты \
-   2.1. Предметом настоящей оферты являются Доработки, отправляемые Участником Автору. \
-   2.2. Участник предоставляет Автору право использовать Доработки по собственному усмотрению
-        и без необходимости предварительного согласования с Участником или иным третьим лицом
-        на условиях простой (неисключительной) безвозмездной безотзывной лицензии, полностью
-        или фрагментарно, в составе Программы или других программ, продуктов или сервисов
-        как с открытым, так и с закрытым исходным кодом, любыми способами, не противоречащими
-        закону, включая, но не ограничиваясь следующими: \
-        2.2.1. Запускать и использовать Доработки для выполнения любых задач; \
-        2.2.2. Распространять, импортировать и доводить Доработки до всеобщего сведения; \
-        2.2.3. Вносить в Доработки изменения, сокращения и дополнения, снабжать Доработки
-               при их использовании комментариями, иллюстрациями или пояснениями; \
-        2.2.4. Создавать на основе Доработок иные результаты интеллектуальной деятельности,
-               в том числе производные и составные произведения; \
-        2.2.5. Переводить Доработки на другие языки, в том числе на другие языки программирования; \
-        2.2.6. Осуществлять прокат и публичный показ Доработок; \
-        2.2.7. Использовать Доработки под любым фирменным наименованием, товарным знаком
-               (знаком обслуживания) или иным обозначением, или без такового. \
-   2.3. Участник предоставляет Автору право сублицензировать полученные права на Доработки
-        третьим лицам на любых условиях на усмотрение Автора. \
-   2.4. Участник предоставляет Автору права на Доработки на территории всего мира. \
-   2.5. Участник предоставляет Автору права на весь срок действия исключительного права
-        Участника на Доработки. \
-   2.6. Участник предоставляет Автору права на Доработки на безвозмездной основе. \
-   2.7. Участник разрешает Автору самостоятельно определять порядок, способ и
-        место указания его имени, реквизитов и/или псевдонима при включении
-        Доработок в состав Программы или других программ, продуктов или сервисов.
-
-3. Акцепт Оферты \
-   3.1. Участник может передавать Доработки в адрес Автора через зеркала официального
-        Репозитория Программы по адресам https://git.yourcmc.ru/vitalif/vitastor/ или
-        https://github.com/vitalif/vitastor/ в виде “запроса на слияние” (pull request),
-        либо в письменном виде или с помощью любых других электронных средств коммуникации,
-        например, электронной почты или мессенджеров. \
-   3.2. Факт передачи Участником Доработок в адрес Автора любым способом с одной из пометок
-        “I accept Vitastor CLA agreement: https://git.yourcmc.ru/vitalif/vitastor/src/branch/master/CLA-en.md”
-        или “Я принимаю соглашение Vitastor CLA: https://git.yourcmc.ru/vitalif/vitastor/src/branch/master/CLA-ru.md”
-        является полным и безоговорочным акцептом (принятием) Участником условий настоящей
-        Оферты, т.е. Участник считается ознакомившимся с настоящим публичным договором и
-        в соответствии с ГК РФ признается лицом, вступившим с Автором в договорные отношения
-        на основании настоящей Оферты. \
-   3.3. Датой акцептирования настоящей Оферты считается дата такой передачи.
-
-4. Права и обязанности Сторон \
-   4.1. Участник сохраняет за собой право использовать Доработки любым законным
-        способом, не противоречащим настоящему Договору. \
-   4.2. Автор вправе отказать Участнику во включении Доработок в состав
-        Программы без объяснения причин в любой момент по своему усмотрению.
-
-5. Гарантии и заверения \
-   5.1. Лицо, направляющее Доработки для целей их включения в состав Программы,
-        гарантирует, что является Участником или представителем Участника. Имя или реквизиты
-        Участника должны быть указаны при их передаче в адрес Автора Программы. \
-   5.2. Участник гарантирует, что является законным обладателем исключительных прав
-        на Доработки. \
-   5.3. Участник гарантирует, что на момент акцептирования настоящей Оферты ему
-        ничего не известно (и не могло быть известно) о правах третьих лиц на
-        передаваемые Автору Доработки или их часть, которые могут быть нарушены
-        в связи с передачей Доработок по настоящему Договору. \
-   5.4. Участник гарантирует, что является дееспособным лицом и обладает всеми
-        необходимыми правами для заключения Договора. \
-   5.5. Участник гарантирует, что Доработки не содержат вредоносного ПО, а также
-        любой другой информации, запрещённой к распространению по законам Российской
-        Федерации.
-
-6. Прекращение действия оферты \
-   6.1. Действие настоящего договора может быть прекращено по соглашению сторон,
-        оформленному в письменном виде, а также вследствие его расторжения по основаниям,
-        предусмотренным законом.
-
-7. Заключительные положения \
-   7.1. Участник вправе по желанию подписать настоящий Договор в письменном виде. \
-   7.2. Настоящий договор действует с момента его заключения и до истечения срока
-        действия исключительных прав Участника на Доработки. \
-   7.3. Автор имеет право в одностороннем порядке вносить изменения и дополнения в договор
-        без специального уведомления об этом Участников. Новая редакция документа вступает
-        в силу через 3 (Три) календарных дня со дня опубликования в официальном Репозитории
-        Программы по адресу в сети Интернет
-        [https://git.yourcmc.ru/vitalif/vitastor/src/branch/master/CLA-ru.md](https://git.yourcmc.ru/vitalif/vitastor/src/branch/master/CLA-ru.md).
-        Участники самостоятельно отслеживают действующие условия Оферты. \
-   7.4. Все споры, возникающие между сторонами в процессе их взаимодействия по настоящему
-        договору, решаются путём переговоров. В случае невозможности урегулирования споров
-        переговорным порядком стороны разрешают их в Арбитражном суде г.Москвы.
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@ -1,7 +0,0 @@
-cmake_minimum_required(VERSION 2.8.12)
-
-project(vitastor)
-
-set(VITASTOR_VERSION "1.9.3")
-
-add_subdirectory(src)
--- a/27
+++ b/27
@ -1,27 +0,0 @@
-Copyright (c) Vitaliy Filippov (vitalif [at] yourcmc.ru), 2019+
-
-All server-side code (OSD, Monitor and so on) is licensed under the terms of
-Vitastor Network Public License 1.1 (VNPL 1.1), a copyleft license based on
-GNU GPLv3.0 with the additional "Network Interaction" clause which requires
-opensourcing all programs directly or indirectly interacting with Vitastor
-through a computer network and expressly designed to be used in conjunction
-with it ("Proxy Programs"). Proxy Programs may be made public not only under
-the terms of the same license, but also under the terms of any GPL-Compatible
-Free Software License, as listed by the Free Software Foundation.
-This is a stricter copyleft license than the Affero GPL.
-
-Please note that VNPL doesn't require you to open the code of proprietary
-software running inside a VM if it's not specially designed to be used with
-Vitastor.
-
-Basically, you can't use the software in a proprietary environment to provide
-its functionality to users without opensourcing all intermediary components
-standing between the user and Vitastor or purchasing a commercial license
-from the author 😀.
-
-Client libraries (cluster_client and so on) are dual-licensed under the same
-VNPL 1.1 and also GNU GPL 2.0 or later to allow for compatibility with GPLed
-software like QEMU and fio.
-
-You can find the full text of VNPL-1.1 in the file [VNPL-1.1.txt](VNPL-1.1.txt).
-GPL 2.0 is also included in this repository as [GPL-2.0.txt](GPL-2.0.txt).
--- a/Make-gen.pl
+++ b/Make-gen.pl
@ -0,0 +1,46 @@
+#!/usr/bin/perl
+
+use strict;
+
+my $deps = {};
+for my $line (split /\n/, `grep '^#include "' *.cpp *.h`)
+{
+    if ($line =~ /^([^:]+):\#include "([^"]+)"/s)
+    {
+        $deps->{$1}->{$2} = 1;
+    }
+}
+
+my $added;
+do
+{
+    $added = 0;
+    for my $file (keys %$deps)
+    {
+        for my $dep (keys %{$deps->{$file}})
+        {
+            if ($deps->{$dep})
+            {
+                for my $subdep (keys %{$deps->{$dep}})
+                {
+                    if (!$deps->{$file}->{$subdep})
+                    {
+                        $added = 1;
+                        $deps->{$file}->{$subdep} = 1;
+                    }
+                }
+            }
+        }
+    }
+} while ($added);
+
+for my $file (sort keys %$deps)
+{
+    if ($file =~ /\.cpp$/)
+    {
+        my $obj = $file;
+        $obj =~ s/\.cpp$/.o/s;
+        print "$obj: $file ".join(" ", sort keys %{$deps->{$file}})."\n";
+        print "\tg++ \$(CXXFLAGS) -c -o \$\@ \$\<\n";
+    }
+}
--- a/169
+++ b/169
@ -0,0 +1,169 @@
+BLOCKSTORE_OBJS := allocator.o blockstore.o blockstore_impl.o blockstore_init.o blockstore_open.o blockstore_journal.o blockstore_read.o \
+	blockstore_write.o blockstore_sync.o blockstore_stable.o blockstore_rollback.o blockstore_flush.o crc32c.o ringloop.o
+# -fsanitize=address
+CXXFLAGS := -g -O3 -Wall -Wno-sign-compare -Wno-comment -Wno-parentheses -Wno-pointer-arith -fPIC -fdiagnostics-color=always
+all: libfio_blockstore.so osd libfio_sec_osd.so libfio_cluster.so stub_osd stub_uring_osd stub_bench osd_test dump_journal qemu_driver.so nbd_proxy
+clean:
+	rm -f *.o
+
+dump_journal: dump_journal.cpp crc32c.o blockstore_journal.h
+	g++ $(CXXFLAGS) -o $@ $< crc32c.o
+
+libblockstore.so: $(BLOCKSTORE_OBJS)
+	g++ $(CXXFLAGS) -o $@ -shared $(BLOCKSTORE_OBJS) -ltcmalloc_minimal -luring
+libfio_blockstore.so: ./libblockstore.so fio_engine.o json11.o
+	g++ $(CXXFLAGS) -shared -o $@ fio_engine.o json11.o ./libblockstore.so -ltcmalloc_minimal -luring
+
+OSD_OBJS := osd.o osd_secondary.o msgr_receive.o msgr_send.o osd_peering.o osd_flush.o osd_peering_pg.o \
+	osd_primary.o osd_primary_subops.o etcd_state_client.o messenger.o osd_cluster.o http_client.o osd_ops.o pg_states.o \
+	osd_rmw.o json11.o base64.o timerfd_manager.o epoll_manager.o
+osd: ./libblockstore.so osd_main.cpp osd.h osd_ops.h $(OSD_OBJS)
+	g++ $(CXXFLAGS) -o $@ osd_main.cpp $(OSD_OBJS) ./libblockstore.so -ltcmalloc_minimal -luring
+
+stub_osd: stub_osd.o rw_blocking.o
+	g++ $(CXXFLAGS) -o $@ stub_osd.o rw_blocking.o -ltcmalloc_minimal
+
+STUB_URING_OSD_OBJS := stub_uring_osd.o epoll_manager.o messenger.o msgr_send.o msgr_receive.o ringloop.o timerfd_manager.o json11.o
+stub_uring_osd: $(STUB_URING_OSD_OBJS)
+	g++ $(CXXFLAGS) -o $@ -ltcmalloc_minimal $(STUB_URING_OSD_OBJS) -luring
+stub_bench: stub_bench.cpp osd_ops.h rw_blocking.o
+	g++ $(CXXFLAGS) -o $@ stub_bench.cpp rw_blocking.o -ltcmalloc_minimal
+osd_test: osd_test.cpp osd_ops.h rw_blocking.o
+	g++ $(CXXFLAGS) -o $@ osd_test.cpp rw_blocking.o -ltcmalloc_minimal
+osd_peering_pg_test: osd_peering_pg_test.cpp osd_peering_pg.o
+	g++ $(CXXFLAGS) -o $@ $< osd_peering_pg.o -ltcmalloc_minimal
+
+libfio_sec_osd.so: fio_sec_osd.o rw_blocking.o
+	g++ $(CXXFLAGS) -ltcmalloc_minimal -shared -o $@ fio_sec_osd.o rw_blocking.o
+
+FIO_CLUSTER_OBJS := cluster_client.o epoll_manager.o etcd_state_client.o \
+	messenger.o msgr_send.o msgr_receive.o ringloop.o json11.o http_client.o osd_ops.o pg_states.o timerfd_manager.o base64.o
+libfio_cluster.so: fio_cluster.o $(FIO_CLUSTER_OBJS)
+	g++ $(CXXFLAGS) -ltcmalloc_minimal -shared -o $@ $< $(FIO_CLUSTER_OBJS) -luring
+
+nbd_proxy: nbd_proxy.o $(FIO_CLUSTER_OBJS)
+	g++ $(CXXFLAGS) -ltcmalloc_minimal -o $@ $< $(FIO_CLUSTER_OBJS) -luring
+
+qemu_driver.o: qemu_driver.c qemu_proxy.h
+	gcc -I qemu/b/qemu `pkg-config glib-2.0 --cflags` \
+		-I qemu/include $(CXXFLAGS) -c -o $@ $<
+
+qemu_driver.so: qemu_driver.o qemu_proxy.o $(FIO_CLUSTER_OBJS)
+	g++ $(CXXFLAGS) -ltcmalloc_minimal -shared -o $@ $< $(FIO_CLUSTER_OBJS) qemu_driver.o qemu_proxy.o -luring
+
+test_blockstore: ./libblockstore.so test_blockstore.cpp timerfd_interval.o
+	g++ $(CXXFLAGS) -o test_blockstore test_blockstore.cpp timerfd_interval.o ./libblockstore.so -ltcmalloc_minimal -luring
+test: test.cpp osd_peering_pg.o
+	g++ $(CXXFLAGS) -o test test.cpp osd_peering_pg.o -luring -lm
+test_allocator: test_allocator.cpp allocator.o
+	g++ $(CXXFLAGS) -o test_allocator test_allocator.cpp allocator.o
+
+crc32c.o: crc32c.c crc32c.h
+	g++ $(CXXFLAGS) -c -o $@ $<
+json11.o: json11/json11.cpp
+	g++ $(CXXFLAGS) -c -o json11.o json11/json11.cpp
+
+# Autogenerated
+
+allocator.o: allocator.cpp allocator.h
+	g++ $(CXXFLAGS) -c -o $@ $<
+base64.o: base64.cpp base64.h
+	g++ $(CXXFLAGS) -c -o $@ $<
+blockstore.o: blockstore.cpp allocator.h blockstore.h blockstore_flush.h blockstore_impl.h blockstore_init.h blockstore_journal.h cpp-btree/btree_map.h crc32c.h malloc_or_die.h object_id.h ringloop.h
+	g++ $(CXXFLAGS) -c -o $@ $<
+blockstore_flush.o: blockstore_flush.cpp allocator.h blockstore.h blockstore_flush.h blockstore_impl.h blockstore_init.h blockstore_journal.h cpp-btree/btree_map.h crc32c.h malloc_or_die.h object_id.h ringloop.h
+	g++ $(CXXFLAGS) -c -o $@ $<
+blockstore_impl.o: blockstore_impl.cpp allocator.h blockstore.h blockstore_flush.h blockstore_impl.h blockstore_init.h blockstore_journal.h cpp-btree/btree_map.h crc32c.h malloc_or_die.h object_id.h ringloop.h
+	g++ $(CXXFLAGS) -c -o $@ $<
+blockstore_init.o: blockstore_init.cpp allocator.h blockstore.h blockstore_flush.h blockstore_impl.h blockstore_init.h blockstore_journal.h cpp-btree/btree_map.h crc32c.h malloc_or_die.h object_id.h ringloop.h
+	g++ $(CXXFLAGS) -c -o $@ $<
+blockstore_journal.o: blockstore_journal.cpp allocator.h blockstore.h blockstore_flush.h blockstore_impl.h blockstore_init.h blockstore_journal.h cpp-btree/btree_map.h crc32c.h malloc_or_die.h object_id.h ringloop.h
+	g++ $(CXXFLAGS) -c -o $@ $<
+blockstore_open.o: blockstore_open.cpp allocator.h blockstore.h blockstore_flush.h blockstore_impl.h blockstore_init.h blockstore_journal.h cpp-btree/btree_map.h crc32c.h malloc_or_die.h object_id.h ringloop.h
+	g++ $(CXXFLAGS) -c -o $@ $<
+blockstore_read.o: blockstore_read.cpp allocator.h blockstore.h blockstore_flush.h blockstore_impl.h blockstore_init.h blockstore_journal.h cpp-btree/btree_map.h crc32c.h malloc_or_die.h object_id.h ringloop.h
+	g++ $(CXXFLAGS) -c -o $@ $<
+blockstore_rollback.o: blockstore_rollback.cpp allocator.h blockstore.h blockstore_flush.h blockstore_impl.h blockstore_init.h blockstore_journal.h cpp-btree/btree_map.h crc32c.h malloc_or_die.h object_id.h ringloop.h
+	g++ $(CXXFLAGS) -c -o $@ $<
+blockstore_stable.o: blockstore_stable.cpp allocator.h blockstore.h blockstore_flush.h blockstore_impl.h blockstore_init.h blockstore_journal.h cpp-btree/btree_map.h crc32c.h malloc_or_die.h object_id.h ringloop.h
+	g++ $(CXXFLAGS) -c -o $@ $<
+blockstore_sync.o: blockstore_sync.cpp allocator.h blockstore.h blockstore_flush.h blockstore_impl.h blockstore_init.h blockstore_journal.h cpp-btree/btree_map.h crc32c.h malloc_or_die.h object_id.h ringloop.h
+	g++ $(CXXFLAGS) -c -o $@ $<
+blockstore_write.o: blockstore_write.cpp allocator.h blockstore.h blockstore_flush.h blockstore_impl.h blockstore_init.h blockstore_journal.h cpp-btree/btree_map.h crc32c.h malloc_or_die.h object_id.h ringloop.h
+	g++ $(CXXFLAGS) -c -o $@ $<
+cluster_client.o: cluster_client.cpp cluster_client.h etcd_state_client.h http_client.h json11/json11.hpp malloc_or_die.h messenger.h object_id.h osd_id.h osd_ops.h ringloop.h timerfd_manager.h
+	g++ $(CXXFLAGS) -c -o $@ $<
+dump_journal.o: dump_journal.cpp allocator.h blockstore.h blockstore_flush.h blockstore_impl.h blockstore_init.h blockstore_journal.h cpp-btree/btree_map.h crc32c.h malloc_or_die.h object_id.h ringloop.h
+	g++ $(CXXFLAGS) -c -o $@ $<
+epoll_manager.o: epoll_manager.cpp epoll_manager.h ringloop.h timerfd_manager.h
+	g++ $(CXXFLAGS) -c -o $@ $<
+etcd_state_client.o: etcd_state_client.cpp base64.h etcd_state_client.h http_client.h json11/json11.hpp object_id.h osd_id.h osd_ops.h pg_states.h timerfd_manager.h
+	g++ $(CXXFLAGS) -c -o $@ $<
+fio_cluster.o: fio_cluster.cpp cluster_client.h epoll_manager.h etcd_state_client.h fio/fio.h fio/optgroup.h http_client.h json11/json11.hpp malloc_or_die.h messenger.h object_id.h osd_id.h osd_ops.h ringloop.h timerfd_manager.h
+	g++ $(CXXFLAGS) -c -o $@ $<
+fio_engine.o: fio_engine.cpp blockstore.h fio/fio.h fio/optgroup.h json11/json11.hpp object_id.h ringloop.h
+	g++ $(CXXFLAGS) -c -o $@ $<
+fio_sec_osd.o: fio_sec_osd.cpp fio/fio.h fio/optgroup.h object_id.h osd_id.h osd_ops.h rw_blocking.h
+	g++ $(CXXFLAGS) -c -o $@ $<
+http_client.o: http_client.cpp http_client.h json11/json11.hpp timerfd_manager.h
+	g++ $(CXXFLAGS) -c -o $@ $<
+messenger.o: messenger.cpp json11/json11.hpp malloc_or_die.h messenger.h object_id.h osd_id.h osd_ops.h ringloop.h timerfd_manager.h
+	g++ $(CXXFLAGS) -c -o $@ $<
+msgr_receive.o: msgr_receive.cpp json11/json11.hpp malloc_or_die.h messenger.h object_id.h osd_id.h osd_ops.h ringloop.h timerfd_manager.h
+	g++ $(CXXFLAGS) -c -o $@ $<
+msgr_send.o: msgr_send.cpp json11/json11.hpp malloc_or_die.h messenger.h object_id.h osd_id.h osd_ops.h ringloop.h timerfd_manager.h
+	g++ $(CXXFLAGS) -c -o $@ $<
+nbd_proxy.o: nbd_proxy.cpp cluster_client.h epoll_manager.h etcd_state_client.h http_client.h json11/json11.hpp malloc_or_die.h messenger.h object_id.h osd_id.h osd_ops.h ringloop.h timerfd_manager.h
+	g++ $(CXXFLAGS) -c -o $@ $<
+osd.o: osd.cpp blockstore.h cpp-btree/btree_map.h epoll_manager.h etcd_state_client.h http_client.h json11/json11.hpp malloc_or_die.h messenger.h object_id.h osd.h osd_id.h osd_ops.h osd_peering_pg.h pg_states.h ringloop.h timerfd_manager.h
+	g++ $(CXXFLAGS) -c -o $@ $<
+osd_cluster.o: osd_cluster.cpp base64.h blockstore.h cpp-btree/btree_map.h epoll_manager.h etcd_state_client.h http_client.h json11/json11.hpp malloc_or_die.h messenger.h object_id.h osd.h osd_id.h osd_ops.h osd_peering_pg.h pg_states.h ringloop.h timerfd_manager.h
+	g++ $(CXXFLAGS) -c -o $@ $<
+osd_flush.o: osd_flush.cpp blockstore.h cpp-btree/btree_map.h epoll_manager.h etcd_state_client.h http_client.h json11/json11.hpp malloc_or_die.h messenger.h object_id.h osd.h osd_id.h osd_ops.h osd_peering_pg.h pg_states.h ringloop.h timerfd_manager.h
+	g++ $(CXXFLAGS) -c -o $@ $<
+osd_main.o: osd_main.cpp blockstore.h cpp-btree/btree_map.h epoll_manager.h etcd_state_client.h http_client.h json11/json11.hpp malloc_or_die.h messenger.h object_id.h osd.h osd_id.h osd_ops.h osd_peering_pg.h pg_states.h ringloop.h timerfd_manager.h
+	g++ $(CXXFLAGS) -c -o $@ $<
+osd_ops.o: osd_ops.cpp object_id.h osd_id.h osd_ops.h
+	g++ $(CXXFLAGS) -c -o $@ $<
+osd_peering.o: osd_peering.cpp base64.h blockstore.h cpp-btree/btree_map.h epoll_manager.h etcd_state_client.h http_client.h json11/json11.hpp malloc_or_die.h messenger.h object_id.h osd.h osd_id.h osd_ops.h osd_peering_pg.h pg_states.h ringloop.h timerfd_manager.h
+	g++ $(CXXFLAGS) -c -o $@ $<
+osd_peering_pg.o: osd_peering_pg.cpp cpp-btree/btree_map.h object_id.h osd_id.h osd_ops.h osd_peering_pg.h pg_states.h
+	g++ $(CXXFLAGS) -c -o $@ $<
+osd_peering_pg_test.o: osd_peering_pg_test.cpp cpp-btree/btree_map.h object_id.h osd_id.h osd_ops.h osd_peering_pg.h pg_states.h
+	g++ $(CXXFLAGS) -c -o $@ $<
+osd_primary.o: osd_primary.cpp blockstore.h cpp-btree/btree_map.h epoll_manager.h etcd_state_client.h http_client.h json11/json11.hpp malloc_or_die.h messenger.h object_id.h osd.h osd_id.h osd_ops.h osd_peering_pg.h osd_primary.h osd_rmw.h pg_states.h ringloop.h timerfd_manager.h
+	g++ $(CXXFLAGS) -c -o $@ $<
+osd_primary_subops.o: osd_primary_subops.cpp blockstore.h cpp-btree/btree_map.h epoll_manager.h etcd_state_client.h http_client.h json11/json11.hpp malloc_or_die.h messenger.h object_id.h osd.h osd_id.h osd_ops.h osd_peering_pg.h osd_primary.h osd_rmw.h pg_states.h ringloop.h timerfd_manager.h
+	g++ $(CXXFLAGS) -c -o $@ $<
+osd_rmw.o: osd_rmw.cpp malloc_or_die.h object_id.h osd_id.h osd_rmw.h xor.h
+	g++ $(CXXFLAGS) -c -o $@ $<
+osd_rmw_test.o: osd_rmw_test.cpp malloc_or_die.h object_id.h osd_id.h osd_rmw.cpp osd_rmw.h test_pattern.h xor.h
+	g++ $(CXXFLAGS) -c -o $@ $<
+osd_secondary.o: osd_secondary.cpp blockstore.h cpp-btree/btree_map.h epoll_manager.h etcd_state_client.h http_client.h json11/json11.hpp malloc_or_die.h messenger.h object_id.h osd.h osd_id.h osd_ops.h osd_peering_pg.h pg_states.h ringloop.h timerfd_manager.h
+	g++ $(CXXFLAGS) -c -o $@ $<
+osd_test.o: osd_test.cpp object_id.h osd_id.h osd_ops.h rw_blocking.h test_pattern.h
+	g++ $(CXXFLAGS) -c -o $@ $<
+pg_states.o: pg_states.cpp pg_states.h
+	g++ $(CXXFLAGS) -c -o $@ $<
+qemu_proxy.o: qemu_proxy.cpp cluster_client.h etcd_state_client.h http_client.h json11/json11.hpp malloc_or_die.h messenger.h object_id.h osd_id.h osd_ops.h qemu_proxy.h ringloop.h timerfd_manager.h
+	g++ $(CXXFLAGS) -c -o $@ $<
+ringloop.o: ringloop.cpp ringloop.h
+	g++ $(CXXFLAGS) -c -o $@ $<
+rw_blocking.o: rw_blocking.cpp rw_blocking.h
+	g++ $(CXXFLAGS) -c -o $@ $<
+stub_bench.o: stub_bench.cpp object_id.h osd_id.h osd_ops.h rw_blocking.h
+	g++ $(CXXFLAGS) -c -o $@ $<
+stub_osd.o: stub_osd.cpp object_id.h osd_id.h osd_ops.h rw_blocking.h
+	g++ $(CXXFLAGS) -c -o $@ $<
+stub_uring_osd.o: stub_uring_osd.cpp epoll_manager.h json11/json11.hpp malloc_or_die.h messenger.h object_id.h osd_id.h osd_ops.h ringloop.h timerfd_manager.h
+	g++ $(CXXFLAGS) -c -o $@ $<
+test.o: test.cpp allocator.h blockstore.h blockstore_flush.h blockstore_impl.h blockstore_init.h blockstore_journal.h cpp-btree/btree_map.h crc32c.h malloc_or_die.h object_id.h osd_id.h osd_ops.h osd_peering_pg.h pg_states.h ringloop.h
+	g++ $(CXXFLAGS) -c -o $@ $<
+test_allocator.o: test_allocator.cpp allocator.h
+	g++ $(CXXFLAGS) -c -o $@ $<
+test_blockstore.o: test_blockstore.cpp blockstore.h object_id.h ringloop.h timerfd_interval.h
+	g++ $(CXXFLAGS) -c -o $@ $<
+timerfd_interval.o: timerfd_interval.cpp ringloop.h timerfd_interval.h
+	g++ $(CXXFLAGS) -c -o $@ $<
+timerfd_manager.o: timerfd_manager.cpp timerfd_manager.h
+	g++ $(CXXFLAGS) -c -o $@ $<
--- a/README-ru.md
+++ b/README-ru.md
@ -1,105 +0,0 @@
-# Vitastor
-
-[Read English version](README.md)
-
-## Идея
-
-Вернём былую скорость кластерному блочному хранилищу!
-
-Vitastor - распределённая блочная и файловая SDS (программная СХД), прямой аналог Ceph RBD и CephFS,
-а также внутренних СХД популярных облачных провайдеров. Однако, в отличие от них, Vitastor
-быстрый и при этом простой. Только пока маленький :-).
-
-Vitastor архитектурно похож на Ceph, что означает атомарность и строгую консистентность,
-репликацию через первичный OSD, симметричную кластеризацию без единой точки отказа
-и автоматическое распределение данных по любому числу дисков любого размера с настраиваемыми схемами
-избыточности - репликацией или с произвольными кодами коррекции ошибок.
-
-Vitastor нацелен в первую очередь на SSD и SSD+HDD кластеры с как минимум 10 Гбит/с сетью, поддерживает
-TCP и RDMA и на хорошем железе может достигать задержки 4 КБ чтения и записи на уровне ~0.1 мс,
-что примерно в 10 раз быстрее, чем Ceph и другие популярные программные СХД.
-
-Vitastor поддерживает QEMU-драйвер, протоколы NBD и NFS, драйверы OpenStack, OpenNebula, Proxmox, Kubernetes.
-Другие драйверы могут также быть легко реализованы.
-
-Подробности смотрите в документации по ссылкам. Можете начать отсюда: [Быстрый старт](docs/intro/quickstart.ru.md).
-
-## Презентации и записи докладов
-
- DevOpsConf'2021: презентация ([на русском](https://vitastor.io/presentation/devopsconf/devopsconf.html),
-  [на английском](https://vitastor.io/presentation/devopsconf/devopsconf_en.html)),
-  [видео](https://vitastor.io/presentation/devopsconf/talk.webm)
- Highload'2022: презентация ([на русском](https://vitastor.io/presentation/highload/highload.html)),
-  [видео](https://vitastor.io/presentation/highload/talk.webm)
-
-## Документация
-
- Введение
-  - [Быстрый старт](docs/intro/quickstart.ru.md)
-  - [Возможности](docs/intro/features.ru.md)
-  - [Архитектура](docs/intro/architecture.ru.md)
-  - [Автор и лицензия](docs/intro/author.ru.md)
- Установка
-  - [Пакеты](docs/installation/packages.ru.md)
-  - [Proxmox](docs/installation/proxmox.ru.md)
-  - [OpenNebula](docs/installation/opennebula.ru.md)
-  - [OpenStack](docs/installation/openstack.ru.md)
-  - [Kubernetes CSI](docs/installation/kubernetes.ru.md)
-  - [Сборка из исходных кодов](docs/installation/source.ru.md)
- Конфигурация
-  - [Обзор](docs/config.ru.md)
-  - Параметры
-    - [Общие](docs/config/common.ru.md)
-    - [Сетевые](docs/config/network.ru.md)
-    - [Клиентский код](docs/config/client.ru.md)
-    - [Глобальные дисковые параметры](docs/config/layout-cluster.ru.md)
-    - [Дисковые параметры OSD](docs/config/layout-osd.ru.md)
-    - [Прочие параметры OSD](docs/config/osd.ru.md)
-    - [Параметры мониторов](docs/config/monitor.ru.md)
-  - [Настройки пулов](docs/config/pool.ru.md)
-  - [Метаданные образов в etcd](docs/config/inode.ru.md)
- Использование
-  - [vitastor-cli](docs/usage/cli.ru.md) (консольный интерфейс)
-  - [vitastor-disk](docs/usage/disk.ru.md) (управление дисками)
-  - [fio](docs/usage/fio.ru.md) для тестов производительности
-  - [NBD](docs/usage/nbd.ru.md) для монтирования ядром
-  - [QEMU и qemu-img](docs/usage/qemu.ru.md)
-  - [NFS](docs/usage/nfs.ru.md) кластерная файловая система и псевдо-ФС прокси
-  - [Администрирование](docs/usage/admin.ru.md)
- Производительность
-  - [Понимание сути производительности](docs/performance/understanding.ru.md)
-  - [Теоретический максимум](docs/performance/theoretical.ru.md)
-  - [Пример сравнения с Ceph](docs/performance/comparison1.ru.md)
-  - [Более новый тест Vitastor 1.3.1](docs/performance/bench2.ru.md)
-
-## Автор и лицензия
-
-Автор: Виталий Филиппов (vitalif [at] yourcmc.ru), 2019+
-
-Заходите в Telegram-чат Vitastor: https://t.me/vitastor
-
-Лицензия: VNPL 1.1 на серверный код и двойная VNPL 1.1 + GPL 2.0+ на клиентский.
-
-VNPL - "сетевой копилефт", собственная свободная копилефт-лицензия
-Vitastor Network Public License 1.1, основанная на GNU GPL 3.0 с дополнительным
-условием "Сетевого взаимодействия", требующим распространять все программы,
-специально разработанные для использования вместе с Vitastor и взаимодействующие
-с ним по сети, под лицензией VNPL или под любой другой свободной лицензией.
-
-Идея VNPL - расширение действия копилефта не только на модули, явным образом
-связываемые с кодом Vitastor, но также на модули, оформленные в виде микросервисов
-и взаимодействующие с ним по сети.
-
-Таким образом, если вы хотите построить на основе Vitastor сервис, содержаший
-компоненты с закрытым кодом, взаимодействующие с Vitastor, вам нужна коммерческая
-лицензия от автора 😀.
-
-На Windows и любое другое ПО, не разработанное *специально* для использования
-вместе с Vitastor, никакие ограничения не накладываются.
-
-Клиентские библиотеки распространяются на условиях двойной лицензии VNPL 1.0
-и также на условиях GNU GPL 2.0 или более поздней версии. Так сделано в целях
-совместимости с таким ПО, как QEMU и fio.
-
-Вы можете найти полный текст VNPL 1.1 на английском языке в файле [VNPL-1.1.txt](VNPL-1.1.txt),
-VNPL 1.1 на русском языке в файле [VNPL-1.1-RU.txt](VNPL-1.1-RU.txt), а GPL 2.0 в файле [GPL-2.0.txt](GPL-2.0.txt).
--- a/README.md
+++ b/README.md
@ -1,105 +1,387 @@
-# Vitastor
-
-[Читать на русском](README-ru.md)
+## Vitastor

 ## The Idea

-Make Clustered Block Storage Fast Again.
+Make Software-Defined Block Storage Great Again.

-Vitastor is a distributed block and file SDS, direct replacement of Ceph RBD and CephFS,
-and also internal SDS's of public clouds. However, in contrast to them, Vitastor is fast
-and simple at the same time. The only thing is it's slightly young :-).
+Vitastor is a small, simple and fast clustered block storage (storage for VM drives),
+architecturally similar to Ceph which means strong consistency, primary-replication, symmetric
+clustering and automatic data distribution over any number of drives of any size
+with configurable redundancy (replication or erasure codes/XOR).

-Vitastor is architecturally similar to Ceph which means strong consistency,
-primary-replication, symmetric clustering and automatic data distribution over any
-number of drives of any size with configurable redundancy (replication or erasure codes/XOR).
+## Features

-Vitastor targets primarily SSD and SSD+HDD clusters with at least 10 Gbit/s network,
-supports TCP and RDMA and may achieve 4 KB read and write latency as low as ~0.1 ms
-with proper hardware which is ~10 times faster than other popular SDS's like Ceph
-or internal systems of public clouds.
+Vitastor is currently a pre-release, a lot of features are missing and you can still expect
+breaking changes in the future. However, the following is implemented:

-Vitastor supports QEMU, NBD, NFS protocols, OpenStack, OpenNebula, Proxmox, Kubernetes drivers.
-More drivers may be created easily.
+- Basic part: highly-available block storage with symmetric clustering and no SPOF
+- Performance ;-D
+- Two redundancy schemes: Replication and XOR n+1 (simplest case of EC)
+- Configuration via simple JSON data structures in etcd
+- Automatic data distribution over OSDs, with support for:
+  - Mathematical optimization for better uniformity and less data movement
+  - Multiple pools
+  - Placement tree
+  - Configurable failure domains
+- Recovery of degraded blocks
+- Rebalancing (data movement between OSDs)
+- Lazy fsync support
+- I/O statistics reporting to etcd
+- Generic user-space client library
+- QEMU driver (built out-of-tree)
+- Loadable fio engine for benchmarks (also built out-of-tree)
+- NBD proxy for kernel mounts

-Read more details in the documentation. You can start from here: [Quick Start](docs/intro/quickstart.en.md).
+## Roadmap

-## Talks and presentations
+- Packaging for Debian and, probably, CentOS too
+- OSD creation tool (OSDs currently have to be created by hand)
+- Inode deletion tool (currently you can't delete anything :))
+- Other administrative tools
+- Per-inode I/O and space usage statistics
+- jerasure EC support with any number of data and parity drives in a group
+- Parallel usage of multiple network interfaces
+- Proxmox and OpenNebula plugins
+- iSCSI proxy
+- Inode metadata storage in etcd
+- Snapshots and copy-on-write image clones
+- Operation timeouts and better failure detection
+- Checksums
+- SSD+HDD optimizations, possibly including tiered storage and soft journal flushes
+- RDMA and NVDIMM support
+- Compression (possibly)
+- Read caching using system page cache (possibly)

- DevOpsConf'2021: presentation ([in Russian](https://vitastor.io/presentation/devopsconf/devopsconf.html),
-  [in English](https://vitastor.io/presentation/devopsconf/devopsconf_en.html)),
-  [video](https://vitastor.io/presentation/devopsconf/talk.webm)
- Highload'2022: presentation ([in Russian](https://vitastor.io/presentation/highload/highload.html)),
-  [video](https://vitastor.io/presentation/highload/talk.webm)
+## Architecture

-## Documentation
+Similarities:

- Introduction
-  - [Quick Start](docs/intro/quickstart.en.md)
-  - [Features](docs/intro/features.en.md)
-  - [Architecture](docs/intro/architecture.en.md)
-  - [Author and license](docs/intro/author.en.md)
- Installation
-  - [Packages](docs/installation/packages.en.md)
-  - [Proxmox](docs/installation/proxmox.en.md)
-  - [OpenNebula](docs/installation/opennebula.en.md)
-  - [OpenStack](docs/installation/openstack.en.md)
-  - [Kubernetes CSI](docs/installation/kubernetes.en.md)
-  - [Building from Source](docs/installation/source.en.md)
- Configuration
-  - [Overview](docs/config.en.md)
-  - Parameter Reference
-    - [Common](docs/config/common.en.md)
-    - [Network](docs/config/network.en.md)
-    - [Client](docs/config/client.en.md)
-    - [Global Disk Layout](docs/config/layout-cluster.en.md)
-    - [OSD Disk Layout](docs/config/layout-osd.en.md)
-    - [OSD Runtime Parameters](docs/config/osd.en.md)
-    - [Monitor](docs/config/monitor.en.md)
-  - [Pool configuration](docs/config/pool.en.md)
-  - [Image metadata in etcd](docs/config/inode.en.md)
- Usage
-  - [vitastor-cli](docs/usage/cli.en.md) (command-line interface)
-  - [vitastor-disk](docs/usage/disk.en.md) (disk management tool)
-  - [fio](docs/usage/fio.en.md) for benchmarks
-  - [NBD](docs/usage/nbd.en.md) for kernel mounts
-  - [QEMU and qemu-img](docs/usage/qemu.en.md)
-  - [NFS](docs/usage/nfs.en.md) clustered file system and pseudo-FS proxy
-  - [Administration](docs/usage/admin.en.md)
- Performance
-  - [Understanding storage performance](docs/performance/understanding.en.md)
-  - [Theoretical performance](docs/performance/theoretical.en.md)
-  - [Example comparison with Ceph](docs/performance/comparison1.en.md)
-  - [Newer benchmark of Vitastor 1.3.1](docs/performance/bench2.en.md)
+- Just like Ceph, Vitastor has Pools, PGs, OSDs, Monitors, Failure Domains, Placement Tree.
+- Just like Ceph, Vitastor is transactional (even though there's a "lazy fsync mode" which
+  doesn't implicitly flush every operation to disks).
+- OSDs also have journal and metadata and they can also be put on separate drives.
+- Just like in Ceph, client library attempts to recover from any cluster failure so
+  you can basically reboot the whole cluster and only pause, but not crash, your clients
+  (I consider this a bug if the client crashes in that case).
+
+Some basic terms for people not familiar with Ceph:
+
+- OSD (Object Storage Daemon) is a process that stores data and serves read/write requests.
+- PG (Placement Group) is a container for data that (normally) shares the same replicas.
+- Pool is a container for data that has the same redundancy scheme and placement rules.
+- Monitor is a separate daemon that watches cluster state and handles failures.
+- Failure Domain is a group of OSDs that you allow to fail. It's "host" by default.
+- Placement Tree groups OSDs in a hierarchy to later split them into Failure Domains.
+
+Architectural differences from Ceph:
+
+- Vitastor's primary focus is on SSDs. Proper SSD+HDD optimizations may be added in the future, though.
+- Vitastor OSD is (and will always be) single-threaded. If you want to dedicate more than 1 core
+  per drive you should run multiple OSDs each on a different partition of the drive.
+  Vitastor isn't CPU-hungry though (as opposed to Ceph), so 1 core is sufficient in a lot of cases.
+- Metadata and journal are always kept in memory. Metadata size depends linearly on drive capacity
+  and data store block size which is 128 KB by default. With 128 KB blocks, metadata should occupy
+  around 512 MB per 1 TB (which is still less than Ceph wants). Journal doesn't have to be big,
+  the example test below was conducted with only 16 MB journal. A big journal is probably even
+  harmful as dirty write metadata also take some memory.
+- Vitastor storage layer doesn't have internal copy-on-write or redirect-write. I know that maybe
+  it's possible to create a good copy-on-write storage, but it's much harder and makes performance
+  less deterministic, so CoW isn't used in Vitastor.
+- The basic layer of Vitastor is block storage with fixed-size blocks, not object storage with
+  rich semantics like in Ceph (RADOS).
+- There's a "lazy fsync" mode which allows to batch writes before flushing them to the disk.
+  This allows to use Vitastor with desktop SSDs, but still lowers performance due to additional
+  network roundtrips, so use server SSDs with capacitor-based power loss protection
+  ("Advanced Power Loss Protection") for best performance.
+- PGs are ephemeral. This means that they aren't stored on data disks and only exist in memory
+  while OSDs are running.
+- Recovery process is per-object (per-block), not per-PG. Also there are no PGLOGs.
+- Monitors don't store data. Cluster configuration and state is stored in etcd in simple human-readable
+  JSON structures. Monitors only watch cluster state and handle data movement.
+  Thus Vitastor's Monitor isn't a critical component of the system and is more similar to Ceph's Manager.
+  Vitastor's Monitor is implemented in node.js.
+- PG distribution isn't based on consistent hashes. All PG mappings are stored in etcd.
+  Rebalancing PGs between OSDs is done by mathematical optimization - data distribution problem
+  is reduced to a linear programming problem and solved by lp_solve. This allows for almost
+  perfect (96-99% uniformity compared to Ceph's 80-90%) data distribution in most cases, ability
+  to map PGs by hand without breaking rebalancing logic, reduced OSD peer-to-peer communication
+  (on average, OSDs have fewer peers) and less data movement. It also probably has a drawback -
+  this method may fail in very large clusters, but up to several hundreds of OSDs it's perfectly fine.
+  It's also easy to add consistent hashes in the future if something proves their necessity.
+- There's no separate CRUSH layer. You select pool redundancy scheme, placement root, failure domain
+  and so on directly in pool configuration.
+
+## Understanding Storage Performance
+
+The most important thing for fast storage is latency, not parallel iops.
+
+The best possible latency is achieved with one thread and queue depth of 1 which basically means
+"client load as low as possible". In this case IOPS = 1/latency, and this number doesn't
+scale with number of servers, drives, server processes or threads and so on.
+Single-threaded IOPS and latency numbers only depend on *how fast a single daemon is*.
+
+Why is it important? It's important because some of the applications *can't* use
+queue depth greater than 1 because their task isn't parallelizable. A notable example
+is any ACID DBMS because all of them write their WALs sequentially with fsync()s.
+
+fsync, by the way, is another important thing often missing in benchmarks. The point is
+that drives have cache buffers and don't guarantee that your data is actually persisted
+until you call fsync() which is translated to a FLUSH CACHE command by the OS.
+
+Desktop SSDs are very fast without fsync - NVMes, for example, can process ~80000 write
+operations per second with queue depth of 1 without fsync - but they're really slow with
+fsync because they have to actually write data to flash chips when you call fsync. Typical
+number is around 1000-2000 iops with fsync.
+
+Server SSDs often have supercapacitors that act as a built-in UPS and allow the drive
+to flush its DRAM cache to the persistent flash storage when a power loss occurs.
+This makes them perform equally well with and without fsync. This feature is called
+"Advanced Power Loss Protection" by Intel; other vendors either call it similarly
+or directly as "Full Capacitor-Based Power Loss Protection".
+
+All software-defined storages that I currently know are slow in terms of latency.
+Notable examples are Ceph and internal SDSes used by cloud providers like Amazon, Google,
+Yandex and so on. They're all slow and can only reach ~0.3ms read and ~0.6ms 4 KB write latency
+with best-in-slot hardware.
+
+And that's in the SSD era when you can buy an SSD that has ~0.04ms latency for 100 $.
+
+I use the following 6 commands with small variations to benchmark any storage:
+
+- Linear write:
+  `fio -ioengine=libaio -direct=1 -invalidate=1 -name=test -bs=4M -iodepth=32 -rw=write -runtime=60 -filename=/dev/sdX`
+- Linear read:
+  `fio -ioengine=libaio -direct=1 -invalidate=1 -name=test -bs=4M -iodepth=32 -rw=read -runtime=60 -filename=/dev/sdX`
+- Random write latency (this hurts storages the most):
+  `fio -ioengine=libaio -direct=1 -invalidate=1 -name=test -bs=4k -iodepth=1 -fsync=1 -rw=randwrite -runtime=60 -filename=/dev/sdX`
+- Random read latency:
+  `fio -ioengine=libaio -direct=1 -invalidate=1 -name=test -bs=4k -iodepth=1 -rw=randread -runtime=60 -filename=/dev/sdX`
+- Parallel write iops (use numjobs if a single CPU core is insufficient to saturate the load):
+  `fio -ioengine=libaio -direct=1 -invalidate=1 -name=test -bs=4k -iodepth=128 [-numjobs=4 -group_reporting] -rw=randwrite -runtime=60 -filename=/dev/sdX`
+- Parallel read iops (use numjobs if a single CPU core is insufficient to saturate the load):
+  `fio -ioengine=libaio -direct=1 -invalidate=1 -name=test -bs=4k -iodepth=128 [-numjobs=4 -group_reporting] -rw=randread -runtime=60 -filename=/dev/sdX`
+
+## Vitastor's Theoretical Maximum Random Access Performance
+
+Replicated setups:
+- Single-threaded (T1Q1) read latency: 1 network roundtrip + 1 disk read.
+- Single-threaded write+fsync latency:
+  - With immediate commit: 2 network roundtrips + 1 disk write.
+  - With lazy commit: 4 network roundtrips + 1 disk write + 1 disk flush.
+- Saturated parallel read iops: min(network bandwidth, sum(disk read iops)).
+- Saturated parallel write iops: min(network bandwidth, sum(disk write iops / number of replicas / write amplification)).
+
+EC/XOR setups:
+- Single-threaded (T1Q1) read latency: 1.5 network roundtrips + 1 disk read.
+- Single-threaded write+fsync latency:
+  - With immediate commit: 3.5 network roundtrips + 1 disk read + 2 disk writes.
+  - With lazy commit: 5.5 network roundtrips + 1 disk read + 2 disk writes + 2 disk fsyncs.
+  - 0.5 in actually (k-1)/k which means that an additional roundtrip doesn't happen when
+    the read sub-operation can be served locally.
+- Saturated parallel read iops: min(network bandwidth, sum(disk read iops)).
+- Saturated parallel write iops: min(network bandwidth, sum(disk write iops * number of data drives / (number of data + parity drives) / write amplification)).
+  In fact, you should put disk write iops under the condition of ~10% reads / ~90% writes in this formula.
+
+Write amplification for 4 KB blocks is usually 3-5 in Vitastor:
+1. Journal block write
+2. Journal data write
+3. Metadata block write
+4. Another journal block write for EC/XOR setups
+5. Data block write
+
+If you manage to get an SSD which handles 512 byte blocks well (Optane?) you may
+lower 1, 3 and 4 to 512 bytes (1/8 of data size) and get WA as low as 2.375.
+
+Lazy fsync also reduces WA for parallel workloads because journal blocks are only
+written when they fill up or fsync is requested.
+
+## Example Comparison with Ceph
+
+Hardware configuration: 4 nodes, each with:
+- 6x SATA SSD Intel D3-4510 3.84 TB
+- 2x Xeon Gold 6242 (16 cores @ 2.8 GHz)
+- 384 GB RAM
+- 1x 25 GbE network interface (Mellanox ConnectX-4 LX), connected to a Juniper QFX5200 switch
+
+CPU powersaving was disabled. Both Vitastor and Ceph were configured with 2 OSDs per 1 SSD.
+
+All of the results below apply to 4 KB blocks.
+
+Raw drive performance:
+- T1Q1 write ~27000 iops (~0.037ms latency)
+- T1Q1 read ~9800 iops (~0.101ms latency)
+- T1Q32 write ~60000 iops
+- T1Q32 read ~81700 iops
+
+Ceph 15.2.4 (Bluestore):
+- T1Q1 write ~1000 iops (~1ms latency)
+- T1Q1 read ~1750 iops (~0.57ms latency)
+- T8Q64 write ~100000 iops, total CPU usage by OSDs about 40 virtual cores on each node
+- T8Q64 read ~480000 iops, total CPU usage by OSDs about 40 virtual cores on each node
+
+T8Q64 tests were conducted over 8 400GB RBD images from all hosts (every host was running 2 instances of fio).
+This is because Ceph has performance penalties related to running multiple clients over a single RBD image.
+
+cephx_sign_messages was set to false during tests, RocksDB and Bluestore settings were left at defaults.
+
+In fact, not that bad for Ceph. These servers are an example of well-balanced Ceph nodes.
+However, CPU usage and I/O latency were through the roof, as usual.
+
+Vitastor:
+- T1Q1 write: 7087 iops (0.14ms latency)
+- T1Q1 read: 6838 iops (0.145ms latency)
+- T2Q64 write: 162000 iops, total CPU usage by OSDs about 3 virtual cores on each node
+- T8Q64 read: 895000 iops, total CPU usage by OSDs about 4 virtual cores on each node
+
+T8Q64 read test was conducted over 1 larger inode (3.2T) from all hosts (every host was running 2 instances of fio).
+Vitastor has no performance penalties related to running multiple clients over a single inode.
+If conducted from one node with all primary OSDs moved to other nodes the result was slightly lower (689000 iops),
+this is because all operations resulted in network roundtrips between the client and the primary OSD.
+When fio was colocated with OSDs (like in Ceph benchmarks above), 1/4 of the read workload actually
+used the loopback network.
+
+Vitastor was configured with: `--disable_data_fsync true --immediate_commit all --flusher_count 8
+  --disk_alignment 4096 --journal_block_size 4096 --meta_block_size 4096
+  --journal_no_same_sector_overwrites true --journal_sector_buffer_count 1024
+  --journal_size 16777216`.
+
+### NBD
+
+NBD is currently required to mount Vitastor via kernel, but it imposes additional overhead
+due to additional copying between the kernel and userspace. This mostly hurts linear
+bandwidth, not iops.
+
+Vitastor with single-thread NBD on the same hardware:
+- T1Q1 write: 6000 iops (0.166ms latency)
+- T1Q1 read: 5518 iops (0.18ms latency)
+- T1Q128 write: 94400 iops
+- T1Q128 read: 103000 iops
+- Linear write (4M T1Q128): 1266 MB/s (compared to 2600 MB/s via fio)
+- Linear read (4M T1Q128): 975 MB/s (compared to 1400 MB/s via fio)
+
+## Building
+
+- Install Linux kernel 5.4 or newer for io_uring support.
+- Install liburing 0.4 or newer and its headers.
+- Install lp_solve.
+- Install etcd.
+- Install node.js 12 or newer.
+- Install gcc and g++ 9.x.
+- Clone https://yourcmc.ru/git/vitalif/vitastor/ with submodules.
+- Install QEMU 4.x or 5.x, get its source, begin to build it, stop the build and copy headers:
+   - `<qemu>/include` &rarr; `<vitastor>/qemu/include`
+   - Debian:
+      * Use qemu packages from the main repository
+      * `<qemu>/b/qemu/config-host.h` &rarr; `<vitastor>/qemu/b/qemu/config-host.h`
+      * `<qemu>/b/qemu/qapi` &rarr; `<vitastor>/qemu/b/qemu/qapi`
+   - CentOS 8:
+      * Use qemu packages from the Advanced-Virtualization repository. To enable it, run
+        `yum install centos-release-advanced-virtualization.noarch` and then `yum install qemu`
+      * `<qemu>/config-host.h` &rarr; `<vitastor>/qemu/b/qemu/config-host.h`
+      * `<qemu>/qapi` &rarr; `<vitastor>/qemu/b/qemu/qapi`
+   - `config-host.h` and `qapi` are required because they contain generated headers
+- Install fio 3.16, get its source and symlink it into `<vitastor>/fio`. It doesn't currently
+  build with fio 3.20 or newer due to the conflicts between g++ and gcc's atomics. This will
+  be fixed in the future.
+- Build Vitastor with `make -j8`.
+- Copy binaries somewhere.
+
+## Running
+
+Please note that startup procedure isn't currently simple - you specify configuration
+and calculate disk offsets almost by hand. This will be fixed in near future.
+
+- Get some SATA or NVMe SSDs with capacitors (server-grade drives). You can use desktop SSDs
+  with lazy fsync, but prepare for inferior single-thread latency.
+- Get a fast network (at least 10 Gbit/s).
+- Disable CPU powersaving: `cpupower idle-set -D 0 && cpupower frequency-set -g performance`.
+- Install etcd with `--max-txn-ops=100000 --auto-compaction-retention=10 --auto-compaction-mode=revision` options.
+- Create global configuration in etcd: `etcdctl put /vitastor/config/global '{"immediate_commit":"all"}'`
+  (if all your drives have capacitors).
+- Create pool configuration in etcd: `etcdctl put /vitastor/config/pools '{"1":{"name":"testpool","scheme":"replicated","pg_size":2,"pg_minsize":1,"pg_count":256,"failure_domain":"host"}}'`.
+- Calculate offsets for your drives with `node ./mon/simple-offsets.js /dev/sdX`.
+- Make systemd units for your OSDs. Look at `./mon/make-units.sh` for example.
+  Notable configuration variables from the example:
+  - `disable_data_fsync 1` - only safe with server-grade drives with capacitors.
+  - `immediate_commit all` - use this if all your drives are server-grade.
+  - `disable_device_lock 1` - only required if you run multiple OSDs on one block device.
+  - `flusher_count 16` - flusher is a micro-thread that removes old data from the journal.
+    More flushers mean more aggressive journal flushing which allows for more throughput
+    but slightly hurts latency under less load. Flushing will probably be improved in the future
+    because currently high queue depths sometimes lead to performance degradation.
+  - `disk_alignment`, `journal_block_size`, `meta_block_size` should be set to the internal
+    block size of your SSDs which is 4096 on most drives.
+  - `journal_no_same_sector_overwrites true` prevents multiple overwrites of the same journal sector.
+    Some SSDs (like Intel D3-4510) don't like such overwrites so they benefit from this setting.
+    When this setting is set, it is also required to raise `journal_sector_buffer_count` setting,
+    which is the number of dirty journal sectors that may be written to at the same time.
+- `systemctl start vitastor.target` everywhere.
+- Start any number of monitors: `cd mon; node mon-main.js --etcd_url 'http://10.115.0.10:2379,http://10.115.0.11:2379,http://10.115.0.12:2379,http://10.115.0.13:2379' --etcd_prefix '/vitastor' --etcd_start_timeout 5`.
+- At this point, one of the monitors will configure PGs and OSDs will start them.
+- You can check PG states with `etcdctl get --prefix /vitastor/pg/state`. All PGs should become 'active'.
+- Run tests with (for example): `fio -thread -ioengine=./libfio_cluster.so -name=test -bs=4M -direct=1 -iodepth=16 -rw=write -etcd=10.115.0.10:2379/v3 -pool=1 -inode=1 -size=400G`.
+- Upload VM disk image with qemu-img (for example):
+  ```
+  LD_PRELOAD=./qemu_driver.so qemu-img convert -f qcow2 debian10.qcow2 -p
+    -O raw 'vitastor:etcd_host=10.115.0.10\:2379/v3:pool=1:inode=1:size=2147483648'
+  ```
+- Run QEMU with (for example):
+  ```
+  LD_PRELOAD=./qemu_driver.so qemu-system-x86_64 -enable-kvm -m 1024
+    -drive 'file=vitastor:etcd_host=10.115.0.10\:2379/v3:pool=1:inode=1:size=2147483648',format=raw,if=none,id=drive-virtio-disk0,cache=none
+    -device virtio-blk-pci,scsi=off,bus=pci.0,addr=0x5,drive=drive-virtio-disk0,id=virtio-disk0,bootindex=1,write-cache=off,physical_block_size=4096,logical_block_size=512
+    -vnc 0.0.0.0:0
+  ```
+
+## Known Problems
+
+- OSDs may currently crash with "can't get SQE, will fall out of sync with EPOLLET"
+  if you try to load them with very long iodepths because io_uring queue (ring) is limited
+  and OSDs don't check if it fills up.
+- Object deletion requests may currently lead to unfound objects on crashes because
+  proper handling of deletions in a cluster requires a "three-phase cleanup process"
+  and it's currently not implemented. In fact, even though deletion requests are
+  implemented, there's no user tool to delete anything from the cluster yet :).
+  Of course I'll create such tool, but its first implementation will be vulnerable to this issue.
+  It's not a big deal though, because you'll be able to just repeat the deletion request
+  in this case.
+
+## Implementation Principles
+
+- I like simple and stupid solutions, so expect Vitastor to stay simple.
+- I also like reinventing the wheel to some extent, like writing my own HTTP client
+  for etcd interaction instead of using prebuilt libraries, because in this case
+  I'm confident about what my code does and what it doesn't do.
+- I don't care about C++ "best practices" like RAII or proper inheritance or usage of
+  smart pointers or whatever and I don't intend to change my mind, so if you're here
+  looking for ideal reference C++ code, this probably isn't the right place.
+- I like node.js better than any other dynamically-typed language interpreter
+  because it's faster than any other interpreter in the world, has neutral C-like
+  syntax and built-in event loop. That's why Monitor is implemented in node.js.

 ## Author and License

 Copyright (c) Vitaliy Filippov (vitalif [at] yourcmc.ru), 2019+

-Join Vitastor Telegram Chat: https://t.me/vitastor
+You can also find me in the Russian Telegram Ceph chat: https://t.me/ceph_ru

 All server-side code (OSD, Monitor and so on) is licensed under the terms of
-Vitastor Network Public License 1.1 (VNPL 1.1), a copyleft license based on
+Vitastor Network Public License 1.0 (VNPL 1.0), a copyleft license based on
 GNU GPLv3.0 with the additional "Network Interaction" clause which requires
 opensourcing all programs directly or indirectly interacting with Vitastor
-through a computer network and expressly designed to be used in conjunction
-with it ("Proxy Programs"). Proxy Programs may be made public not only under
-the terms of the same license, but also under the terms of any GPL-Compatible
-Free Software License, as listed by the Free Software Foundation.
+through a computer network ("Proxy Programs"). Proxy Programs may be made public
+not only under the terms of the same license, but also under the terms of any
+GPL-Compatible Free Software License, as listed by the Free Software Foundation.
 This is a stricter copyleft license than the Affero GPL.

-Please note that VNPL doesn't require you to open the code of proprietary
-software running inside a VM if it's not specially designed to be used with
-Vitastor.
-
 Basically, you can't use the software in a proprietary environment to provide
 its functionality to users without opensourcing all intermediary components
 standing between the user and Vitastor or purchasing a commercial license
 from the author 😀.

 Client libraries (cluster_client and so on) are dual-licensed under the same
-VNPL 1.1 and also GNU GPL 2.0 or later to allow for compatibility with GPLed
+VNPL 1.0 and also GNU GPL 2.0 or later to allow for compatibility with GPLed
 software like QEMU and fio.

-You can find the full text of VNPL-1.1 in the file [VNPL-1.1.txt](VNPL-1.1.txt).
+You can find the full text of VNPL-1.0 in the file [VNPL-1.0.txt](VNPL-1.0.txt).
 GPL 2.0 is also included in this repository as [GPL-2.0.txt](GPL-2.0.txt).
--- a/VNPL-1.0.txt
+++ b/VNPL-1.0.txt
@ -1,7 +1,7 @@
                     VITASTOR NETWORK PUBLIC LICENSE
-                     Version 1.1,  6 February 2021
+                       Version 1, 17 September 2020

- Copyright (C) 2021 Vitaliy Filippov <vitalif@yourcmc.ru>
+ Copyright (C) 2020 Vitaliy Filippov <vitalif@yourcmc.ru>
 Everyone is permitted to copy and distribute verbatim copies
 of this license document, but changing it is not allowed.

@ -61,7 +61,7 @@ modification follow.

  0. Definitions.

-  "This License" refers to version 1.1 of the Vitastor Network Public License.
+  "This License" refers to version 1 of the Vitastor Network Public License.

  "Copyright" also means copyright-like laws that apply to other kinds of
 works, such as semiconductor masks.
@ -540,15 +540,12 @@ License would be to refrain entirely from conveying the Program.

  13. Remote Network Interaction.

-  A "Proxy Program" means a separate program which is specially designed to
-be used in conjunction with the covered work and interacts with it directly
-or indirectly through any kind of API (application programming interfaces),
-a computer network, an imitation of such network, or another Proxy Program
-itself.
-
-  Notwithstanding any other provision of this License, if you provide any user
-with an opportunity to interact with the covered work through a computer
-network, an imitation of such network, or any number of "Proxy Programs",
+  Notwithstanding any other provision of this License, if you provide
+any user an opportunity to interact with the covered work directly
+or indirectly through a computer network, an imitation of such network,
+or an additional program (hereinafter referred to as a "Proxy Program")
+that, in turn, interacts with the covered work through a computer network,
+an imitation of such network, or another Proxy Program itself,
 you must prominently offer that user an opportunity to receive the
 Corresponding Source of the covered work and all Proxy Programs from a
 network server at no charge, through some standard or customary means of
@ -629,7 +626,7 @@ the "copyright" line and a pointer to where the full notice is found.

    This program is free software: you can redistribute it and/or modify
    it under the terms of the Vitastor Network Public License as published by
-    the Vitastor Author, either version 1.1 of the License, or
+    the Vitastor Author, either version 1 of the License, or
    (at your option) any later version.

    This program is distributed in the hope that it will be useful,
--- a/VNPL-1.1-RU.txt
+++ b/VNPL-1.1-RU.txt
@ -1,680 +0,0 @@
-                СЕТЕВАЯ ПУБЛИЧНАЯ ЛИЦЕНЗИЯ VITASTOR
-                  VITASTOR NETWORK PUBLIC LICENSE
-                   Версия 1.1, от 6 февраля 2021
-
- Автор лицензии: Виталий Филиппов <vitalif@yourcmc.ru>, 2021 год
- Каждый имеет право копировать и распространять точные копии этой
- лицензии, но без внесения изменений.
-
-                            ПРЕАМБУЛА
-
-  Сетевая Публичная Лицензия Vitastor - это свободная "копилефт" лицензия для
-для программного обеспечения (ПО) и других видов произведений, специально
-разработанная, чтобы гарантировать кооперацию с сообществом при разработке
-сетевых приложений.
-
-  Большинство лицензий на программное обеспечение и другие произведения
-спроектированы так, чтобы лишить Вас свободы делиться ими и изменять их.
-Сетевая Публичная Лицензия Vitastor, напротив, разработана с целью
-гарантировать Ваше право распространять и вносить изменения во все версии
-программного обеспечения -- для уверенности, что ПО останется свободным для
-всех пользователей.
-
-  Когда мы говорим о свободном ПО, мы имеем в виду свободу использования, а не
-бесплатность. Свободные лицензии, такие, как Сетевая Публичная Лицензия
-Vitastor, составлены для того, чтобы убедиться, что у Вас есть право
-распространять копии свободного ПО (и взимать плату за них, если Вы хотите),
-что Вы получаете исходные тексты или можете получить их, если захотите, что Вы
-можете изменять программное обеспечение или использовать его части в новых
-свободных программах, и что Вы знаете о своем праве делать всё это.
-
-  Разработчики, использующие Сетевую Публичную Лицензию Vitastor, гарантируют
-Ваши права при помощи следующих мер: (1) закрепляют авторское право на
-программное обеспечение, и (2) предлагают Вам принять условия настоящей
-Лицензии, закрепляющей Ваше право на создание копий, распространение и (или)
-модификацию программного обеспечения.
-
-  Еще одно преимущество защиты свободы всех пользователей заключается в том,
-что улучшения, сделанные в разных версиях программы, при их широком
-распространении становятся доступными для использования другими разработчиками.
-Многие разработчики программного обеспечения воодушевляются этим
-сотрудничеством и пользуются его преимуществами. Однако, если программное
-обеспечение используется на сетевых серверах, данный результат не всегда
-достигается. Генеральная публичная лицензия GNU разрешает создание измененных
-версий и предоставление неограниченного доступа к ним, не делая общедоступным
-их исходный текст. Даже генеральная публичная лицензия GNU Affero разрешает
-использование модифицированной версии свободной программы в закрытой среде, где
-внешние пользователи взаимодействуют с ней только через закрытый промежуточный
-интерфейс (прокси), опять же, без открытия в свободный публичный доступ как
-самой программы, так и прокси.
-
-  Сетевая Публичная Лицензия Vitastor разработана специально, чтобы
-гарантировать, что в таких случаях и модифицированная версия программы, и
-прокси останутся доступными сообществу. Для этого лицензия требует от
-операторов сетевых серверов предоставлять исходный код оригинальной программы,
-а также всех других программ, взаимодействующих с ней на их серверах,
-пользователям этих серверов, на условиях свободных лицензий. Таким образом,
-публичное использование изменённой версии ПО на сервере, прямо или косвенно
-доступном пользователям, даёт пользователям доступ к исходным кодам изменённой
-версии.
-
-  Детальные определения используемых терминов и описание условий копирования,
-распространения и внесения изменений приведены ниже.
-
-                        ТЕРМИНЫ И УСЛОВИЯ
-
-  0. Определения.
-
-  "Настоящая Лицензия" -- версия 1.1 Сетевой Публичной Лицензии Vitastor.
-
-  Под "Авторским правом" понимаются все законы, сходные с авторско-правовыми,
-которые применяются к любым видам работ, например, к топологиям микросхем.
-
-  Термином "Программа" обозначается любое охраноспособное произведение,
-используемое в соответствии с настоящей Лицензией. Лицензиат именуется "Вы".
-"Лицензиаты" и "получатели" могут быть как физическими лицами, так и
-организациями.
-
-  "Внесение изменений" в произведение означает копирование или адаптацию
-произведения целиком или в части, способом, требующим разрешения
-правообладателя, за исключением изготовления его точной копии. Получившееся
-произведение называется "измененной версией" предыдущего произведения или
-произведением, "основанным на" более ранней работе.
-
-  Термином "Лицензионное произведение" обозначается неизмененная Программа или
-произведение, основанное на Программе.
-
-  "Распространение" произведения означает совершение с ним действий, которые
-при отсутствии разрешения сделают Вас прямо или косвенно ответственным за
-нарушение действующего закона об авторском праве, за исключением запуска на
-компьютере или изменения копии, созданной в личных целях. Распространение
-включает в себя копирование, раздачу копий (с изменениями или без них),
-доведение до всеобщего сведения, а в некоторых странах -- и другие действия.
-
-  "Передача" произведения означает любой вид распространения, который позволяет
-другим лицам создавать или получать копии произведения. Обычное взаимодействие
-с пользователем через компьютерную сеть без создания копии передачей не
-является.
-
-  Интерактивный интерфейс пользователя должен отображать "Информация об
-авторском праве", достаточную для того, чтобы (1) обеспечить отображение
-соответствующего уведомления об авторских правах и (2) сообщить пользователю
-о том, что ему не предоставляются никакие гарантии на произведение (за
-исключением явным образом предоставленных гарантий), о том, что лицензиаты
-могут передавать произведение на условиях, описанных в настоящей Лицензии,
-а также о том, как ознакомиться с текстом настоящей Лицензии. Если интерфейс
-предоставляет собой список пользовательских команд или настроек, наподобие
-меню, это требование считается выполненным при наличии явно выделенного
-пункта в таком меню.
-
-  1. Исходный текст.
-
-  Под "Исходным текстом" понимается произведение в форме, которая более всего
-подходит для внесения в него изменений. "Объектным кодом" называется
-произведение в любой иной форме.
-
-  "Стандартный интерфейс" -- интерфейс, который либо является общепринятым
-стандартом, введенным общепризнанным органом по стандартизации, либо, в случае
-интерфейсов, характерных для конкретного языка программирования -- тот,
-который широко используется разработчиками, пишущими программы на этом языке.
-
-  "Системные библиотеки" исполняемого произведения включают в себя то, что не
-относится к произведению в целом и при этом (a) входит в обычный комплект
-Основного компонента, но при этом не является его частью и (b) служит только
-для обеспечения работы с этим Основным компонентом или для реализации
-Стандартного интерфейса, для которого существует общедоступная реализация,
-опубликованная в виде исходного текста. "Основным компонентом" в данном
-контексте назван главный существенный компонент (ядро, оконная система и т.д.)
-определенной операционной системы (если она используется), под управлением
-которой функционирует исполняемое произведение, либо компилятор, используемый
-для создания произведения или интерпретатор объектного кода, используемый для
-его запуска.
-
-  "Полный исходный текст" для произведения в форме объектного кода -- весь
-исходный текст, необходимый для создания, установки и (для исполняемого
-произведения) функционирования объектного кода, а также модификации
-произведения, включая сценарии, контролирующие эти действия. Однако он не
-включает в себя Системные библиотеки, необходимые для функционирования
-произведения, инструменты общего назначения или общедоступные свободные
-программы, которые используются в неизменном виде для выполнения этих
-действий, но не являются частью произведения. Полный исходный текст включает
-в себя, например, файлы описания интерфейса, прилагаемые к файлам исходного
-текста произведения, а также исходные тексты общих библиотек и динамически
-связанных подпрограмм, которые требуются для функционирования произведения
-и разработаны специально для этого, например, для прямой передачи данных
-или управления потоками между этими подпрограммами и другими частями
-произведения. Полный исходный текст не включает в себя то, что пользователи
-могут сгенерировать автоматически из других частей Полного исходного текста.
-Полным исходным текстом для произведения в форме исходных текстов является
-само это произведение.
-
-  2. Основные права.
-
-  Все права, предоставленные на основании настоящей Лицензии, действуют в
-течение срока действия авторских прав на Программу и не могут быть отозваны
-при условии, что сформулированные в ней условия соблюдены. Настоящая Лицензия
-однозначно подтверждает Ваши неограниченные права на запуск неизмененной
-Программы. Настоящая Лицензия распространяется на результаты функционирования
-Лицензионного произведения только в том случае, если они, учитывая их
-содержание, сами являются частью Лицензионного произведения. Настоящая
-Лицензия подтверждает Ваши права на  свободное использование произведения
-или другие аналогичные полномочия, предусмотренные действующим
-законодательством об авторском праве.
-
-  Если Вы не осуществляете обычную передачу Лицензионного произведения, то
-можете как угодно создавать, запускать и распространять его копии до тех пор,
-пока ваша Лицензия сохраняет силу. Вы можете передавать Лицензионные
-произведения третьим лицам исключительно для того, чтобы они внесли в них
-изменения для Вас или предоставили Вам возможность их запуска, при условии,
-что Вы соглашаетесь с условиями настоящей Лицензии при передаче всех
-материалов, авторскими правами на которые Вы не обладаете. Лица, создающие
-или запускающие Лицензионные произведения для Вас, должны делать это
-исключительно от Вашего имени, под Вашим руководством и контролем, на
-условиях, которые запрещают им создание без Вашей санкции каких-либо копий
-материалов, на которые Вы обладаете авторским правом.
-
-  Любая другая передача разрешается исключительно при соблюдении описанных
-ниже условий. Сублицензирование не допускается; раздел 10 делает его не нужным.
-
-  3. Защита прав пользователей от законов, запрещающих обход технических средств.
-
-  Ни одно Лицензионное произведение не должно считаться содержащим эффективные
-технические средства, удовлетворяющие требованиям любого действующего закона,
-принятого для исполнения обязательств, предусмотренных статьей 11 Договора ВОИС
-по авторскому праву от 20 декабря 1996 года или аналогичных законов,
-запрещающих или ограничивающих обход таких технических средств.
-
-  При передаче Лицензионного произведения Вы отказываетесь от всех
-предоставляемых законом полномочий по запрету обхода технических средств,
-используемых авторами в связи с осуществлением их прав, признавая, что такой
-обход находится в рамках осуществления прав на использование Лицензионного
-произведения, предоставленных настоящей Лицензией; также Вы отказываетесь
-от любых попыток ограничить функционирование произведения или внесение в него
-изменений, направленных на реализацию предоставленных Вам законом прав на
-запрет пользователю обхода технических средств.
-
-  4. Передача неизмененных копий.
-
-  Вы можете передавать точные копии исходных текстов Программы в том виде,
-в котором Вы их получили, на любом носителе, при условии, что Вы прилагаете
-к каждой копии соответствующее уведомление об авторских правах способом,
-обеспечивающим ознакомление с ним пользователя; сохраняете все уведомления
-о том, что к тексту применима настоящая Лицензия и любые ограничения,
-добавленные в соответствии с разделом 7; сохраняете все уведомления об
-отсутствии каких-либо гарантий; предоставляете всем получателям вместе с
-Программой копию настоящей Лицензии.
-
-  Вы можете установить любую цену за каждую копию, которую Вы передаете,
-или распространять копии бесплатно; также Вы можете предложить поддержку
-или гарантию за отдельную плату.
-
-  5. Передача измененных исходных текстов.
-
-  Вы можете передавать исходный текст произведения, основанного на Программе,
-или изменений, необходимых для того, чтобы получить его из Программы, на
-условиях, описанных в разделе 4, при соблюдении следующих условий:
-
-    а) Произведение должно содержать уведомления о произведенных Вами
-    изменениях с указанием их даты, сделанные способом, обеспечивающим
-    ознакомление с ними пользователя.
-
-    b) Произведение должно содержать уведомление о том, что оно
-    распространяется на условиях настоящей Лицензии, а также об условиях,
-    добавленных в соответствии с разделом 7, сделанное способом,
-    обеспечивающим ознакомление с ним пользователя. Данное требование имеет
-    приоритет над требованиями раздела 4 "оставлять нетронутыми все
-    уведомления".
-
-    c) Вы должны передать на условиях настоящей Лицензии всю работу целиком
-    любому лицу, которое приобретает копию. Таким образом, настоящая Лицензия
-    вместе с любыми применимыми условиями раздела 7 будет применяться к
-    произведению в целом и всем его частям, независимо от их комплектности.
-    Настоящая Лицензия не дает права на лицензирование произведения на любых
-    других условиях, но это не лишает законной силы такое разрешение, если Вы
-    получили его отдельно.
-
-    d) Если произведение имеет интерактивные пользовательские интерфейсы,
-    каждый из них должен отображать Информацию об авторском праве; однако,
-    если Программа имеет пользовательские интерфейсы, которые не отображают
-    информацию об авторском праве, от Вашего произведения этого также не
-    требуется.
-
-  Включение Лицензионного произведения в подборку на разделе хранения данных
-или на носителе, на котором распространяется произведение, вместе с другими
-отдельными самостоятельными произведениями, которые по своей природе не
-являются переработкой Лицензионного произведения и не объединены с ним,
-например, в программный комплекс, называется "набором", если авторские права
-на подборку не используются для ограничения доступа к ней или законных прав
-её пользователей сверх того, что предусматривают лицензии на отдельные
-произведения. Включение Лицензионного произведения в набор не влечет применения
-положений настоящей Лицензии к остальным его частям.
-
-  6. Передача произведения в формах, не относящихся к исходному тексту.
-
-  Вы можете передавать Лицензионное произведение в виде объектного кода в
-соответствии с положениями разделов 4 и 5, при условии, что Вы также передаете
-машиночитаемый Полный исходный текст в соответствии с условиями настоящей
-Лицензии, одним из следующих способов:
-
-    а) Передавая объектный код или содержащий его материальный продукт (включая
-    распределенный материальный носитель), с приложением Полного исходного
-    текста наматериальном носителе, обычно используемом для обмена программным
-    обеспечением.
-
-    b) Передавая объектный код или содержащий его материальный продукт (включая
-    носитель, на котором распространяется произведение), с письменным
-    предложением, действительным в течение не менее трех лет либо до тех пор,
-    пока Вы предоставляете запасные части или поддержку для данного продукта,
-    о передаче любому обладателю объектного кода (1) копии Полного исходного
-    текста для всего программного обеспечения, содержащегося в продукте, на
-    которое распространяется действие настоящей Лицензии, на физическом
-    носителе, обычно используемом для обмена программным обеспечением, по цене,
-    не превышающей разумных затрат на передачу копии, или (2) доступа к Полному
-    исходному тексту с возможностью его копирования с сетевого сервера без
-    взимания платы.
-
-    с) Передавая отдельные копии объектного кода с письменной копией предложения
-    о предоставлении Полного исходного текста. Этот вариант допускается только
-    в отдельных случаях при распространении без извлечения прибыли, и только
-    если Вы получили объектный код с таким предложением в соответствии
-    с пунктом 6b.
-
-    d) Передавая объектный код посредством предоставления доступа к нему по
-    определенному адресу (бесплатно или за дополнительную плату), и предлагая
-    эквивалентный доступ к Полному исходному тексту таким же способом по тому же
-    адресу без какой-либо дополнительной оплаты. От Вас не требуется принуждать
-    получателей копировать Полный исходный текст вместе с объектным кодом. Если
-    объектный код размещен на сетевом сервере, Полный исходный текст может
-    находиться на другом сервере (управляемом Вами или третьим лицом), который
-    предоставляет аналогичную возможность копирования; при этом Вы должны четко
-    указать рядом с объектным кодом способ получения Полного исходного текста.
-    Независимо от того, на каком сервере расположен Полный исходный текст, Вы
-    обязаны убедиться в том, что он будет распространяться в течение времени,
-    необходимого для соблюдения этих требований.
-
-    e) Передавая объектный код с использованием одноранговой (пиринговой) сети,
-    при условии информирования других пользователей сети о том, где можно
-    бесплатно получить объектный код и Полный исходный текст произведения
-    способом, описанным в пункте 6d.
-
-  Не нужно включать в передаваемый объектный код его отделимые части, исходные
-тексты которых не входят в состав Полного исходного текста, такие как Системные
-библиотеки.
-
-  "Потребительский товар" это либо (1) "товар, предназначенный для личных нужд",
-под которым понимается любое материальное личное имущество, которое обычно
-используется для личных, семейных или домашних целей, или (2) что-либо
-спроектированное или продающееся для использования в жилище. При определении
-того, предназначен ли товар для личных нужд, сомнения должны толковаться в
-пользу положительного ответа на этот вопрос. Применительно к конкретному
-товару, используемому конкретным пользователем, под выражением "обычно
-используется" имеется в виду способ, которым данный вид товаров преимущественно
-или как правило используется, независимо от статуса конкретного пользователя
-или способа, которым конкретный пользователь использует, предполагает или
-будет использовать товар. Товар относится к предназначенным для личных нужд
-независимо от того, насколько часто он используется в коммерческой
-деятельности, промышленности или иной сфере, не относящейся к личным нуждам,
-за исключением случая, когда использование в этой сфере представляет собой
-единственный основной способ использования такого товара.
-
-  "Информация, необходимая для установки" Потребительского товара -- любые
-методы, процедуры, сведения, необходимые для авторизации, или другая
-информация, необходимая для установки и запуска в Потребительском товаре
-измененных версий Лицензионного произведения, полученных при изменении
-Полного исходного текста. Данная информация должна быть достаточной для
-того, чтобы обеспечить возможность внесения в исходный текст изменений,
-не приводящих к ограничению или нарушению его дальнейшей работоспособности.
-
-  Если вместе с Потребительским товаром или специально для использования
-в нём Вы передаете произведение в виде объектного кода на условиях, описанных
-в данном разделе, и такая передача является частью сделки, по которой право
-владения и пользования Потребительским товаром переходит к получателю
-пожизненно или на определенный срок (независимо от признаков сделки), Полный
-исходный текст, передаваемый согласно данному разделу, должен сопровождаться
-Информацией, необходимой для установки. Но это требование не применяется,
-если ни Вы, ни какое-либо третье лицо не сохраняет за собой возможности
-установки измененного объектного кода на Потребительский товар (например,
-произведение было установлено в постоянную память).
-
-  Требование о предоставлении Информации, необходимой для установки, не
-включает в себя требование продолжения оказания услуг по поддержке,
-предоставления гарантии или обновлений для произведения, которое было изменено
-или установлено получателем, либо для Потребительского товара, в котором оно
-было изменено или на который оно было установлено. В доступе к сети может быть
-отказано, если само внесение изменений существенно и негативно влияет на
-работу сети, нарушает правила обмена данными или не поддерживает протоколы для
-обмена данными по сети.
-
-  Передаваемый в соответствии с данным разделом Полный исходный текст и
-предоставленная Информация, необходимая для установки, должны быть записаны в
-формате, который имеет общедоступное описание (и общедоступную реализацию,
-опубликованную в форме исходного текста) и не должны требовать никаких
-специальных паролей или ключей для распаковки, чтения или копирования.
-
-  7. Дополнительные условия.
-
-  "Дополнительными разрешениями" называются условия, которые дополняют условия
-настоящей Лицензии, делая исключения из одного или нескольких её положений.
-Дополнительные разрешения, которые применимы ко всей Программе, должны
-рассматриваться как часть настоящей Лицензии, в той степени, в которой они
-соответствуют действующему законодательству. Если дополнительные разрешения
-применяются только к части Программы, эта часть может быть использована отдельно
-на измененных условиях, но вся Программа продолжает использоваться на условиях
-настоящей Лицензии без учета дополнительных разрешений.
-
-  Когда Вы передаете копию Лицензионного произведения, Вы можете по своему
-усмотрению исключить любые дополнительные разрешения, примененные к этой копии
-или к любой её части. (Для дополнительных разрешений может быть заявлено
-требование об их удалении в определенных случаях, когда Вы вносите изменения в
-произведение.) Вы можете добавлять дополнительные разрешения к добавленным Вами
-в Лицензионное произведение материалам, на которые Вы обладаете авторскими
-правами или правом выдачи соответствующего разрешения.
-
-  Независимо от любых других положений настоящей Лицензии, Вы можете дополнить
-следующими условиями положения настоящей Лицензии в отношении материала,
-добавленного к Лицензионному произведению (если это разрешено обладателями
-авторских прав на материал):
-
-    a) отказом от гарантий или ограничением ответственности, отличающимися от
-    тех, что описаны в разделах 15 и 16 настоящей Лицензии; либо
-
-    b) требованием сохранения соответствующей информации о правах или об
-    авторстве материала, или включения её в Информацию об авторском праве,
-    отображаемую содержащим его произведением; либо
-
-    c) запретом на искажение информации об источнике происхождения материала
-    или требованием того, чтобы измененные версии такого материала содержали
-    корректную отметку об отличиях от исходной версии; либо
-
-    d) ограничением использования в целях рекламы имен лицензиаров или авторов
-    материала; либо
-
-    e) отказом от предоставления прав на использование в качестве товарных
-    знаков некоторых торговых наименований, товарных знаков или знаков
-    обслуживания; либо
-
-    f) требованием от каждого, кто по договору передает материал (или его
-    измененные версии), предоставления компенсации лицензиарам и авторам
-    материала в виде принятия на себя любой ответственности, которую этот
-    договор налагает на лицензиаров и авторов.
-
-  Все остальные ограничительные дополнительные условия считаются "дополнительными
-запретами" по смыслу раздела 10. Если программа, которую Вы получили, или любая
-её часть содержит уведомление о том, что наряду с настоящей Лицензией её
-использование регулируется условием, относящимся к дополнительным запретам, Вы
-можете удалить такое условие. Если лицензия содержит дополнительный запрет, но
-допускает лицензирование на измененных условиях или передачу в соответствии с
-настоящей Лицензией, Вы можете добавить к Лицензионному произведению материал,
-используемый на условиях такой лицензии, в том случае, если дополнительный
-запрет не сохраняется при таком изменении условий лицензии или передаче.
-
-  Если Вы добавляете условия для использования Лицензионного произведения в
-соответствии с настоящим разделом, Вы должны поместить в соответствующих файлах
-исходного текста уведомление о том, что к этим файлам применяются дополнительные
-условия, или указание на то, как ознакомиться с соответствующими условиями.
-
-  Дополнительные разрешающие или ограничивающие условия могут быть сформулированы
-в виде отдельной лицензии или зафиксированы как исключения; вышеуказанные
-требования применяются в любом случае.
-
-  8. Прекращение действия.
-
-  Вы не можете распространять Лицензионное произведение или вносить в него
-изменения на условиях, отличающихся от явно оговоренных в настоящей Лицензии.
-Любая попытка распространения или внесения изменений на иных условиях является
-ничтожной и автоматически прекращает Ваши права, полученные по настоящей
-Лицензии (включая лицензию на любые патенты, предоставленные согласно третьему
-пункту раздела 11).
-
-  Тем не менее если Вы прекращаете нарушение настоящей Лицензии, Ваши права,
-полученные от конкретного правообладателя, восстанавливаются (а) временно, до
-тех пор пока правообладатель явно и окончательно не прекратит действие Ваших
-прав, и (б) навсегда, если правообладатель не уведомит Вас о нарушении с помощью
-надлежащих средств в течение 60 дней после прекращения нарушений.
-
-  Кроме того, Ваши права, полученные от конкретного правообладателя,
-восстанавливаются навсегда, если правообладатель впервые любым подходящим
-способом уведомляет Вас о нарушении настоящей Лицензии на свое произведение (для
-любого произведения) и Вы устраняете нарушение в течение 30 дней после получения
-уведомления.
-
-  Прекращение Ваших прав, описанное в настоящем разделе, не прекращает действие
-лицензий лиц, которые получили от Вас копии произведения или права,
-предоставляемые настоящей Лицензией. Если Ваши права были прекращены навсегда и
-не восстановлены, Вы не можете вновь получить право на тот же материал на
-условиях, описанных в разделе 10.
-
-  9. Акцепт не требуется для получения копий.
-
-  Вы не обязаны принимать условия настоящей Лицензии для того, чтобы получить или
-запустить копию Программы. Случайное распространение Лицензионного произведения,
-происходящее вследствие использования одноранговой (пиринговой) сети для
-получения его копии, также не требует принятия этих условий. Тем не менее только
-настоящая Лицензия дает Вам право распространять или изменять любое Лицензионное
-произведение. Если Вы не приняли условия настоящей Лицензии, такие действия
-будут нарушением авторского права. Поэтому изменяя или распространяя
-Лицензионное произведение, Вы выражаете согласие с условиями настоящей Лицензии.
-
-  10. Автоматическое получение прав последующими получателями.
-
-  Каждый раз, когда Вы передаете Лицензионное произведение, получатель
-автоматически получает от его лицензиара право запускать, изменять и
-распространять это произведение при условии соблюдения настоящей Лицензии. Вы не
-несете ответственности за соблюдение третьими лицами условий настоящей Лицензии.
-
-  "Реорганизацией" называются действия, в результате которых передается управление
-организацией или значительная часть её активов, а также происходит разделение
-или слияние организаций. Если распространение Лицензионного произведения
-является результатом реорганизации, каждая из сторон сделки, получающая копию
-произведения, также получает все права на произведение, которые предшествующее
-юридическое лицо имело или могло предоставить согласно предыдущему абзацу, а
-также право на владение Полным исходным текстом произведения от предшественника,
-осуществляемое в его интересах, если предшественник владеет им или может
-получить его при разумных усилиях.
-
-  Вы не можете налагать каких-либо дополнительных ограничений на осуществление
-прав, предоставленных или подтвержденных в соответствии с настоящей Лицензией.
-Например, Вы не можете ставить осуществление прав, предоставленных по настоящей
-Лицензии, в зависимость от оплаты отчислений, роялти или других сборов; также Вы
-не можете инициировать судебный процесс (включая встречный иск или заявление
-встречного требования в судебном процессе) о нарушении любых патентных прав при
-создании, использовании, продаже, предложении продажи, импорте Программы или
-любой её части.
-
-  11. Патенты.
-
-  "Инвестором" называется правообладатель, разрешающий использование Программы
-либо произведения, на котором основана Программа, на условиях настоящей
-Лицензии. Произведение, лицензированное таким образом, называется "версией со
-вкладом" инвестора.
-
-  "Неотъемлемые патентные претензии" инвестора -- все патентные права,
-принадлежащие инвестору или контролируемые им в настоящее время либо
-приобретенные в будущем, которые могут быть нарушены созданием, использованием
-или продажей версии со вкладом, допускаемыми настоящей Лицензией; они не
-включают в себя права, которые будут нарушены исключительно вследствие будущих
-изменений версии со вкладом. Для целей данного определения под "контролем"
-понимается право выдавать патентные сублицензии способами, не нарушающими
-требований настоящей Лицензии.
-
-  Каждый инвестор предоставляет Вам неисключительную безвозмездную лицензию на
-патент, действующую во всем мире, соответствующую неотъемлемым патентным
-претензиям инвестора, на создание, использование, продажу, предложение для
-продажи, импорт, а также запуск, внесение изменений и распространение всего, что
-входит в состав версии со вкладом.
-
-  В следующих трех абзацах "лицензией на патент" называется любое явно выраженное
-вовне согласие или обязательство не применять патент (например, выдача
-разрешения на использование запатентованного объекта или обещание не подавать в
-суд за нарушение патента). "Выдать" кому-то такую лицензию на патент означает
-заключить такое соглашение или обязаться не применять патент против него.
-
-  Если Вы передаете Лицензионное произведение, сознательно основываясь на лицензии
-на патент, в то время как Полный исходный текст произведения невозможно
-бесплатно скопировать с общедоступного сервера или другим не вызывающим
-затруднений способом, Вы должны либо (1) обеспечить возможность такого доступа к
-Полному исходному тексту, либо (2) отказаться от прав, предоставленных по
-лицензии на патент для данного произведения, либо (3) принять меры по передаче
-лицензии на патент последующим получателям произведения, в соответствии с
-требованиями настоящей Лицензии. "Сознательно основываясь" означает, что Вы
-знаете, что при отсутствии лицензии на патент передача Вами Лицензионного
-произведения в определенной стране или использование получателем переданного ему
-Вами Лицензионного произведения в этой стране нарушит один или несколько
-определенных патентов этой страны, срок действия которых не истек.
-
-  Если в соответствии или в связи с единичной сделкой либо соглашением Вы
-передаете или делаете заказ на распространение Лицензионного произведения, и
-предоставляете определенным лицам, получающим Лицензионное произведение,
-лицензию на патент, разрешающую им использовать, распространять, вносить
-изменения или передавать конкретные экземпляры Лицензионного произведения,
-права, которые Вы предоставляете по лицензии на патент, автоматически переходят
-ко всем получателям Лицензионного произведения и произведений, созданных на его
-основе.
-
-  Патентная лицензия называется "дискриминирующей", если она не покрывает,
-запрещает осуществление или содержит в качестве условия отказ от применения
-одного или нескольких прав, предоставленных настоящей Лицензией. Вы не можете
-передавать Лицензионное произведение, если Вы являетесь участником договора с
-третьим лицом, осуществляющим распространение программного обеспечения, в
-соответствии с которым Вы делаете в пользу третьего лица выплаты, размер которых
-зависит от масштабов Вашей деятельности по передаче произведения, и в
-соответствии с которым любое третье лицо, получающее от Вас Лицензионное
-произведение, делает это на условиях дискриминирующей патентной лицензии (а)
-которая зависит от количества копий Лицензионного произведения, переданных Вами
-(или копий, сделанных с этих копий), или (b) которая используется
-преимущественно в конкретных товарах или подборках, содержащих Лицензионное
-произведение, или в связи с ними, в том случае, если Вы заключили данный договор
-или получили лицензию на патент после 28 марта 2007 года.
-
-  Ничто в настоящей Лицензии не должно толковаться как исключение или ограничение
-любого предполагаемого права или других способов противодействия нарушениям,
-которые во всем остальном могут быть доступны для Вас в соответствии с
-применимым патентным правом.
-
-  12. Запрет отказывать в свободе другим.
-
-  Если на Вас наложены обязанности (будь то по решению суда, договору или иным
-способом), которые противоречат условиям настоящей Лицензии, это не освобождает
-Вас от соблюдения её условий. Если Вы не можете передать Лицензионное
-произведение так, чтобы одновременно выполнять Ваши обязательства по настоящей
-Лицензии и любые другие относящиеся к делу обязательства, то Вы не можете
-передавать его вообще. Например, если Вы согласны с условием, обязывающими Вас
-производить сбор отчислений за дальнейшую передачу от тех, кому Вы передаете
-Программу, то для того, чтобы соблюсти это условие и выполнить требования
-настоящей Лицензии, Вы должны полностью воздержаться от передачи Программы.
-
-  13. Удаленное сетевое взаимодействие.
-
-  Под "Прокси-программой" понимается отдельная программа, специально
-разработанная для использования совместно с Лицензионным произведением,
-и взаимодействующая с ним прямо или косвенно через любой вид программного
-интерфейса, компьютерную сеть, имитацию такой сети, или, в свою очередь,
-через другую Прокси-программу.
-
-  Независимо от любых других положений настоящей Лицензии, если вы
-предоставляете любому пользователю возможность взаимодействовать с Лицензионным
-произведением через компьютерную сеть, имитацию такой сети, или через любое
-количество "Прокси-программ", вы должны в явной форме предложить этому
-пользователю возможность получить Полный исходный текст Лицензионного
-произведения и всех Прокси-программ путём предоставления доступа к нему
-с сетевого сервера без взимания платы, посредством стандартных или
-традиционных способов, используемых для копирования программного обеспечения.
-Полный исходный текст Лицензионного произведения должен предоставляться
-пользователю на условиях настоящей Лицензии, а Полный исходный текст
-Прокси-программ должен предоставляться пользователю либо на условиях настоящей
-Лицензии, либо на условиях одной из свободных лицензий, совместимых с
-Генеральной публичной Лицензией GNU, перечисленных Фондом Свободного
-Программного Обеспечения в списке под названием "Лицензии свободных программ,
-совместимые с GPL".
-
-  14. Пересмотренные редакции настоящей Лицензии.
-
-  Автор настоящей Лицензии время от времени может публиковать пересмотренные
-и (или) новые редакции Сетевой Публичной Лицензии Vitastor. Они будут аналогичны
-по смыслу настоящей редакции, но могут отличаться от нее в деталях, направленных
-на решение новых проблем или регулирование новых отношений.
-
-  Каждой редакции присваивается собственный номер. Если для Программы указано,
-что к ней применима определенная редакция Сетевой Публичной Лицензии Vitastor
-"или любая более поздняя редакция", у Вас есть возможность использовать термины
-и условия, содержащиеся в редакции с указанным номером или любой более поздней
-редакции, опубликованной автором настоящей Лицензии. Если для Программы не
-указан номер редакции Сетевой Публичной Лицензии Vitastor, Вы можете выбрать
-любую редакцию, опубликованную автором настоящей Лицензии.
-
-  Более поздние редакции Лицензии могут дать Вам дополнительные или принципиально
-иные права. Тем не менее в результате Вашего выбора более поздней редакции на
-автора или правообладателя не возлагается никаких дополнительных обязанностей.
-
-  15. Отказ от гарантий.
-
-  НА ПРОГРАММУ НЕ ПРЕДОСТАВЛЯЕТСЯ НИКАКИХ ГАРАНТИЙ ЗА ИСКЛЮЧЕНИЕМ ПРЕДУСМОТРЕННЫХ
-ДЕЙСТВУЮЩИМ ЗАКОНОДАТЕЛЬСТВОМ. ЕСЛИ ИНОЕ НЕ УКАЗАНО В ПИСЬМЕННОЙ ФОРМЕ,
-ПРАВООБЛАДАТЕЛИ И (ИЛИ) ТРЕТЬИ ЛИЦА ПРЕДОСТАВЛЯЮТ ПРОГРАММУ "КАК ЕСТЬ", БЕЗ
-КАКИХ-ЛИБО ЯВНЫХ ИЛИ ПОДРАЗУМЕВАЕМЫХ ГАРАНТИЙ, ВКЛЮЧАЯ ГАРАНТИИ ПРИГОДНОСТИ ДЛЯ
-КОНКРЕТНЫХ ЦЕЛЕЙ, НО НЕ ОГРАНИЧИВАЯСЬ ИМИ. ВЕСЬ РИСК, СВЯЗАННЫЙ С КАЧЕСТВОМ И
-ПРОИЗВОДИТЕЛЬНОСТЬЮ ПРОГРАММЫ, ВОЗЛАГАЕТСЯ НА ВАС. ЕСЛИ В ПРОГРАММЕ БУДУТ
-ВЫЯВЛЕНЫ НЕДОСТАТКИ, ВЫ ПРИНИМАЕТЕ НА СЕБЯ СТОИМОСТЬ ВСЕГО НЕОБХОДИМОГО
-ОБСЛУЖИВАНИЯ, РЕМОНТА ИЛИ ИСПРАВЛЕНИЯ.
-
-  16. Ограничение ответственности.
-
-  ЕСЛИ ИНОЕ НЕ ПРЕДУСМОТРЕНО ДЕЙСТВУЮЩИМ ЗАКОНОДАТЕЛЬСТВОМ ИЛИ СОГЛАШЕНИЕМ СТОРОН,
-ЗАКЛЮЧЕННЫМ В ПИСЬМЕННОЙ ФОРМЕ, ПРАВООБЛАДАТЕЛЬ ИЛИ ИНОЕ ЛИЦО, КОТОРОЕ ВНОСИТ
-ИЗМЕНЕНИЯ В ПРОГРАММУ И (ИЛИ) ПЕРЕДАЕТ ЕЁ НА УСЛОВИЯХ, СФОРМУЛИРОВАННЫХ ВЫШЕ, НЕ
-МОЖЕТ НЕСТИ ОТВЕТСТВЕННОСТЬ ПЕРЕД ВАМИ ЗА ПРИЧИНЕННЫЙ УЩЕРБ, ВКЛЮЧАЯ УЩЕРБ
-ОБЩЕГО ЛИБО КОНКРЕТНОГО ХАРАКТЕРА, ПРИЧИНЕННЫЙ СЛУЧАЙНО ИЛИ ЯВЛЯЮЩИЙСЯ
-СЛЕДСТВИЕМ ИСПОЛЬЗОВАНИЯ ПРОГРАММЫ ЛИБО НЕВОЗМОЖНОСТИ ЕЁ ИСПОЛЬЗОВАНИЯ (В ТОМ
-ЧИСЛЕ ЗА УНИЧТОЖЕНИЕ ИЛИ МОДИФИКАЦИЮ ИНФОРМАЦИИ, ЛИБО УБЫТКИ, ПОНЕСЕННЫЕ ВАМИ
-ИЛИ ТРЕТЬИМИ ЛИЦАМИ, ЛИБО СБОИ ПРОГРАММЫ ПРИ ВЗАИМОДЕЙСТВИИ С ДРУГИМ ПРОГРАММНЫМ
-ОБЕСПЕЧЕНИЕМ), В ТОМ ЧИСЛЕ И В СЛУЧАЯХ, КОГДА ПРАВООБЛАДАТЕЛЬ ИЛИ ТРЕТЬЕ ЛИЦО
-ПРЕДУПРЕЖДЕНЫ О ВОЗМОЖНОСТИ ПРИЧИНЕНИЯ ТАКИХ УБЫТКОВ.
-
-  17. Толкование разделов 15 и 16.
-
-  Если отказ от гарантии и ограничение ответственности, представленные выше, по
-закону не могут быть применены в соответствии с их условиями, суды,
-рассматривающие спор, должны применить действующий закон, который в наибольшей
-степени предусматривает абсолютный отказ от всей гражданской ответственности в
-связи с Программой, за исключением случаев, когда гарантия или принятие на себя
-ответственности за копию программы предоставляется за плату.
-
-                        КОНЕЦ ОПРЕДЕЛЕНИЙ И УСЛОВИЙ
-
-          Порядок применения условий Лицензии к Вашим программам
-
-  Если Вы разрабатываете новую программу и хотите, чтобы её использование принесло
-максимальную пользу обществу, наилучший способ достичь этого -- сделать её
-свободной, чтобы все могли распространять и изменять её на условиях настоящей
-Лицензии.
-
-  Для этого сделайте так, чтобы программа содержала в себе описанные ниже
-уведомления. Самым надежным способом это сделать является включение их в начало
-каждого файла исходного текста, чтобы наиболее эффективным образом сообщить об
-отсутствии гарантий; каждый файл должен иметь по меньшей мере одну строку с
-оповещением об авторских правах и указанием на то, где находится полный текст
-уведомлений.
-
-    <Строка с названием Программы и информацией о её назначении.>
-    Copyright © <год выпуска программы в свет>  <имя автора>
-
-    Эта программа является свободным программным обеспечением: Вы можете
-    распространять её и (или) изменять, соблюдая условия Сетевой Публичной
-    Лицензии Vitastor, опубликованной автором Vitastor, либо редакции 1.1
-    Лицензии, либо (на Ваше усмотрение) любой редакции, выпущенной позже.
-
-    Эта программа распространяется в расчете на то, что она окажется полезной,
-    но БЕЗ КАКИХ-ЛИБО ГАРАНТИЙ, включая подразумеваемую гарантию КАЧЕСТВА либо
-    ПРИГОДНОСТИ ДЛЯ ОПРЕДЕЛЕННЫХ ЦЕЛЕЙ. Ознакомьтесь с Сетевой Публичной
-    Лицензией Vitastor для получения более подробной информации.
-
-  Также добавьте информацию о том, как связаться с Вами посредством электронной
-или обычной почты.
-
-  Если ваша программа взаимодействует с пользователями удаленно через
-компьютерную сеть, Вы также должны убедиться, что обеспечили её пользователям
-возможность получить её исходные тексты. Например, если Ваша программа является
-веб-приложением, её интерфейс может отображать ссылку "Исходные коды", которая
-указывает на архив с текстом. Существует много способов, которыми Вы можете
-распространять исходные тексты, для разных программ подходят разные решения;
-ознакомьтесь с разделом 13 для того, чтобы узнать конкретные требования.
--- a/allocator.cpp
+++ b/allocator.cpp
@ -0,0 +1,130 @@
+// Copyright (c) Vitaliy Filippov, 2019+
+// License: VNPL-1.0 (see README.md for details)
+
+#include <stdexcept>
+#include "allocator.h"
+
+#include <stdlib.h>
+#include <malloc.h>
+
+allocator::allocator(uint64_t blocks)
+{
+    if (blocks >= 0x80000000 || blocks <= 1)
+    {
+        throw std::invalid_argument("blocks");
+    }
+    uint64_t p2 = 1, total = 1;
+    while (p2 * 64 < blocks)
+    {
+        p2 = p2 * 64;
+        total += p2;
+    }
+    total -= p2;
+    total += (blocks+63) / 64;
+    mask = new uint64_t[2 + total];
+    size = free = blocks;
+    last_one_mask = (blocks % 64) == 0
+        ? UINT64_MAX
+        : ~(UINT64_MAX << (64 - blocks % 64));
+    for (uint64_t i = 0; i < total; i++)
+    {
+        mask[i] = 0;
+    }
+}
+
+allocator::~allocator()
+{
+    delete[] mask;
+}
+
+void allocator::set(uint64_t addr, bool value)
+{
+    if (addr >= size)
+    {
+        return;
+    }
+    uint64_t p2 = 1, offset = 0;
+    while (p2 * 64 < size)
+    {
+        offset += p2;
+        p2 = p2 * 64;
+    }
+    uint64_t cur_addr = addr;
+    bool is_last = true;
+    uint64_t value64 = value ? 1 : 0;
+    while (1)
+    {
+        uint64_t last = offset + cur_addr/64;
+        uint64_t bit = cur_addr % 64;
+        if (((mask[last] >> bit) & 1) != value64)
+        {
+            if (is_last)
+            {
+                free += value ? -1 : 1;
+            }
+            if (value)
+            {
+                mask[last] = mask[last] | (1l << bit);
+                if (mask[last] != (!is_last || cur_addr/64 < size/64
+                    ? UINT64_MAX : last_one_mask))
+                {
+                    break;
+                }
+            }
+            else
+            {
+                mask[last] = mask[last] & ~(1l << bit);
+            }
+            is_last = false;
+            if (p2 > 1)
+            {
+                p2 = p2 / 64;
+                offset -= p2;
+                cur_addr /= 64;
+            }
+            else
+            {
+                break;
+            }
+        }
+        else
+        {
+            break;
+        }
+    }
+}
+
+uint64_t allocator::find_free()
+{
+    uint64_t p2 = 1, offset = 0, addr = 0, f, i;
+    while (p2 < size)
+    {
+        uint64_t m = mask[offset + addr];
+        for (i = 0, f = 1; i < 64; i++, f <<= 1)
+        {
+            if (!(m & f))
+            {
+                break;
+            }
+        }
+        if (i == 64)
+        {
+            // No space
+            return UINT64_MAX;
+        }
+        addr = (addr * 64) | i;
+        if (addr >= size)
+        {
+            // No space
+            return UINT64_MAX;
+        }
+        offset += p2;
+        p2 = p2 * 64;
+    }
+    return addr;
+}
+
+uint64_t allocator::get_free_count()
+{
+    return free;
+}
--- a/src/util/allocator.h
+++ b/src/util/allocator.h
@ -1,5 +1,5 @@
 // Copyright (c) Vitaliy Filippov, 2019+
-// License: VNPL-1.1 (see README.md for details)
+// License: VNPL-1.0 (see README.md for details)

 #pragma once

@ -8,7 +8,6 @@
 // Hierarchical bitmap allocator
 class allocator
 {
-    uint64_t total;
    uint64_t size;
    uint64_t free;
    uint64_t last_one_mask;
@ -16,12 +15,7 @@ class allocator
 public:
    allocator(uint64_t blocks);
    ~allocator();
-    bool get(uint64_t addr);
    void set(uint64_t addr, bool value);
    uint64_t find_free();
    uint64_t get_free_count();
 };
-
-void bitmap_set(void *bitmap, uint64_t start, uint64_t len, uint64_t bitmap_granularity);
-void bitmap_clear(void *bitmap, uint64_t start, uint64_t len, uint64_t bitmap_granularity);
-bool bitmap_check(void *bitmap, uint64_t start, uint64_t len, uint64_t bitmap_granularity);
--- a/base64.cpp
+++ b/base64.cpp
@ -0,0 +1,55 @@
+// Copyright (c) Vitaliy Filippov, 2019+
+// License: VNPL-1.0 (see README.md for details)
+
+#include "base64.h"
+
+std::string base64_encode(const std::string &in)
+{
+    std::string out;
+    unsigned val = 0;
+    int valb = -6;
+    for (unsigned char c: in)
+    {
+        val = (val << 8) + c;
+        valb += 8;
+        while (valb >= 0)
+        {
+            out.push_back("ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789+/"[(val>>valb) & 0x3F]);
+            valb -= 6;
+        }
+    }
+    if (valb > -6)
+        out.push_back("ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789+/"[((val<<8)>>(valb+8)) & 0x3F]);
+    while (out.size() % 4)
+        out.push_back('=');
+    return out;
+}
+
+static char T[256] = { 0 };
+
+std::string base64_decode(const std::string &in)
+{
+    std::string out;
+    if (T[0] == 0)
+    {
+        for (int i = 0; i < 256; i++)
+            T[i] = -1;
+        for (int i = 0; i < 64; i++)
+            T[(unsigned char)("ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789+/"[i])] = i;
+    }
+    unsigned val = 0;
+    int valb = -8;
+    for (unsigned char c: in)
+    {
+        if (T[c] == -1)
+            break;
+        val = (val<<6) + T[c];
+        valb += 6;
+        if (valb >= 0)
+        {
+            out.push_back(char((val >> valb) & 0xFF));
+            valb -= 8;
+        }
+    }
+    return out;
+}
--- a/base64.h
+++ b/base64.h
@ -0,0 +1,8 @@
+// Copyright (c) Vitaliy Filippov, 2019+
+// License: VNPL-1.0 (see README.md for details)
+
+#pragma once
+#include <string>
+
+std::string base64_encode(const std::string &in);
+std::string base64_decode(const std::string &in);
--- a/blockstore.cpp
+++ b/blockstore.cpp
@ -0,0 +1,69 @@
+// Copyright (c) Vitaliy Filippov, 2019+
+// License: VNPL-1.0 (see README.md for details)
+
+#include "blockstore_impl.h"
+
+blockstore_t::blockstore_t(blockstore_config_t & config, ring_loop_t *ringloop)
+{
+    impl = new blockstore_impl_t(config, ringloop);
+}
+
+blockstore_t::~blockstore_t()
+{
+    delete impl;
+}
+
+void blockstore_t::loop()
+{
+    impl->loop();
+}
+
+bool blockstore_t::is_started()
+{
+    return impl->is_started();
+}
+
+bool blockstore_t::is_stalled()
+{
+    return impl->is_stalled();
+}
+
+bool blockstore_t::is_safe_to_stop()
+{
+    return impl->is_safe_to_stop();
+}
+
+void blockstore_t::enqueue_op(blockstore_op_t *op)
+{
+    impl->enqueue_op(op, false);
+}
+
+void blockstore_t::enqueue_op_first(blockstore_op_t *op)
+{
+    impl->enqueue_op(op, true);
+}
+
+std::unordered_map<object_id, uint64_t> & blockstore_t::get_unstable_writes()
+{
+    return impl->unstable_writes;
+}
+
+uint32_t blockstore_t::get_block_size()
+{
+    return impl->get_block_size();
+}
+
+uint64_t blockstore_t::get_block_count()
+{
+    return impl->get_block_count();
+}
+
+uint64_t blockstore_t::get_free_block_count()
+{
+    return impl->get_free_block_count();
+}
+
+uint32_t blockstore_t::get_disk_alignment()
+{
+    return impl->get_disk_alignment();
+}
--- a/src/blockstore/blockstore.h
+++ b/src/blockstore/blockstore.h
@ -1,5 +1,5 @@
 // Copyright (c) Vitaliy Filippov, 2019+
-// License: VNPL-1.1 (see README.md for details)
+// License: VNPL-1.0 (see README.md for details)

 #pragma once

@ -9,29 +9,23 @@

 #include <stdint.h>

-#include <string>
 #include <map>
+#include <unordered_map>
 #include <functional>

 #include "object_id.h"
 #include "ringloop.h"
-#include "timerfd_manager.h"

 // Memory alignment for direct I/O (usually 512 bytes)
-#ifndef DIRECT_IO_ALIGNMENT
-#define DIRECT_IO_ALIGNMENT 512
-#endif
-
-// Memory allocation alignment (page size is usually optimal)
+// All other alignments must be a multiple of this one
 #ifndef MEM_ALIGNMENT
-#define MEM_ALIGNMENT 4096
+#define MEM_ALIGNMENT 512
 #endif

 // Default block size is 128 KB, current allowed range is 4K - 128M
-#define DEFAULT_DATA_BLOCK_ORDER 17
-#define MIN_DATA_BLOCK_SIZE 4*1024
-#define MAX_DATA_BLOCK_SIZE 128*1024*1024
-#define DEFAULT_BITMAP_GRANULARITY 4096
+#define DEFAULT_ORDER 17
+#define MIN_BLOCK_SIZE 4*1024
+#define MAX_BLOCK_SIZE 128*1024*1024

 #define BS_OP_MIN 1
 #define BS_OP_READ 1
@ -69,15 +63,9 @@ Input:
 - offset, len = offset and length within object. length may be zero, in that case
  read operation only returns the version / write operation only bumps the version
 - buf = pre-allocated buffer for data (read) / with data (write). may be NULL if len == 0.
- bitmap = pointer to the new 'external' object bitmap data. Its part which is respective to the
-  write request is copied into the metadata area bitwise and stored there.

 Output:
- retval = number of bytes actually read/written or negative error number
-  -EINVAL = invalid input parameters
-  -ENOENT = requested object/version does not exist for reads
-  -ENOSPC = no space left in the store for writes
-  -EDOM = checksum error.
+- retval = number of bytes actually read/written or negative error number (-EINVAL or -ENOSPC)
 - version = the version actually read or written

 ## BS_OP_DELETE
@ -111,7 +99,7 @@ Input:
 - buf = pre-allocated obj_ver_id array <len> units long

 Output:
- retval = 0 or negative error number (-ENOENT if no such version for stabilize)
+- retval = 0 or negative error number (-EINVAL, -ENOENT if no such version or -EBUSY if not synced)

 ## BS_OP_SYNC_STAB_ALL

@ -126,14 +114,11 @@ Output:
 Get a list of all objects in this Blockstore.

 Input:
- pg_alignment = PG alignment
- pg_count = PG count or 0 to list all objects
- pg_number = PG number
- list_stable_limit = max number of clean objects in the reply
-  it's guaranteed that dirty objects are returned from the same interval,
-  i.e. from (min_oid .. min(max_oid, max(returned stable OIDs)))
- min_oid = min inode/stripe or 0 to list all objects
- max_oid = max inode/stripe or 0 to list all objects
+- oid.stripe = PG alignment
+- len = PG count or 0 to list all objects
+- offset = PG number
+- oid.inode = min inode number or 0 to list all inodes
+- version = max inode number or 0 to list all inodes

 Output:
 - retval = total obj_ver_id count
@ -150,35 +135,17 @@ struct blockstore_op_t
    uint64_t opcode;
    // finish callback
    std::function<void (blockstore_op_t*)> callback;
-    union __attribute__((__packed__))
-    {
-        // R/W
-        struct __attribute__((__packed__))
-        {
    object_id oid;
    uint64_t version;
    uint32_t offset;
    uint32_t len;
-        };
-        // List
-        struct __attribute__((__packed__))
-        {
-            object_id min_oid;
-            object_id max_oid;
-            uint32_t pg_alignment;
-            uint32_t pg_count;
-            uint32_t pg_number;
-            uint32_t list_stable_limit;
-        };
-    };
    void *buf;
-    void *bitmap;
    int retval;

    uint8_t private_data[BS_OP_PRIVATE_DATA_SIZE];
 };

-typedef std::map<std::string, std::string> blockstore_config_t;
+typedef std::unordered_map<std::string, std::string> blockstore_config_t;

 class blockstore_impl_t;

@ -186,12 +153,9 @@ class blockstore_t
 {
    blockstore_impl_t *impl;
 public:
-    blockstore_t(blockstore_config_t & config, ring_loop_t *ringloop, timerfd_manager_t *tfd);
+    blockstore_t(blockstore_config_t & config, ring_loop_t *ringloop);
    ~blockstore_t();

-    // Update configuration
-    void parse_config(blockstore_config_t & config);
-
    // Event loop
    void loop();

@ -210,23 +174,17 @@ public:
    // Submission
    void enqueue_op(blockstore_op_t *op);

-    // Simplified synchronous operation: get object bitmap & current version
-    int read_bitmap(object_id oid, uint64_t target_version, void *bitmap, uint64_t *result_version = NULL);
+    // Insert operation into the beginning of the queue
+    // Intended for the OSD syncer "thread" to be able to stabilize something when the journal is full
+    void enqueue_op_first(blockstore_op_t *op);

-    // Get per-inode space usage statistics
-    std::map<uint64_t, uint64_t> & get_inode_space_stats();
-
-    // Set per-pool no_inode_stats
-    void set_no_inode_stats(const std::vector<uint64_t> & pool_ids);
-
-    // Print diagnostics to stdout
-    void dump_diagnostics();
+    // Unstable writes are added here (map of object_id -> version)
+    std::unordered_map<object_id, uint64_t> & get_unstable_writes();

+    // FIXME rename to object_size
    uint32_t get_block_size();
    uint64_t get_block_count();
    uint64_t get_free_block_count();

-    uint64_t get_journal_size();
-
-    uint32_t get_bitmap_granularity();
+    uint32_t get_disk_alignment();
 };
--- a/blockstore_flush.cpp
+++ b/blockstore_flush.cpp
@ -0,0 +1,790 @@
+// Copyright (c) Vitaliy Filippov, 2019+
+// License: VNPL-1.0 (see README.md for details)
+
+#include "blockstore_impl.h"
+
+journal_flusher_t::journal_flusher_t(int flusher_count, blockstore_impl_t *bs)
+{
+    this->bs = bs;
+    this->flusher_count = flusher_count;
+    dequeuing = false;
+    active_flushers = 0;
+    syncing_flushers = 0;
+    flusher_start_threshold = bs->journal_block_size / sizeof(journal_entry_stable);
+    journal_trim_interval = flusher_start_threshold;
+    journal_trim_counter = 0;
+    journal_superblock = bs->journal.inmemory ? bs->journal.buffer : memalign_or_die(MEM_ALIGNMENT, bs->journal_block_size);
+    co = new journal_flusher_co[flusher_count];
+    for (int i = 0; i < flusher_count; i++)
+    {
+        co[i].bs = bs;
+        co[i].flusher = this;
+    }
+}
+
+journal_flusher_co::journal_flusher_co()
+{
+    wait_state = 0;
+    simple_callback_r = [this](ring_data_t* data)
+    {
+        bs->live = true;
+        if (data->res != data->iov.iov_len)
+        {
+            throw std::runtime_error(
+                "data read operation failed during flush ("+std::to_string(data->res)+" != "+std::to_string(data->iov.iov_len)+
+                "). can't continue, sorry :-("
+            );
+        }
+        wait_count--;
+    };
+    simple_callback_w = [this](ring_data_t* data)
+    {
+        bs->live = true;
+        if (data->res != data->iov.iov_len)
+        {
+            throw std::runtime_error(
+                "write operation failed ("+std::to_string(data->res)+" != "+std::to_string(data->iov.iov_len)+
+                "). state "+std::to_string(wait_state)+". in-memory state is corrupted. AAAAAAAaaaaaaaaa!!!111"
+            );
+        }
+        wait_count--;
+    };
+}
+
+journal_flusher_t::~journal_flusher_t()
+{
+    if (!bs->journal.inmemory)
+        free(journal_superblock);
+    delete[] co;
+}
+
+bool journal_flusher_t::is_active()
+{
+    return active_flushers > 0 || dequeuing;
+}
+
+void journal_flusher_t::loop()
+{
+    for (int i = 0; (active_flushers > 0 || dequeuing) && i < flusher_count; i++)
+    {
+        co[i].loop();
+    }
+}
+
+void journal_flusher_t::enqueue_flush(obj_ver_id ov)
+{
+    auto it = flush_versions.find(ov.oid);
+    if (it != flush_versions.end())
+    {
+        if (it->second < ov.version)
+            it->second = ov.version;
+    }
+    else
+    {
+        flush_versions[ov.oid] = ov.version;
+        flush_queue.push_back(ov.oid);
+    }
+    if (!dequeuing && flush_queue.size() >= flusher_start_threshold)
+    {
+        dequeuing = true;
+        bs->ringloop->wakeup();
+    }
+}
+
+void journal_flusher_t::unshift_flush(obj_ver_id ov)
+{
+    auto it = flush_versions.find(ov.oid);
+    if (it != flush_versions.end())
+    {
+        if (it->second < ov.version)
+            it->second = ov.version;
+    }
+    else
+    {
+        flush_versions[ov.oid] = ov.version;
+        flush_queue.push_front(ov.oid);
+    }
+    if (!dequeuing && flush_queue.size() >= flusher_start_threshold)
+    {
+        dequeuing = true;
+        bs->ringloop->wakeup();
+    }
+}
+
+void journal_flusher_t::request_trim()
+{
+    dequeuing = true;
+    trim_wanted++;
+    bs->ringloop->wakeup();
+}
+
+void journal_flusher_t::release_trim()
+{
+    trim_wanted--;
+}
+
+#define await_sqe(label) \
+    resume_##label:\
+        sqe = bs->get_sqe();\
+        if (!sqe)\
+        {\
+            wait_state = label;\
+            return false;\
+        }\
+        data = ((ring_data_t*)sqe->user_data);
+
+// FIXME: Implement batch flushing
+bool journal_flusher_co::loop()
+{
+    // This is much better than implementing the whole function as an FSM
+    // Maybe I should consider a coroutine library like https://github.com/hnes/libaco ...
+    if (wait_state == 1)
+        goto resume_1;
+    else if (wait_state == 2)
+        goto resume_2;
+    else if (wait_state == 3)
+        goto resume_3;
+    else if (wait_state == 4)
+        goto resume_4;
+    else if (wait_state == 5)
+        goto resume_5;
+    else if (wait_state == 6)
+        goto resume_6;
+    else if (wait_state == 7)
+        goto resume_7;
+    else if (wait_state == 8)
+        goto resume_8;
+    else if (wait_state == 9)
+        goto resume_9;
+    else if (wait_state == 10)
+        goto resume_10;
+    else if (wait_state == 12)
+        goto resume_12;
+    else if (wait_state == 13)
+        goto resume_13;
+    else if (wait_state == 14)
+        goto resume_14;
+    else if (wait_state == 15)
+        goto resume_15;
+    else if (wait_state == 16)
+        goto resume_16;
+    else if (wait_state == 17)
+        goto resume_17;
+    else if (wait_state == 18)
+        goto resume_18;
+resume_0:
+    if (!flusher->flush_queue.size() || !flusher->dequeuing)
+    {
+        flusher->dequeuing = false;
+        wait_state = 0;
+        return true;
+    }
+    cur.oid = flusher->flush_queue.front();
+    cur.version = flusher->flush_versions[cur.oid];
+    flusher->flush_queue.pop_front();
+    flusher->flush_versions.erase(cur.oid);
+    dirty_end = bs->dirty_db.find(cur);
+    if (dirty_end != bs->dirty_db.end())
+    {
+        repeat_it = flusher->sync_to_repeat.find(cur.oid);
+        if (repeat_it != flusher->sync_to_repeat.end())
+        {
+#ifdef BLOCKSTORE_DEBUG
+            printf("Postpone %lx:%lx v%lu\n", cur.oid.inode, cur.oid.stripe, cur.version);
+#endif
+            // We don't flush different parts of history of the same object in parallel
+            // So we check if someone is already flushing this object
+            // In that case we set sync_to_repeat and pick another object
+            // Another coroutine will see it and re-queue the object after it finishes
+            if (repeat_it->second < cur.version)
+                repeat_it->second = cur.version;
+            wait_state = 0;
+            goto resume_0;
+        }
+        else
+            flusher->sync_to_repeat[cur.oid] = 0;
+        if (dirty_end->second.journal_sector >= bs->journal.dirty_start &&
+            (bs->journal.dirty_start >= bs->journal.used_start ||
+            dirty_end->second.journal_sector < bs->journal.used_start))
+        {
+            flusher->enqueue_flush(cur);
+            // We can't flush journal sectors that are still written to
+            // However, as we group flushes by oid, current oid may have older writes to flush!
+            // And it may even block writes if we don't flush the older version
+            // (if it's in the beginning of the journal)...
+            // So first try to find an older version of the same object to flush.
+            bool found = false;
+            while (dirty_end != bs->dirty_db.begin())
+            {
+                dirty_end--;
+                if (dirty_end->first.oid != cur.oid)
+                {
+                    break;
+                }
+                if (!(dirty_end->second.journal_sector >= bs->journal.dirty_start &&
+                    (bs->journal.dirty_start >= bs->journal.used_start ||
+                    dirty_end->second.journal_sector < bs->journal.used_start)))
+                {
+                    found = true;
+                    cur.version = dirty_end->first.version;
+                    break;
+                }
+            }
+            if (!found)
+            {
+                // Try other objects
+                flusher->sync_to_repeat.erase(cur.oid);
+                int search_left = flusher->flush_queue.size() - 1;
+#ifdef BLOCKSTORE_DEBUG
+                printf("Flusher overran writers (dirty_start=%08lx) - searching for older flushes (%d left)\n", bs->journal.dirty_start, search_left);
+#endif
+                while (search_left > 0)
+                {
+                    cur.oid = flusher->flush_queue.front();
+                    cur.version = flusher->flush_versions[cur.oid];
+                    flusher->flush_queue.pop_front();
+                    flusher->flush_versions.erase(cur.oid);
+                    dirty_end = bs->dirty_db.find(cur);
+                    if (dirty_end != bs->dirty_db.end())
+                    {
+                        if (dirty_end->second.journal_sector >= bs->journal.dirty_start &&
+                            (bs->journal.dirty_start >= bs->journal.used_start ||
+                            dirty_end->second.journal_sector < bs->journal.used_start))
+                        {
+#ifdef BLOCKSTORE_DEBUG
+                            printf("Write %lx:%lx v%lu is too new: offset=%08lx\n", cur.oid.inode, cur.oid.stripe, cur.version, dirty_end->second.journal_sector);
+#endif
+                            flusher->enqueue_flush(cur);
+                        }
+                        else
+                        {
+                            repeat_it = flusher->sync_to_repeat.find(cur.oid);
+                            if (repeat_it == flusher->sync_to_repeat.end())
+                            {
+                                flusher->sync_to_repeat[cur.oid] = 0;
+                                break;
+                            }
+                        }
+                    }
+                    search_left--;
+                }
+                if (search_left <= 0)
+                {
+#ifdef BLOCKSTORE_DEBUG
+                    printf("No older flushes, stopping\n");
+#endif
+                    flusher->dequeuing = false;
+                    wait_state = 0;
+                    return true;
+                }
+            }
+        }
+#ifdef BLOCKSTORE_DEBUG
+        printf("Flushing %lx:%lx v%lu\n", cur.oid.inode, cur.oid.stripe, cur.version);
+#endif
+        flusher->active_flushers++;
+resume_1:
+        // Find it in clean_db
+        clean_it = bs->clean_db.find(cur.oid);
+        old_clean_loc = (clean_it != bs->clean_db.end() ? clean_it->second.location : UINT64_MAX);
+        // Scan dirty versions of the object
+        if (!scan_dirty(1))
+        {
+            wait_state += 1;
+            return false;
+        }
+        // Writes and deletes shouldn't happen at the same time
+        assert(!(copy_count > 0 || has_writes) || !has_delete);
+        if (copy_count == 0 && !has_writes && !has_delete || has_delete && old_clean_loc == UINT64_MAX)
+        {
+            // Nothing to flush
+            bs->erase_dirty(dirty_start, std::next(dirty_end), clean_loc);
+            goto trim_journal;
+        }
+        if (clean_loc == UINT64_MAX)
+        {
+            if (old_clean_loc == UINT64_MAX)
+            {
+                // Object not allocated. This is a bug.
+                char err[1024];
+                snprintf(
+                    err, 1024, "BUG: Object %lx:%lx v%lu that we are trying to flush is not allocated on the data device",
+                    cur.oid.inode, cur.oid.stripe, cur.version
+                );
+                throw std::runtime_error(err);
+            }
+            else
+            {
+                clean_loc = old_clean_loc;
+            }
+        }
+        // Also we need to submit metadata read(s). We do read-modify-write cycle(s) for every operation.
+    resume_2:
+        if (!modify_meta_read(clean_loc, meta_new, 2))
+        {
+            wait_state += 2;
+            return false;
+        }
+        if (old_clean_loc != UINT64_MAX && old_clean_loc != clean_loc)
+        {
+        resume_14:
+            if (!modify_meta_read(old_clean_loc, meta_old, 14))
+            {
+                wait_state += 14;
+                return false;
+            }
+        }
+        else
+            meta_old.submitted = false;
+    resume_3:
+        if (wait_count > 0)
+        {
+            wait_state = 3;
+            return false;
+        }
+        if (meta_new.submitted)
+        {
+            meta_new.it->second.state = 1;
+            bs->ringloop->wakeup();
+        }
+        if (meta_old.submitted)
+        {
+            meta_old.it->second.state = 1;
+            bs->ringloop->wakeup();
+        }
+        // Reads completed, submit writes and set bitmap bits
+        if (bs->clean_entry_bitmap_size)
+        {
+            new_clean_bitmap = (bs->inmemory_meta
+                ? meta_new.buf + meta_new.pos*bs->clean_entry_size + sizeof(clean_disk_entry)
+                : bs->clean_bitmap + (clean_loc >> bs->block_order)*bs->clean_entry_bitmap_size);
+            if (clean_init_bitmap)
+            {
+                memset(new_clean_bitmap, 0, bs->clean_entry_bitmap_size);
+                bitmap_set(new_clean_bitmap, clean_bitmap_offset, clean_bitmap_len);
+            }
+        }
+        for (it = v.begin(); it != v.end(); it++)
+        {
+            if (new_clean_bitmap)
+            {
+                bitmap_set(new_clean_bitmap, it->offset, it->len);
+            }
+            await_sqe(4);
+            data->iov = (struct iovec){ it->buf, (size_t)it->len };
+            data->callback = simple_callback_w;
+            my_uring_prep_writev(
+                sqe, bs->data_fd, &data->iov, 1, bs->data_offset + clean_loc + it->offset
+            );
+            wait_count++;
+        }
+        // Sync data before writing metadata
+    resume_16:
+    resume_17:
+    resume_18:
+        if (copy_count && !fsync_batch(false, 16))
+        {
+            wait_state += 16;
+            return false;
+        }
+    resume_5:
+        // And metadata writes, but only after data writes complete
+        if (!bs->inmemory_meta && meta_new.it->second.state == 0 || wait_count > 0)
+        {
+            // metadata sector is still being read or data is still being written, wait for it
+            wait_state = 5;
+            return false;
+        }
+        if (old_clean_loc != UINT64_MAX && old_clean_loc != clean_loc)
+        {
+            if (!bs->inmemory_meta && meta_old.it->second.state == 0)
+            {
+                wait_state = 5;
+                return false;
+            }
+            memset(meta_old.buf + meta_old.pos*bs->clean_entry_size, 0, bs->clean_entry_size);
+            await_sqe(15);
+            data->iov = (struct iovec){ meta_old.buf, bs->meta_block_size };
+            data->callback = simple_callback_w;
+            my_uring_prep_writev(
+                sqe, bs->meta_fd, &data->iov, 1, bs->meta_offset + meta_old.sector
+            );
+            wait_count++;
+        }
+        if (has_delete)
+        {
+            memset(meta_new.buf + meta_new.pos*bs->clean_entry_size, 0, bs->clean_entry_size);
+        }
+        else
+        {
+            clean_disk_entry *new_entry = (clean_disk_entry*)(meta_new.buf + meta_new.pos*bs->clean_entry_size);
+            assert(new_entry->oid.inode == 0 || new_entry->oid == cur.oid);
+            new_entry->oid = cur.oid;
+            new_entry->version = cur.version;
+            if (!bs->inmemory_meta)
+            {
+                memcpy(&new_entry->bitmap, new_clean_bitmap, bs->clean_entry_bitmap_size);
+            }
+        }
+        await_sqe(6);
+        data->iov = (struct iovec){ meta_new.buf, bs->meta_block_size };
+        data->callback = simple_callback_w;
+        my_uring_prep_writev(
+            sqe, bs->meta_fd, &data->iov, 1, bs->meta_offset + meta_new.sector
+        );
+        wait_count++;
+    resume_7:
+        if (wait_count > 0)
+        {
+            wait_state = 7;
+            return false;
+        }
+        // Done, free all buffers
+        if (!bs->inmemory_meta)
+        {
+            meta_new.it->second.usage_count--;
+            if (meta_new.it->second.usage_count == 0)
+            {
+                free(meta_new.it->second.buf);
+                flusher->meta_sectors.erase(meta_new.it);
+            }
+            if (old_clean_loc != UINT64_MAX && old_clean_loc != clean_loc)
+            {
+                meta_old.it->second.usage_count--;
+                if (meta_old.it->second.usage_count == 0)
+                {
+                    free(meta_old.it->second.buf);
+                    flusher->meta_sectors.erase(meta_old.it);
+                }
+            }
+        }
+        for (it = v.begin(); it != v.end(); it++)
+        {
+            free(it->buf);
+        }
+        v.clear();
+        // And sync metadata (in batches - not per each operation!)
+    resume_8:
+    resume_9:
+    resume_10:
+        if (!fsync_batch(true, 8))
+        {
+            wait_state += 8;
+            return false;
+        }
+        // Update clean_db and dirty_db, free old data locations
+        update_clean_db();
+    trim_journal:
+        // Clear unused part of the journal every <journal_trim_interval> flushes
+        if (!((++flusher->journal_trim_counter) % flusher->journal_trim_interval) || flusher->trim_wanted > 0)
+        {
+            flusher->journal_trim_counter = 0;
+            if (bs->journal.trim())
+            {
+                // Update journal "superblock"
+                await_sqe(12);
+                *((journal_entry_start*)flusher->journal_superblock) = {
+                    .crc32 = 0,
+                    .magic = JOURNAL_MAGIC,
+                    .type = JE_START,
+                    .size = sizeof(journal_entry_start),
+                    .reserved = 0,
+                    .journal_start = bs->journal.used_start,
+                };
+                ((journal_entry_start*)flusher->journal_superblock)->crc32 = je_crc32((journal_entry*)flusher->journal_superblock);
+                data->iov = (struct iovec){ flusher->journal_superblock, bs->journal_block_size };
+                data->callback = simple_callback_w;
+                my_uring_prep_writev(sqe, bs->journal.fd, &data->iov, 1, bs->journal.offset);
+                wait_count++;
+            resume_13:
+                if (wait_count > 0)
+                {
+                    wait_state = 13;
+                    return false;
+                }
+            }
+        }
+        // All done
+#ifdef BLOCKSTORE_DEBUG
+        printf("Flushed %lx:%lx v%lu (%d copies, wr:%d, del:%d), %ld left\n", cur.oid.inode, cur.oid.stripe, cur.version,
+            copy_count, has_writes, has_delete, flusher->flush_queue.size());
+#endif
+        flusher->active_flushers--;
+        repeat_it = flusher->sync_to_repeat.find(cur.oid);
+        if (repeat_it != flusher->sync_to_repeat.end() && repeat_it->second > cur.version)
+        {
+            // Requeue version
+            flusher->unshift_flush({ .oid = cur.oid, .version = repeat_it->second });
+        }
+        flusher->sync_to_repeat.erase(repeat_it);
+        wait_state = 0;
+        goto resume_0;
+    }
+    return true;
+}
+
+bool journal_flusher_co::scan_dirty(int wait_base)
+{
+    if (wait_state == wait_base)
+    {
+        goto resume_0;
+    }
+    dirty_it = dirty_start = dirty_end;
+    v.clear();
+    wait_count = 0;
+    copy_count = 0;
+    clean_loc = UINT64_MAX;
+    has_delete = false;
+    has_writes = false;
+    skip_copy = false;
+    clean_init_bitmap = false;
+    while (1)
+    {
+        if (!IS_STABLE(dirty_it->second.state))
+        {
+            char err[1024];
+            snprintf(
+                err, 1024, "BUG: Unexpected dirty_entry %lx:%lx v%lu state during flush: %d",
+                dirty_it->first.oid.inode, dirty_it->first.oid.stripe, dirty_it->first.version, dirty_it->second.state
+            );
+            throw std::runtime_error(err);
+        }
+        else if (IS_JOURNAL(dirty_it->second.state) && !skip_copy)
+        {
+            // First we submit all reads
+            has_writes = true;
+            if (dirty_it->second.len != 0)
+            {
+                offset = dirty_it->second.offset;
+                end_offset = dirty_it->second.offset + dirty_it->second.len;
+                it = v.begin();
+                while (1)
+                {
+                    for (; it != v.end(); it++)
+                        if (it->offset >= offset)
+                            break;
+                    if (it == v.end() || it->offset > offset && it->len > 0)
+                    {
+                        submit_offset = dirty_it->second.location + offset - dirty_it->second.offset;
+                        submit_len = it == v.end() || it->offset >= end_offset ? end_offset-offset : it->offset-offset;
+                        it = v.insert(it, (copy_buffer_t){ .offset = offset, .len = submit_len, .buf = memalign_or_die(MEM_ALIGNMENT, submit_len) });
+                        copy_count++;
+                        if (bs->journal.inmemory)
+                        {
+                            // Take it from memory
+                            memcpy(it->buf, bs->journal.buffer + submit_offset, submit_len);
+                        }
+                        else
+                        {
+                            // Read it from disk
+                            await_sqe(0);
+                            data->iov = (struct iovec){ it->buf, (size_t)submit_len };
+                            data->callback = simple_callback_r;
+                            my_uring_prep_readv(
+                                sqe, bs->journal.fd, &data->iov, 1, bs->journal.offset + submit_offset
+                            );
+                            wait_count++;
+                        }
+                    }
+                    offset = it->offset+it->len;
+                    if (it == v.end() || offset >= end_offset)
+                        break;
+                }
+            }
+        }
+        else if (IS_BIG_WRITE(dirty_it->second.state) && !skip_copy)
+        {
+            // There is an unflushed big write. Copy small writes in its position
+            has_writes = true;
+            clean_loc = dirty_it->second.location;
+            clean_init_bitmap = true;
+            clean_bitmap_offset = dirty_it->second.offset;
+            clean_bitmap_len = dirty_it->second.len;
+            skip_copy = true;
+        }
+        else if (IS_DELETE(dirty_it->second.state) && !skip_copy)
+        {
+            // There is an unflushed delete
+            has_delete = true;
+            skip_copy = true;
+        }
+        dirty_start = dirty_it;
+        if (dirty_it == bs->dirty_db.begin())
+        {
+            break;
+        }
+        dirty_it--;
+        if (dirty_it->first.oid != cur.oid)
+        {
+            break;
+        }
+    }
+    return true;
+}
+
+bool journal_flusher_co::modify_meta_read(uint64_t meta_loc, flusher_meta_write_t &wr, int wait_base)
+{
+    if (wait_state == wait_base)
+    {
+        goto resume_0;
+    }
+    // We must check if the same sector is already in memory if we don't keep all metadata in memory all the time.
+    // And yet another option is to use LSM trees for metadata, but it sophisticates everything a lot,
+    // so I'll avoid it as long as I can.
+    wr.submitted = false;
+    wr.sector = ((meta_loc >> bs->block_order) / (bs->meta_block_size / bs->clean_entry_size)) * bs->meta_block_size;
+    wr.pos = ((meta_loc >> bs->block_order) % (bs->meta_block_size / bs->clean_entry_size));
+    if (bs->inmemory_meta)
+    {
+        wr.buf = bs->metadata_buffer + wr.sector;
+        return true;
+    }
+    wr.it = flusher->meta_sectors.find(wr.sector);
+    if (wr.it == flusher->meta_sectors.end())
+    {
+        // Not in memory yet, read it
+        wr.buf = memalign_or_die(MEM_ALIGNMENT, bs->meta_block_size);
+        wr.it = flusher->meta_sectors.emplace(wr.sector, (meta_sector_t){
+            .offset = wr.sector,
+            .len = bs->meta_block_size,
+            .state = 0, // 0 = not read yet
+            .buf = wr.buf,
+            .usage_count = 1,
+        }).first;
+        await_sqe(0);
+        data->iov = (struct iovec){ wr.it->second.buf, bs->meta_block_size };
+        data->callback = simple_callback_r;
+        wr.submitted = true;
+        my_uring_prep_readv(
+            sqe, bs->meta_fd, &data->iov, 1, bs->meta_offset + wr.sector
+        );
+        wait_count++;
+    }
+    else
+    {
+        wr.buf = wr.it->second.buf;
+        wr.it->second.usage_count++;
+    }
+    return true;
+}
+
+void journal_flusher_co::update_clean_db()
+{
+    if (old_clean_loc != UINT64_MAX && old_clean_loc != clean_loc)
+    {
+#ifdef BLOCKSTORE_DEBUG
+        printf("Free block %lu (new location is %lu)\n", old_clean_loc >> bs->block_order, clean_loc >> bs->block_order);
+#endif
+        bs->data_alloc->set(old_clean_loc >> bs->block_order, false);
+    }
+    if (has_delete)
+    {
+        auto clean_it = bs->clean_db.find(cur.oid);
+        bs->clean_db.erase(clean_it);
+        bs->data_alloc->set(clean_loc >> bs->block_order, false);
+        clean_loc = UINT64_MAX;
+    }
+    else
+    {
+        bs->clean_db[cur.oid] = {
+            .version = cur.version,
+            .location = clean_loc,
+        };
+    }
+    bs->erase_dirty(dirty_start, std::next(dirty_end), clean_loc);
+}
+
+bool journal_flusher_co::fsync_batch(bool fsync_meta, int wait_base)
+{
+    if (wait_state == wait_base)
+        goto resume_0;
+    else if (wait_state == wait_base+1)
+        goto resume_1;
+    else if (wait_state == wait_base+2)
+        goto resume_2;
+    if (!(fsync_meta ? bs->disable_meta_fsync : bs->disable_journal_fsync))
+    {
+        cur_sync = flusher->syncs.end();
+        while (cur_sync != flusher->syncs.begin())
+        {
+            cur_sync--;
+            if (cur_sync->fsync_meta == fsync_meta && cur_sync->state == 0)
+            {
+                goto sync_found;
+            }
+        }
+        cur_sync = flusher->syncs.emplace(flusher->syncs.end(), (flusher_sync_t){
+            .fsync_meta = fsync_meta,
+            .ready_count = 0,
+            .state = 0,
+        });
+    sync_found:
+        cur_sync->ready_count++;
+        flusher->syncing_flushers++;
+        if (flusher->syncing_flushers >= flusher->flusher_count || !flusher->flush_queue.size())
+        {
+            // Sync batch is ready. Do it.
+            await_sqe(0);
+            data->iov = { 0 };
+            data->callback = simple_callback_w;
+            my_uring_prep_fsync(sqe, fsync_meta ? bs->meta_fd : bs->data_fd, IORING_FSYNC_DATASYNC);
+            cur_sync->state = 1;
+            wait_count++;
+        resume_1:
+            if (wait_count > 0)
+            {
+                wait_state = 1;
+                return false;
+            }
+            // Sync completed. All previous coroutines waiting for it must be resumed
+            cur_sync->state = 2;
+            bs->ringloop->wakeup();
+        }
+        // Wait until someone else sends and completes a sync.
+    resume_2:
+        if (!cur_sync->state)
+        {
+            wait_state = 2;
+            return false;
+        }
+        flusher->syncing_flushers--;
+        cur_sync->ready_count--;
+        if (cur_sync->ready_count == 0)
+        {
+            flusher->syncs.erase(cur_sync);
+        }
+    }
+    return true;
+}
+
+void journal_flusher_co::bitmap_set(void *bitmap, uint64_t start, uint64_t len)
+{
+    if (start == 0)
+    {
+        if (len == 32*bs->bitmap_granularity)
+        {
+            *((uint32_t*)bitmap) = UINT32_MAX;
+            return;
+        }
+        else if (len == 64*bs->bitmap_granularity)
+        {
+            *((uint64_t*)bitmap) = UINT64_MAX;
+            return;
+        }
+    }
+    unsigned bit_start = start / bs->bitmap_granularity;
+    unsigned bit_end = ((start + len) + bs->bitmap_granularity - 1) / bs->bitmap_granularity;
+    while (bit_start < bit_end)
+    {
+        if (!(bit_start & 7) && bit_end >= bit_start+8)
+        {
+            ((uint8_t*)bitmap)[bit_start / 8] = UINT8_MAX;
+            bit_start += 8;
+        }
+        else
+        {
+            ((uint8_t*)bitmap)[bit_start / 8] |= 1 << (bit_start % 8);
+            bit_start++;
+        }
+    }
+}
--- a/src/blockstore/blockstore_flush.h
+++ b/src/blockstore/blockstore_flush.h
@ -1,22 +1,10 @@
 // Copyright (c) Vitaliy Filippov, 2019+
-// License: VNPL-1.1 (see README.md for details)
-
-#define COPY_BUF_JOURNAL 1
-#define COPY_BUF_DATA 2
-#define COPY_BUF_ZERO 4
-#define COPY_BUF_CSUM_FILL 8
-#define COPY_BUF_COALESCED 16
-#define COPY_BUF_META_BLOCK 32
-#define COPY_BUF_JOURNALED_BIG 64
+// License: VNPL-1.0 (see README.md for details)

 struct copy_buffer_t
 {
-    int copy_flags;
-    uint64_t offset, len, disk_offset;
-    uint64_t journal_sector; // only for reads: sector+1 if used and !journal.inmemory, otherwise 0
+    uint64_t offset, len;
    void *buf;
-    uint8_t *csum_buf;
-    int *dyn_data;
 };

 struct meta_sector_t
@ -49,7 +37,7 @@ class journal_flusher_co
 {
    blockstore_impl_t *bs;
    journal_flusher_t *flusher;
-    int wait_state, wait_count, wait_journal_count;
+    int wait_state, wait_count;
    struct io_uring_sqe *sqe;
    struct ring_data_t *data;

@ -58,40 +46,28 @@ class journal_flusher_co
    obj_ver_id cur;
    std::map<obj_ver_id, dirty_entry>::iterator dirty_it, dirty_start, dirty_end;
    std::map<object_id, uint64_t>::iterator repeat_it;
-    std::function<void(ring_data_t*)> simple_callback_r, simple_callback_rj, simple_callback_w;
+    std::function<void(ring_data_t*)> simple_callback_r, simple_callback_w;

-    bool try_trim = false;
    bool skip_copy, has_delete, has_writes;
+    blockstore_clean_db_t::iterator clean_it;
    std::vector<copy_buffer_t> v;
    std::vector<copy_buffer_t>::iterator it;
-    int i;
-    bool fill_incomplete, cleared_incomplete;
-    int read_to_fill_incomplete;
    int copy_count;
-    uint64_t clean_loc, clean_ver, old_clean_loc, old_clean_ver;
+    uint64_t clean_loc, old_clean_loc;
    flusher_meta_write_t meta_old, meta_new;
    bool clean_init_bitmap;
    uint64_t clean_bitmap_offset, clean_bitmap_len;
-    uint8_t *clean_init_dyn_ptr;
-    uint8_t *new_clean_bitmap;
+    void *new_clean_bitmap;

-    uint64_t new_trim_pos;
+    // local: scan_dirty()
+    uint64_t offset, end_offset, submit_offset, submit_len;

    friend class journal_flusher_t;
-    void scan_dirty();
-    bool read_dirty(int wait_base);
-    bool modify_meta_do_reads(int wait_base);
-    bool wait_meta_reads(int wait_base);
+    bool scan_dirty(int wait_base);
    bool modify_meta_read(uint64_t meta_loc, flusher_meta_write_t &wr, int wait_base);
-    bool clear_incomplete_csum_block_bits(int wait_base);
-    void calc_block_checksums(uint32_t *new_data_csums, bool skip_overwrites);
-    void update_metadata_entry();
-    bool write_meta_block(flusher_meta_write_t & meta_block, int wait_base);
    void update_clean_db();
-    void free_data_blocks();
    bool fsync_batch(bool fsync_meta, int wait_base);
-    bool trim_journal(int wait_base);
-    void free_buffers();
+    void bitmap_set(void *bitmap, uint64_t start, uint64_t len);
 public:
    journal_flusher_co();
    bool loop();
@ -102,14 +78,13 @@ class journal_flusher_t
 {
    int trim_wanted = 0;
    bool dequeuing;
-    int min_flusher_count, max_flusher_count, cur_flusher_count, target_flusher_count;
+    int flusher_count;
    int flusher_start_threshold;
    journal_flusher_co *co;
    blockstore_impl_t *bs;
    friend class journal_flusher_co;

-    int journal_trim_counter;
-    bool trimming;
+    int journal_trim_counter, journal_trim_interval;
    void* journal_superblock;

    int active_flushers;
@ -119,23 +94,14 @@ class journal_flusher_t

    std::map<uint64_t, meta_sector_t> meta_sectors;
    std::deque<object_id> flush_queue;
-    std::map<object_id, uint64_t> flush_versions; // FIXME: consider unordered_map?
-
-    bool try_find_older(std::map<obj_ver_id, dirty_entry>::iterator & dirty_end, obj_ver_id & cur);
-    bool try_find_other(std::map<obj_ver_id, dirty_entry>::iterator & dirty_end, obj_ver_id & cur);
-
+    std::map<object_id, uint64_t> flush_versions;
 public:
-    journal_flusher_t(blockstore_impl_t *bs);
+    journal_flusher_t(int flusher_count, blockstore_impl_t *bs);
    ~journal_flusher_t();
    void loop();
-    bool is_trim_wanted() { return trim_wanted; }
    bool is_active();
-    void mark_trim_possible();
    void request_trim();
    void release_trim();
    void enqueue_flush(obj_ver_id oid);
-    void unshift_flush(obj_ver_id oid, bool force);
-    void remove_flush(object_id oid);
-    void dump_diagnostics();
-    bool is_mutated(uint64_t clean_loc);
+    void unshift_flush(obj_ver_id oid);
 };
--- a/blockstore_impl.cpp
+++ b/blockstore_impl.cpp
@ -0,0 +1,610 @@
+// Copyright (c) Vitaliy Filippov, 2019+
+// License: VNPL-1.0 (see README.md for details)
+
+#include "blockstore_impl.h"
+
+blockstore_impl_t::blockstore_impl_t(blockstore_config_t & config, ring_loop_t *ringloop)
+{
+    assert(sizeof(blockstore_op_private_t) <= BS_OP_PRIVATE_DATA_SIZE);
+    this->ringloop = ringloop;
+    ring_consumer.loop = [this]() { loop(); };
+    ringloop->register_consumer(&ring_consumer);
+    initialized = 0;
+    zero_object = (uint8_t*)memalign_or_die(MEM_ALIGNMENT, block_size);
+    data_fd = meta_fd = journal.fd = -1;
+    parse_config(config);
+    try
+    {
+        open_data();
+        open_meta();
+        open_journal();
+        calc_lengths();
+        data_alloc = new allocator(block_count);
+    }
+    catch (std::exception & e)
+    {
+        if (data_fd >= 0)
+            close(data_fd);
+        if (meta_fd >= 0 && meta_fd != data_fd)
+            close(meta_fd);
+        if (journal.fd >= 0 && journal.fd != meta_fd)
+            close(journal.fd);
+        throw;
+    }
+    flusher = new journal_flusher_t(flusher_count, this);
+}
+
+blockstore_impl_t::~blockstore_impl_t()
+{
+    delete data_alloc;
+    delete flusher;
+    free(zero_object);
+    ringloop->unregister_consumer(&ring_consumer);
+    if (data_fd >= 0)
+        close(data_fd);
+    if (meta_fd >= 0 && meta_fd != data_fd)
+        close(meta_fd);
+    if (journal.fd >= 0 && journal.fd != meta_fd)
+        close(journal.fd);
+    if (metadata_buffer)
+        free(metadata_buffer);
+    if (clean_bitmap)
+        free(clean_bitmap);
+}
+
+bool blockstore_impl_t::is_started()
+{
+    return initialized == 10;
+}
+
+bool blockstore_impl_t::is_stalled()
+{
+    return queue_stall;
+}
+
+// main event loop - produce requests
+void blockstore_impl_t::loop()
+{
+    // FIXME: initialized == 10 is ugly
+    if (initialized != 10)
+    {
+        // read metadata, then journal
+        if (initialized == 0)
+        {
+            metadata_init_reader = new blockstore_init_meta(this);
+            initialized = 1;
+        }
+        if (initialized == 1)
+        {
+            int res = metadata_init_reader->loop();
+            if (!res)
+            {
+                delete metadata_init_reader;
+                metadata_init_reader = NULL;
+                journal_init_reader = new blockstore_init_journal(this);
+                initialized = 2;
+            }
+        }
+        if (initialized == 2)
+        {
+            int res = journal_init_reader->loop();
+            if (!res)
+            {
+                delete journal_init_reader;
+                journal_init_reader = NULL;
+                initialized = 10;
+                ringloop->wakeup();
+            }
+        }
+    }
+    else
+    {
+        // try to submit ops
+        unsigned initial_ring_space = ringloop->space_left();
+        // FIXME: rework this "sync polling"
+        auto cur_sync = in_progress_syncs.begin();
+        while (cur_sync != in_progress_syncs.end())
+        {
+            if (continue_sync(*cur_sync) != 2)
+            {
+                // List is unmodified
+                cur_sync++;
+            }
+            else
+            {
+                cur_sync = in_progress_syncs.begin();
+            }
+        }
+        auto cur = submit_queue.begin();
+        int has_writes = 0;
+        while (cur != submit_queue.end())
+        {
+            auto op_ptr = cur;
+            auto op = *(cur++);
+            // FIXME: This needs some simplification
+            // Writes should not block reads if the ring is not full and reads don't depend on them
+            // In all other cases we should stop submission
+            if (PRIV(op)->wait_for)
+            {
+                check_wait(op);
+                if (PRIV(op)->wait_for == WAIT_SQE)
+                {
+                    break;
+                }
+                else if (PRIV(op)->wait_for)
+                {
+                    if (op->opcode == BS_OP_WRITE || op->opcode == BS_OP_WRITE_STABLE || op->opcode == BS_OP_DELETE)
+                    {
+                        has_writes = 2;
+                    }
+                    continue;
+                }
+            }
+            unsigned ring_space = ringloop->space_left();
+            unsigned prev_sqe_pos = ringloop->save();
+            bool dequeue_op = false;
+            if (op->opcode == BS_OP_READ)
+            {
+                dequeue_op = dequeue_read(op);
+            }
+            else if (op->opcode == BS_OP_WRITE || op->opcode == BS_OP_WRITE_STABLE)
+            {
+                if (has_writes == 2)
+                {
+                    // Some writes could not be submitted
+                    break;
+                }
+                dequeue_op = dequeue_write(op);
+                has_writes = dequeue_op ? 1 : 2;
+            }
+            else if (op->opcode == BS_OP_DELETE)
+            {
+                if (has_writes == 2)
+                {
+                    // Some writes could not be submitted
+                    break;
+                }
+                dequeue_op = dequeue_del(op);
+                has_writes = dequeue_op ? 1 : 2;
+            }
+            else if (op->opcode == BS_OP_SYNC)
+            {
+                // wait for all small writes to be submitted
+                // wait for all big writes to complete, submit data device fsync
+                // wait for the data device fsync to complete, then submit journal writes for big writes
+                // then submit an fsync operation
+                if (has_writes)
+                {
+                    // Can't submit SYNC before previous writes
+                    continue;
+                }
+                dequeue_op = dequeue_sync(op);
+            }
+            else if (op->opcode == BS_OP_STABLE)
+            {
+                if (has_writes == 2)
+                {
+                    // Don't submit additional flushes before completing previous LISTs
+                    break;
+                }
+                dequeue_op = dequeue_stable(op);
+            }
+            else if (op->opcode == BS_OP_ROLLBACK)
+            {
+                if (has_writes == 2)
+                {
+                    // Don't submit additional flushes before completing previous LISTs
+                    break;
+                }
+                dequeue_op = dequeue_rollback(op);
+            }
+            else if (op->opcode == BS_OP_LIST)
+            {
+                // Block LIST operation by previous modifications,
+                // so it always returns a consistent state snapshot
+                if (has_writes == 2 || inflight_writes > 0)
+                    has_writes = 2;
+                else
+                {
+                    process_list(op);
+                    dequeue_op = true;
+                }
+            }
+            if (dequeue_op)
+            {
+                submit_queue.erase(op_ptr);
+            }
+            else
+            {
+                ringloop->restore(prev_sqe_pos);
+                if (PRIV(op)->wait_for == WAIT_SQE)
+                {
+                    PRIV(op)->wait_detail = 1 + ring_space;
+                    // ring is full, stop submission
+                    break;
+                }
+            }
+        }
+        if (!readonly)
+        {
+            flusher->loop();
+        }
+        int ret = ringloop->submit();
+        if (ret < 0)
+        {
+            throw std::runtime_error(std::string("io_uring_submit: ") + strerror(-ret));
+        }
+        if ((initial_ring_space - ringloop->space_left()) > 0)
+        {
+            live = true;
+        }
+        queue_stall = !live && !ringloop->has_work();
+        live = false;
+    }
+}
+
+bool blockstore_impl_t::is_safe_to_stop()
+{
+    // It's safe to stop blockstore when there are no in-flight operations,
+    // no in-progress syncs and flusher isn't doing anything
+    if (submit_queue.size() > 0 || in_progress_syncs.size() > 0 || !readonly && flusher->is_active())
+    {
+        return false;
+    }
+    if (unsynced_big_writes.size() > 0 || unsynced_small_writes.size() > 0)
+    {
+        if (!readonly && !stop_sync_submitted)
+        {
+            // We should sync the blockstore before unmounting
+            blockstore_op_t *op = new blockstore_op_t;
+            op->opcode = BS_OP_SYNC;
+            op->buf = NULL;
+            op->callback = [](blockstore_op_t *op)
+            {
+                delete op;
+            };
+            enqueue_op(op);
+            stop_sync_submitted = true;
+        }
+        return false;
+    }
+    return true;
+}
+
+void blockstore_impl_t::check_wait(blockstore_op_t *op)
+{
+    if (PRIV(op)->wait_for == WAIT_SQE)
+    {
+        if (ringloop->space_left() < PRIV(op)->wait_detail)
+        {
+            // stop submission if there's still no free space
+#ifdef BLOCKSTORE_DEBUG
+            printf("Still waiting for %lu SQE(s)\n", PRIV(op)->wait_detail);
+#endif
+            return;
+        }
+        PRIV(op)->wait_for = 0;
+    }
+    else if (PRIV(op)->wait_for == WAIT_JOURNAL)
+    {
+        if (journal.used_start == PRIV(op)->wait_detail)
+        {
+            // do not submit
+#ifdef BLOCKSTORE_DEBUG
+            printf("Still waiting to flush journal offset %08lx\n", PRIV(op)->wait_detail);
+#endif
+            return;
+        }
+        flusher->release_trim();
+        PRIV(op)->wait_for = 0;
+    }
+    else if (PRIV(op)->wait_for == WAIT_JOURNAL_BUFFER)
+    {
+        int next = ((journal.cur_sector + 1) % journal.sector_count);
+        if (journal.sector_info[next].usage_count > 0 ||
+            journal.sector_info[next].dirty)
+        {
+            // do not submit
+#ifdef BLOCKSTORE_DEBUG
+            printf("Still waiting for a journal buffer\n");
+#endif
+            return;
+        }
+        PRIV(op)->wait_for = 0;
+    }
+    else if (PRIV(op)->wait_for == WAIT_FREE)
+    {
+        if (!data_alloc->get_free_count() && !flusher->is_active())
+        {
+#ifdef BLOCKSTORE_DEBUG
+            printf("Still waiting for free space on the data device\n");
+#endif
+            return;
+        }
+        PRIV(op)->wait_for = 0;
+    }
+    else
+    {
+        throw std::runtime_error("BUG: op->wait_for value is unexpected");
+    }
+}
+
+void blockstore_impl_t::enqueue_op(blockstore_op_t *op, bool first)
+{
+    if (op->opcode < BS_OP_MIN || op->opcode > BS_OP_MAX ||
+        ((op->opcode == BS_OP_READ || op->opcode == BS_OP_WRITE || op->opcode == BS_OP_WRITE_STABLE) && (
+            op->offset >= block_size ||
+            op->len > block_size-op->offset ||
+            (op->len % disk_alignment)
+        )) ||
+        readonly && op->opcode != BS_OP_READ && op->opcode != BS_OP_LIST ||
+        first && (op->opcode == BS_OP_WRITE || op->opcode == BS_OP_WRITE_STABLE))
+    {
+        // Basic verification not passed
+        op->retval = -EINVAL;
+        std::function<void (blockstore_op_t*)>(op->callback)(op);
+        return;
+    }
+    if (op->opcode == BS_OP_SYNC_STAB_ALL)
+    {
+        std::function<void(blockstore_op_t*)> *old_callback = new std::function<void(blockstore_op_t*)>(op->callback);
+        op->opcode = BS_OP_SYNC;
+        op->callback = [this, old_callback](blockstore_op_t *op)
+        {
+            if (op->retval >= 0 && unstable_writes.size() > 0)
+            {
+                op->opcode = BS_OP_STABLE;
+                op->len = unstable_writes.size();
+                obj_ver_id *vers = new obj_ver_id[op->len];
+                op->buf = vers;
+                int i = 0;
+                for (auto it = unstable_writes.begin(); it != unstable_writes.end(); it++, i++)
+                {
+                    vers[i] = {
+                        .oid = it->first,
+                        .version = it->second,
+                    };
+                }
+                unstable_writes.clear();
+                op->callback = [this, old_callback](blockstore_op_t *op)
+                {
+                    obj_ver_id *vers = (obj_ver_id*)op->buf;
+                    delete[] vers;
+                    op->buf = NULL;
+                    (*old_callback)(op);
+                    delete old_callback;
+                };
+                this->enqueue_op(op);
+            }
+            else
+            {
+                (*old_callback)(op);
+                delete old_callback;
+            }
+        };
+    }
+    if ((op->opcode == BS_OP_WRITE || op->opcode == BS_OP_WRITE_STABLE || op->opcode == BS_OP_DELETE) && !enqueue_write(op))
+    {
+        std::function<void (blockstore_op_t*)>(op->callback)(op);
+        return;
+    }
+    if (op->opcode == BS_OP_SYNC && immediate_commit == IMMEDIATE_ALL)
+    {
+        op->retval = 0;
+        std::function<void (blockstore_op_t*)>(op->callback)(op);
+        return;
+    }
+    // Call constructor without allocating memory. We'll call destructor before returning op back
+    new ((void*)op->private_data) blockstore_op_private_t;
+    PRIV(op)->wait_for = 0;
+    PRIV(op)->op_state = 0;
+    PRIV(op)->pending_ops = 0;
+    if (!first)
+    {
+        submit_queue.push_back(op);
+    }
+    else
+    {
+        submit_queue.push_front(op);
+    }
+    ringloop->wakeup();
+}
+
+static bool replace_stable(object_id oid, uint64_t version, int search_start, int search_end, obj_ver_id* list)
+{
+    while (search_start < search_end)
+    {
+        int pos = search_start+(search_end-search_start)/2;
+        if (oid < list[pos].oid)
+        {
+            search_end = pos;
+        }
+        else if (list[pos].oid < oid)
+        {
+            search_start = pos+1;
+        }
+        else
+        {
+            list[pos].version = version;
+            return true;
+        }
+    }
+    return false;
+}
+
+void blockstore_impl_t::process_list(blockstore_op_t *op)
+{
+    uint32_t list_pg = op->offset;
+    uint32_t pg_count = op->len;
+    uint64_t pg_stripe_size = op->oid.stripe;
+    uint64_t min_inode = op->oid.inode;
+    uint64_t max_inode = op->version;
+    // Check PG
+    if (pg_count != 0 && (pg_stripe_size < MIN_BLOCK_SIZE || list_pg >= pg_count))
+    {
+        op->retval = -EINVAL;
+        FINISH_OP(op);
+        return;
+    }
+    // Copy clean_db entries (sorted)
+    int stable_count = 0, stable_alloc = clean_db.size() / (pg_count ? pg_count : 1);
+    obj_ver_id *stable = (obj_ver_id*)malloc(sizeof(obj_ver_id) * stable_alloc);
+    if (!stable)
+    {
+        op->retval = -ENOMEM;
+        FINISH_OP(op);
+        return;
+    }
+    {
+        auto clean_it = clean_db.begin(), clean_end = clean_db.end();
+        if ((min_inode != 0 || max_inode != 0) && min_inode <= max_inode)
+        {
+            clean_it = clean_db.lower_bound({
+                .inode = min_inode,
+                .stripe = 0,
+            });
+            clean_end = clean_db.upper_bound({
+                .inode = max_inode,
+                .stripe = UINT64_MAX,
+            });
+        }
+        for (; clean_it != clean_end; clean_it++)
+        {
+            if (!pg_count || ((clean_it->first.inode + clean_it->first.stripe / pg_stripe_size) % pg_count) == list_pg)
+            {
+                if (stable_count >= stable_alloc)
+                {
+                    stable_alloc += 32768;
+                    stable = (obj_ver_id*)realloc(stable, sizeof(obj_ver_id) * stable_alloc);
+                    if (!stable)
+                    {
+                        op->retval = -ENOMEM;
+                        FINISH_OP(op);
+                        return;
+                    }
+                }
+                stable[stable_count++] = {
+                    .oid = clean_it->first,
+                    .version = clean_it->second.version,
+                };
+            }
+        }
+    }
+    int clean_stable_count = stable_count;
+    // Copy dirty_db entries (sorted, too)
+    int unstable_count = 0, unstable_alloc = 0;
+    obj_ver_id *unstable = NULL;
+    {
+        auto dirty_it = dirty_db.begin(), dirty_end = dirty_db.end();
+        if ((min_inode != 0 || max_inode != 0) && min_inode <= max_inode)
+        {
+            dirty_it = dirty_db.lower_bound({
+                .oid = {
+                    .inode = min_inode,
+                    .stripe = 0,
+                },
+                .version = 0,
+            });
+            dirty_end = dirty_db.upper_bound({
+                .oid = {
+                    .inode = max_inode,
+                    .stripe = UINT64_MAX,
+                },
+                .version = UINT64_MAX,
+            });
+        }
+        for (; dirty_it != dirty_end; dirty_it++)
+        {
+            if (!pg_count || ((dirty_it->first.oid.inode + dirty_it->first.oid.stripe / pg_stripe_size) % pg_count) == list_pg)
+            {
+                if (IS_DELETE(dirty_it->second.state))
+                {
+                    // Deletions are always stable, so try to zero out two possible entries
+                    if (!replace_stable(dirty_it->first.oid, 0, 0, clean_stable_count, stable))
+                    {
+                        replace_stable(dirty_it->first.oid, 0, clean_stable_count, stable_count, stable);
+                    }
+                }
+                else if (IS_STABLE(dirty_it->second.state))
+                {
+                    // First try to replace a clean stable version in the first part of the list
+                    if (!replace_stable(dirty_it->first.oid, dirty_it->first.version, 0, clean_stable_count, stable))
+                    {
+                        // Then try to replace the last dirty stable version in the second part of the list
+                        if (stable[stable_count-1].oid == dirty_it->first.oid)
+                        {
+                            stable[stable_count-1].version = dirty_it->first.version;
+                        }
+                        else
+                        {
+                            if (stable_count >= stable_alloc)
+                            {
+                                stable_alloc += 32768;
+                                stable = (obj_ver_id*)realloc(stable, sizeof(obj_ver_id) * stable_alloc);
+                                if (!stable)
+                                {
+                                    if (unstable)
+                                        free(unstable);
+                                    op->retval = -ENOMEM;
+                                    FINISH_OP(op);
+                                    return;
+                                }
+                            }
+                            stable[stable_count++] = dirty_it->first;
+                        }
+                    }
+                }
+                else
+                {
+                    if (unstable_count >= unstable_alloc)
+                    {
+                        unstable_alloc += 32768;
+                        unstable = (obj_ver_id*)realloc(unstable, sizeof(obj_ver_id) * unstable_alloc);
+                        if (!unstable)
+                        {
+                            if (stable)
+                                free(stable);
+                            op->retval = -ENOMEM;
+                            FINISH_OP(op);
+                            return;
+                        }
+                    }
+                    unstable[unstable_count++] = dirty_it->first;
+                }
+            }
+        }
+    }
+    // Remove zeroed out stable entries
+    int j = 0;
+    for (int i = 0; i < stable_count; i++)
+    {
+        if (stable[i].version != 0)
+        {
+            stable[j++] = stable[i];
+        }
+    }
+    stable_count = j;
+    if (stable_count+unstable_count > stable_alloc)
+    {
+        stable_alloc = stable_count+unstable_count;
+        stable = (obj_ver_id*)realloc(stable, sizeof(obj_ver_id) * stable_alloc);
+        if (!stable)
+        {
+            if (unstable)
+                free(unstable);
+            op->retval = -ENOMEM;
+            FINISH_OP(op);
+            return;
+        }
+    }
+    // Copy unstable entries
+    for (int i = 0; i < unstable_count; i++)
+    {
+        stable[j++] = unstable[i];
+    }
+    free(unstable);
+    op->version = stable_count;
+    op->retval = stable_count+unstable_count;
+    op->buf = stable;
+    FINISH_OP(op);
+}
--- a/src/blockstore/blockstore_impl.h
+++ b/src/blockstore/blockstore_impl.h
@ -1,16 +1,14 @@
 // Copyright (c) Vitaliy Filippov, 2019+
-// License: VNPL-1.1 (see README.md for details)
+// License: VNPL-1.0 (see README.md for details)

 #pragma once

 #include "blockstore.h"
-#include "blockstore_disk.h"

 #include <sys/types.h>
 #include <sys/ioctl.h>
 #include <sys/stat.h>
 #include <fcntl.h>
-#include <time.h>
 #include <unistd.h>
 #include <linux/fs.h>

@ -18,7 +16,6 @@
 #include <list>
 #include <deque>
 #include <new>
-#include <unordered_map>

 #include "cpp-btree/btree_map.h"

@ -33,13 +30,12 @@
 #define BS_ST_BIG_WRITE 0x02
 #define BS_ST_DELETE 0x03

-#define BS_ST_WAIT_DEL 0x10
-#define BS_ST_WAIT_BIG 0x20
-#define BS_ST_IN_FLIGHT 0x30
-#define BS_ST_SUBMITTED 0x40
-#define BS_ST_WRITTEN 0x50
-#define BS_ST_SYNCED 0x60
-#define BS_ST_STABLE 0x70
+#define BS_ST_WAIT_BIG 0x10
+#define BS_ST_IN_FLIGHT 0x20
+#define BS_ST_SUBMITTED 0x30
+#define BS_ST_WRITTEN 0x40
+#define BS_ST_SYNCED 0x50
+#define BS_ST_STABLE 0x60

 #define BS_ST_INSTANT 0x100

@ -55,16 +51,6 @@
 #define IS_JOURNAL(st) (((st) & 0x0F) == BS_ST_SMALL_WRITE)
 #define IS_BIG_WRITE(st) (((st) & 0x0F) == BS_ST_BIG_WRITE)
 #define IS_DELETE(st) (((st) & 0x0F) == BS_ST_DELETE)
-#define IS_INSTANT(st) (((st) & BS_ST_TYPE_MASK) == BS_ST_DELETE || ((st) & BS_ST_INSTANT))
-
-#define BS_SUBMIT_CHECK_SQES(n) \
-    if (ringloop->sqes_left() < (n))\
-    {\
-        /* Pause until there are more requests available */\
-        PRIV(op)->wait_detail = (n);\
-        PRIV(op)->wait_for = WAIT_SQE;\
-        return 0;\
-    }

 #define BS_SUBMIT_GET_SQE(sqe, data) \
    BS_SUBMIT_GET_ONLY_SQE(sqe); \
@ -75,7 +61,6 @@
    if (!sqe)\
    {\
        /* Pause until there are more requests available */\
-        PRIV(op)->wait_detail = 1;\
        PRIV(op)->wait_for = WAIT_SQE;\
        return 0;\
    }
@ -85,52 +70,19 @@
    if (!sqe)\
    {\
        /* Pause until there are more requests available */\
-        PRIV(op)->wait_detail = 1;\
        PRIV(op)->wait_for = WAIT_SQE;\
        return 0;\
    }

 #include "blockstore_journal.h"

-// "VITAstor"
-#define BLOCKSTORE_META_MAGIC_V1 0x726F747341544956l
-#define BLOCKSTORE_META_FORMAT_V1 1
-#define BLOCKSTORE_META_FORMAT_V2 2
-
-// metadata header (superblock)
-struct __attribute__((__packed__)) blockstore_meta_header_v1_t
-{
-    uint64_t zero;
-    uint64_t magic;
-    uint64_t version;
-    uint32_t meta_block_size;
-    uint32_t data_block_size;
-    uint32_t bitmap_granularity;
-};
-
-struct __attribute__((__packed__)) blockstore_meta_header_v2_t
-{
-    uint64_t zero;
-    uint64_t magic;
-    uint64_t version;
-    uint32_t meta_block_size;
-    uint32_t data_block_size;
-    uint32_t bitmap_granularity;
-    uint32_t data_csum_type;
-    uint32_t csum_block_size;
-    uint32_t header_csum;
-};
-
-// 32 bytes = 24 bytes + block bitmap (4 bytes by default) + external attributes (also bitmap, 4 bytes by default)
-// per "clean" entry on disk with fixed metadata tables
+// 24 bytes + block bitmap per "clean" entry on disk with fixed metadata tables
+// FIXME: maybe add crc32's to metadata
 struct __attribute__((__packed__)) clean_disk_entry
 {
    object_id oid;
    uint64_t version;
    uint8_t bitmap[];
-    // Two more fields come after bitmap in metadata version 2:
-    // uint32_t data_csum[];
-    // uint32_t entry_csum;
 };

 // 32 = 16 + 16 bytes per "clean" entry in memory (object_id => clean_entry)
@ -140,7 +92,7 @@ struct __attribute__((__packed__)) clean_entry
    uint64_t location;
 };

-// 64 = 24 + 40 bytes per dirty entry in memory (obj_ver_id => dirty_entry). Plus checksums
+// 56 = 24 + 32 bytes per dirty entry in memory (obj_ver_id => dirty_entry)
 struct __attribute__((__packed__)) dirty_entry
 {
    uint32_t state;
@ -149,7 +101,6 @@ struct __attribute__((__packed__)) dirty_entry
    uint32_t offset;   // data offset within object (stripe)
    uint32_t len;      // data length
    uint64_t journal_sector; // journal sector used for this entry
-    void* dyn_data;    // dynamic data: external bitmap and data block checksums. may be a pointer to the in-memory journal
 };

 // - Sync must be submitted after previous writes/deletes (not before!)
@ -178,11 +129,36 @@ struct __attribute__((__packed__)) dirty_entry
 // Suspend operation until there is some free space on the data device
 #define WAIT_FREE 5

-struct used_clean_obj_t
+struct fulfill_read_t
 {
-    int refs;
-    bool was_freed; // was freed by a parallel flush?
-    bool was_changed; // was changed by a parallel flush?
+    uint64_t offset, len;
+};
+
+#define PRIV(op) ((blockstore_op_private_t*)(op)->private_data)
+#define FINISH_OP(op) PRIV(op)->~blockstore_op_private_t(); std::function<void (blockstore_op_t*)>(op->callback)(op)
+
+struct blockstore_op_private_t
+{
+    // Wait status
+    int wait_for;
+    uint64_t wait_detail;
+    int pending_ops;
+    int op_state;
+
+    // Read
+    std::vector<fulfill_read_t> read_vec;
+
+    // Sync, write
+    uint64_t min_flushed_journal_sector, max_flushed_journal_sector;
+
+    // Write
+    struct iovec iov_zerofill[3];
+
+    // Sync
+    std::vector<obj_ver_id> sync_big_writes, sync_small_writes;
+    int sync_small_checked, sync_big_checked;
+    std::list<blockstore_op_t*>::iterator in_progress_ptr;
+    int prev_sync_count;
 };

 // https://github.com/algorithm-ninja/cpp-btree
@ -195,108 +171,62 @@ typedef std::map<obj_ver_id, dirty_entry> blockstore_dirty_db_t;

 #include "blockstore_flush.h"

-#define PRIV(op) ((blockstore_op_private_t*)(op)->private_data)
-#define FINISH_OP(op) PRIV(op)->~blockstore_op_private_t(); std::function<void (blockstore_op_t*)>(op->callback)(op)
-
-struct blockstore_op_private_t
-{
-    // Wait status
-    int wait_for;
-    uint64_t wait_detail, wait_detail2;
-    int pending_ops;
-    int op_state;
-
-    // Read
-    uint64_t clean_block_used;
-    std::vector<copy_buffer_t> read_vec;
-
-    // Sync, write
-    uint64_t min_flushed_journal_sector, max_flushed_journal_sector;
-
-    // Write
-    struct iovec iov_zerofill[3];
-    // Warning: must not have a default value here because it's written to before calling constructor in blockstore_write.cpp O_o
-    uint64_t real_version;
-    timespec tv_begin;
-
-    // Sync
-    std::vector<obj_ver_id> sync_big_writes, sync_small_writes;
-};
-
-typedef uint32_t pool_id_t;
-typedef uint64_t pool_pg_id_t;
-
-#define POOL_ID_BITS 16
-
-struct pool_shard_settings_t
-{
-    uint32_t pg_count;
-    uint32_t pg_stripe_size;
-};
-
-#define STAB_SPLIT_DONE 1
-#define STAB_SPLIT_WAIT 2
-#define STAB_SPLIT_SYNC 3
-#define STAB_SPLIT_TODO 4
-
 class blockstore_impl_t
 {
-    blockstore_disk_t dsk;
-
    /******* OPTIONS *******/
+    std::string data_device, meta_device, journal_device;
+    uint32_t block_size;
+    uint64_t meta_offset;
+    uint64_t data_offset;
+    uint64_t cfg_journal_size, cfg_data_size;
+    // Required write alignment and journal/metadata/data areas' location alignment
+    uint32_t disk_alignment = 4096;
+    // Journal block size - minimum_io_size of the journal device is the best choice
+    uint64_t journal_block_size = 4096;
+    // Metadata block size - minimum_io_size of the metadata device is the best choice
+    uint64_t meta_block_size = 4096;
+    // Sparse write tracking granularity. 4 KB is a good choice. Must be a multiple of disk_alignment
+    uint64_t bitmap_granularity = 4096;
    bool readonly = false;
+    // By default, Blockstore locks all opened devices exclusively. This option can be used to disable locking
+    bool disable_flock = false;
    // It is safe to disable fsync() if drive write cache is writethrough
    bool disable_data_fsync = false, disable_meta_fsync = false, disable_journal_fsync = false;
    // Enable if you want every operation to be executed with an "implicit fsync"
    // Suitable only for server SSDs with capacitors, requires disabled data and journal fsyncs
    int immediate_commit = IMMEDIATE_NONE;
    bool inmemory_meta = false;
-    // Maximum and minimum flusher count
-    unsigned max_flusher_count, min_flusher_count;
-    unsigned journal_trim_interval;
-    // Maximum queue depth
-    unsigned max_write_iodepth = 128;
-    // Enable small (journaled) write throttling, useful for the SSD+HDD case
-    bool throttle_small_writes = false;
-    // Target data device iops, bandwidth and parallelism for throttling (100/100/1 is the default for HDD)
-    int throttle_target_iops = 100;
-    int throttle_target_mbs = 100;
-    int throttle_target_parallelism = 1;
-    // Minimum difference in microseconds between target and real execution times to throttle the response
-    int throttle_threshold_us = 50;
-    // Maximum writes between automatically added fsync operations
-    uint64_t autosync_writes = 128;
+    int flusher_count;
    /******* END OF OPTIONS *******/

    struct ring_consumer_t ring_consumer;

-    std::map<pool_id_t, pool_shard_settings_t> clean_db_settings;
-    std::map<pool_pg_id_t, blockstore_clean_db_t> clean_db_shards;
-    std::map<uint64_t, int> no_inode_stats;
-    uint8_t *clean_bitmaps = NULL;
+    blockstore_clean_db_t clean_db;
+    uint8_t *clean_bitmap = NULL;
    blockstore_dirty_db_t dirty_db;
-    std::vector<blockstore_op_t*> submit_queue;
+    std::list<blockstore_op_t*> submit_queue; // FIXME: funny thing is that vector is better here
    std::vector<obj_ver_id> unsynced_big_writes, unsynced_small_writes;
-    int unsynced_big_write_count = 0, unstable_unsynced = 0;
-    int unsynced_queued_ops = 0;
+    std::list<blockstore_op_t*> in_progress_syncs; // ...and probably here, too
    allocator *data_alloc = NULL;
-    uint64_t used_blocks = 0;
    uint8_t *zero_object;

+    uint32_t block_order;
+    uint64_t block_count;
+    uint32_t clean_entry_bitmap_size = 0, clean_entry_size = 0;
+
+    int meta_fd;
+    int data_fd;
+    uint64_t meta_size, meta_area, meta_len;
+    uint64_t data_size, data_len;
+
    void *metadata_buffer = NULL;

    struct journal_t journal;
    journal_flusher_t *flusher;
-    int big_to_flush = 0;
-    int write_iodepth = 0;
-    bool alloc_dyn_data = false;
-
-    // clean data blocks referenced by read operations
-    std::map<uint64_t, used_clean_obj_t> used_clean_objects;

    bool live = false, queue_stall = false;
    ring_loop_t *ringloop;
-    timerfd_manager_t *tfd;
+    int inflight_writes = 0;

    bool stop_sync_submitted;

@ -307,24 +237,15 @@ class blockstore_impl_t

    friend class blockstore_init_meta;
    friend class blockstore_init_journal;
-    friend struct blockstore_journal_check_t;
+    friend class blockstore_journal_check_t;
    friend class journal_flusher_t;
    friend class journal_flusher_co;

+    void parse_config(blockstore_config_t & config);
    void calc_lengths();
    void open_data();
    void open_meta();
    void open_journal();
-    uint8_t* get_clean_entry_bitmap(uint64_t block_loc, int offset);
-
-    blockstore_clean_db_t& clean_db_shard(object_id oid);
-    void reshard_clean_db(pool_id_t pool_id, uint32_t pg_count, uint32_t pg_stripe_size);
-    void recalc_inode_space_stats(uint64_t pool_id, bool per_inode);
-
-    // Journaling
-    void prepare_journal_sector_write(int sector, blockstore_op_t *op);
-    void handle_journal_write(ring_data_t *data, uint64_t flush_id);
-    void disk_error_abort(const char *op, int retval, int expected);

    // Asynchronous init
    int initialized;
@ -333,41 +254,17 @@ class blockstore_impl_t
    blockstore_init_journal* journal_init_reader;

    void check_wait(blockstore_op_t *op);
-    void init_op(blockstore_op_t *op);

    // Read
    int dequeue_read(blockstore_op_t *read_op);
-    void find_holes(std::vector<copy_buffer_t> & read_vec, uint32_t item_start, uint32_t item_end,
-        std::function<int(int, bool, uint32_t, uint32_t)> callback);
-    int fulfill_read(blockstore_op_t *read_op,
-        uint64_t &fulfilled, uint32_t item_start, uint32_t item_end,
-        uint32_t item_state, uint64_t item_version, uint64_t item_location,
-        uint64_t journal_sector, uint8_t *csum, int *dyn_data);
-    bool fulfill_clean_read(blockstore_op_t *read_op, uint64_t & fulfilled,
-        uint8_t *clean_entry_bitmap, int *dyn_data,
-        uint32_t item_start, uint32_t item_end, uint64_t clean_loc, uint64_t clean_ver);
-    int fill_partial_checksum_blocks(std::vector<copy_buffer_t> & rv, uint64_t & fulfilled,
-        uint8_t *clean_entry_bitmap, int *dyn_data, bool from_journal, uint8_t *read_buf, uint64_t read_offset, uint64_t read_end);
-    int pad_journal_read(std::vector<copy_buffer_t> & rv, copy_buffer_t & cp,
-        uint64_t dirty_offset, uint64_t dirty_end, uint64_t dirty_loc, uint8_t *csum_ptr, int *dyn_data,
-        uint64_t offset, uint64_t submit_len, uint64_t & blk_begin, uint64_t & blk_end, uint8_t* & blk_buf);
-    bool read_range_fulfilled(std::vector<copy_buffer_t> & rv, uint64_t & fulfilled, uint8_t *read_buf,
-        uint8_t *clean_entry_bitmap, uint32_t item_start, uint32_t item_end);
-    bool read_checksum_block(blockstore_op_t *op, int rv_pos, uint64_t &fulfilled, uint64_t clean_loc);
-    uint8_t* read_clean_meta_block(blockstore_op_t *read_op, uint64_t clean_loc, int rv_pos);
-    bool verify_padded_checksums(uint8_t *clean_entry_bitmap, uint8_t *csum_buf, uint32_t offset,
-        iovec *iov, int n_iov, std::function<void(uint32_t, uint32_t, uint32_t)> bad_block_cb);
-    bool verify_journal_checksums(uint8_t *csums, uint32_t offset,
-        iovec *iov, int n_iov, std::function<void(uint32_t, uint32_t, uint32_t)> bad_block_cb);
-    bool verify_clean_padded_checksums(blockstore_op_t *op, uint64_t clean_loc, uint8_t *dyn_data, bool from_journal,
-        iovec *iov, int n_iov, std::function<void(uint32_t, uint32_t, uint32_t)> bad_block_cb);
+    int fulfill_read(blockstore_op_t *read_op, uint64_t &fulfilled, uint32_t item_start, uint32_t item_end,
+        uint32_t item_state, uint64_t item_version, uint64_t item_location);
    int fulfill_read_push(blockstore_op_t *op, void *buf, uint64_t offset, uint64_t len,
        uint32_t item_state, uint64_t item_version);
    void handle_read_event(ring_data_t *data, blockstore_op_t *op);

    // Write
    bool enqueue_write(blockstore_op_t *op);
-    void cancel_all_writes(blockstore_op_t *op, blockstore_dirty_db_t::iterator dirty_it, int retval);
    int dequeue_write(blockstore_op_t *op);
    int dequeue_del(blockstore_op_t *op);
    int continue_write(blockstore_op_t *op);
@ -375,34 +272,34 @@ class blockstore_impl_t
    void handle_write_event(ring_data_t *data, blockstore_op_t *op);

    // Sync
+    int dequeue_sync(blockstore_op_t *op);
+    void handle_sync_event(ring_data_t *data, blockstore_op_t *op);
    int continue_sync(blockstore_op_t *op);
-    void ack_sync(blockstore_op_t *op);
+    void ack_one_sync(blockstore_op_t *op);
+    int ack_sync(blockstore_op_t *op);

    // Stabilize
    int dequeue_stable(blockstore_op_t *op);
    int continue_stable(blockstore_op_t *op);
-    void mark_stable(obj_ver_id ov, bool forget_dirty = false);
+    void mark_stable(const obj_ver_id & ov);
+    void handle_stable_event(ring_data_t *data, blockstore_op_t *op);
    void stabilize_object(object_id oid, uint64_t max_ver);
-    blockstore_op_t* selective_sync(blockstore_op_t *op);
-    int split_stab_op(blockstore_op_t *op, std::function<int(obj_ver_id v)> decider);

    // Rollback
    int dequeue_rollback(blockstore_op_t *op);
    int continue_rollback(blockstore_op_t *op);
    void mark_rolled_back(const obj_ver_id & ov);
+    void handle_rollback_event(ring_data_t *data, blockstore_op_t *op);
    void erase_dirty(blockstore_dirty_db_t::iterator dirty_start, blockstore_dirty_db_t::iterator dirty_end, uint64_t clean_loc);
-    void free_dirty_dyn_data(dirty_entry & e);

    // List
    void process_list(blockstore_op_t *op);

 public:

-    blockstore_impl_t(blockstore_config_t & config, ring_loop_t *ringloop, timerfd_manager_t *tfd);
+    blockstore_impl_t(blockstore_config_t & config, ring_loop_t *ringloop);
    ~blockstore_impl_t();

-    void parse_config(blockstore_config_t & config, bool init);
-
    // Event loop
    void loop();

@ -419,26 +316,13 @@ public:
    bool is_stalled();

    // Submission
-    void enqueue_op(blockstore_op_t *op);
-
-    // Simplified synchronous operation: get object bitmap & current version
-    int read_bitmap(object_id oid, uint64_t target_version, void *bitmap, uint64_t *result_version = NULL);
+    void enqueue_op(blockstore_op_t *op, bool first = false);

    // Unstable writes are added here (map of object_id -> version)
    std::unordered_map<object_id, uint64_t> unstable_writes;

-    // Space usage statistics
-    std::map<uint64_t, uint64_t> inode_space_stats;
-
-    // Set per-pool no_inode_stats
-    void set_no_inode_stats(const std::vector<uint64_t> & pool_ids);
-
-    // Print diagnostics to stdout
-    void dump_diagnostics();
-
-    inline uint32_t get_block_size() { return dsk.data_block_size; }
-    inline uint64_t get_block_count() { return dsk.block_count; }
-    inline uint64_t get_free_block_count() { return dsk.block_count - used_blocks; }
-    inline uint32_t get_bitmap_granularity() { return dsk.disk_alignment; }
-    inline uint64_t get_journal_size() { return dsk.journal_len; }
+    inline uint32_t get_block_size() { return block_size; }
+    inline uint64_t get_block_count() { return block_count; }
+    inline uint64_t get_free_block_count() { return data_alloc->get_free_count(); }
+    inline uint32_t get_disk_alignment() { return disk_alignment; }
 };
--- a/blockstore_init.cpp
+++ b/blockstore_init.cpp
@ -0,0 +1,656 @@
+// Copyright (c) Vitaliy Filippov, 2019+
+// License: VNPL-1.0 (see README.md for details)
+
+#include "blockstore_impl.h"
+
+blockstore_init_meta::blockstore_init_meta(blockstore_impl_t *bs)
+{
+    this->bs = bs;
+}
+
+void blockstore_init_meta::handle_event(ring_data_t *data)
+{
+    if (data->res <= 0)
+    {
+        throw std::runtime_error(
+            std::string("read metadata failed at offset ") + std::to_string(metadata_read) +
+            std::string(": ") + strerror(-data->res)
+        );
+    }
+    prev_done = data->res > 0 ? submitted : 0;
+    done_len = data->res;
+    done_pos = metadata_read;
+    metadata_read += data->res;
+    submitted = 0;
+}
+
+int blockstore_init_meta::loop()
+{
+    if (wait_state == 1)
+        goto resume_1;
+    printf("Reading blockstore metadata\n");
+    if (bs->inmemory_meta)
+        metadata_buffer = bs->metadata_buffer;
+    else
+        metadata_buffer = memalign(MEM_ALIGNMENT, 2*bs->metadata_buf_size);
+    if (!metadata_buffer)
+        throw std::runtime_error("Failed to allocate metadata read buffer");
+    while (1)
+    {
+    resume_1:
+        if (submitted)
+        {
+            wait_state = 1;
+            return 1;
+        }
+        if (metadata_read < bs->meta_len)
+        {
+            sqe = bs->get_sqe();
+            if (!sqe)
+            {
+                throw std::runtime_error("io_uring is full while trying to read metadata");
+            }
+            data = ((ring_data_t*)sqe->user_data);
+            data->iov = {
+                metadata_buffer + (bs->inmemory_meta
+                    ? metadata_read
+                    : (prev == 1 ? bs->metadata_buf_size : 0)),
+                bs->meta_len - metadata_read > bs->metadata_buf_size ? bs->metadata_buf_size : bs->meta_len - metadata_read,
+            };
+            data->callback = [this](ring_data_t *data) { handle_event(data); };
+            my_uring_prep_readv(sqe, bs->meta_fd, &data->iov, 1, bs->meta_offset + metadata_read);
+            bs->ringloop->submit();
+            submitted = (prev == 1 ? 2 : 1);
+            prev = submitted;
+        }
+        if (prev_done)
+        {
+            void *done_buf = bs->inmemory_meta
+                ? (metadata_buffer + done_pos)
+                : (metadata_buffer + (prev_done == 2 ? bs->metadata_buf_size : 0));
+            unsigned count = bs->meta_block_size / bs->clean_entry_size;
+            for (int sector = 0; sector < done_len; sector += bs->meta_block_size)
+            {
+                // handle <count> entries
+                handle_entries(done_buf + sector, count, bs->block_order);
+                done_cnt += count;
+            }
+            prev_done = 0;
+            done_len = 0;
+        }
+        if (!submitted)
+        {
+            break;
+        }
+    }
+    // metadata read finished
+    printf("Metadata entries loaded: %lu, free blocks: %lu / %lu\n", entries_loaded, bs->data_alloc->get_free_count(), bs->block_count);
+    if (!bs->inmemory_meta)
+    {
+        free(metadata_buffer);
+        metadata_buffer = NULL;
+    }
+    return 0;
+}
+
+void blockstore_init_meta::handle_entries(void* entries, unsigned count, int block_order)
+{
+    for (unsigned i = 0; i < count; i++)
+    {
+        clean_disk_entry *entry = (clean_disk_entry*)(entries + i*bs->clean_entry_size);
+        if (!bs->inmemory_meta && bs->clean_entry_bitmap_size)
+        {
+            memcpy(bs->clean_bitmap + (done_cnt+i)*bs->clean_entry_bitmap_size, &entry->bitmap, bs->clean_entry_bitmap_size);
+        }
+        if (entry->oid.inode > 0)
+        {
+            auto clean_it = bs->clean_db.find(entry->oid);
+            if (clean_it == bs->clean_db.end() || clean_it->second.version < entry->version)
+            {
+                if (clean_it != bs->clean_db.end())
+                {
+                    // free the previous block
+#ifdef BLOCKSTORE_DEBUG
+                    printf("Free block %lu (new location is %lu)\n", clean_it->second.location >> block_order, done_cnt+i >> block_order);
+#endif
+                    bs->data_alloc->set(clean_it->second.location >> block_order, false);
+                }
+                entries_loaded++;
+#ifdef BLOCKSTORE_DEBUG
+                printf("Allocate block (clean entry) %lu: %lx:%lx v%lu\n", done_cnt+i, entry->oid.inode, entry->oid.stripe, entry->version);
+#endif
+                bs->data_alloc->set(done_cnt+i, true);
+                bs->clean_db[entry->oid] = (struct clean_entry){
+                    .version = entry->version,
+                    .location = (done_cnt+i) << block_order,
+                };
+            }
+            else
+            {
+#ifdef BLOCKSTORE_DEBUG
+                printf("Old clean entry %lu: %lx:%lx v%lu\n", done_cnt+i, entry->oid.inode, entry->oid.stripe, entry->version);
+#endif
+            }
+        }
+    }
+}
+
+blockstore_init_journal::blockstore_init_journal(blockstore_impl_t *bs)
+{
+    this->bs = bs;
+    next_free = bs->journal.block_size;
+    simple_callback = [this](ring_data_t *data1)
+    {
+        if (data1->res != data1->iov.iov_len)
+        {
+            throw std::runtime_error(std::string("I/O operation failed while reading journal: ") + strerror(-data1->res));
+        }
+        wait_count--;
+    };
+}
+
+bool iszero(uint64_t *buf, int len)
+{
+    for (int i = 0; i < len; i++)
+        if (buf[i] != 0)
+            return false;
+    return true;
+}
+
+void blockstore_init_journal::handle_event(ring_data_t *data1)
+{
+    if (data1->res <= 0)
+    {
+        throw std::runtime_error(
+            std::string("read journal failed at offset ") + std::to_string(journal_pos) +
+            std::string(": ") + strerror(-data1->res)
+        );
+    }
+    done.push_back({
+        .buf = submitted_buf,
+        .pos = journal_pos,
+        .len = (uint64_t)data1->res,
+    });
+    journal_pos += data1->res;
+    if (journal_pos >= bs->journal.len)
+    {
+        // Continue from the beginning
+        journal_pos = bs->journal.block_size;
+        wrapped = true;
+    }
+    submitted_buf = NULL;
+}
+
+#define GET_SQE() \
+    sqe = bs->get_sqe();\
+    if (!sqe)\
+        throw std::runtime_error("io_uring is full while trying to read journal");\
+    data = ((ring_data_t*)sqe->user_data)
+
+int blockstore_init_journal::loop()
+{
+    if (wait_state == 1)
+        goto resume_1;
+    else if (wait_state == 2)
+        goto resume_2;
+    else if (wait_state == 3)
+        goto resume_3;
+    else if (wait_state == 4)
+        goto resume_4;
+    else if (wait_state == 5)
+        goto resume_5;
+    else if (wait_state == 6)
+        goto resume_6;
+    else if (wait_state == 7)
+        goto resume_7;
+    printf("Reading blockstore journal\n");
+    if (!bs->journal.inmemory)
+        submitted_buf = memalign_or_die(MEM_ALIGNMENT, 2*bs->journal.block_size);
+    else
+        submitted_buf = bs->journal.buffer;
+    // Read first block of the journal
+    sqe = bs->get_sqe();
+    if (!sqe)
+        throw std::runtime_error("io_uring is full while trying to read journal");
+    data = ((ring_data_t*)sqe->user_data);
+    data->iov = { submitted_buf, bs->journal.block_size };
+    data->callback = simple_callback;
+    my_uring_prep_readv(sqe, bs->journal.fd, &data->iov, 1, bs->journal.offset);
+    bs->ringloop->submit();
+    wait_count = 1;
+resume_1:
+    if (wait_count > 0)
+    {
+        wait_state = 1;
+        return 1;
+    }
+    if (iszero((uint64_t*)submitted_buf, 3))
+    {
+        // Journal is empty
+        // FIXME handle this wrapping to journal_block_size better (maybe)
+        bs->journal.used_start = bs->journal.block_size;
+        bs->journal.next_free = bs->journal.block_size;
+        // Initialize journal "superblock" and the first block
+        memset(submitted_buf, 0, 2*bs->journal.block_size);
+        *((journal_entry_start*)submitted_buf) = {
+            .crc32 = 0,
+            .magic = JOURNAL_MAGIC,
+            .type = JE_START,
+            .size = sizeof(journal_entry_start),
+            .reserved = 0,
+            .journal_start = bs->journal.block_size,
+        };
+        ((journal_entry_start*)submitted_buf)->crc32 = je_crc32((journal_entry*)submitted_buf);
+        if (bs->readonly)
+        {
+            printf("Skipping journal initialization because blockstore is readonly\n");
+        }
+        else
+        {
+            // Cool effect. Same operations result in journal replay.
+            // FIXME: Randomize initial crc32. Track crc32 when trimming.
+            printf("Resetting journal\n");
+            GET_SQE();
+            data->iov = (struct iovec){ submitted_buf, 2*bs->journal.block_size };
+            data->callback = simple_callback;
+            my_uring_prep_writev(sqe, bs->journal.fd, &data->iov, 1, bs->journal.offset);
+            wait_count++;
+            bs->ringloop->submit();
+        resume_6:
+            if (wait_count > 0)
+            {
+                wait_state = 6;
+                return 1;
+            }
+            if (!bs->disable_journal_fsync)
+            {
+                GET_SQE();
+                my_uring_prep_fsync(sqe, bs->journal.fd, IORING_FSYNC_DATASYNC);
+                data->iov = { 0 };
+                data->callback = simple_callback;
+                wait_count++;
+                bs->ringloop->submit();
+            }
+        resume_4:
+            if (wait_count > 0)
+            {
+                wait_state = 4;
+                return 1;
+            }
+        }
+        if (!bs->journal.inmemory)
+        {
+            free(submitted_buf);
+        }
+    }
+    else
+    {
+        // First block always contains a single JE_START entry
+        je_start = (journal_entry_start*)submitted_buf;
+        if (je_start->magic != JOURNAL_MAGIC ||
+            je_start->type != JE_START ||
+            je_start->size != sizeof(journal_entry_start) ||
+            je_crc32((journal_entry*)je_start) != je_start->crc32)
+        {
+            // Entry is corrupt
+            throw std::runtime_error("first entry of the journal is corrupt");
+        }
+        next_free = journal_pos = bs->journal.used_start = je_start->journal_start;
+        if (!bs->journal.inmemory)
+            free(submitted_buf);
+        submitted_buf = NULL;
+        crc32_last = 0;
+        // Read journal
+        while (1)
+        {
+        resume_2:
+            if (submitted_buf)
+            {
+                wait_state = 2;
+                return 1;
+            }
+            if (!wrapped || journal_pos < bs->journal.used_start)
+            {
+                GET_SQE();
+                uint64_t end = bs->journal.len;
+                if (journal_pos < bs->journal.used_start)
+                    end = bs->journal.used_start;
+                if (!bs->journal.inmemory)
+                    submitted_buf = memalign_or_die(MEM_ALIGNMENT, JOURNAL_BUFFER_SIZE);
+                else
+                    submitted_buf = bs->journal.buffer + journal_pos;
+                data->iov = {
+                    submitted_buf,
+                    end - journal_pos < JOURNAL_BUFFER_SIZE ? end - journal_pos : JOURNAL_BUFFER_SIZE,
+                };
+                data->callback = [this](ring_data_t *data1) { handle_event(data1); };
+                my_uring_prep_readv(sqe, bs->journal.fd, &data->iov, 1, bs->journal.offset + journal_pos);
+                bs->ringloop->submit();
+            }
+            while (done.size() > 0)
+            {
+                handle_res = handle_journal_part(done[0].buf, done[0].pos, done[0].len);
+                if (handle_res == 0)
+                {
+                    // journal ended
+                    // zero out corrupted entry, if required
+                    if (init_write_buf && !bs->readonly)
+                    {
+                        GET_SQE();
+                        data->iov = { init_write_buf, bs->journal.block_size };
+                        data->callback = simple_callback;
+                        my_uring_prep_writev(sqe, bs->journal.fd, &data->iov, 1, bs->journal.offset + init_write_sector);
+                        wait_count++;
+                        bs->ringloop->submit();
+                    resume_7:
+                        if (wait_count > 0)
+                        {
+                            wait_state = 7;
+                            return 1;
+                        }
+                        if (!bs->disable_journal_fsync)
+                        {
+                            GET_SQE();
+                            data->iov = { 0 };
+                            data->callback = simple_callback;
+                            my_uring_prep_fsync(sqe, bs->journal.fd, IORING_FSYNC_DATASYNC);
+                            wait_count++;
+                            bs->ringloop->submit();
+                        }
+                    resume_5:
+                        if (wait_count > 0)
+                        {
+                            wait_state = 5;
+                            return 1;
+                        }
+                    }
+                    // wait for the next read to complete, then stop
+                resume_3:
+                    if (submitted_buf)
+                    {
+                        wait_state = 3;
+                        return 1;
+                    }
+                    // free buffers
+                    if (!bs->journal.inmemory)
+                        for (auto & e: done)
+                            free(e.buf);
+                    done.clear();
+                    break;
+                }
+                else if (handle_res == 1)
+                {
+                    // OK, remove it
+                    if (!bs->journal.inmemory)
+                    {
+                        free(done[0].buf);
+                    }
+                    done.erase(done.begin());
+                }
+                else if (handle_res == 2)
+                {
+                    // Need to wait for more reads
+                    break;
+                }
+            }
+            if (!submitted_buf)
+            {
+                break;
+            }
+        }
+    }
+    // Trim journal on start so we don't stall when all entries are older
+    bs->journal.trim();
+    bs->journal.dirty_start = bs->journal.next_free;
+    printf(
+        "Journal entries loaded: %lu, free journal space: %lu bytes (%08lx..%08lx is used), free blocks: %lu / %lu\n",
+        entries_loaded,
+        (bs->journal.next_free >= bs->journal.used_start
+            ? bs->journal.len-bs->journal.block_size - (bs->journal.next_free-bs->journal.used_start)
+            : bs->journal.used_start - bs->journal.next_free),
+        bs->journal.used_start, bs->journal.next_free,
+        bs->data_alloc->get_free_count(), bs->block_count
+    );
+    bs->journal.crc32_last = crc32_last;
+    return 0;
+}
+
+int blockstore_init_journal::handle_journal_part(void *buf, uint64_t done_pos, uint64_t len)
+{
+    uint64_t proc_pos, pos;
+    if (continue_pos != 0)
+    {
+        proc_pos = (continue_pos / bs->journal.block_size) * bs->journal.block_size;
+        pos = continue_pos % bs->journal.block_size;
+        continue_pos = 0;
+        goto resume;
+    }
+    while (next_free >= done_pos && next_free < done_pos+len)
+    {
+        proc_pos = next_free;
+        pos = 0;
+        next_free += bs->journal.block_size;
+        if (next_free >= bs->journal.len)
+        {
+            next_free = bs->journal.block_size;
+        }
+    resume:
+        while (pos < bs->journal.block_size)
+        {
+            journal_entry *je = (journal_entry*)(buf + proc_pos - done_pos + pos);
+            if (je->magic != JOURNAL_MAGIC || je_crc32(je) != je->crc32 ||
+                je->type < JE_MIN || je->type > JE_MAX || started && je->crc32_prev != crc32_last)
+            {
+                if (pos == 0)
+                {
+                    // invalid entry in the beginning, this is definitely the end of the journal
+                    bs->journal.next_free = proc_pos;
+                    return 0;
+                }
+                else
+                {
+                    // allow partially filled sectors
+                    break;
+                }
+            }
+            if (je->type == JE_SMALL_WRITE || je->type == JE_SMALL_WRITE_INSTANT)
+            {
+#ifdef BLOCKSTORE_DEBUG
+                printf(
+                    "je_small_write%s oid=%lx:%lx ver=%lu offset=%u len=%u\n",
+                    je->type == JE_SMALL_WRITE_INSTANT ? "_instant" : "",
+                    je->small_write.oid.inode, je->small_write.oid.stripe, je->small_write.version,
+                    je->small_write.offset, je->small_write.len
+                );
+#endif
+                // oid, version, offset, len
+                uint64_t prev_free = next_free;
+                if (next_free + je->small_write.len > bs->journal.len)
+                {
+                    // data continues from the beginning of the journal
+                    next_free = bs->journal.block_size;
+                }
+                uint64_t location = next_free;
+                next_free += je->small_write.len;
+                if (next_free >= bs->journal.len)
+                {
+                    next_free = bs->journal.block_size;
+                }
+                if (location != je->small_write.data_offset)
+                {
+                    char err[1024];
+                    snprintf(err, 1024, "BUG: calculated journal data offset (%08lx) != stored journal data offset (%08lx)", location, je->small_write.data_offset);
+                    throw std::runtime_error(err);
+                }
+                uint32_t data_crc32 = 0;
+                if (location >= done_pos && location+je->small_write.len <= done_pos+len)
+                {
+                    // data is within this buffer
+                    data_crc32 = crc32c(0, buf + location - done_pos, je->small_write.len);
+                }
+                else
+                {
+                    // this case is even more interesting because we must carry data crc32 check to next buffer(s)
+                    uint64_t covered = 0;
+                    for (int i = 0; i < done.size(); i++)
+                    {
+                        if (location+je->small_write.len > done[i].pos &&
+                            location < done[i].pos+done[i].len)
+                        {
+                            uint64_t part_end = (location+je->small_write.len < done[i].pos+done[i].len
+                                ? location+je->small_write.len : done[i].pos+done[i].len);
+                            uint64_t part_begin = (location < done[i].pos ? done[i].pos : location);
+                            covered += part_end - part_begin;
+                            data_crc32 = crc32c(data_crc32, done[i].buf + part_begin - done[i].pos, part_end - part_begin);
+                        }
+                    }
+                    if (covered < je->small_write.len)
+                    {
+                        continue_pos = proc_pos+pos;
+                        next_free = prev_free;
+                        return 2;
+                    }
+                }
+                if (data_crc32 != je->small_write.crc32_data)
+                {
+                    // journal entry is corrupt, stop here
+                    // interesting thing is that we must clear the corrupt entry if we're not readonly,
+                    // because we don't write next entries in the same journal block
+                    printf("Journal entry data is corrupt (data crc32 %x != %x)\n", data_crc32, je->small_write.crc32_data);
+                    memset(buf + proc_pos - done_pos + pos, 0, bs->journal.block_size - pos);
+                    bs->journal.next_free = prev_free;
+                    init_write_buf = buf + proc_pos - done_pos;
+                    init_write_sector = proc_pos;
+                    return 0;
+                }
+                auto clean_it = bs->clean_db.find(je->small_write.oid);
+                if (clean_it == bs->clean_db.end() ||
+                    clean_it->second.version < je->small_write.version)
+                {
+                    obj_ver_id ov = {
+                        .oid = je->small_write.oid,
+                        .version = je->small_write.version,
+                    };
+                    bs->dirty_db.emplace(ov, (dirty_entry){
+                        .state = (BS_ST_SMALL_WRITE | BS_ST_SYNCED),
+                        .flags = 0,
+                        .location = location,
+                        .offset = je->small_write.offset,
+                        .len = je->small_write.len,
+                        .journal_sector = proc_pos,
+                    });
+                    bs->journal.used_sectors[proc_pos]++;
+#ifdef BLOCKSTORE_DEBUG
+                    printf(
+                        "journal offset %08lx is used by %lx:%lx v%lu (%lu refs)\n",
+                        proc_pos, ov.oid.inode, ov.oid.stripe, ov.version, bs->journal.used_sectors[proc_pos]
+                    );
+#endif
+                    auto & unstab = bs->unstable_writes[ov.oid];
+                    unstab = unstab < ov.version ? ov.version : unstab;
+                    if (je->type == JE_SMALL_WRITE_INSTANT)
+                    {
+                        bs->mark_stable(ov);
+                    }
+                }
+            }
+            else if (je->type == JE_BIG_WRITE || je->type == JE_BIG_WRITE_INSTANT)
+            {
+#ifdef BLOCKSTORE_DEBUG
+                printf(
+                    "je_big_write%s oid=%lx:%lx ver=%lu loc=%lu\n",
+                    je->type == JE_BIG_WRITE_INSTANT ? "_instant" : "",
+                    je->big_write.oid.inode, je->big_write.oid.stripe, je->big_write.version, je->big_write.location
+                );
+#endif
+                auto clean_it = bs->clean_db.find(je->big_write.oid);
+                if (clean_it == bs->clean_db.end() ||
+                    clean_it->second.version < je->big_write.version)
+                {
+                    // oid, version, block
+                    obj_ver_id ov = {
+                        .oid = je->big_write.oid,
+                        .version = je->big_write.version,
+                    };
+                    bs->dirty_db.emplace(ov, (dirty_entry){
+                        .state = (BS_ST_BIG_WRITE | BS_ST_SYNCED),
+                        .flags = 0,
+                        .location = je->big_write.location,
+                        .offset = je->big_write.offset,
+                        .len = je->big_write.len,
+                        .journal_sector = proc_pos,
+                    });
+#ifdef BLOCKSTORE_DEBUG
+                    printf("Allocate block %lu\n", je->big_write.location >> bs->block_order);
+#endif
+                    bs->data_alloc->set(je->big_write.location >> bs->block_order, true);
+                    bs->journal.used_sectors[proc_pos]++;
+                    auto & unstab = bs->unstable_writes[ov.oid];
+                    unstab = unstab < ov.version ? ov.version : unstab;
+                    if (je->type == JE_BIG_WRITE_INSTANT)
+                    {
+                        bs->mark_stable(ov);
+                    }
+                }
+            }
+            else if (je->type == JE_STABLE)
+            {
+#ifdef BLOCKSTORE_DEBUG
+                printf("je_stable oid=%lx:%lx ver=%lu\n", je->stable.oid.inode, je->stable.oid.stripe, je->stable.version);
+#endif
+                // oid, version
+                obj_ver_id ov = {
+                    .oid = je->stable.oid,
+                    .version = je->stable.version,
+                };
+                bs->mark_stable(ov);
+            }
+            else if (je->type == JE_ROLLBACK)
+            {
+#ifdef BLOCKSTORE_DEBUG
+                printf("je_rollback oid=%lx:%lx ver=%lu\n", je->rollback.oid.inode, je->rollback.oid.stripe, je->rollback.version);
+#endif
+                // rollback dirty writes of <oid> up to <version>
+                obj_ver_id ov = {
+                    .oid = je->rollback.oid,
+                    .version = je->rollback.version,
+                };
+                bs->mark_rolled_back(ov);
+            }
+            else if (je->type == JE_DELETE)
+            {
+#ifdef BLOCKSTORE_DEBUG
+                printf("je_delete oid=%lx:%lx ver=%lu\n", je->del.oid.inode, je->del.oid.stripe, je->del.version);
+#endif
+                auto clean_it = bs->clean_db.find(je->del.oid);
+                if (clean_it == bs->clean_db.end() ||
+                    clean_it->second.version < je->del.version)
+                {
+                    // oid, version
+                    obj_ver_id ov = {
+                        .oid = je->del.oid,
+                        .version = je->del.version,
+                    };
+                    bs->dirty_db.emplace(ov, (dirty_entry){
+                        .state = (BS_ST_DELETE | BS_ST_SYNCED),
+                        .flags = 0,
+                        .location = 0,
+                        .offset = 0,
+                        .len = 0,
+                        .journal_sector = proc_pos,
+                    });
+                    bs->journal.used_sectors[proc_pos]++;
+                    // Deletions are treated as immediately stable, because
+                    // "2-phase commit" (write->stabilize) isn't sufficient for them anyway
+                    bs->mark_stable(ov);
+                }
+            }
+            started = true;
+            pos += je->size;
+            crc32_last = je->crc32;
+            entries_loaded++;
+        }
+    }
+    bs->journal.next_free = next_free;
+    return 1;
+}
--- a/src/blockstore/blockstore_init.h
+++ b/src/blockstore/blockstore_init.h
@ -1,35 +1,21 @@
 // Copyright (c) Vitaliy Filippov, 2019+
-// License: VNPL-1.1 (see README.md for details)
+// License: VNPL-1.0 (see README.md for details)

 #pragma once

-struct blockstore_init_meta_buf
-{
-    uint8_t *buf = NULL;
-    uint64_t size = 0;
-    uint64_t offset = 0;
-    int state = 0;
-};
-
 class blockstore_init_meta
 {
    blockstore_impl_t *bs;
-    int wait_state = 0;
-    bool zero_on_init = false;
+    int wait_state = 0, wait_count = 0;
    void *metadata_buffer = NULL;
-    blockstore_init_meta_buf bufs[2] = {};
-    int submitted = 0;
+    uint64_t metadata_read = 0;
+    int prev = 0, prev_done = 0, done_len = 0, submitted = 0;
+    uint64_t done_cnt = 0, done_pos = 0;
+    uint64_t entries_loaded = 0;
    struct io_uring_sqe *sqe;
    struct ring_data_t *data;
-    uint64_t md_offset = 0;
-    uint64_t next_offset = 0;
-    uint64_t last_read_offset = 0;
-    uint64_t entries_loaded = 0;
-    unsigned entries_per_block = 0;
-    int i = 0, j = 0;
-    std::vector<uint64_t> entries_to_zero;
-    bool handle_meta_block(uint8_t *buf, uint64_t count, uint64_t done_cnt);
-    void handle_event(ring_data_t *data, int buf_num);
+    void handle_entries(void *entries, unsigned count, int block_order);
+    void handle_event(ring_data_t *data);
 public:
    blockstore_init_meta(blockstore_impl_t *bs);
    int loop();
@ -50,8 +36,6 @@ class blockstore_init_journal
    bool started = false;
    uint64_t next_free;
    std::vector<bs_init_journal_done> done;
-    std::vector<obj_ver_id> double_allocs;
-    std::vector<iovec> small_write_data;
    uint64_t journal_pos = 0;
    uint64_t continue_pos = 0;
    void *init_write_buf = NULL;
@ -64,7 +48,6 @@ class blockstore_init_journal
    std::function<void(ring_data_t*)> simple_callback;
    int handle_journal_part(void *buf, uint64_t done_pos, uint64_t len);
    void handle_event(ring_data_t *data);
-    void erase_dirty_object(blockstore_dirty_db_t::iterator dirty_it);
 public:
    blockstore_init_journal(blockstore_impl_t* bs);
    int loop();
--- a/blockstore_journal.cpp
+++ b/blockstore_journal.cpp
@ -0,0 +1,226 @@
+// Copyright (c) Vitaliy Filippov, 2019+
+// License: VNPL-1.0 (see README.md for details)
+
+#include "blockstore_impl.h"
+
+blockstore_journal_check_t::blockstore_journal_check_t(blockstore_impl_t *bs)
+{
+    this->bs = bs;
+    sectors_required = 0;
+    next_pos = bs->journal.next_free;
+    next_sector = bs->journal.cur_sector;
+    first_sector = -1;
+    next_in_pos = bs->journal.in_sector_pos;
+    right_dir = next_pos >= bs->journal.used_start;
+}
+
+// Check if we can write <required> entries of <size> bytes and <data_after> data bytes after them to the journal
+int blockstore_journal_check_t::check_available(blockstore_op_t *op, int entries_required, int size, int data_after)
+{
+    int required = entries_required;
+    while (1)
+    {
+        int fits = bs->journal.no_same_sector_overwrites && bs->journal.sector_info[next_sector].written
+            ? 0
+            : (bs->journal.block_size - next_in_pos) / size;
+        if (fits > 0)
+        {
+            if (first_sector == -1)
+            {
+                first_sector = next_sector;
+            }
+            required -= fits;
+            next_in_pos += fits * size;
+            sectors_required++;
+        }
+        else if (bs->journal.sector_info[next_sector].dirty)
+        {
+            // sectors_required is more like "sectors to write"
+            sectors_required++;
+        }
+        if (required <= 0)
+        {
+            break;
+        }
+        next_pos = next_pos + bs->journal.block_size;
+        if (next_pos >= bs->journal.len)
+        {
+            next_pos = bs->journal.block_size;
+            right_dir = false;
+        }
+        next_in_pos = 0;
+        next_sector = ((next_sector + 1) % bs->journal.sector_count);
+        if (next_sector == first_sector)
+        {
+            // next_sector may wrap when all sectors are flushed and the incoming batch is too big
+            // This is an error condition, we can't wait for anything in this case
+            throw std::runtime_error(
+                "Blockstore journal_sector_buffer_count="+std::to_string(bs->journal.sector_count)+
+                " is too small for a batch of "+std::to_string(entries_required)+" entries of "+std::to_string(size)+" bytes"
+            );
+        }
+        if (bs->journal.sector_info[next_sector].usage_count > 0 ||
+            bs->journal.sector_info[next_sector].dirty)
+        {
+            // No memory buffer available. Wait for it.
+            int used = 0, dirty = 0;
+            for (int i = 0; i < bs->journal.sector_count; i++)
+            {
+                if (bs->journal.sector_info[i].dirty)
+                {
+                    dirty++;
+                    used++;
+                }
+                if (bs->journal.sector_info[i].usage_count > 0)
+                {
+                    used++;
+                }
+            }
+            // In fact, it's even more rare than "ran out of journal space", so print a warning
+            printf(
+                "Ran out of journal sector buffers: %d/%lu buffers used (%d dirty), next buffer (%ld) is %s and flushed %lu times\n",
+                used, bs->journal.sector_count, dirty, next_sector,
+                bs->journal.sector_info[next_sector].dirty ? "dirty" : "not dirty",
+                bs->journal.sector_info[next_sector].usage_count
+            );
+            PRIV(op)->wait_for = WAIT_JOURNAL_BUFFER;
+            return 0;
+        }
+    }
+    if (data_after > 0)
+    {
+        next_pos = next_pos + data_after;
+        if (next_pos > bs->journal.len)
+        {
+            next_pos = bs->journal.block_size + data_after;
+            right_dir = false;
+        }
+    }
+    if (!right_dir && next_pos >= bs->journal.used_start-bs->journal.block_size)
+    {
+        // No space in the journal. Wait until used_start changes.
+        printf(
+            "Ran out of journal space (free space: %lu bytes)\n",
+            (bs->journal.next_free >= bs->journal.used_start
+                ? bs->journal.len-bs->journal.block_size - (bs->journal.next_free-bs->journal.used_start)
+                : bs->journal.used_start - bs->journal.next_free)
+        );
+        PRIV(op)->wait_for = WAIT_JOURNAL;
+        bs->flusher->request_trim();
+        PRIV(op)->wait_detail = bs->journal.used_start;
+        return 0;
+    }
+    return 1;
+}
+
+journal_entry* prefill_single_journal_entry(journal_t & journal, uint16_t type, uint32_t size)
+{
+    if (journal.block_size - journal.in_sector_pos < size ||
+        journal.no_same_sector_overwrites && journal.sector_info[journal.cur_sector].written)
+    {
+        assert(!journal.sector_info[journal.cur_sector].dirty);
+        // Move to the next journal sector
+        journal.sector_info[journal.cur_sector].written = false;
+        if (journal.sector_info[journal.cur_sector].usage_count > 0)
+        {
+            // Also select next sector buffer in memory
+            journal.cur_sector = ((journal.cur_sector + 1) % journal.sector_count);
+            assert(!journal.sector_info[journal.cur_sector].usage_count);
+        }
+        else
+        {
+            journal.dirty_start = journal.next_free;
+        }
+        journal.sector_info[journal.cur_sector].offset = journal.next_free;
+        journal.in_sector_pos = 0;
+        journal.next_free = (journal.next_free+journal.block_size) < journal.len ? journal.next_free + journal.block_size : journal.block_size;
+        memset(journal.inmemory
+            ? journal.buffer + journal.sector_info[journal.cur_sector].offset
+            : journal.sector_buf + journal.block_size*journal.cur_sector, 0, journal.block_size);
+    }
+    journal_entry *je = (struct journal_entry*)(
+        (journal.inmemory
+            ? journal.buffer + journal.sector_info[journal.cur_sector].offset
+            : journal.sector_buf + journal.block_size*journal.cur_sector) + journal.in_sector_pos
+    );
+    journal.in_sector_pos += size;
+    je->magic = JOURNAL_MAGIC;
+    je->type = type;
+    je->size = size;
+    je->crc32_prev = journal.crc32_last;
+    journal.sector_info[journal.cur_sector].dirty = true;
+    return je;
+}
+
+void prepare_journal_sector_write(journal_t & journal, int cur_sector, io_uring_sqe *sqe, std::function<void(ring_data_t*)> cb)
+{
+    journal.sector_info[cur_sector].dirty = false;
+    journal.sector_info[cur_sector].written = true;
+    journal.sector_info[cur_sector].usage_count++;
+    ring_data_t *data = ((ring_data_t*)sqe->user_data);
+    data->iov = (struct iovec){
+        (journal.inmemory
+            ? journal.buffer + journal.sector_info[cur_sector].offset
+            : journal.sector_buf + journal.block_size*cur_sector),
+        journal.block_size
+    };
+    data->callback = cb;
+    my_uring_prep_writev(
+        sqe, journal.fd, &data->iov, 1, journal.offset + journal.sector_info[cur_sector].offset
+    );
+}
+
+journal_t::~journal_t()
+{
+    if (sector_buf)
+        free(sector_buf);
+    if (sector_info)
+        free(sector_info);
+    if (buffer)
+        free(buffer);
+    sector_buf = NULL;
+    sector_info = NULL;
+    buffer = NULL;
+}
+
+bool journal_t::trim()
+{
+    auto journal_used_it = used_sectors.lower_bound(used_start);
+#ifdef BLOCKSTORE_DEBUG
+    printf(
+        "Trimming journal (used_start=%08lx, next_free=%08lx, dirty_start=%08lx, new_start=%08lx, new_refcount=%ld)\n",
+        used_start, next_free, dirty_start,
+        journal_used_it == used_sectors.end() ? 0 : journal_used_it->first,
+        journal_used_it == used_sectors.end() ? 0 : journal_used_it->second
+    );
+#endif
+    if (journal_used_it == used_sectors.end())
+    {
+        // Journal is cleared to its end, restart from the beginning
+        journal_used_it = used_sectors.begin();
+        if (journal_used_it == used_sectors.end())
+        {
+            // Journal is empty
+            used_start = next_free;
+        }
+        else
+        {
+            used_start = journal_used_it->first;
+            // next_free does not need updating here
+        }
+    }
+    else if (journal_used_it->first > used_start)
+    {
+        // Journal is cleared up to <journal_used_it>
+        used_start = journal_used_it->first;
+    }
+    else
+    {
+        // Can't trim journal
+        return false;
+    }
+#ifdef BLOCKSTORE_DEBUG
+    printf("Journal trimmed to %08lx (next_free=%08lx)\n", used_start, next_free);
+#endif
+    return true;
+}
--- a/src/blockstore/blockstore_journal.h
+++ b/src/blockstore/blockstore_journal.h
@ -1,17 +1,16 @@
 // Copyright (c) Vitaliy Filippov, 2019+
-// License: VNPL-1.1 (see README.md for details)
+// License: VNPL-1.0 (see README.md for details)

 #pragma once

 #include "crc32c.h"
-#include <set>

 #define MIN_JOURNAL_SIZE 4*1024*1024
 #define JOURNAL_MAGIC 0x4A33
-#define JOURNAL_VERSION_V1 1
-#define JOURNAL_VERSION_V2 2
 #define JOURNAL_BUFFER_SIZE 4*1024*1024
-#define JOURNAL_ENTRY_HEADER_SIZE 16
+
+// We reserve some extra space for future stabilize requests during writes
+#define JOURNAL_STABILIZE_RESERVATION 65536

 // Journal entries
 // Journal entries are linked to each other by their crc32 value
@ -27,7 +26,7 @@
 #define JE_BIG_WRITE_INSTANT   0x08
 #define JE_MAX         0x08

-// crc32c comes first to ease calculation
+// crc32c comes first to ease calculation and is equal to crc32()
 struct __attribute__((__packed__)) journal_entry_start
 {
    uint32_t crc32;
@ -36,13 +35,7 @@ struct __attribute__((__packed__)) journal_entry_start
    uint32_t size;
    uint32_t reserved;
    uint64_t journal_start;
-    uint64_t version;
-    uint32_t data_csum_type;
-    uint32_t csum_block_size;
 };
-#define JE_START_V0_SIZE 24
-#define JE_START_V1_SIZE 32
-#define JE_START_V2_SIZE 40

 struct __attribute__((__packed__)) journal_entry_small_write
 {
@ -58,12 +51,7 @@ struct __attribute__((__packed__)) journal_entry_small_write
    // small_write entries contain <len> bytes of data which is stored in next sectors
    // data_offset is its offset within journal
    uint64_t data_offset;
-    uint32_t crc32_data; // zero when data_csum_type != 0
-    // small_write and big_write entries are followed by the "external" bitmap
-    // its size is dynamic and included in journal entry's <size> field
-    uint8_t bitmap[];
-    // and then data checksums if data_csum_type != 0
-    // uint32_t data_crc32c[];
+    uint32_t crc32_data;
 };

 struct __attribute__((__packed__)) journal_entry_big_write
@ -78,11 +66,6 @@ struct __attribute__((__packed__)) journal_entry_big_write
    uint32_t offset;
    uint32_t len;
    uint64_t location;
-    // small_write and big_write entries are followed by the "external" bitmap
-    // its size is dynamic and included in journal entry's <size> field
-    uint8_t bitmap[];
-    // and then data checksums if data_csum_type != 0
-    // uint32_t data_crc32c[];
 };

 struct __attribute__((__packed__)) journal_entry_stable
@ -148,24 +131,16 @@ inline uint32_t je_crc32(journal_entry *je)
 struct journal_sector_info_t
 {
    uint64_t offset;
-    uint64_t flush_count;
+    uint64_t usage_count;
    bool written;
    bool dirty;
-    uint64_t submit_id;
-};
-
-struct pending_journaling_t
-{
-    int pending;
-    int sector;
-    blockstore_op_t *op;
 };

 struct journal_t
 {
    int fd;
+    uint64_t device_size;
    bool inmemory = false;
-    bool flush_journal = false;
    void *buffer = NULL;

    uint64_t block_size;
@ -185,9 +160,6 @@ struct journal_t
    bool no_same_sector_overwrites = false;
    int cur_sector = 0;
    int in_sector_pos = 0;
-    std::vector<int> submitting_sectors;
-    std::multimap<uint64_t, pending_journaling_t> flushing_ops;
-    uint64_t submit_id = 0;

    // Used sector map
    // May use ~ 80 MB per 1 GB of used journal space in the worst case
@ -195,20 +167,13 @@ struct journal_t

    ~journal_t();
    bool trim();
-    uint64_t get_trim_pos();
-    void dump_diagnostics();
-    inline bool entry_fits(int size)
-    {
-        return !(block_size - in_sector_pos < size ||
-            no_same_sector_overwrites && sector_info[cur_sector].written);
-    }
 };

 struct blockstore_journal_check_t
 {
    blockstore_impl_t *bs;
    uint64_t next_pos, next_sector, next_in_pos;
-    int sectors_to_write, first_sector;
+    int sectors_required, first_sector;
    bool right_dir; // writing to the end or the beginning of the ring buffer

    blockstore_journal_check_t(blockstore_impl_t *bs);
@ -217,5 +182,4 @@ struct blockstore_journal_check_t

 journal_entry* prefill_single_journal_entry(journal_t & journal, uint16_t type, uint32_t size);

-uint32_t crc32c_pad(uint32_t prev_crc, const void *buf, size_t len, size_t left_pad, size_t right_pad);
-uint32_t crc32c_nopad(uint32_t prev_crc, const void *buf, size_t len, size_t left_pad, size_t right_pad);
+void prepare_journal_sector_write(journal_t & journal, int sector, io_uring_sqe *sqe, std::function<void(ring_data_t*)> cb);
--- a/blockstore_open.cpp
+++ b/blockstore_open.cpp
@ -0,0 +1,371 @@
+// Copyright (c) Vitaliy Filippov, 2019+
+// License: VNPL-1.0 (see README.md for details)
+
+#include <sys/file.h>
+#include "blockstore_impl.h"
+
+static uint32_t is_power_of_two(uint64_t value)
+{
+    uint32_t l = 0;
+    while (value > 1)
+    {
+        if (value & 1)
+        {
+            return 64;
+        }
+        value = value >> 1;
+        l++;
+    }
+    return l;
+}
+
+void blockstore_impl_t::parse_config(blockstore_config_t & config)
+{
+    // Parse
+    if (config["readonly"] == "true" || config["readonly"] == "1" || config["readonly"] == "yes")
+    {
+        readonly = true;
+    }
+    if (config["disable_data_fsync"] == "true" || config["disable_data_fsync"] == "1" || config["disable_data_fsync"] == "yes")
+    {
+        disable_data_fsync = true;
+    }
+    if (config["disable_meta_fsync"] == "true" || config["disable_meta_fsync"] == "1" || config["disable_meta_fsync"] == "yes")
+    {
+        disable_meta_fsync = true;
+    }
+    if (config["disable_journal_fsync"] == "true" || config["disable_journal_fsync"] == "1" || config["disable_journal_fsync"] == "yes")
+    {
+        disable_journal_fsync = true;
+    }
+    if (config["disable_device_lock"] == "true" || config["disable_device_lock"] == "1" || config["disable_device_lock"] == "yes")
+    {
+        disable_flock = true;
+    }
+    if (config["immediate_commit"] == "all")
+    {
+        immediate_commit = IMMEDIATE_ALL;
+    }
+    else if (config["immediate_commit"] == "small")
+    {
+        immediate_commit = IMMEDIATE_SMALL;
+    }
+    metadata_buf_size = strtoull(config["meta_buf_size"].c_str(), NULL, 10);
+    cfg_journal_size = strtoull(config["journal_size"].c_str(), NULL, 10);
+    data_device = config["data_device"];
+    data_offset = strtoull(config["data_offset"].c_str(), NULL, 10);
+    cfg_data_size = strtoull(config["data_size"].c_str(), NULL, 10);
+    meta_device = config["meta_device"];
+    meta_offset = strtoull(config["meta_offset"].c_str(), NULL, 10);
+    block_size = strtoull(config["block_size"].c_str(), NULL, 10);
+    inmemory_meta = config["inmemory_metadata"] != "false";
+    journal_device = config["journal_device"];
+    journal.offset = strtoull(config["journal_offset"].c_str(), NULL, 10);
+    journal.sector_count = strtoull(config["journal_sector_buffer_count"].c_str(), NULL, 10);
+    journal.no_same_sector_overwrites = config["journal_no_same_sector_overwrites"] == "true" ||
+        config["journal_no_same_sector_overwrites"] == "1" || config["journal_no_same_sector_overwrites"] == "yes";
+    journal.inmemory = config["inmemory_journal"] != "false";
+    disk_alignment = strtoull(config["disk_alignment"].c_str(), NULL, 10);
+    journal_block_size = strtoull(config["journal_block_size"].c_str(), NULL, 10);
+    meta_block_size = strtoull(config["meta_block_size"].c_str(), NULL, 10);
+    bitmap_granularity = strtoull(config["bitmap_granularity"].c_str(), NULL, 10);
+    flusher_count = strtoull(config["flusher_count"].c_str(), NULL, 10);
+    // Validate
+    if (!block_size)
+    {
+        block_size = (1 << DEFAULT_ORDER);
+    }
+    if ((block_order = is_power_of_two(block_size)) >= 64 || block_size < MIN_BLOCK_SIZE || block_size >= MAX_BLOCK_SIZE)
+    {
+        throw std::runtime_error("Bad block size");
+    }
+    if (!flusher_count)
+    {
+        flusher_count = 32;
+    }
+    if (!disk_alignment)
+    {
+        disk_alignment = 4096;
+    }
+    else if (disk_alignment % MEM_ALIGNMENT)
+    {
+        throw std::runtime_error("disk_alingment must be a multiple of "+std::to_string(MEM_ALIGNMENT));
+    }
+    if (!journal_block_size)
+    {
+        journal_block_size = 4096;
+    }
+    else if (journal_block_size % MEM_ALIGNMENT)
+    {
+        throw std::runtime_error("journal_block_size must be a multiple of "+std::to_string(MEM_ALIGNMENT));
+    }
+    if (!meta_block_size)
+    {
+        meta_block_size = 4096;
+    }
+    else if (meta_block_size % MEM_ALIGNMENT)
+    {
+        throw std::runtime_error("meta_block_size must be a multiple of "+std::to_string(MEM_ALIGNMENT));
+    }
+    if (data_offset % disk_alignment)
+    {
+        throw std::runtime_error("data_offset must be a multiple of disk_alignment = "+std::to_string(disk_alignment));
+    }
+    if (!bitmap_granularity)
+    {
+        bitmap_granularity = 4096;
+    }
+    else if (bitmap_granularity % disk_alignment)
+    {
+        throw std::runtime_error("Sparse write tracking granularity must be a multiple of disk_alignment = "+std::to_string(disk_alignment));
+    }
+    if (block_size % bitmap_granularity)
+    {
+        throw std::runtime_error("Block size must be a multiple of sparse write tracking granularity");
+    }
+    if (journal_device == meta_device || meta_device == "" && journal_device == data_device)
+    {
+        journal_device = "";
+    }
+    if (meta_device == data_device)
+    {
+        meta_device = "";
+    }
+    if (meta_offset % meta_block_size)
+    {
+        throw std::runtime_error("meta_offset must be a multiple of meta_block_size = "+std::to_string(meta_block_size));
+    }
+    if (journal.offset % journal_block_size)
+    {
+        throw std::runtime_error("journal_offset must be a multiple of journal_block_size = "+std::to_string(journal_block_size));
+    }
+    if (journal.sector_count < 2)
+    {
+        journal.sector_count = 32;
+    }
+    if (metadata_buf_size < 65536)
+    {
+        metadata_buf_size = 4*1024*1024;
+    }
+    if (meta_device == "")
+    {
+        disable_meta_fsync = disable_data_fsync;
+    }
+    if (journal_device == "")
+    {
+        disable_journal_fsync = disable_meta_fsync;
+    }
+    if (immediate_commit != IMMEDIATE_NONE && !disable_journal_fsync)
+    {
+        throw std::runtime_error("immediate_commit requires disable_journal_fsync");
+    }
+    if (immediate_commit == IMMEDIATE_ALL && !disable_data_fsync)
+    {
+        throw std::runtime_error("immediate_commit=all requires disable_journal_fsync and disable_data_fsync");
+    }
+    // init some fields
+    clean_entry_bitmap_size = block_size / bitmap_granularity / 8;
+    clean_entry_size = sizeof(clean_disk_entry) + clean_entry_bitmap_size;
+    journal.block_size = journal_block_size;
+    journal.next_free = journal_block_size;
+    journal.used_start = journal_block_size;
+    // no free space because sector is initially unmapped
+    journal.in_sector_pos = journal_block_size;
+}
+
+void blockstore_impl_t::calc_lengths()
+{
+    // data
+    data_len = data_size - data_offset;
+    if (data_fd == meta_fd && data_offset < meta_offset)
+    {
+        data_len = meta_offset - data_offset;
+    }
+    if (data_fd == journal.fd && data_offset < journal.offset)
+    {
+        data_len = data_len < journal.offset-data_offset
+            ? data_len : journal.offset-data_offset;
+    }
+    if (cfg_data_size != 0)
+    {
+        if (data_len < cfg_data_size)
+        {
+            throw std::runtime_error("Data area ("+std::to_string(data_len)+
+                " bytes) is less than configured size ("+std::to_string(cfg_data_size)+" bytes)");
+        }
+        data_len = cfg_data_size;
+    }
+    // meta
+    meta_area = (meta_fd == data_fd ? data_size : meta_size) - meta_offset;
+    if (meta_fd == data_fd && meta_offset <= data_offset)
+    {
+        meta_area = data_offset - meta_offset;
+    }
+    if (meta_fd == journal.fd && meta_offset <= journal.offset)
+    {
+        meta_area = meta_area < journal.offset-meta_offset
+            ? meta_area : journal.offset-meta_offset;
+    }
+    // journal
+    journal.len = (journal.fd == data_fd ? data_size : (journal.fd == meta_fd ? meta_size : journal.device_size)) - journal.offset;
+    if (journal.fd == data_fd && journal.offset <= data_offset)
+    {
+        journal.len = data_offset - journal.offset;
+    }
+    if (journal.fd == meta_fd && journal.offset <= meta_offset)
+    {
+        journal.len = journal.len < meta_offset-journal.offset
+            ? journal.len : meta_offset-journal.offset;
+    }
+    // required metadata size
+    block_count = data_len / block_size;
+    meta_len = ((block_count - 1 + meta_block_size / clean_entry_size) / (meta_block_size / clean_entry_size)) * meta_block_size;
+    if (meta_area < meta_len)
+    {
+        throw std::runtime_error("Metadata area is too small, need at least "+std::to_string(meta_len)+" bytes");
+    }
+    if (inmemory_meta)
+    {
+        metadata_buffer = memalign(MEM_ALIGNMENT, meta_len);
+        if (!metadata_buffer)
+            throw std::runtime_error("Failed to allocate memory for the metadata");
+    }
+    else if (clean_entry_bitmap_size)
+    {
+        clean_bitmap = (uint8_t*)malloc(block_count * clean_entry_bitmap_size);
+        if (!clean_bitmap)
+            throw std::runtime_error("Failed to allocate memory for the metadata sparse write bitmap");
+    }
+    // requested journal size
+    if (cfg_journal_size > journal.len)
+    {
+        throw std::runtime_error("Requested journal_size is too large");
+    }
+    else if (cfg_journal_size > 0)
+    {
+        journal.len = cfg_journal_size;
+    }
+    if (journal.len < MIN_JOURNAL_SIZE)
+    {
+        throw std::runtime_error("Journal is too small, need at least "+std::to_string(MIN_JOURNAL_SIZE)+" bytes");
+    }
+    if (journal.inmemory)
+    {
+        journal.buffer = memalign(MEM_ALIGNMENT, journal.len);
+        if (!journal.buffer)
+            throw std::runtime_error("Failed to allocate memory for journal");
+    }
+}
+
+void check_size(int fd, uint64_t *size, std::string name)
+{
+    int sectsize;
+    struct stat st;
+    if (fstat(fd, &st) < 0)
+    {
+        throw std::runtime_error("Failed to stat "+name);
+    }
+    if (S_ISREG(st.st_mode))
+    {
+        *size = st.st_size;
+    }
+    else if (S_ISBLK(st.st_mode))
+    {
+        if (ioctl(fd, BLKSSZGET, &sectsize) < 0 ||
+            ioctl(fd, BLKGETSIZE64, size) < 0 ||
+            sectsize != 512)
+        {
+            throw std::runtime_error(name+" sector is not equal to 512 bytes");
+        }
+    }
+    else
+    {
+        throw std::runtime_error(name+" is neither a file nor a block device");
+    }
+}
+
+void blockstore_impl_t::open_data()
+{
+    data_fd = open(data_device.c_str(), O_DIRECT|O_RDWR);
+    if (data_fd == -1)
+    {
+        throw std::runtime_error("Failed to open data device");
+    }
+    check_size(data_fd, &data_size, "data device");
+    if (data_offset >= data_size)
+    {
+        throw std::runtime_error("data_offset exceeds device size = "+std::to_string(data_size));
+    }
+    if (!disable_flock && flock(data_fd, LOCK_EX|LOCK_NB) != 0)
+    {
+        throw std::runtime_error(std::string("Failed to lock data device: ") + strerror(errno));
+    }
+}
+
+void blockstore_impl_t::open_meta()
+{
+    if (meta_device != "")
+    {
+        meta_offset = 0;
+        meta_fd = open(meta_device.c_str(), O_DIRECT|O_RDWR);
+        if (meta_fd == -1)
+        {
+            throw std::runtime_error("Failed to open metadata device");
+        }
+        check_size(meta_fd, &meta_size, "metadata device");
+        if (meta_offset >= meta_size)
+        {
+            throw std::runtime_error("meta_offset exceeds device size = "+std::to_string(meta_size));
+        }
+        if (!disable_flock && flock(meta_fd, LOCK_EX|LOCK_NB) != 0)
+        {
+            throw std::runtime_error(std::string("Failed to lock metadata device: ") + strerror(errno));
+        }
+    }
+    else
+    {
+        meta_fd = data_fd;
+        meta_size = 0;
+        if (meta_offset >= data_size)
+        {
+            throw std::runtime_error("meta_offset exceeds device size = "+std::to_string(data_size));
+        }
+    }
+}
+
+void blockstore_impl_t::open_journal()
+{
+    if (journal_device != "")
+    {
+        journal.fd = open(journal_device.c_str(), O_DIRECT|O_RDWR);
+        if (journal.fd == -1)
+        {
+            throw std::runtime_error("Failed to open journal device");
+        }
+        check_size(journal.fd, &journal.device_size, "journal device");
+        if (!disable_flock && flock(journal.fd, LOCK_EX|LOCK_NB) != 0)
+        {
+            throw std::runtime_error(std::string("Failed to lock journal device: ") + strerror(errno));
+        }
+    }
+    else
+    {
+        journal.fd = meta_fd;
+        journal.device_size = 0;
+        if (journal.offset >= data_size)
+        {
+            throw std::runtime_error("journal_offset exceeds device size");
+        }
+    }
+    journal.sector_info = (journal_sector_info_t*)calloc(journal.sector_count, sizeof(journal_sector_info_t));
+    if (!journal.sector_info)
+    {
+        throw std::bad_alloc();
+    }
+    if (!journal.inmemory)
+    {
+        journal.sector_buf = (uint8_t*)memalign(MEM_ALIGNMENT, journal.sector_count * journal_block_size);
+        if (!journal.sector_buf)
+            throw std::bad_alloc();
+    }
+}
--- a/blockstore_read.cpp
+++ b/blockstore_read.cpp
@ -0,0 +1,256 @@
+// Copyright (c) Vitaliy Filippov, 2019+
+// License: VNPL-1.0 (see README.md for details)
+
+#include "blockstore_impl.h"
+
+int blockstore_impl_t::fulfill_read_push(blockstore_op_t *op, void *buf, uint64_t offset, uint64_t len,
+    uint32_t item_state, uint64_t item_version)
+{
+    if (!len)
+    {
+        // Zero-length version - skip
+        return 1;
+    }
+    else if (IS_IN_FLIGHT(item_state))
+    {
+        // Write not finished yet - skip
+        return 1;
+    }
+    else if (IS_DELETE(item_state))
+    {
+        // item is unallocated - return zeroes
+        memset(buf, 0, len);
+        return 1;
+    }
+    if (journal.inmemory && IS_JOURNAL(item_state))
+    {
+        memcpy(buf, journal.buffer + offset, len);
+        return 1;
+    }
+    BS_SUBMIT_GET_SQE(sqe, data);
+    data->iov = (struct iovec){ buf, len };
+    PRIV(op)->pending_ops++;
+    my_uring_prep_readv(
+        sqe,
+        IS_JOURNAL(item_state) ? journal.fd : data_fd,
+        &data->iov, 1,
+        (IS_JOURNAL(item_state) ? journal.offset : data_offset) + offset
+    );
+    data->callback = [this, op](ring_data_t *data) { handle_read_event(data, op); };
+    return 1;
+}
+
+// FIXME I've seen a bug here so I want some tests
+int blockstore_impl_t::fulfill_read(blockstore_op_t *read_op, uint64_t &fulfilled, uint32_t item_start, uint32_t item_end,
+    uint32_t item_state, uint64_t item_version, uint64_t item_location)
+{
+    uint32_t cur_start = item_start;
+    if (cur_start < read_op->offset + read_op->len && item_end > read_op->offset)
+    {
+        cur_start = cur_start < read_op->offset ? read_op->offset : cur_start;
+        item_end = item_end > read_op->offset + read_op->len ? read_op->offset + read_op->len : item_end;
+        auto it = PRIV(read_op)->read_vec.begin();
+        while (1)
+        {
+            for (; it != PRIV(read_op)->read_vec.end(); it++)
+            {
+                if (it->offset >= cur_start)
+                {
+                    break;
+                }
+                else if (it->offset + it->len > cur_start)
+                {
+                    cur_start = it->offset + it->len;
+                    if (cur_start >= item_end)
+                    {
+                        goto endwhile;
+                    }
+                }
+            }
+            if (it == PRIV(read_op)->read_vec.end() || it->offset > cur_start)
+            {
+                fulfill_read_t el = {
+                    .offset = cur_start,
+                    .len = it == PRIV(read_op)->read_vec.end() || it->offset >= item_end ? item_end-cur_start : it->offset-cur_start,
+                };
+                it = PRIV(read_op)->read_vec.insert(it, el);
+                if (!fulfill_read_push(read_op,
+                    read_op->buf + el.offset - read_op->offset,
+                    item_location + el.offset - item_start,
+                    el.len, item_state, item_version))
+                {
+                    return 0;
+                }
+                fulfilled += el.len;
+            }
+            cur_start = it->offset + it->len;
+            if (it == PRIV(read_op)->read_vec.end() || cur_start >= item_end)
+            {
+                break;
+            }
+        }
+    }
+endwhile:
+    return 1;
+}
+
+int blockstore_impl_t::dequeue_read(blockstore_op_t *read_op)
+{
+    auto clean_it = clean_db.find(read_op->oid);
+    auto dirty_it = dirty_db.upper_bound((obj_ver_id){
+        .oid = read_op->oid,
+        .version = UINT64_MAX,
+    });
+    if (dirty_it != dirty_db.begin())
+        dirty_it--;
+    bool clean_found = clean_it != clean_db.end();
+    bool dirty_found = (dirty_it != dirty_db.end() && dirty_it->first.oid == read_op->oid);
+    if (!clean_found && !dirty_found)
+    {
+        // region is not allocated - return zeroes
+        memset(read_op->buf, 0, read_op->len);
+        read_op->version = 0;
+        read_op->retval = read_op->len;
+        FINISH_OP(read_op);
+        return 1;
+    }
+    uint64_t fulfilled = 0;
+    PRIV(read_op)->pending_ops = 0;
+    uint64_t result_version = 0;
+    if (dirty_found)
+    {
+        while (dirty_it->first.oid == read_op->oid)
+        {
+            dirty_entry& dirty = dirty_it->second;
+            bool version_ok = read_op->version >= dirty_it->first.version;
+            if (IS_SYNCED(dirty.state))
+            {
+                if (!version_ok && read_op->version != 0)
+                    read_op->version = dirty_it->first.version;
+                version_ok = true;
+            }
+            if (version_ok)
+            {
+                if (!result_version)
+                {
+                    result_version = dirty_it->first.version;
+                }
+                if (!fulfill_read(read_op, fulfilled, dirty.offset, dirty.offset + dirty.len,
+                    dirty.state, dirty_it->first.version, dirty.location + (IS_JOURNAL(dirty.state) ? 0 : dirty.offset)))
+                {
+                    // need to wait. undo added requests, don't dequeue op
+                    PRIV(read_op)->read_vec.clear();
+                    return 0;
+                }
+            }
+            if (fulfilled == read_op->len || dirty_it == dirty_db.begin())
+            {
+                break;
+            }
+            dirty_it--;
+        }
+    }
+    if (clean_it != clean_db.end())
+    {
+        if (!result_version)
+        {
+            result_version = clean_it->second.version;
+        }
+        if (fulfilled < read_op->len)
+        {
+            if (!clean_entry_bitmap_size)
+            {
+                if (!fulfill_read(read_op, fulfilled, 0, block_size, (BS_ST_BIG_WRITE | BS_ST_STABLE), 0, clean_it->second.location))
+                {
+                    // need to wait. undo added requests, don't dequeue op
+                    PRIV(read_op)->read_vec.clear();
+                    return 0;
+                }
+            }
+            else
+            {
+                uint64_t meta_loc = clean_it->second.location >> block_order;
+                uint8_t *clean_entry_bitmap;
+                if (inmemory_meta)
+                {
+                    uint64_t sector = (meta_loc / (meta_block_size / clean_entry_size)) * meta_block_size;
+                    uint64_t pos = (meta_loc % (meta_block_size / clean_entry_size));
+                    clean_entry_bitmap = (uint8_t*)(metadata_buffer + sector + pos*clean_entry_size + sizeof(clean_disk_entry));
+                }
+                else
+                {
+                    clean_entry_bitmap = (uint8_t*)(clean_bitmap + meta_loc*clean_entry_bitmap_size);
+                }
+                uint64_t bmp_start = 0, bmp_end = 0, bmp_size = block_size/bitmap_granularity;
+                while (bmp_start < bmp_size)
+                {
+                    while (!(clean_entry_bitmap[bmp_end >> 3] & (1 << (bmp_end & 0x7))) && bmp_end < bmp_size)
+                    {
+                        bmp_end++;
+                    }
+                    if (bmp_end > bmp_start)
+                    {
+                        // fill with zeroes
+                        fulfill_read(read_op, fulfilled, bmp_start * bitmap_granularity,
+                            bmp_end * bitmap_granularity, (BS_ST_DELETE | BS_ST_STABLE), 0, 0);
+                    }
+                    bmp_start = bmp_end;
+                    while (clean_entry_bitmap[bmp_end >> 3] & (1 << (bmp_end & 0x7)) && bmp_end < bmp_size)
+                    {
+                        bmp_end++;
+                    }
+                    if (bmp_end > bmp_start)
+                    {
+                        if (!fulfill_read(read_op, fulfilled, bmp_start * bitmap_granularity,
+                            bmp_end * bitmap_granularity, (BS_ST_BIG_WRITE | BS_ST_STABLE), 0,
+                            clean_it->second.location + bmp_start * bitmap_granularity))
+                        {
+                            // need to wait. undo added requests, don't dequeue op
+                            PRIV(read_op)->read_vec.clear();
+                            return 0;
+                        }
+                        bmp_start = bmp_end;
+                    }
+                }
+            }
+        }
+    }
+    else if (fulfilled < read_op->len)
+    {
+        // fill remaining parts with zeroes
+        fulfill_read(read_op, fulfilled, 0, block_size, (BS_ST_DELETE | BS_ST_STABLE), 0, 0);
+    }
+    assert(fulfilled == read_op->len);
+    read_op->version = result_version;
+    if (!PRIV(read_op)->pending_ops)
+    {
+        // everything is fulfilled from memory
+        if (!PRIV(read_op)->read_vec.size())
+        {
+            // region is not allocated - return zeroes
+            memset(read_op->buf, 0, read_op->len);
+        }
+        read_op->retval = read_op->len;
+        FINISH_OP(read_op);
+        return 1;
+    }
+    read_op->retval = 0;
+    return 1;
+}
+
+void blockstore_impl_t::handle_read_event(ring_data_t *data, blockstore_op_t *op)
+{
+    live = true;
+    PRIV(op)->pending_ops--;
+    if (data->res != data->iov.iov_len)
+    {
+        // read error
+        op->retval = data->res;
+    }
+    if (PRIV(op)->pending_ops == 0)
+    {
+        if (op->retval == 0)
+            op->retval = op->len;
+        FINISH_OP(op);
+    }
+}
--- a/blockstore_rollback.cpp
+++ b/blockstore_rollback.cpp
@ -0,0 +1,246 @@
+// Copyright (c) Vitaliy Filippov, 2019+
+// License: VNPL-1.0 (see README.md for details)
+
+#include "blockstore_impl.h"
+
+int blockstore_impl_t::dequeue_rollback(blockstore_op_t *op)
+{
+    if (PRIV(op)->op_state)
+    {
+        return continue_rollback(op);
+    }
+    obj_ver_id* v;
+    int i, todo = op->len;
+    for (i = 0, v = (obj_ver_id*)op->buf; i < op->len; i++, v++)
+    {
+        // Check that there are some versions greater than v->version (which may be zero),
+        // check that they're unstable, synced, and not currently written to
+        auto dirty_it = dirty_db.lower_bound((obj_ver_id){
+            .oid = v->oid,
+            .version = UINT64_MAX,
+        });
+        if (dirty_it == dirty_db.begin())
+        {
+            if (v->version == 0)
+            {
+                // Already rolled back
+                // FIXME Skip this object version
+            }
+        bad_op:
+            op->retval = -ENOENT;
+            FINISH_OP(op);
+            return 1;
+        }
+        else
+        {
+            dirty_it--;
+            if (dirty_it->first.oid != v->oid || dirty_it->first.version < v->version)
+            {
+                goto bad_op;
+            }
+            while (dirty_it->first.oid == v->oid && dirty_it->first.version > v->version)
+            {
+                if (!IS_SYNCED(dirty_it->second.state) ||
+                    IS_STABLE(dirty_it->second.state))
+                {
+                    op->retval = -EBUSY;
+                    FINISH_OP(op);
+                    return 1;
+                }
+                if (dirty_it == dirty_db.begin())
+                {
+                    break;
+                }
+                dirty_it--;
+            }
+        }
+    }
+    // Check journal space
+    blockstore_journal_check_t space_check(this);
+    if (!space_check.check_available(op, todo, sizeof(journal_entry_rollback), 0))
+    {
+        return 0;
+    }
+    // There is sufficient space. Get SQEs
+    struct io_uring_sqe *sqe[space_check.sectors_required];
+    for (i = 0; i < space_check.sectors_required; i++)
+    {
+        BS_SUBMIT_GET_SQE_DECL(sqe[i]);
+    }
+    // Prepare and submit journal entries
+    auto cb = [this, op](ring_data_t *data) { handle_rollback_event(data, op); };
+    int s = 0, cur_sector = -1;
+    if ((journal_block_size - journal.in_sector_pos) < sizeof(journal_entry_rollback) &&
+        journal.sector_info[journal.cur_sector].dirty)
+    {
+        if (cur_sector == -1)
+            PRIV(op)->min_flushed_journal_sector = 1 + journal.cur_sector;
+        cur_sector = journal.cur_sector;
+        prepare_journal_sector_write(journal, cur_sector, sqe[s++], cb);
+    }
+    for (i = 0, v = (obj_ver_id*)op->buf; i < op->len; i++, v++)
+    {
+        journal_entry_rollback *je = (journal_entry_rollback*)
+            prefill_single_journal_entry(journal, JE_ROLLBACK, sizeof(journal_entry_rollback));
+        journal.sector_info[journal.cur_sector].dirty = false;
+        je->oid = v->oid;
+        je->version = v->version;
+        je->crc32 = je_crc32((journal_entry*)je);
+        journal.crc32_last = je->crc32;
+        if (cur_sector != journal.cur_sector)
+        {
+            if (cur_sector == -1)
+                PRIV(op)->min_flushed_journal_sector = 1 + journal.cur_sector;
+            cur_sector = journal.cur_sector;
+            prepare_journal_sector_write(journal, cur_sector, sqe[s++], cb);
+        }
+    }
+    PRIV(op)->max_flushed_journal_sector = 1 + journal.cur_sector;
+    PRIV(op)->pending_ops = s;
+    PRIV(op)->op_state = 1;
+    inflight_writes++;
+    return 1;
+}
+
+int blockstore_impl_t::continue_rollback(blockstore_op_t *op)
+{
+    if (PRIV(op)->op_state == 2)
+        goto resume_2;
+    else if (PRIV(op)->op_state == 3)
+        goto resume_3;
+    else if (PRIV(op)->op_state == 5)
+        goto resume_5;
+    else
+        return 1;
+resume_2:
+    // Release used journal sectors
+    release_journal_sectors(op);
+resume_3:
+    if (!disable_journal_fsync)
+    {
+        io_uring_sqe *sqe = get_sqe();
+        if (!sqe)
+        {
+            return 0;
+        }
+        ring_data_t *data = ((ring_data_t*)sqe->user_data);
+        my_uring_prep_fsync(sqe, journal.fd, IORING_FSYNC_DATASYNC);
+        data->iov = { 0 };
+        data->callback = [this, op](ring_data_t *data) { handle_rollback_event(data, op); };
+        PRIV(op)->min_flushed_journal_sector = PRIV(op)->max_flushed_journal_sector = 0;
+        PRIV(op)->pending_ops = 1;
+        PRIV(op)->op_state = 4;
+        return 1;
+    }
+resume_5:
+    obj_ver_id* v;
+    int i;
+    for (i = 0, v = (obj_ver_id*)op->buf; i < op->len; i++, v++)
+    {
+        mark_rolled_back(*v);
+    }
+    journal.trim();
+    inflight_writes--;
+    // Acknowledge op
+    op->retval = 0;
+    FINISH_OP(op);
+    return 1;
+}
+
+void blockstore_impl_t::mark_rolled_back(const obj_ver_id & ov)
+{
+    auto it = dirty_db.lower_bound((obj_ver_id){
+        .oid = ov.oid,
+        .version = UINT64_MAX,
+    });
+    if (it != dirty_db.begin())
+    {
+        uint64_t max_unstable = 0;
+        auto rm_start = it;
+        auto rm_end = it;
+        it--;
+        while (it->first.oid == ov.oid &&
+            it->first.version > ov.version &&
+            !IS_IN_FLIGHT(it->second.state) &&
+            !IS_STABLE(it->second.state))
+        {
+            if (it->first.oid != ov.oid)
+                break;
+            else if (it->first.version <= ov.version)
+            {
+                if (!IS_STABLE(it->second.state))
+                    max_unstable = it->first.version;
+                break;
+            }
+            else if (IS_STABLE(it->second.state))
+                break;
+            // Remove entry
+            rm_start = it;
+            if (it == dirty_db.begin())
+                break;
+            it--;
+        }
+        if (rm_start != rm_end)
+        {
+            erase_dirty(rm_start, rm_end, UINT64_MAX);
+        }
+        auto unstab_it = unstable_writes.find(ov.oid);
+        if (unstab_it != unstable_writes.end())
+        {
+            if (max_unstable == 0)
+                unstable_writes.erase(unstab_it);
+            else
+                unstab_it->second = max_unstable;
+        }
+    }
+}
+
+void blockstore_impl_t::handle_rollback_event(ring_data_t *data, blockstore_op_t *op)
+{
+    live = true;
+    if (data->res != data->iov.iov_len)
+    {
+        inflight_writes--;
+        throw std::runtime_error(
+            "write operation failed ("+std::to_string(data->res)+" != "+std::to_string(data->iov.iov_len)+
+            "). in-memory state is corrupted. AAAAAAAaaaaaaaaa!!!111"
+        );
+    }
+    PRIV(op)->pending_ops--;
+    if (PRIV(op)->pending_ops == 0)
+    {
+        PRIV(op)->op_state++;
+        if (!continue_rollback(op))
+        {
+            submit_queue.push_front(op);
+        }
+    }
+}
+
+void blockstore_impl_t::erase_dirty(blockstore_dirty_db_t::iterator dirty_start, blockstore_dirty_db_t::iterator dirty_end, uint64_t clean_loc)
+{
+    auto dirty_it = dirty_end;
+    while (dirty_it != dirty_start)
+    {
+        dirty_it--;
+        if (IS_BIG_WRITE(dirty_it->second.state) && dirty_it->second.location != clean_loc)
+        {
+#ifdef BLOCKSTORE_DEBUG
+            printf("Free block %lu\n", dirty_it->second.location >> block_order);
+#endif
+            data_alloc->set(dirty_it->second.location >> block_order, false);
+        }
+        int used = --journal.used_sectors[dirty_it->second.journal_sector];
+#ifdef BLOCKSTORE_DEBUG
+        printf(
+            "remove usage of journal offset %08lx by %lx:%lx v%lu (%d refs)\n", dirty_it->second.journal_sector,
+            dirty_it->first.oid.inode, dirty_it->first.oid.stripe, dirty_it->first.version, used
+        );
+#endif
+        if (used == 0)
+        {
+            journal.used_sectors.erase(dirty_it->second.journal_sector);
+        }
+    }
+    dirty_db.erase(dirty_start, dirty_end);
+}
--- a/blockstore_stable.cpp
+++ b/blockstore_stable.cpp
@ -0,0 +1,241 @@
+// Copyright (c) Vitaliy Filippov, 2019+
+// License: VNPL-1.0 (see README.md for details)
+
+#include "blockstore_impl.h"
+
+// Stabilize small write:
+// 1) Copy data from the journal to the data device
+// 2) Increase version on the metadata device and sync it
+// 3) Advance clean_db entry's version, clear previous journal entries
+//
+// This makes 1 4K small write+sync look like:
+// 512b+4K (journal) + sync + 512b (journal) + sync + 4K (data) [+ sync?] + 512b (metadata) + sync.
+// WA = 2.375. It's not the best, SSD FTL-like redirect-write could probably be lower
+// even with defragmentation. But it's fixed and it's still better than in Ceph. :)
+// except for HDD-only clusters, because each write results in 3 seeks.
+
+// Stabilize big write:
+// 1) Copy metadata from the journal to the metadata device
+// 2) Move dirty_db entry to clean_db and clear previous journal entries
+//
+// This makes 1 128K big write+sync look like:
+// 128K (data) + sync + 512b (journal) + sync + 512b (journal) + sync + 512b (metadata) + sync.
+// WA = 1.012. Very good :)
+
+// Stabilize delete:
+// 1) Remove metadata entry and sync it
+// 2) Remove dirty_db entry and clear previous journal entries
+// We have 2 problems here:
+// - In the cluster environment, we must store the "tombstones" of deleted objects until
+//   all replicas (not just quorum) agrees about their deletion. That is, "stabilize" is
+//   not possible for deletes in degraded placement groups
+// - With simple "fixed" metadata tables we can't just clear the metadata entry of the latest
+//   object version. We must clear all previous entries, too.
+// FIXME Fix both problems - probably, by switching from "fixed" metadata tables to "dynamic"
+
+// AND We must do it in batches, for the sake of reduced fsync call count
+// AND We must know what we stabilize. Basic workflow is like:
+// 1) primary OSD receives sync request
+// 2) it submits syncs to blockstore and peers
+// 3) after everyone acks sync it acks sync to the client
+// 4) after a while it takes his synced object list and sends stabilize requests
+//    to peers and to its own blockstore, thus freeing the old version
+
+int blockstore_impl_t::dequeue_stable(blockstore_op_t *op)
+{
+    if (PRIV(op)->op_state)
+    {
+        return continue_stable(op);
+    }
+    obj_ver_id* v;
+    int i, todo = 0;
+    for (i = 0, v = (obj_ver_id*)op->buf; i < op->len; i++, v++)
+    {
+        auto dirty_it = dirty_db.find(*v);
+        if (dirty_it == dirty_db.end())
+        {
+            auto clean_it = clean_db.find(v->oid);
+            if (clean_it == clean_db.end() || clean_it->second.version < v->version)
+            {
+                // No such object version
+                op->retval = -ENOENT;
+                FINISH_OP(op);
+                return 1;
+            }
+            else
+            {
+                // Already stable
+            }
+        }
+        else if (!IS_SYNCED(dirty_it->second.state))
+        {
+            // Object not synced yet. Caller must sync it first
+            op->retval = -EBUSY;
+            FINISH_OP(op);
+            return 1;
+        }
+        else if (!IS_STABLE(dirty_it->second.state))
+        {
+            todo++;
+        }
+    }
+    if (!todo)
+    {
+        // Already stable
+        op->retval = 0;
+        FINISH_OP(op);
+        return 1;
+    }
+    // Check journal space
+    blockstore_journal_check_t space_check(this);
+    if (!space_check.check_available(op, todo, sizeof(journal_entry_stable), 0))
+    {
+        return 0;
+    }
+    // There is sufficient space. Get SQEs
+    struct io_uring_sqe *sqe[space_check.sectors_required];
+    for (i = 0; i < space_check.sectors_required; i++)
+    {
+        BS_SUBMIT_GET_SQE_DECL(sqe[i]);
+    }
+    // Prepare and submit journal entries
+    auto cb = [this, op](ring_data_t *data) { handle_stable_event(data, op); };
+    int s = 0, cur_sector = -1;
+    if ((journal_block_size - journal.in_sector_pos) < sizeof(journal_entry_stable) &&
+        journal.sector_info[journal.cur_sector].dirty)
+    {
+        if (cur_sector == -1)
+            PRIV(op)->min_flushed_journal_sector = 1 + journal.cur_sector;
+        cur_sector = journal.cur_sector;
+        prepare_journal_sector_write(journal, cur_sector, sqe[s++], cb);
+    }
+    for (i = 0, v = (obj_ver_id*)op->buf; i < op->len; i++, v++)
+    {
+        // FIXME: Only stabilize versions that aren't stable yet
+        journal_entry_stable *je = (journal_entry_stable*)
+            prefill_single_journal_entry(journal, JE_STABLE, sizeof(journal_entry_stable));
+        journal.sector_info[journal.cur_sector].dirty = false;
+        je->oid = v->oid;
+        je->version = v->version;
+        je->crc32 = je_crc32((journal_entry*)je);
+        journal.crc32_last = je->crc32;
+        if (cur_sector != journal.cur_sector)
+        {
+            if (cur_sector == -1)
+                PRIV(op)->min_flushed_journal_sector = 1 + journal.cur_sector;
+            cur_sector = journal.cur_sector;
+            prepare_journal_sector_write(journal, cur_sector, sqe[s++], cb);
+        }
+    }
+    PRIV(op)->max_flushed_journal_sector = 1 + journal.cur_sector;
+    PRIV(op)->pending_ops = s;
+    PRIV(op)->op_state = 1;
+    inflight_writes++;
+    return 1;
+}
+
+int blockstore_impl_t::continue_stable(blockstore_op_t *op)
+{
+    if (PRIV(op)->op_state == 2)
+        goto resume_2;
+    else if (PRIV(op)->op_state == 3)
+        goto resume_3;
+    else if (PRIV(op)->op_state == 5)
+        goto resume_5;
+    else
+        return 1;
+resume_2:
+    // Release used journal sectors
+    release_journal_sectors(op);
+resume_3:
+    if (!disable_journal_fsync)
+    {
+        io_uring_sqe *sqe = get_sqe();
+        if (!sqe)
+        {
+            return 0;
+        }
+        ring_data_t *data = ((ring_data_t*)sqe->user_data);
+        my_uring_prep_fsync(sqe, journal.fd, IORING_FSYNC_DATASYNC);
+        data->iov = { 0 };
+        data->callback = [this, op](ring_data_t *data) { handle_stable_event(data, op); };
+        PRIV(op)->min_flushed_journal_sector = PRIV(op)->max_flushed_journal_sector = 0;
+        PRIV(op)->pending_ops = 1;
+        PRIV(op)->op_state = 4;
+        return 1;
+    }
+resume_5:
+    // Mark dirty_db entries as stable, acknowledge op completion
+    obj_ver_id* v;
+    int i;
+    for (i = 0, v = (obj_ver_id*)op->buf; i < op->len; i++, v++)
+    {
+        // Mark all dirty_db entries up to op->version as stable
+        mark_stable(*v);
+    }
+    inflight_writes--;
+    // Acknowledge op
+    op->retval = 0;
+    FINISH_OP(op);
+    return 1;
+}
+
+void blockstore_impl_t::mark_stable(const obj_ver_id & v)
+{
+    auto dirty_it = dirty_db.find(v);
+    if (dirty_it != dirty_db.end())
+    {
+        while (1)
+        {
+            if ((dirty_it->second.state & BS_ST_WORKFLOW_MASK) == BS_ST_SYNCED)
+            {
+                dirty_it->second.state = (dirty_it->second.state & ~BS_ST_WORKFLOW_MASK) | BS_ST_STABLE;
+            }
+            else if (IS_STABLE(dirty_it->second.state))
+            {
+                break;
+            }
+            if (dirty_it == dirty_db.begin())
+            {
+                break;
+            }
+            dirty_it--;
+            if (dirty_it->first.oid != v.oid)
+            {
+                break;
+            }
+        }
+#ifdef BLOCKSTORE_DEBUG
+        printf("enqueue_flush %lx:%lx v%lu\n", v.oid.inode, v.oid.stripe, v.version);
+#endif
+        flusher->enqueue_flush(v);
+    }
+    auto unstab_it = unstable_writes.find(v.oid);
+    if (unstab_it != unstable_writes.end() &&
+        unstab_it->second <= v.version)
+    {
+        unstable_writes.erase(unstab_it);
+    }
+}
+
+void blockstore_impl_t::handle_stable_event(ring_data_t *data, blockstore_op_t *op)
+{
+    live = true;
+    if (data->res != data->iov.iov_len)
+    {
+        inflight_writes--;
+        throw std::runtime_error(
+            "write operation failed ("+std::to_string(data->res)+" != "+std::to_string(data->iov.iov_len)+
+            "). in-memory state is corrupted. AAAAAAAaaaaaaaaa!!!111"
+        );
+    }
+    PRIV(op)->pending_ops--;
+    if (PRIV(op)->pending_ops == 0)
+    {
+        PRIV(op)->op_state++;
+        if (!continue_stable(op))
+        {
+            submit_queue.push_front(op);
+        }
+    }
+}
--- a/blockstore_sync.cpp
+++ b/blockstore_sync.cpp
@ -0,0 +1,305 @@
+// Copyright (c) Vitaliy Filippov, 2019+
+// License: VNPL-1.0 (see README.md for details)
+
+#include "blockstore_impl.h"
+
+#define SYNC_HAS_SMALL 1
+#define SYNC_HAS_BIG 2
+#define SYNC_DATA_SYNC_SENT 3
+#define SYNC_DATA_SYNC_DONE 4
+#define SYNC_JOURNAL_WRITE_SENT 5
+#define SYNC_JOURNAL_WRITE_DONE 6
+#define SYNC_JOURNAL_SYNC_SENT 7
+#define SYNC_DONE 8
+
+int blockstore_impl_t::dequeue_sync(blockstore_op_t *op)
+{
+    if (PRIV(op)->op_state == 0)
+    {
+        stop_sync_submitted = false;
+        PRIV(op)->sync_big_writes.swap(unsynced_big_writes);
+        PRIV(op)->sync_small_writes.swap(unsynced_small_writes);
+        PRIV(op)->sync_small_checked = 0;
+        PRIV(op)->sync_big_checked = 0;
+        unsynced_big_writes.clear();
+        unsynced_small_writes.clear();
+        if (PRIV(op)->sync_big_writes.size() > 0)
+            PRIV(op)->op_state = SYNC_HAS_BIG;
+        else if (PRIV(op)->sync_small_writes.size() > 0)
+            PRIV(op)->op_state = SYNC_HAS_SMALL;
+        else
+            PRIV(op)->op_state = SYNC_DONE;
+        // Always add sync to in_progress_syncs because we clear unsynced_big_writes and unsynced_small_writes
+        PRIV(op)->prev_sync_count = in_progress_syncs.size();
+        PRIV(op)->in_progress_ptr = in_progress_syncs.insert(in_progress_syncs.end(), op);
+    }
+    continue_sync(op);
+    // Always dequeue because we always add syncs to in_progress_syncs
+    return 1;
+}
+
+int blockstore_impl_t::continue_sync(blockstore_op_t *op)
+{
+    auto cb = [this, op](ring_data_t *data) { handle_sync_event(data, op); };
+    if (PRIV(op)->op_state == SYNC_HAS_SMALL)
+    {
+        // No big writes, just fsync the journal
+        for (; PRIV(op)->sync_small_checked < PRIV(op)->sync_small_writes.size(); PRIV(op)->sync_small_checked++)
+        {
+            if (IS_IN_FLIGHT(dirty_db[PRIV(op)->sync_small_writes[PRIV(op)->sync_small_checked]].state))
+            {
+                // Wait for small inflight writes to complete
+                return 0;
+            }
+        }
+        if (journal.sector_info[journal.cur_sector].dirty)
+        {
+            // Write out the last journal sector if it happens to be dirty
+            BS_SUBMIT_GET_ONLY_SQE(sqe);
+            prepare_journal_sector_write(journal, journal.cur_sector, sqe, cb);
+            PRIV(op)->min_flushed_journal_sector = PRIV(op)->max_flushed_journal_sector = 1 + journal.cur_sector;
+            PRIV(op)->pending_ops = 1;
+            PRIV(op)->op_state = SYNC_JOURNAL_WRITE_SENT;
+            return 1;
+        }
+        else
+        {
+            PRIV(op)->op_state = SYNC_JOURNAL_WRITE_DONE;
+        }
+    }
+    if (PRIV(op)->op_state == SYNC_HAS_BIG)
+    {
+        for (; PRIV(op)->sync_big_checked < PRIV(op)->sync_big_writes.size(); PRIV(op)->sync_big_checked++)
+        {
+            if (IS_IN_FLIGHT(dirty_db[PRIV(op)->sync_big_writes[PRIV(op)->sync_big_checked]].state))
+            {
+                // Wait for big inflight writes to complete
+                return 0;
+            }
+        }
+        // 1st step: fsync data
+        if (!disable_data_fsync)
+        {
+            BS_SUBMIT_GET_SQE(sqe, data);
+            my_uring_prep_fsync(sqe, data_fd, IORING_FSYNC_DATASYNC);
+            data->iov = { 0 };
+            data->callback = cb;
+            PRIV(op)->min_flushed_journal_sector = PRIV(op)->max_flushed_journal_sector = 0;
+            PRIV(op)->pending_ops = 1;
+            PRIV(op)->op_state = SYNC_DATA_SYNC_SENT;
+            return 1;
+        }
+        else
+        {
+            PRIV(op)->op_state = SYNC_DATA_SYNC_DONE;
+        }
+    }
+    if (PRIV(op)->op_state == SYNC_DATA_SYNC_DONE)
+    {
+        for (; PRIV(op)->sync_small_checked < PRIV(op)->sync_small_writes.size(); PRIV(op)->sync_small_checked++)
+        {
+            if (IS_IN_FLIGHT(dirty_db[PRIV(op)->sync_small_writes[PRIV(op)->sync_small_checked]].state))
+            {
+                // Wait for small inflight writes to complete
+                return 0;
+            }
+        }
+        // 2nd step: Data device is synced, prepare & write journal entries
+        // Check space in the journal and journal memory buffers
+        blockstore_journal_check_t space_check(this);
+        if (!space_check.check_available(op, PRIV(op)->sync_big_writes.size(), sizeof(journal_entry_big_write), 0))
+        {
+            return 0;
+        }
+        // Get SQEs. Don't bother about merging, submit each journal sector as a separate request
+        struct io_uring_sqe *sqe[space_check.sectors_required];
+        for (int i = 0; i < space_check.sectors_required; i++)
+        {
+            BS_SUBMIT_GET_SQE_DECL(sqe[i]);
+        }
+        // Prepare and submit journal entries
+        auto it = PRIV(op)->sync_big_writes.begin();
+        int s = 0, cur_sector = -1;
+        if ((journal_block_size - journal.in_sector_pos) < sizeof(journal_entry_big_write) &&
+            journal.sector_info[journal.cur_sector].dirty)
+        {
+            if (cur_sector == -1)
+                PRIV(op)->min_flushed_journal_sector = 1 + journal.cur_sector;
+            cur_sector = journal.cur_sector;
+            prepare_journal_sector_write(journal, cur_sector, sqe[s++], cb);
+        }
+        while (it != PRIV(op)->sync_big_writes.end())
+        {
+            journal_entry_big_write *je = (journal_entry_big_write*)prefill_single_journal_entry(
+                journal, (dirty_db[*it].state & BS_ST_INSTANT) ? JE_BIG_WRITE_INSTANT : JE_BIG_WRITE,
+                sizeof(journal_entry_big_write)
+            );
+            dirty_db[*it].journal_sector = journal.sector_info[journal.cur_sector].offset;
+            journal.sector_info[journal.cur_sector].dirty = false;
+            journal.used_sectors[journal.sector_info[journal.cur_sector].offset]++;
+#ifdef BLOCKSTORE_DEBUG
+            printf(
+                "journal offset %08lx is used by %lx:%lx v%lu (%lu refs)\n",
+                dirty_db[*it].journal_sector, it->oid.inode, it->oid.stripe, it->version,
+                journal.used_sectors[journal.sector_info[journal.cur_sector].offset]
+            );
+#endif
+            je->oid = it->oid;
+            je->version = it->version;
+            je->offset = dirty_db[*it].offset;
+            je->len = dirty_db[*it].len;
+            je->location = dirty_db[*it].location;
+            je->crc32 = je_crc32((journal_entry*)je);
+            journal.crc32_last = je->crc32;
+            it++;
+            if (cur_sector != journal.cur_sector)
+            {
+                if (cur_sector == -1)
+                    PRIV(op)->min_flushed_journal_sector = 1 + journal.cur_sector;
+                cur_sector = journal.cur_sector;
+                prepare_journal_sector_write(journal, cur_sector, sqe[s++], cb);
+            }
+        }
+        PRIV(op)->max_flushed_journal_sector = 1 + journal.cur_sector;
+        PRIV(op)->pending_ops = s;
+        PRIV(op)->op_state = SYNC_JOURNAL_WRITE_SENT;
+        return 1;
+    }
+    if (PRIV(op)->op_state == SYNC_JOURNAL_WRITE_DONE)
+    {
+        if (!disable_journal_fsync)
+        {
+            BS_SUBMIT_GET_SQE(sqe, data);
+            my_uring_prep_fsync(sqe, journal.fd, IORING_FSYNC_DATASYNC);
+            data->iov = { 0 };
+            data->callback = cb;
+            PRIV(op)->pending_ops = 1;
+            PRIV(op)->op_state = SYNC_JOURNAL_SYNC_SENT;
+            return 1;
+        }
+        else
+        {
+            PRIV(op)->op_state = SYNC_DONE;
+        }
+    }
+    if (PRIV(op)->op_state == SYNC_DONE)
+    {
+        return ack_sync(op);
+    }
+    return 1;
+}
+
+void blockstore_impl_t::handle_sync_event(ring_data_t *data, blockstore_op_t *op)
+{
+    live = true;
+    if (data->res != data->iov.iov_len)
+    {
+        throw std::runtime_error(
+            "write operation failed ("+std::to_string(data->res)+" != "+std::to_string(data->iov.iov_len)+
+            "). in-memory state is corrupted. AAAAAAAaaaaaaaaa!!!111"
+        );
+    }
+    PRIV(op)->pending_ops--;
+    if (PRIV(op)->pending_ops == 0)
+    {
+        // Release used journal sectors
+        release_journal_sectors(op);
+        // Handle states
+        if (PRIV(op)->op_state == SYNC_DATA_SYNC_SENT)
+        {
+            PRIV(op)->op_state = SYNC_DATA_SYNC_DONE;
+        }
+        else if (PRIV(op)->op_state == SYNC_JOURNAL_WRITE_SENT)
+        {
+            PRIV(op)->op_state = SYNC_JOURNAL_WRITE_DONE;
+        }
+        else if (PRIV(op)->op_state == SYNC_JOURNAL_SYNC_SENT)
+        {
+            PRIV(op)->op_state = SYNC_DONE;
+            ack_sync(op);
+        }
+        else
+        {
+            throw std::runtime_error("BUG: unexpected sync op state");
+        }
+    }
+}
+
+int blockstore_impl_t::ack_sync(blockstore_op_t *op)
+{
+    if (PRIV(op)->op_state == SYNC_DONE && PRIV(op)->prev_sync_count == 0)
+    {
+        // Remove dependency of subsequent syncs
+        auto it = PRIV(op)->in_progress_ptr;
+        int done_syncs = 1;
+        ++it;
+        // Acknowledge sync
+        ack_one_sync(op);
+        while (it != in_progress_syncs.end())
+        {
+            auto & next_sync = *it++;
+            PRIV(next_sync)->prev_sync_count -= done_syncs;
+            if (PRIV(next_sync)->prev_sync_count == 0 && PRIV(next_sync)->op_state == SYNC_DONE)
+            {
+                done_syncs++;
+                // Acknowledge next_sync
+                ack_one_sync(next_sync);
+            }
+        }
+        return 2;
+    }
+    return 0;
+}
+
+void blockstore_impl_t::ack_one_sync(blockstore_op_t *op)
+{
+    // Handle states
+    for (auto it = PRIV(op)->sync_big_writes.begin(); it != PRIV(op)->sync_big_writes.end(); it++)
+    {
+#ifdef BLOCKSTORE_DEBUG
+        printf("Ack sync big %lx:%lx v%lu\n", it->oid.inode, it->oid.stripe, it->version);
+#endif
+        auto & unstab = unstable_writes[it->oid];
+        unstab = unstab < it->version ? it->version : unstab;
+        auto dirty_it = dirty_db.find(*it);
+        dirty_it->second.state = ((dirty_it->second.state & ~BS_ST_WORKFLOW_MASK) | BS_ST_SYNCED);
+        if (dirty_it->second.state & BS_ST_INSTANT)
+        {
+            mark_stable(dirty_it->first);
+        }
+        dirty_it++;
+        while (dirty_it != dirty_db.end() && dirty_it->first.oid == it->oid)
+        {
+            if ((dirty_it->second.state & BS_ST_WORKFLOW_MASK) == BS_ST_WAIT_BIG)
+            {
+                dirty_it->second.state = (dirty_it->second.state & ~BS_ST_WORKFLOW_MASK) | BS_ST_IN_FLIGHT;
+            }
+            dirty_it++;
+        }
+    }
+    for (auto it = PRIV(op)->sync_small_writes.begin(); it != PRIV(op)->sync_small_writes.end(); it++)
+    {
+#ifdef BLOCKSTORE_DEBUG
+        printf("Ack sync small %lx:%lx v%lu\n", it->oid.inode, it->oid.stripe, it->version);
+#endif
+        auto & unstab = unstable_writes[it->oid];
+        unstab = unstab < it->version ? it->version : unstab;
+        if (dirty_db[*it].state == (BS_ST_DELETE | BS_ST_WRITTEN))
+        {
+            dirty_db[*it].state = (BS_ST_DELETE | BS_ST_SYNCED);
+            // Deletions are treated as immediately stable
+            mark_stable(*it);
+        }
+        else /* (BS_ST_INSTANT?) | BS_ST_SMALL_WRITE | BS_ST_WRITTEN */
+        {
+            dirty_db[*it].state = (dirty_db[*it].state & ~BS_ST_WORKFLOW_MASK) | BS_ST_SYNCED;
+            if (dirty_db[*it].state & BS_ST_INSTANT)
+            {
+                mark_stable(*it);
+            }
+        }
+    }
+    in_progress_syncs.erase(PRIV(op)->in_progress_ptr);
+    op->retval = 0;
+    FINISH_OP(op);
+}
--- a/blockstore_write.cpp
+++ b/blockstore_write.cpp
@ -0,0 +1,514 @@
+// Copyright (c) Vitaliy Filippov, 2019+
+// License: VNPL-1.0 (see README.md for details)
+
+#include "blockstore_impl.h"
+
+bool blockstore_impl_t::enqueue_write(blockstore_op_t *op)
+{
+    // Check or assign version number
+    bool found = false, deleted = false, is_del = (op->opcode == BS_OP_DELETE);
+    bool is_inflight_big = false;
+    uint64_t version = 1;
+    if (dirty_db.size() > 0)
+    {
+        auto dirty_it = dirty_db.upper_bound((obj_ver_id){
+            .oid = op->oid,
+            .version = UINT64_MAX,
+        });
+        dirty_it--; // segfaults when dirty_db is empty
+        if (dirty_it != dirty_db.end() && dirty_it->first.oid == op->oid)
+        {
+            found = true;
+            version = dirty_it->first.version + 1;
+            deleted = IS_DELETE(dirty_it->second.state);
+            is_inflight_big = (dirty_it->second.state & BS_ST_TYPE_MASK) == BS_ST_BIG_WRITE
+                ? !IS_SYNCED(dirty_it->second.state)
+                : ((dirty_it->second.state & BS_ST_WORKFLOW_MASK) == BS_ST_WAIT_BIG);
+        }
+    }
+    if (!found)
+    {
+        auto clean_it = clean_db.find(op->oid);
+        if (clean_it != clean_db.end())
+        {
+            version = clean_it->second.version + 1;
+        }
+        else
+        {
+            deleted = true;
+        }
+    }
+    if (op->version == 0)
+    {
+        op->version = version;
+    }
+    else if (op->version < version)
+    {
+        // Invalid version requested
+        op->retval = -EEXIST;
+        return false;
+    }
+    if (deleted && is_del)
+    {
+        // Already deleted
+        op->retval = 0;
+        return false;
+    }
+    if (is_inflight_big && !is_del && !deleted && op->len < block_size &&
+        immediate_commit != IMMEDIATE_ALL)
+    {
+        // Issue an additional sync so that the previous big write can reach the journal
+        blockstore_op_t *sync_op = new blockstore_op_t;
+        sync_op->opcode = BS_OP_SYNC;
+        sync_op->callback = [this, op](blockstore_op_t *sync_op)
+        {
+            delete sync_op;
+        };
+        enqueue_op(sync_op);
+    }
+#ifdef BLOCKSTORE_DEBUG
+    if (is_del)
+        printf("Delete %lx:%lx v%lu\n", op->oid.inode, op->oid.stripe, op->version);
+    else
+        printf("Write %lx:%lx v%lu offset=%u len=%u\n", op->oid.inode, op->oid.stripe, op->version, op->offset, op->len);
+#endif
+    // No strict need to add it into dirty_db here, it's just left
+    // from the previous implementation where reads waited for writes
+    dirty_db.emplace((obj_ver_id){
+        .oid = op->oid,
+        .version = op->version,
+    }, (dirty_entry){
+        .state = (uint32_t)(
+            is_del
+                ? (BS_ST_DELETE | BS_ST_IN_FLIGHT)
+                : (op->opcode == BS_OP_WRITE_STABLE ? BS_ST_INSTANT : 0) | (op->len == block_size || deleted
+                    ? (BS_ST_BIG_WRITE | BS_ST_IN_FLIGHT)
+                    : (is_inflight_big ? (BS_ST_SMALL_WRITE | BS_ST_WAIT_BIG) : (BS_ST_SMALL_WRITE | BS_ST_IN_FLIGHT)))
+        ),
+        .flags = 0,
+        .location = 0,
+        .offset = is_del ? 0 : op->offset,
+        .len = is_del ? 0 : op->len,
+        .journal_sector = 0,
+    });
+    return true;
+}
+
+// First step of the write algorithm: dequeue operation and submit initial write(s)
+int blockstore_impl_t::dequeue_write(blockstore_op_t *op)
+{
+    if (PRIV(op)->op_state)
+    {
+        return continue_write(op);
+    }
+    auto dirty_it = dirty_db.find((obj_ver_id){
+        .oid = op->oid,
+        .version = op->version,
+    });
+    assert(dirty_it != dirty_db.end());
+    if ((dirty_it->second.state & BS_ST_WORKFLOW_MASK) == BS_ST_WAIT_BIG)
+    {
+        // Don't dequeue
+        return 0;
+    }
+    else if ((dirty_it->second.state & BS_ST_TYPE_MASK) == BS_ST_BIG_WRITE)
+    {
+        blockstore_journal_check_t space_check(this);
+        if (!space_check.check_available(op, unsynced_big_writes.size() + 1, sizeof(journal_entry_big_write), JOURNAL_STABILIZE_RESERVATION))
+        {
+            return 0;
+        }
+        // Big (redirect) write
+        uint64_t loc = data_alloc->find_free();
+        if (loc == UINT64_MAX)
+        {
+            // no space
+            if (flusher->is_active())
+            {
+                // hope that some space will be available after flush
+                PRIV(op)->wait_for = WAIT_FREE;
+                return 0;
+            }
+            op->retval = -ENOSPC;
+            FINISH_OP(op);
+            return 1;
+        }
+        BS_SUBMIT_GET_SQE(sqe, data);
+        dirty_it->second.location = loc << block_order;
+        dirty_it->second.state = (dirty_it->second.state & ~BS_ST_WORKFLOW_MASK) | BS_ST_SUBMITTED;
+#ifdef BLOCKSTORE_DEBUG
+        printf("Allocate block %lu\n", loc);
+#endif
+        data_alloc->set(loc, true);
+        uint64_t stripe_offset = (op->offset % bitmap_granularity);
+        uint64_t stripe_end = (op->offset + op->len) % bitmap_granularity;
+        // Zero fill up to bitmap_granularity
+        int vcnt = 0;
+        if (stripe_offset)
+        {
+            PRIV(op)->iov_zerofill[vcnt++] = (struct iovec){ zero_object, stripe_offset };
+        }
+        PRIV(op)->iov_zerofill[vcnt++] = (struct iovec){ op->buf, op->len };
+        if (stripe_end)
+        {
+            stripe_end = bitmap_granularity - stripe_end;
+            PRIV(op)->iov_zerofill[vcnt++] = (struct iovec){ zero_object, stripe_end };
+        }
+        data->iov.iov_len = op->len + stripe_offset + stripe_end; // to check it in the callback
+        data->callback = [this, op](ring_data_t *data) { handle_write_event(data, op); };
+        my_uring_prep_writev(
+            sqe, data_fd, PRIV(op)->iov_zerofill, vcnt, data_offset + (loc << block_order) + op->offset - stripe_offset
+        );
+        PRIV(op)->pending_ops = 1;
+        PRIV(op)->min_flushed_journal_sector = PRIV(op)->max_flushed_journal_sector = 0;
+        if (immediate_commit != IMMEDIATE_ALL)
+        {
+            // Remember big write as unsynced
+            unsynced_big_writes.push_back((obj_ver_id){
+                .oid = op->oid,
+                .version = op->version,
+            });
+            PRIV(op)->op_state = 3;
+        }
+        else
+        {
+            PRIV(op)->op_state = 1;
+        }
+    }
+    else /* if ((dirty_it->second.state & BS_ST_TYPE_MASK) == BS_ST_SMALL_WRITE) */
+    {
+        // Small (journaled) write
+        // First check if the journal has sufficient space
+        blockstore_journal_check_t space_check(this);
+        if (unsynced_big_writes.size() && !space_check.check_available(op, unsynced_big_writes.size(), sizeof(journal_entry_big_write), 0)
+            || !space_check.check_available(op, 1, sizeof(journal_entry_small_write), op->len + JOURNAL_STABILIZE_RESERVATION))
+        {
+            return 0;
+        }
+        // There is sufficient space. Get SQE(s)
+        struct io_uring_sqe *sqe1 = NULL;
+        if (immediate_commit != IMMEDIATE_NONE ||
+            (journal_block_size - journal.in_sector_pos) < sizeof(journal_entry_small_write) &&
+            journal.sector_info[journal.cur_sector].dirty)
+        {
+            // Write current journal sector only if it's dirty and full, or in the immediate_commit mode
+            BS_SUBMIT_GET_SQE_DECL(sqe1);
+        }
+        struct io_uring_sqe *sqe2 = NULL;
+        if (op->len > 0)
+        {
+            BS_SUBMIT_GET_SQE_DECL(sqe2);
+        }
+        // Got SQEs. Prepare previous journal sector write if required
+        auto cb = [this, op](ring_data_t *data) { handle_write_event(data, op); };
+        if (immediate_commit == IMMEDIATE_NONE)
+        {
+            if (sqe1)
+            {
+                prepare_journal_sector_write(journal, journal.cur_sector, sqe1, cb);
+                PRIV(op)->min_flushed_journal_sector = PRIV(op)->max_flushed_journal_sector = 1 + journal.cur_sector;
+                PRIV(op)->pending_ops++;
+            }
+            else
+            {
+                PRIV(op)->min_flushed_journal_sector = PRIV(op)->max_flushed_journal_sector = 0;
+            }
+        }
+        // Then pre-fill journal entry
+        journal_entry_small_write *je = (journal_entry_small_write*)prefill_single_journal_entry(
+            journal, op->opcode == BS_OP_WRITE_STABLE ? JE_SMALL_WRITE_INSTANT : JE_SMALL_WRITE,
+            sizeof(journal_entry_small_write)
+        );
+        dirty_it->second.journal_sector = journal.sector_info[journal.cur_sector].offset;
+        journal.used_sectors[journal.sector_info[journal.cur_sector].offset]++;
+#ifdef BLOCKSTORE_DEBUG
+        printf(
+            "journal offset %08lx is used by %lx:%lx v%lu (%lu refs)\n",
+            dirty_it->second.journal_sector, dirty_it->first.oid.inode, dirty_it->first.oid.stripe, dirty_it->first.version,
+            journal.used_sectors[journal.sector_info[journal.cur_sector].offset]
+        );
+#endif
+        // Figure out where data will be
+        journal.next_free = (journal.next_free + op->len) <= journal.len ? journal.next_free : journal_block_size;
+        je->oid = op->oid;
+        je->version = op->version;
+        je->offset = op->offset;
+        je->len = op->len;
+        je->data_offset = journal.next_free;
+        je->crc32_data = crc32c(0, op->buf, op->len);
+        je->crc32 = je_crc32((journal_entry*)je);
+        journal.crc32_last = je->crc32;
+        if (immediate_commit != IMMEDIATE_NONE)
+        {
+            prepare_journal_sector_write(journal, journal.cur_sector, sqe1, cb);
+            PRIV(op)->min_flushed_journal_sector = PRIV(op)->max_flushed_journal_sector = 1 + journal.cur_sector;
+            PRIV(op)->pending_ops++;
+        }
+        if (op->len > 0)
+        {
+            // Prepare journal data write
+            if (journal.inmemory)
+            {
+                // Copy data
+                memcpy(journal.buffer + journal.next_free, op->buf, op->len);
+            }
+            ring_data_t *data2 = ((ring_data_t*)sqe2->user_data);
+            data2->iov = (struct iovec){ op->buf, op->len };
+            data2->callback = cb;
+            my_uring_prep_writev(
+                sqe2, journal.fd, &data2->iov, 1, journal.offset + journal.next_free
+            );
+            PRIV(op)->pending_ops++;
+        }
+        else
+        {
+            // Zero-length overwrite. Allowed to bump object version in EC placement groups without actually writing data
+        }
+        dirty_it->second.location = journal.next_free;
+        dirty_it->second.state = (dirty_it->second.state & ~BS_ST_WORKFLOW_MASK) | BS_ST_SUBMITTED;
+        journal.next_free += op->len;
+        if (journal.next_free >= journal.len)
+        {
+            journal.next_free = journal_block_size;
+        }
+        if (immediate_commit == IMMEDIATE_NONE)
+        {
+            // Remember small write as unsynced
+            unsynced_small_writes.push_back((obj_ver_id){
+                .oid = op->oid,
+                .version = op->version,
+            });
+        }
+        if (!PRIV(op)->pending_ops)
+        {
+            PRIV(op)->op_state = 4;
+            continue_write(op);
+        }
+        else
+        {
+            PRIV(op)->op_state = 3;
+        }
+    }
+    inflight_writes++;
+    return 1;
+}
+
+int blockstore_impl_t::continue_write(blockstore_op_t *op)
+{
+    io_uring_sqe *sqe = NULL;
+    journal_entry_big_write *je;
+    auto dirty_it = dirty_db.find((obj_ver_id){
+        .oid = op->oid,
+        .version = op->version,
+    });
+    assert(dirty_it != dirty_db.end());
+    if (PRIV(op)->op_state == 2)
+        goto resume_2;
+    else if (PRIV(op)->op_state == 4)
+        goto resume_4;
+    else
+        return 1;
+resume_2:
+    // Only for the immediate_commit mode: prepare and submit big_write journal entry
+    sqe = get_sqe();
+    if (!sqe)
+    {
+        return 0;
+    }
+    je = (journal_entry_big_write*)prefill_single_journal_entry(
+        journal, op->opcode == BS_OP_WRITE_STABLE ? JE_BIG_WRITE_INSTANT : JE_BIG_WRITE,
+        sizeof(journal_entry_big_write)
+    );
+    dirty_it->second.journal_sector = journal.sector_info[journal.cur_sector].offset;
+    journal.sector_info[journal.cur_sector].dirty = false;
+    journal.used_sectors[journal.sector_info[journal.cur_sector].offset]++;
+#ifdef BLOCKSTORE_DEBUG
+    printf(
+        "journal offset %08lx is used by %lx:%lx v%lu (%lu refs)\n",
+        journal.sector_info[journal.cur_sector].offset, op->oid.inode, op->oid.stripe, op->version,
+        journal.used_sectors[journal.sector_info[journal.cur_sector].offset]
+    );
+#endif
+    je->oid = op->oid;
+    je->version = op->version;
+    je->offset = op->offset;
+    je->len = op->len;
+    je->location = dirty_it->second.location;
+    je->crc32 = je_crc32((journal_entry*)je);
+    journal.crc32_last = je->crc32;
+    prepare_journal_sector_write(journal, journal.cur_sector, sqe,
+        [this, op](ring_data_t *data) { handle_write_event(data, op); });
+    PRIV(op)->min_flushed_journal_sector = PRIV(op)->max_flushed_journal_sector = 1 + journal.cur_sector;
+    PRIV(op)->pending_ops = 1;
+    PRIV(op)->op_state = 3;
+    return 1;
+resume_4:
+    // Switch object state
+#ifdef BLOCKSTORE_DEBUG
+    printf("Ack write %lx:%lx v%lu = %d\n", op->oid.inode, op->oid.stripe, op->version, dirty_it->second.state);
+#endif
+    bool imm = (dirty_it->second.state & BS_ST_TYPE_MASK) == BS_ST_BIG_WRITE
+        ? (immediate_commit == IMMEDIATE_ALL)
+        : (immediate_commit != IMMEDIATE_NONE);
+    if (imm)
+    {
+        auto & unstab = unstable_writes[op->oid];
+        unstab = unstab < op->version ? op->version : unstab;
+    }
+    dirty_it->second.state = (dirty_it->second.state & ~BS_ST_WORKFLOW_MASK)
+        | (imm ? BS_ST_SYNCED : BS_ST_WRITTEN);
+    if (imm && ((dirty_it->second.state & BS_ST_TYPE_MASK) == BS_ST_DELETE || (dirty_it->second.state & BS_ST_INSTANT)))
+    {
+        // Deletions are treated as immediately stable
+        mark_stable(dirty_it->first);
+    }
+    if (immediate_commit == IMMEDIATE_ALL)
+    {
+        dirty_it++;
+        while (dirty_it != dirty_db.end() && dirty_it->first.oid == op->oid)
+        {
+            if ((dirty_it->second.state & BS_ST_WORKFLOW_MASK) == BS_ST_WAIT_BIG)
+            {
+                dirty_it->second.state = (dirty_it->second.state & ~BS_ST_WORKFLOW_MASK) | BS_ST_IN_FLIGHT;
+            }
+            dirty_it++;
+        }
+    }
+    inflight_writes--;
+    // Acknowledge write
+    op->retval = op->len;
+    FINISH_OP(op);
+    return 1;
+}
+
+void blockstore_impl_t::handle_write_event(ring_data_t *data, blockstore_op_t *op)
+{
+    live = true;
+    if (data->res != data->iov.iov_len)
+    {
+        inflight_writes--;
+        // FIXME: our state becomes corrupted after a write error. maybe do something better than just die
+        throw std::runtime_error(
+            "write operation failed ("+std::to_string(data->res)+" != "+std::to_string(data->iov.iov_len)+
+            "). in-memory state is corrupted. AAAAAAAaaaaaaaaa!!!111"
+        );
+    }
+    PRIV(op)->pending_ops--;
+    if (PRIV(op)->pending_ops == 0)
+    {
+        release_journal_sectors(op);
+        PRIV(op)->op_state++;
+        if (!continue_write(op))
+        {
+            submit_queue.push_front(op);
+        }
+    }
+}
+
+void blockstore_impl_t::release_journal_sectors(blockstore_op_t *op)
+{
+    // Release flushed journal sectors
+    if (PRIV(op)->min_flushed_journal_sector > 0 &&
+        PRIV(op)->max_flushed_journal_sector > 0)
+    {
+        uint64_t s = PRIV(op)->min_flushed_journal_sector;
+        while (1)
+        {
+            journal.sector_info[s-1].usage_count--;
+            if (s != (1+journal.cur_sector) && journal.sector_info[s-1].usage_count == 0)
+            {
+                // We know for sure that we won't write into this sector anymore
+                uint64_t new_ds = journal.sector_info[s-1].offset + journal.block_size;
+                if (new_ds >= journal.len)
+                {
+                    new_ds = journal.block_size;
+                }
+                if ((journal.dirty_start + (journal.dirty_start >= journal.used_start ? 0 : journal.len)) <
+                    (new_ds + (new_ds >= journal.used_start ? 0 : journal.len)))
+                {
+                    journal.dirty_start = new_ds;
+                }
+            }
+            if (s == PRIV(op)->max_flushed_journal_sector)
+                break;
+            s = 1 + s % journal.sector_count;
+        }
+        PRIV(op)->min_flushed_journal_sector = PRIV(op)->max_flushed_journal_sector = 0;
+    }
+}
+
+int blockstore_impl_t::dequeue_del(blockstore_op_t *op)
+{
+    auto dirty_it = dirty_db.find((obj_ver_id){
+        .oid = op->oid,
+        .version = op->version,
+    });
+    assert(dirty_it != dirty_db.end());
+    blockstore_journal_check_t space_check(this);
+    if (!space_check.check_available(op, 1, sizeof(journal_entry_del), 0))
+    {
+        return 0;
+    }
+    io_uring_sqe *sqe = NULL;
+    if (immediate_commit != IMMEDIATE_NONE ||
+        (journal_block_size - journal.in_sector_pos) < sizeof(journal_entry_del) &&
+        journal.sector_info[journal.cur_sector].dirty)
+    {
+        // Write current journal sector only if it's dirty and full, or in the immediate_commit mode
+        BS_SUBMIT_GET_SQE_DECL(sqe);
+    }
+    auto cb = [this, op](ring_data_t *data) { handle_write_event(data, op); };
+    // Prepare journal sector write
+    if (immediate_commit == IMMEDIATE_NONE)
+    {
+        if (sqe)
+        {
+            prepare_journal_sector_write(journal, journal.cur_sector, sqe, cb);
+            PRIV(op)->min_flushed_journal_sector = PRIV(op)->max_flushed_journal_sector = 1 + journal.cur_sector;
+            PRIV(op)->pending_ops++;
+        }
+        else
+        {
+            PRIV(op)->min_flushed_journal_sector = PRIV(op)->max_flushed_journal_sector = 0;
+        }
+    }
+    // Pre-fill journal entry
+    journal_entry_del *je = (journal_entry_del*)prefill_single_journal_entry(
+        journal, JE_DELETE, sizeof(struct journal_entry_del)
+    );
+    dirty_it->second.journal_sector = journal.sector_info[journal.cur_sector].offset;
+    journal.used_sectors[journal.sector_info[journal.cur_sector].offset]++;
+#ifdef BLOCKSTORE_DEBUG
+    printf(
+        "journal offset %08lx is used by %lx:%lx v%lu (%lu refs)\n",
+        dirty_it->second.journal_sector, dirty_it->first.oid.inode, dirty_it->first.oid.stripe, dirty_it->first.version,
+        journal.used_sectors[journal.sector_info[journal.cur_sector].offset]
+    );
+#endif
+    je->oid = op->oid;
+    je->version = op->version;
+    je->crc32 = je_crc32((journal_entry*)je);
+    journal.crc32_last = je->crc32;
+    dirty_it->second.state = BS_ST_DELETE | BS_ST_SUBMITTED;
+    if (immediate_commit != IMMEDIATE_NONE)
+    {
+        prepare_journal_sector_write(journal, journal.cur_sector, sqe, cb);
+        PRIV(op)->min_flushed_journal_sector = PRIV(op)->max_flushed_journal_sector = 1 + journal.cur_sector;
+        PRIV(op)->pending_ops++;
+        // Remember small write as unsynced
+        unsynced_small_writes.push_back((obj_ver_id){
+            .oid = op->oid,
+            .version = op->version,
+        });
+    }
+    if (!PRIV(op)->pending_ops)
+    {
+        PRIV(op)->op_state = 4;
+        continue_write(op);
+    }
+    else
+    {
+        PRIV(op)->op_state = 3;
+    }
+    return 1;
+}
--- a/cluster_client.cpp
+++ b/cluster_client.cpp
@ -0,0 +1,745 @@
+// Copyright (c) Vitaliy Filippov, 2019+
+// License: VNPL-1.0 or GNU GPL-2.0+ (see README.md for details)
+
+#include "cluster_client.h"
+
+cluster_client_t::cluster_client_t(ring_loop_t *ringloop, timerfd_manager_t *tfd, json11::Json & config)
+{
+    this->ringloop = ringloop;
+    this->tfd = tfd;
+
+    msgr.osd_num = 0;
+    msgr.tfd = tfd;
+    msgr.ringloop = ringloop;
+    msgr.repeer_pgs = [this](osd_num_t peer_osd)
+    {
+        if (msgr.osd_peer_fds.find(peer_osd) != msgr.osd_peer_fds.end())
+        {
+            // peer_osd just connected
+            continue_ops();
+        }
+        else if (unsynced_writes.size())
+        {
+            // peer_osd just dropped connection
+            for (auto op: syncing_writes)
+            {
+                for (auto & part: op->parts)
+                {
+                    if (part.osd_num == peer_osd && part.done)
+                    {
+                        // repeat this operation
+                        part.osd_num = 0;
+                        part.done = false;
+                        assert(!part.sent);
+                        op->done_count--;
+                    }
+                }
+            }
+            for (auto op: unsynced_writes)
+            {
+                for (auto & part: op->parts)
+                {
+                    if (part.osd_num == peer_osd && part.done)
+                    {
+                        // repeat this operation
+                        part.osd_num = 0;
+                        part.done = false;
+                        assert(!part.sent);
+                        op->done_count--;
+                    }
+                }
+                if (op->done_count < op->parts.size())
+                {
+                    cur_ops.insert(op);
+                }
+            }
+            continue_ops();
+        }
+    };
+    msgr.exec_op = [this](osd_op_t *op)
+    {
+        // Garbage in
+        printf("Incoming garbage from peer %d\n", op->peer_fd);
+        msgr.stop_client(op->peer_fd);
+        delete op;
+    };
+    msgr.use_sync_send_recv = config["use_sync_send_recv"].bool_value() ||
+        config["use_sync_send_recv"].uint64_value();
+
+    st_cli.tfd = tfd;
+    st_cli.on_load_config_hook = [this](json11::Json::object & cfg) { on_load_config_hook(cfg); };
+    st_cli.on_change_osd_state_hook = [this](uint64_t peer_osd) { on_change_osd_state_hook(peer_osd); };
+    st_cli.on_change_hook = [this](json11::Json::object & changes) { on_change_hook(changes); };
+    st_cli.on_load_pgs_hook = [this](bool success) { on_load_pgs_hook(success); };
+
+    log_level = config["log_level"].int64_value();
+    st_cli.parse_config(config);
+    st_cli.load_global_config();
+
+    if (ringloop)
+    {
+        consumer.loop = [this]()
+        {
+            msgr.read_requests();
+            msgr.send_replies();
+            this->ringloop->submit();
+        };
+        ringloop->register_consumer(&consumer);
+    }
+}
+
+cluster_client_t::~cluster_client_t()
+{
+    if (ringloop)
+    {
+        ringloop->unregister_consumer(&consumer);
+    }
+}
+
+void cluster_client_t::stop()
+{
+    while (msgr.clients.size() > 0)
+    {
+        msgr.stop_client(msgr.clients.begin()->first);
+    }
+}
+
+void cluster_client_t::continue_ops(bool up_retry)
+{
+    for (auto op_it = cur_ops.begin(); op_it != cur_ops.end(); )
+    {
+        if ((*op_it)->up_wait)
+        {
+            if (up_retry)
+            {
+                (*op_it)->up_wait = false;
+                continue_rw(*op_it++);
+            }
+            else
+                op_it++;
+        }
+        else
+            continue_rw(*op_it++);
+    }
+}
+
+static uint32_t is_power_of_two(uint64_t value)
+{
+    uint32_t l = 0;
+    while (value > 1)
+    {
+        if (value & 1)
+        {
+            return 64;
+        }
+        value = value >> 1;
+        l++;
+    }
+    return l;
+}
+
+void cluster_client_t::on_load_config_hook(json11::Json::object & config)
+{
+    bs_block_size = config["block_size"].uint64_value();
+    bs_disk_alignment = config["disk_alignment"].uint64_value();
+    bs_bitmap_granularity = config["bitmap_granularity"].uint64_value();
+    if (!bs_block_size)
+    {
+        bs_block_size = DEFAULT_BLOCK_SIZE;
+    }
+    if (!bs_disk_alignment)
+    {
+        bs_disk_alignment = DEFAULT_DISK_ALIGNMENT;
+    }
+    if (!bs_bitmap_granularity)
+    {
+        bs_bitmap_granularity = DEFAULT_BITMAP_GRANULARITY;
+    }
+    uint32_t block_order;
+    if ((block_order = is_power_of_two(bs_block_size)) >= 64 || bs_block_size < MIN_BLOCK_SIZE || bs_block_size >= MAX_BLOCK_SIZE)
+    {
+        throw std::runtime_error("Bad block size");
+    }
+    if (config["immediate_commit"] == "all")
+    {
+        // Cluster-wide immediate_commit mode
+        immediate_commit = true;
+    }
+    else if (config.find("client_dirty_limit") != config.end())
+    {
+        client_dirty_limit = config["client_dirty_limit"].uint64_value();
+    }
+    if (!client_dirty_limit)
+    {
+        client_dirty_limit = DEFAULT_CLIENT_DIRTY_LIMIT;
+    }
+    up_wait_retry_interval = config["up_wait_retry_interval"].uint64_value();
+    if (!up_wait_retry_interval)
+    {
+        up_wait_retry_interval = 500;
+    }
+    else if (up_wait_retry_interval < 50)
+    {
+        up_wait_retry_interval = 50;
+    }
+    msgr.peer_connect_interval = config["peer_connect_interval"].uint64_value();
+    if (!msgr.peer_connect_interval)
+    {
+        msgr.peer_connect_interval = DEFAULT_PEER_CONNECT_INTERVAL;
+    }
+    msgr.peer_connect_timeout = config["peer_connect_timeout"].uint64_value();
+    if (!msgr.peer_connect_timeout)
+    {
+        msgr.peer_connect_timeout = DEFAULT_PEER_CONNECT_TIMEOUT;
+    }
+    st_cli.start_etcd_watcher();
+    st_cli.load_pgs();
+}
+
+void cluster_client_t::on_load_pgs_hook(bool success)
+{
+    for (auto pool_item: st_cli.pool_config)
+    {
+        pg_counts[pool_item.first] = pool_item.second.real_pg_count;
+    }
+    for (auto op: offline_ops)
+    {
+        execute(op);
+    }
+    offline_ops.clear();
+    continue_ops();
+}
+
+void cluster_client_t::on_change_hook(json11::Json::object & changes)
+{
+    for (auto pool_item: st_cli.pool_config)
+    {
+        if (pg_counts[pool_item.first] != pool_item.second.real_pg_count)
+        {
+            // At this point, all pool operations should have been suspended
+            // And now they have to be resliced!
+            for (auto op: cur_ops)
+            {
+                if (INODE_POOL(op->inode) == pool_item.first)
+                {
+                    op->needs_reslice = true;
+                }
+            }
+            for (auto op: unsynced_writes)
+            {
+                if (INODE_POOL(op->inode) == pool_item.first)
+                {
+                    op->needs_reslice = true;
+                }
+            }
+            for (auto op: syncing_writes)
+            {
+                if (INODE_POOL(op->inode) == pool_item.first)
+                {
+                    op->needs_reslice = true;
+                }
+            }
+            pg_counts[pool_item.first] = pool_item.second.real_pg_count;
+        }
+    }
+    continue_ops();
+}
+
+void cluster_client_t::on_change_osd_state_hook(uint64_t peer_osd)
+{
+    if (msgr.wanted_peers.find(peer_osd) != msgr.wanted_peers.end())
+    {
+        msgr.connect_peer(peer_osd, st_cli.peer_states[peer_osd]);
+    }
+}
+
+/**
+ * How writes are synced when immediate_commit is false
+ *
+ * 1) accept up to <client_dirty_limit> write operations for execution,
+ *    queue all subsequent writes into <next_writes>
+ * 2) accept exactly one SYNC, queue all subsequent SYNCs into <next_writes>, too
+ * 3) "continue" all accepted writes
+ *
+ * "Continue" WRITE:
+ * 1) if the operation is not a copy yet - copy it (required for replay)
+ * 2) if the operation is not sliced yet - slice it
+ * 3) if the operation doesn't require reslice - try to connect & send all remaining parts
+ * 4) if any of them fail due to disconnected peers or PGs not up, repeat after reconnecting or small timeout
+ * 5) if any of them fail due to other errors, fail the operation and forget it from the current "unsynced batch"
+ * 6) if PG count changes before all parts are done, wait for all in-progress parts to finish,
+ *    throw all results away, reslice and resubmit op
+ * 7) when all parts are done, try to "continue" the current SYNC
+ * 8) if the operation succeeds, but then some OSDs drop their connections, repeat
+ *    parts from the current "unsynced batch" previously sent to those OSDs in any order
+ *
+ * "Continue" current SYNC:
+ * 1) take all unsynced operations from the current batch
+ * 2) check if all affected OSDs are still alive
+ * 3) if yes, send all SYNCs. otherwise, leave current SYNC as is.
+ * 4) if any of them fail due to disconnected peers, repeat SYNC after repeating all writes
+ * 5) if any of them fail due to other errors, fail the SYNC operation
+ */
+
+void cluster_client_t::execute(cluster_op_t *op)
+{
+    if (!bs_disk_alignment)
+    {
+        // We're offline
+        offline_ops.push_back(op);
+        return;
+    }
+    op->retval = 0;
+    if (op->opcode != OSD_OP_SYNC && op->opcode != OSD_OP_READ && op->opcode != OSD_OP_WRITE ||
+        (op->opcode == OSD_OP_READ || op->opcode == OSD_OP_WRITE) && (!op->inode || !op->len ||
+        op->offset % bs_disk_alignment || op->len % bs_disk_alignment))
+    {
+        op->retval = -EINVAL;
+        std::function<void(cluster_op_t*)>(op->callback)(op);
+        return;
+    }
+    if (op->opcode == OSD_OP_SYNC)
+    {
+        execute_sync(op);
+        return;
+    }
+    if (op->opcode == OSD_OP_WRITE && !immediate_commit)
+    {
+        if (next_writes.size() > 0)
+        {
+            assert(cur_sync);
+            next_writes.push_back(op);
+            return;
+        }
+        if (queued_bytes >= client_dirty_limit)
+        {
+            // Push an extra SYNC operation to flush previous writes
+            next_writes.push_back(op);
+            cluster_op_t *sync_op = new cluster_op_t;
+            sync_op->is_internal = true;
+            sync_op->opcode = OSD_OP_SYNC;
+            sync_op->callback = [](cluster_op_t* sync_op) {};
+            execute_sync(sync_op);
+            return;
+        }
+        queued_bytes += op->len;
+    }
+    cur_ops.insert(op);
+    continue_rw(op);
+}
+
+void cluster_client_t::continue_rw(cluster_op_t *op)
+{
+    pool_id_t pool_id = INODE_POOL(op->inode);
+    if (!pool_id)
+    {
+        op->retval = -EINVAL;
+        std::function<void(cluster_op_t*)>(op->callback)(op);
+        return;
+    }
+    if (st_cli.pool_config.find(pool_id) == st_cli.pool_config.end() ||
+        st_cli.pool_config[pool_id].real_pg_count == 0)
+    {
+        // Postpone operations to unknown pools
+        return;
+    }
+    if (op->opcode == OSD_OP_WRITE && !immediate_commit && !op->is_internal)
+    {
+        // Save operation for replay when PG goes out of sync
+        // (primary OSD drops our connection in this case)
+        cluster_op_t *op_copy = new cluster_op_t();
+        op_copy->is_internal = true;
+        op_copy->orig_op = op;
+        op_copy->opcode = op->opcode;
+        op_copy->inode = op->inode;
+        op_copy->offset = op->offset;
+        op_copy->len = op->len;
+        op_copy->buf = malloc_or_die(op->len);
+        op_copy->iov.push_back(op_copy->buf, op->len);
+        op_copy->callback = [](cluster_op_t* op_copy)
+        {
+            if (op_copy->orig_op)
+            {
+                // Acknowledge write and forget the original pointer
+                op_copy->orig_op->retval = op_copy->retval;
+                std::function<void(cluster_op_t*)>(op_copy->orig_op->callback)(op_copy->orig_op);
+                op_copy->orig_op = NULL;
+            }
+        };
+        void *cur_buf = op_copy->buf;
+        for (int i = 0; i < op->iov.count; i++)
+        {
+            memcpy(cur_buf, op->iov.buf[i].iov_base, op->iov.buf[i].iov_len);
+            cur_buf += op->iov.buf[i].iov_len;
+        }
+        unsynced_writes.push_back(op_copy);
+        cur_ops.erase(op);
+        cur_ops.insert(op_copy);
+        op = op_copy;
+    }
+    if (!op->parts.size())
+    {
+        // Slice the operation into parts
+        slice_rw(op);
+    }
+    if (!op->needs_reslice)
+    {
+        // Send unsent parts, if they're not subject to change
+        for (auto & op_part: op->parts)
+        {
+            if (!op_part.sent && !op_part.done)
+            {
+                try_send(op, &op_part);
+            }
+        }
+    }
+    if (!op->sent_count)
+    {
+        if (op->done_count >= op->parts.size())
+        {
+            // Finished successfully
+            // Even if the PG count has changed in meanwhile we treat it as success
+            // because if some operations were invalid for the new PG count we'd get errors
+            cur_ops.erase(op);
+            op->retval = op->len;
+            std::function<void(cluster_op_t*)>(op->callback)(op);
+            continue_sync();
+            return;
+        }
+        else if (op->retval != 0 && op->retval != -EPIPE)
+        {
+            // Fatal error (not -EPIPE)
+            cur_ops.erase(op);
+            if (!immediate_commit && op->opcode == OSD_OP_WRITE)
+            {
+                for (int i = 0; i < unsynced_writes.size(); i++)
+                {
+                    if (unsynced_writes[i] == op)
+                    {
+                        unsynced_writes.erase(unsynced_writes.begin()+i, unsynced_writes.begin()+i+1);
+                        break;
+                    }
+                }
+            }
+            bool del = op->is_internal;
+            std::function<void(cluster_op_t*)>(op->callback)(op);
+            if (del)
+            {
+                if (op->buf)
+                    free(op->buf);
+                delete op;
+            }
+            continue_sync();
+            return;
+        }
+        else
+        {
+            // -EPIPE or no error - clear the error
+            op->retval = 0;
+            if (op->needs_reslice)
+            {
+                op->parts.clear();
+                op->done_count = 0;
+                op->needs_reslice = false;
+                continue_rw(op);
+            }
+        }
+    }
+}
+
+void cluster_client_t::slice_rw(cluster_op_t *op)
+{
+    // Slice the request into individual object stripe requests
+    // Primary OSDs still operate individual stripes, but their size is multiplied by PG minsize in case of EC
+    auto & pool_cfg = st_cli.pool_config[INODE_POOL(op->inode)];
+    uint64_t pg_block_size = bs_block_size * (
+        pool_cfg.scheme == POOL_SCHEME_REPLICATED ? 1 : pool_cfg.pg_minsize
+    );
+    uint64_t first_stripe = (op->offset / pg_block_size) * pg_block_size;
+    uint64_t last_stripe = ((op->offset + op->len + pg_block_size - 1) / pg_block_size - 1) * pg_block_size;
+    op->retval = 0;
+    op->parts.resize((last_stripe - first_stripe) / pg_block_size + 1);
+    int iov_idx = 0;
+    size_t iov_pos = 0;
+    int i = 0;
+    for (uint64_t stripe = first_stripe; stripe <= last_stripe; stripe += pg_block_size)
+    {
+        pg_num_t pg_num = (op->inode + stripe/pool_cfg.pg_stripe_size) % pool_cfg.real_pg_count + 1;
+        uint64_t begin = (op->offset < stripe ? stripe : op->offset);
+        uint64_t end = (op->offset + op->len) > (stripe + pg_block_size)
+            ? (stripe + pg_block_size) : (op->offset + op->len);
+        op->parts[i] = {
+            .parent = op,
+            .offset = begin,
+            .len = (uint32_t)(end - begin),
+            .pg_num = pg_num,
+            .sent = false,
+            .done = false,
+        };
+        int left = end-begin;
+        while (left > 0 && iov_idx < op->iov.count)
+        {
+            if (op->iov.buf[iov_idx].iov_len - iov_pos < left)
+            {
+                op->parts[i].iov.push_back(op->iov.buf[iov_idx].iov_base + iov_pos, op->iov.buf[iov_idx].iov_len - iov_pos);
+                left -= (op->iov.buf[iov_idx].iov_len - iov_pos);
+                iov_pos = 0;
+                iov_idx++;
+            }
+            else
+            {
+                op->parts[i].iov.push_back(op->iov.buf[iov_idx].iov_base + iov_pos, left);
+                iov_pos += left;
+                left = 0;
+            }
+        }
+        assert(left == 0);
+        i++;
+    }
+}
+
+bool cluster_client_t::try_send(cluster_op_t *op, cluster_op_part_t *part)
+{
+    auto & pool_cfg = st_cli.pool_config[INODE_POOL(op->inode)];
+    auto pg_it = pool_cfg.pg_config.find(part->pg_num);
+    if (pg_it != pool_cfg.pg_config.end() &&
+        !pg_it->second.pause && pg_it->second.cur_primary)
+    {
+        osd_num_t primary_osd = pg_it->second.cur_primary;
+        auto peer_it = msgr.osd_peer_fds.find(primary_osd);
+        if (peer_it != msgr.osd_peer_fds.end())
+        {
+            int peer_fd = peer_it->second;
+            part->osd_num = primary_osd;
+            part->sent = true;
+            op->sent_count++;
+            part->op = {
+                .op_type = OSD_OP_OUT,
+                .peer_fd = peer_fd,
+                .req = { .rw = {
+                    .header = {
+                        .magic = SECONDARY_OSD_OP_MAGIC,
+                        .id = op_id++,
+                        .opcode = op->opcode,
+                    },
+                    .inode = op->inode,
+                    .offset = part->offset,
+                    .len = part->len,
+                } },
+                .callback = [this, part](osd_op_t *op_part)
+                {
+                    handle_op_part(part);
+                },
+            };
+            part->op.iov = part->iov;
+            msgr.outbox_push(&part->op);
+            return true;
+        }
+        else if (msgr.wanted_peers.find(primary_osd) == msgr.wanted_peers.end())
+        {
+            msgr.connect_peer(primary_osd, st_cli.peer_states[primary_osd]);
+        }
+    }
+    return false;
+}
+
+void cluster_client_t::execute_sync(cluster_op_t *op)
+{
+    if (immediate_commit)
+    {
+        // Syncs are not required in the immediate_commit mode
+        op->retval = 0;
+        std::function<void(cluster_op_t*)>(op->callback)(op);
+    }
+    else if (cur_sync != NULL)
+    {
+        next_writes.push_back(op);
+    }
+    else
+    {
+        cur_sync = op;
+        continue_sync();
+    }
+}
+
+void cluster_client_t::continue_sync()
+{
+    if (!cur_sync || cur_sync->parts.size() > 0)
+    {
+        // Already submitted
+        return;
+    }
+    cur_sync->retval = 0;
+    std::set<osd_num_t> sync_osds;
+    for (auto prev_op: unsynced_writes)
+    {
+        if (prev_op->done_count < prev_op->parts.size())
+        {
+            // Writes not finished yet
+            return;
+        }
+        for (auto & part: prev_op->parts)
+        {
+            if (part.osd_num)
+            {
+                sync_osds.insert(part.osd_num);
+            }
+        }
+    }
+    if (!sync_osds.size())
+    {
+        // No dirty writes
+        finish_sync();
+        return;
+    }
+    // Check that all OSD connections are still alive
+    for (auto sync_osd: sync_osds)
+    {
+        auto peer_it = msgr.osd_peer_fds.find(sync_osd);
+        if (peer_it == msgr.osd_peer_fds.end())
+        {
+            // SYNC is pointless to send to a non connected OSD
+            return;
+        }
+    }
+    syncing_writes.swap(unsynced_writes);
+    // Post sync to affected OSDs
+    cur_sync->parts.resize(sync_osds.size());
+    int i = 0;
+    for (auto sync_osd: sync_osds)
+    {
+        cur_sync->parts[i] = {
+            .parent = cur_sync,
+            .osd_num = sync_osd,
+            .sent = false,
+            .done = false,
+        };
+        send_sync(cur_sync, &cur_sync->parts[i]);
+        i++;
+    }
+}
+
+void cluster_client_t::finish_sync()
+{
+    int retval = cur_sync->retval;
+    if (retval != 0)
+    {
+        for (auto op: syncing_writes)
+        {
+            if (op->done_count < op->parts.size())
+            {
+                cur_ops.insert(op);
+            }
+        }
+        unsynced_writes.insert(unsynced_writes.begin(), syncing_writes.begin(), syncing_writes.end());
+        syncing_writes.clear();
+    }
+    if (retval == -EPIPE)
+    {
+        // Retry later
+        cur_sync->parts.clear();
+        cur_sync->retval = 0;
+        cur_sync->sent_count = 0;
+        cur_sync->done_count = 0;
+        return;
+    }
+    std::function<void(cluster_op_t*)>(cur_sync->callback)(cur_sync);
+    if (!retval)
+    {
+        for (auto op: syncing_writes)
+        {
+            assert(op->sent_count == 0);
+            if (op->is_internal)
+            {
+                if (op->buf)
+                    free(op->buf);
+                delete op;
+            }
+        }
+        syncing_writes.clear();
+    }
+    cur_sync = NULL;
+    queued_bytes = 0;
+    std::vector<cluster_op_t*> next_wr_copy;
+    next_wr_copy.swap(next_writes);
+    for (auto next_op: next_wr_copy)
+    {
+        execute(next_op);
+    }
+}
+
+void cluster_client_t::send_sync(cluster_op_t *op, cluster_op_part_t *part)
+{
+    auto peer_it = msgr.osd_peer_fds.find(part->osd_num);
+    assert(peer_it != msgr.osd_peer_fds.end());
+    part->sent = true;
+    op->sent_count++;
+    part->op = {
+        .op_type = OSD_OP_OUT,
+        .peer_fd = peer_it->second,
+        .req = {
+            .hdr = {
+                .magic = SECONDARY_OSD_OP_MAGIC,
+                .id = op_id++,
+                .opcode = OSD_OP_SYNC,
+            },
+        },
+        .callback = [this, part](osd_op_t *op_part)
+        {
+            handle_op_part(part);
+        },
+    };
+    msgr.outbox_push(&part->op);
+}
+
+void cluster_client_t::handle_op_part(cluster_op_part_t *part)
+{
+    cluster_op_t *op = part->parent;
+    part->sent = false;
+    op->sent_count--;
+    int expected = part->op.req.hdr.opcode == OSD_OP_SYNC ? 0 : part->op.req.rw.len;
+    if (part->op.reply.hdr.retval != expected)
+    {
+        // Operation failed, retry
+        printf(
+            "Operation failed on OSD %lu: retval=%ld (expected %d), dropping connection\n",
+            part->osd_num, part->op.reply.hdr.retval, expected
+        );
+        msgr.stop_client(part->op.peer_fd);
+        if (part->op.reply.hdr.retval == -EPIPE)
+        {
+            op->up_wait = true;
+            if (!retry_timeout_id)
+            {
+                retry_timeout_id = tfd->set_timer(up_wait_retry_interval, false, [this](int)
+                {
+                    retry_timeout_id = 0;
+                    continue_ops(true);
+                });
+            }
+        }
+        if (!op->retval || op->retval == -EPIPE)
+        {
+            // Don't overwrite other errors with -EPIPE
+            op->retval = part->op.reply.hdr.retval;
+        }
+    }
+    else
+    {
+        // OK
+        part->done = true;
+        op->done_count++;
+    }
+    if (op->sent_count == 0)
+    {
+        if (op->opcode == OSD_OP_SYNC)
+        {
+            assert(op == cur_sync);
+            finish_sync();
+        }
+        else
+        {
+            continue_rw(op);
+        }
+    }
+}
--- a/cluster_client.h
+++ b/cluster_client.h
@ -0,0 +1,102 @@
+// Copyright (c) Vitaliy Filippov, 2019+
+// License: VNPL-1.0 or GNU GPL-2.0+ (see README.md for details)
+
+#pragma once
+
+#include "messenger.h"
+#include "etcd_state_client.h"
+
+#define MIN_BLOCK_SIZE 4*1024
+#define MAX_BLOCK_SIZE 128*1024*1024
+#define DEFAULT_BLOCK_SIZE 128*1024
+#define DEFAULT_DISK_ALIGNMENT 4096
+#define DEFAULT_BITMAP_GRANULARITY 4096
+#define DEFAULT_CLIENT_DIRTY_LIMIT 32*1024*1024
+
+struct cluster_op_t;
+
+struct cluster_op_part_t
+{
+    cluster_op_t *parent;
+    uint64_t offset;
+    uint32_t len;
+    pg_num_t pg_num;
+    osd_num_t osd_num;
+    osd_op_buf_list_t iov;
+    bool sent;
+    bool done;
+    osd_op_t op;
+};
+
+struct cluster_op_t
+{
+    uint64_t opcode; // OSD_OP_READ, OSD_OP_WRITE, OSD_OP_SYNC
+    uint64_t inode;
+    uint64_t offset;
+    uint64_t len;
+    int retval;
+    osd_op_buf_list_t iov;
+    std::function<void(cluster_op_t*)> callback;
+protected:
+    void *buf = NULL;
+    cluster_op_t *orig_op = NULL;
+    bool is_internal = false;
+    bool needs_reslice = false;
+    bool up_wait = false;
+    int sent_count = 0, done_count = 0;
+    std::vector<cluster_op_part_t> parts;
+    friend class cluster_client_t;
+};
+
+class cluster_client_t
+{
+    timerfd_manager_t *tfd;
+    ring_loop_t *ringloop;
+
+    uint64_t bs_block_size = 0;
+    uint64_t bs_disk_alignment = 0;
+    uint64_t bs_bitmap_granularity = 0;
+    std::map<pool_id_t, uint64_t> pg_counts;
+    bool immediate_commit = false;
+    // FIXME: Implement inmemory_commit mode. Note that it requires to return overlapping reads from memory.
+    uint64_t client_dirty_limit = 0;
+    int log_level;
+    int up_wait_retry_interval = 500; // ms
+
+    uint64_t op_id = 1;
+    etcd_state_client_t st_cli;
+    osd_messenger_t msgr;
+    ring_consumer_t consumer;
+    // operations currently in progress
+    std::set<cluster_op_t*> cur_ops;
+    int retry_timeout_id = 0;
+    // unsynced operations are copied in memory to allow replay when cluster isn't in the immediate_commit mode
+    // unsynced_writes are replayed in any order (because only the SYNC operation guarantees ordering)
+    std::vector<cluster_op_t*> unsynced_writes;
+    std::vector<cluster_op_t*> syncing_writes;
+    cluster_op_t* cur_sync = NULL;
+    std::vector<cluster_op_t*> next_writes;
+    std::vector<cluster_op_t*> offline_ops;
+    uint64_t queued_bytes = 0;
+
+public:
+    cluster_client_t(ring_loop_t *ringloop, timerfd_manager_t *tfd, json11::Json & config);
+    ~cluster_client_t();
+    void execute(cluster_op_t *op);
+    void stop();
+
+protected:
+    void continue_ops(bool up_retry = false);
+    void on_load_config_hook(json11::Json::object & config);
+    void on_load_pgs_hook(bool success);
+    void on_change_hook(json11::Json::object & changes);
+    void on_change_osd_state_hook(uint64_t peer_osd);
+    void continue_rw(cluster_op_t *op);
+    void slice_rw(cluster_op_t *op);
+    bool try_send(cluster_op_t *op, cluster_op_part_t *part);
+    void execute_sync(cluster_op_t *op);
+    void continue_sync();
+    void finish_sync();
+    void send_sync(cluster_op_t *op, cluster_op_part_t *part);
+    void handle_op_part(cluster_op_part_t *part);
+};
--- a/copy-fio-includes.sh
+++ b/copy-fio-includes.sh
@ -1,13 +0,0 @@
-#!/bin/bash
-
-gcc -I. -E -o fio_headers.i src/util/fio_headers.h
-
-rm -rf fio-copy
-for i in `grep -Po 'fio/[^"]+' fio_headers.i | sort | uniq`; do
-    j=${i##fio/}
-    p=$(dirname $j)
-    mkdir -p fio-copy/$p
-    cp $i fio-copy/$j
-done
-
-rm fio_headers.i
--- a/copy-qemu-includes.sh
+++ b/copy-qemu-includes.sh
@ -1,18 +0,0 @@
-#!/bin/bash
-
-#cd qemu
-#debian/rules b/configure-stamp
-#cd b/qemu; make qapi
-
-gcc -I qemu/b/qemu `pkg-config glib-2.0 --cflags` \
-    -I qemu/include -E -o qemu_driver.i src/client/qemu_driver.c
-
-rm -rf qemu-copy
-for i in `grep -Po 'qemu/[^"]+' qemu_driver.i | sort | uniq`; do
-    j=${i##qemu/}
-    p=$(dirname $j)
-    mkdir -p qemu-copy/$p
-    cp $i qemu-copy/$j
-done
-
-rm qemu_driver.i
--- a/2
+++ b/2
@ -1 +1 @@
-Subproject commit 8de8b467acbca50cfd8835c20e0e379110f3b32b
+Subproject commit 5dc108754ad40d3b1d024f9bd7cca0595ef1a1db
--- a/src/util/crc32c.c
+++ b/src/util/crc32c.c
--- a/src/util/crc32c.h
+++ b/src/util/crc32c.h
@ -8,10 +8,4 @@
 // unsigned __int64 _mm_crc32_u64 (unsigned __int64 crc, unsigned __int64 v)
 // unsigned int _mm_crc32_u8 (unsigned int crc, unsigned char v)

-#ifdef __cplusplus
-extern "C" {
-#endif
 uint32_t crc32c(uint32_t crc, const void *buf, size_t len);
-#ifdef __cplusplus
-};
-#endif
--- a/csi/.dockerignore
+++ b/csi/.dockerignore
@ -1,2 +0,0 @@
-vitastor-csi
-Dockerfile
--- a/csi/Dockerfile
+++ b/csi/Dockerfile
@ -1,48 +0,0 @@
-# Compile stage
-FROM golang:bookworm AS build
-
-ADD go.sum go.mod /app/
-RUN cd /app; CGO_ENABLED=1 GOOS=linux GOARCH=amd64 go mod download -x
-ADD . /app
-RUN perl -i -e '$/ = undef; while(<>) { s/\n\s*(\{\s*\n)/$1\n/g; s/\}(\s*\n\s*)else\b/$1} else/g; print; }' `find /app -name '*.go'` && \
-    cd /app && \
-    CGO_ENABLED=1 GOOS=linux GOARCH=amd64 go build -o vitastor-csi
-
-# Final stage
-FROM debian:bookworm
-
-LABEL maintainers="Vitaliy Filippov <vitalif@yourcmc.ru>"
-LABEL description="Vitastor CSI Driver"
-
-ENV NODE_ID=""
-ENV CSI_ENDPOINT=""
-
-RUN apt-get update && \
-    apt-get install -y wget && \
-    (echo "APT::Install-Recommends false;" > /etc/apt/apt.conf) && \
-    apt-get update && \
-    apt-get install -y e2fsprogs xfsprogs kmod iproute2 \
-        # dependencies of qemu-storage-daemon
-        libnuma1 liburing2 libglib2.0-0 libfuse3-3 libaio1 libzstd1 libnettle8 \
-        libgmp10 libhogweed6 libp11-kit0 libidn2-0 libunistring2 libtasn1-6 libpcre2-8-0 libffi8 && \
-    apt-get clean && \
-    (echo options nbd nbds_max=128 > /etc/modprobe.d/nbd.conf)
-
-COPY --from=build /app/vitastor-csi /bin/
-
-RUN (echo deb http://vitastor.io/debian bookworm main > /etc/apt/sources.list.d/vitastor.list) && \
-    ((echo 'Package: *'; echo 'Pin: origin "vitastor.io"'; echo 'Pin-Priority: 1000') > /etc/apt/preferences.d/vitastor.pref) && \
-    wget -q -O /etc/apt/trusted.gpg.d/vitastor.gpg https://vitastor.io/debian/pubkey.gpg && \
-    apt-get update && \
-    apt-get install -y vitastor-client && \
-    wget https://vitastor.io/archive/qemu/qemu-bookworm-8.1.2%2Bds-1%2Bvitastor1/qemu-utils_8.1.2%2Bds-1%2Bvitastor1_amd64.deb && \
-    wget https://vitastor.io/archive/qemu/qemu-bookworm-8.1.2%2Bds-1%2Bvitastor1/qemu-block-extra_8.1.2%2Bds-1%2Bvitastor1_amd64.deb && \
-    dpkg -x qemu-utils*.deb tmp1 && \
-    dpkg -x qemu-block-extra*.deb tmp1 && \
-    cp -a tmp1/usr/bin/qemu-storage-daemon /usr/bin/ && \
-    mkdir -p /usr/lib/x86_64-linux-gnu/qemu && \
-    cp -a tmp1/usr/lib/x86_64-linux-gnu/qemu/block-vitastor.so /usr/lib/x86_64-linux-gnu/qemu/ && \
-    rm -rf tmp1 *.deb && \
-    apt-get clean
-
-ENTRYPOINT ["/bin/vitastor-csi"]
--- a/csi/Makefile
+++ b/csi/Makefile
@ -1,9 +0,0 @@
-VITASTOR_VERSION ?= v1.9.3
-
-all: build push
-
-build:
-	@docker build --rm -t vitalif/vitastor-csi:$(VITASTOR_VERSION) .
-
-push:
-	@docker push vitalif/vitastor-csi:$(VITASTOR_VERSION)
--- a/csi/deploy/000-csi-namespace.yaml
+++ b/csi/deploy/000-csi-namespace.yaml
@ -1,5 +0,0 @@
---
-apiVersion: v1
-kind: Namespace
-metadata:
-  name: vitastor-system
--- a/csi/deploy/001-csi-config-map.yaml
+++ b/csi/deploy/001-csi-config-map.yaml
@ -1,10 +0,0 @@
---
-apiVersion: v1
-kind: ConfigMap
-data:
-  # You can add multiple configuration files here to use a multi-cluster setup
-  vitastor.conf: |-
-    {"etcd_address":"http://192.168.7.2:2379","etcd_prefix":"/vitastor"}
-metadata:
-  namespace: vitastor-system
-  name: vitastor-config
--- a/csi/deploy/002-csi-nodeplugin-rbac.yaml
+++ b/csi/deploy/002-csi-nodeplugin-rbac.yaml
@ -1,37 +0,0 @@
---
-apiVersion: v1
-kind: ServiceAccount
-metadata:
-  namespace: vitastor-system
-  name: vitastor-csi-nodeplugin
---
-kind: ClusterRole
-apiVersion: rbac.authorization.k8s.io/v1
-metadata:
-  namespace: vitastor-system
-  name: vitastor-csi-nodeplugin
-rules:
-  - apiGroups: [""]
-    resources: ["nodes"]
-    verbs: ["get"]
-  # allow to read Vault Token and connection options from the Tenants namespace
-  - apiGroups: [""]
-    resources: ["secrets"]
-    verbs: ["get"]
-  - apiGroups: [""]
-    resources: ["configmaps"]
-    verbs: ["get"]
---
-kind: ClusterRoleBinding
-apiVersion: rbac.authorization.k8s.io/v1
-metadata:
-  namespace: vitastor-system
-  name: vitastor-csi-nodeplugin
-subjects:
-  - kind: ServiceAccount
-    name: vitastor-csi-nodeplugin
-    namespace: vitastor-system
-roleRef:
-  kind: ClusterRole
-  name: vitastor-csi-nodeplugin
-  apiGroup: rbac.authorization.k8s.io
--- a/csi/deploy/003-csi-nodeplugin-psp.yaml
+++ b/csi/deploy/003-csi-nodeplugin-psp.yaml
@ -1,72 +0,0 @@
---
-apiVersion: policy/v1beta1
-kind: PodSecurityPolicy
-metadata:
-  namespace: vitastor-system
-  name: vitastor-csi-nodeplugin-psp
-spec:
-  allowPrivilegeEscalation: true
-  allowedCapabilities:
-    - 'SYS_ADMIN'
-  fsGroup:
-    rule: RunAsAny
-  privileged: true
-  hostNetwork: true
-  hostPID: true
-  runAsUser:
-    rule: RunAsAny
-  seLinux:
-    rule: RunAsAny
-  supplementalGroups:
-    rule: RunAsAny
-  volumes:
-    - 'configMap'
-    - 'emptyDir'
-    - 'projected'
-    - 'secret'
-    - 'downwardAPI'
-    - 'hostPath'
-  allowedHostPaths:
-    - pathPrefix: '/dev'
-      readOnly: false
-    - pathPrefix: '/run/mount'
-      readOnly: false
-    - pathPrefix: '/sys'
-      readOnly: false
-    - pathPrefix: '/lib/modules'
-      readOnly: true
-    - pathPrefix: '/var/lib/kubelet/pods'
-      readOnly: false
-    - pathPrefix: '/var/lib/kubelet/plugins/csi.vitastor.io'
-      readOnly: false
-    - pathPrefix: '/var/lib/kubelet/plugins_registry'
-      readOnly: false
-    - pathPrefix: '/var/lib/kubelet/plugins'
-      readOnly: false
-
---
-kind: Role
-apiVersion: rbac.authorization.k8s.io/v1
-metadata:
-  namespace: vitastor-system
-  name: vitastor-csi-nodeplugin-psp
-rules:
-  - apiGroups: ['policy']
-    resources: ['podsecuritypolicies']
-    verbs: ['use']
-    resourceNames: ['vitastor-csi-nodeplugin-psp']
-
---
-kind: RoleBinding
-apiVersion: rbac.authorization.k8s.io/v1
-metadata:
-  namespace: vitastor-system
-  name: vitastor-csi-nodeplugin-psp
-subjects:
-  - kind: ServiceAccount
-    name: vitastor-csi-nodeplugin
-    namespace: vitastor-system
-roleRef:
-  kind: Role
-  name: vitastor-csi-nodeplugin-psp
-  apiGroup: rbac.authorization.k8s.io
--- a/csi/deploy/004-csi-nodeplugin.yaml
+++ b/csi/deploy/004-csi-nodeplugin.yaml
@ -1,145 +0,0 @@
---
-kind: DaemonSet
-apiVersion: apps/v1
-metadata:
-  namespace: vitastor-system
-  name: csi-vitastor
-spec:
-  selector:
-    matchLabels:
-      app: csi-vitastor
-  template:
-    metadata:
-      namespace: vitastor-system
-      labels:
-        app: csi-vitastor
-    spec:
-      serviceAccountName: vitastor-csi-nodeplugin
-      hostNetwork: true
-      hostPID: true
-      priorityClassName: system-node-critical
-      # to use e.g. Rook orchestrated cluster, and mons' FQDN is
-      # resolved through k8s service, set dns policy to cluster first
-      dnsPolicy: ClusterFirstWithHostNet
-      containers:
-        - name: driver-registrar
-          # This is necessary only for systems with SELinux, where
-          # non-privileged sidecar containers cannot access unix domain socket
-          # created by privileged CSI driver container.
-          securityContext:
-            privileged: true
-          image: k8s.gcr.io/sig-storage/csi-node-driver-registrar:v2.2.0
-          args:
-            - "--v=5"
-            - "--csi-address=/csi/csi.sock"
-            - "--kubelet-registration-path=/var/lib/kubelet/plugins/csi.vitastor.io/csi.sock"
-          env:
-            - name: KUBE_NODE_NAME
-              valueFrom:
-                fieldRef:
-                  fieldPath: spec.nodeName
-          volumeMounts:
-            - name: socket-dir
-              mountPath: /csi
-            - name: registration-dir
-              mountPath: /registration
-        - name: csi-vitastor
-          securityContext:
-            privileged: true
-            capabilities:
-              add: ["SYS_ADMIN"]
-            allowPrivilegeEscalation: true
-          image: vitalif/vitastor-csi:v1.9.3
-          args:
-            - "--node=$(NODE_ID)"
-            - "--endpoint=$(CSI_ENDPOINT)"
-          env:
-            - name: NODE_ID
-              valueFrom:
-                fieldRef:
-                  fieldPath: spec.nodeName
-            - name: CSI_ENDPOINT
-              value: unix:///csi/csi.sock
-          imagePullPolicy: "IfNotPresent"
-          ports:
-          - containerPort: 9898
-            name: healthz
-            protocol: TCP
-          livenessProbe:
-            failureThreshold: 5
-            httpGet:
-              path: /healthz
-              port: healthz
-            initialDelaySeconds: 10
-            timeoutSeconds: 3
-            periodSeconds: 2
-          volumeMounts:
-            - name: socket-dir
-              mountPath: /csi
-            - mountPath: /dev
-              name: host-dev
-            - mountPath: /sys
-              name: host-sys
-            - mountPath: /run/mount
-              name: host-mount
-            - mountPath: /run/vitastor-csi
-              name: run-vitastor-csi
-            - mountPath: /lib/modules
-              name: lib-modules
-              readOnly: true
-            - name: vitastor-config
-              mountPath: /etc/vitastor
-            - name: plugin-dir
-              mountPath: /var/lib/kubelet/plugins
-              mountPropagation: "Bidirectional"
-            - name: mountpoint-dir
-              mountPath: /var/lib/kubelet/pods
-              mountPropagation: "Bidirectional"
-        - name: liveness-probe
-          securityContext:
-            privileged: true
-          image: quay.io/k8scsi/livenessprobe:v1.1.0
-          args:
-            - "--csi-address=$(CSI_ENDPOINT)"
-            - "--health-port=9898"
-          env:
-            - name: CSI_ENDPOINT
-              value: unix:///csi/csi.sock
-          volumeMounts:
-          - mountPath: /csi
-            name: socket-dir
-      volumes:
-        - name: socket-dir
-          hostPath:
-            path: /var/lib/kubelet/plugins/csi.vitastor.io
-            type: DirectoryOrCreate
-        - name: plugin-dir
-          hostPath:
-            path: /var/lib/kubelet/plugins
-            type: Directory
-        - name: mountpoint-dir
-          hostPath:
-            path: /var/lib/kubelet/pods
-            type: DirectoryOrCreate
-        - name: registration-dir
-          hostPath:
-            path: /var/lib/kubelet/plugins_registry/
-            type: Directory
-        - name: host-dev
-          hostPath:
-            path: /dev
-        - name: host-sys
-          hostPath:
-            path: /sys
-        - name: host-mount
-          hostPath:
-            path: /run/mount
-        - name: run-vitastor-csi
-          hostPath:
-            path: /run/vitastor-csi
-        - name: lib-modules
-          hostPath:
-            path: /lib/modules
-        - name: vitastor-config
-          configMap:
-            name: vitastor-config
--- a/csi/deploy/005-csi-provisioner-rbac.yaml
+++ b/csi/deploy/005-csi-provisioner-rbac.yaml
@ -1,105 +0,0 @@
---
-apiVersion: v1
-kind: ServiceAccount
-metadata:
-  namespace: vitastor-system
-  name: vitastor-csi-provisioner
-
---
-kind: ClusterRole
-apiVersion: rbac.authorization.k8s.io/v1
-metadata:
-  namespace: vitastor-system
-  name: vitastor-external-provisioner-runner
-rules:
-  - apiGroups: [""]
-    resources: ["nodes"]
-    verbs: ["get", "list", "watch"]
-  - apiGroups: [""]
-    resources: ["secrets"]
-    verbs: ["get", "list", "watch"]
-  - apiGroups: [""]
-    resources: ["events"]
-    verbs: ["list", "watch", "create", "update", "patch"]
-  - apiGroups: [""]
-    resources: ["persistentvolumes"]
-    verbs: ["get", "list", "watch", "create", "update", "delete", "patch"]
-  - apiGroups: [""]
-    resources: ["persistentvolumeclaims"]
-    verbs: ["get", "list", "watch", "update"]
-  - apiGroups: [""]
-    resources: ["persistentvolumeclaims/status"]
-    verbs: ["update", "patch"]
-  - apiGroups: ["storage.k8s.io"]
-    resources: ["storageclasses"]
-    verbs: ["get", "list", "watch"]
-  - apiGroups: ["snapshot.storage.k8s.io"]
-    resources: ["volumesnapshots"]
-    verbs: ["get", "list", "patch"]
-  - apiGroups: ["snapshot.storage.k8s.io"]
-    resources: ["volumesnapshots/status"]
-    verbs: ["get", "list", "patch"]
-  - apiGroups: ["snapshot.storage.k8s.io"]
-    resources: ["volumesnapshotcontents"]
-    verbs: ["create", "get", "list", "watch", "update", "delete", "patch"]
-  - apiGroups: ["snapshot.storage.k8s.io"]
-    resources: ["volumesnapshotclasses"]
-    verbs: ["get", "list", "watch"]
-  - apiGroups: ["storage.k8s.io"]
-    resources: ["volumeattachments"]
-    verbs: ["get", "list", "watch", "update", "patch"]
-  - apiGroups: ["storage.k8s.io"]
-    resources: ["volumeattachments/status"]
-    verbs: ["patch"]
-  - apiGroups: ["storage.k8s.io"]
-    resources: ["csinodes"]
-    verbs: ["get", "list", "watch"]
-  - apiGroups: ["snapshot.storage.k8s.io"]
-    resources: ["volumesnapshotcontents/status"]
-    verbs: ["update", "patch"]
-  - apiGroups: [""]
-    resources: ["configmaps"]
-    verbs: ["get"]
---
-kind: ClusterRoleBinding
-apiVersion: rbac.authorization.k8s.io/v1
-metadata:
-  namespace: vitastor-system
-  name: vitastor-csi-provisioner-role
-subjects:
-  - kind: ServiceAccount
-    name: vitastor-csi-provisioner
-    namespace: vitastor-system
-roleRef:
-  kind: ClusterRole
-  name: vitastor-external-provisioner-runner
-  apiGroup: rbac.authorization.k8s.io
-
---
-kind: Role
-apiVersion: rbac.authorization.k8s.io/v1
-metadata:
-  namespace: vitastor-system
-  name: vitastor-external-provisioner-cfg
-rules:
-  - apiGroups: [""]
-    resources: ["configmaps"]
-    verbs: ["get", "list", "watch", "create", "update", "delete"]
-  - apiGroups: ["coordination.k8s.io"]
-    resources: ["leases"]
-    verbs: ["get", "watch", "list", "delete", "update", "create"]
-
---
-kind: RoleBinding
-apiVersion: rbac.authorization.k8s.io/v1
-metadata:
-  name: vitastor-csi-provisioner-role-cfg
-  namespace: vitastor-system
-subjects:
-  - kind: ServiceAccount
-    name: vitastor-csi-provisioner
-    namespace: vitastor-system
-roleRef:
-  kind: Role
-  name: vitastor-external-provisioner-cfg
-  apiGroup: rbac.authorization.k8s.io
--- a/csi/deploy/006-csi-provisioner-psp.yaml
+++ b/csi/deploy/006-csi-provisioner-psp.yaml
@ -1,60 +0,0 @@
---
-apiVersion: policy/v1beta1
-kind: PodSecurityPolicy
-metadata:
-  namespace: vitastor-system
-  name: vitastor-csi-provisioner-psp
-spec:
-  allowPrivilegeEscalation: true
-  allowedCapabilities:
-    - 'SYS_ADMIN'
-  fsGroup:
-    rule: RunAsAny
-  privileged: true
-  runAsUser:
-    rule: RunAsAny
-  seLinux:
-    rule: RunAsAny
-  supplementalGroups:
-    rule: RunAsAny
-  volumes:
-    - 'configMap'
-    - 'emptyDir'
-    - 'projected'
-    - 'secret'
-    - 'downwardAPI'
-    - 'hostPath'
-  allowedHostPaths:
-    - pathPrefix: '/dev'
-      readOnly: false
-    - pathPrefix: '/sys'
-      readOnly: false
-    - pathPrefix: '/lib/modules'
-      readOnly: true
-
---
-kind: Role
-apiVersion: rbac.authorization.k8s.io/v1
-metadata:
-  namespace: vitastor-system
-  name: vitastor-csi-provisioner-psp
-rules:
-  - apiGroups: ['policy']
-    resources: ['podsecuritypolicies']
-    verbs: ['use']
-    resourceNames: ['vitastor-csi-provisioner-psp']
-
---
-kind: RoleBinding
-apiVersion: rbac.authorization.k8s.io/v1
-metadata:
-  name: vitastor-csi-provisioner-psp
-  namespace: vitastor-system
-subjects:
-  - kind: ServiceAccount
-    name: vitastor-csi-provisioner
-    namespace: vitastor-system
-roleRef:
-  kind: Role
-  name: vitastor-csi-provisioner-psp
-  apiGroup: rbac.authorization.k8s.io
--- a/csi/deploy/007-csi-provisioner.yaml
+++ b/csi/deploy/007-csi-provisioner.yaml
@ -1,164 +0,0 @@
---
-kind: Service
-apiVersion: v1
-metadata:
-  namespace: vitastor-system
-  name: csi-vitastor-provisioner
-  labels:
-    app: csi-metrics
-spec:
-  selector:
-    app: csi-vitastor-provisioner
-  ports:
-    - name: http-metrics
-      port: 8080
-      protocol: TCP
-      targetPort: 8680
-
---
-kind: Deployment
-apiVersion: apps/v1
-metadata:
-  namespace: vitastor-system
-  name: csi-vitastor-provisioner
-spec:
-  replicas: 3
-  strategy:
-    type: RollingUpdate
-    rollingUpdate:
-      maxUnavailable: 1
-      maxSurge: 0
-  selector:
-    matchLabels:
-      app: csi-vitastor-provisioner
-  template:
-    metadata:
-      namespace: vitastor-system
-      labels:
-        app: csi-vitastor-provisioner
-    spec:
-      affinity:
-        podAntiAffinity:
-          requiredDuringSchedulingIgnoredDuringExecution:
-            - labelSelector:
-                matchExpressions:
-                  - key: app
-                    operator: In
-                    values:
-                      - csi-vitastor-provisioner
-              topologyKey: "kubernetes.io/hostname"
-      serviceAccountName: vitastor-csi-provisioner
-      priorityClassName: system-cluster-critical
-      containers:
-        - name: csi-provisioner
-          image: k8s.gcr.io/sig-storage/csi-provisioner:v3.0.0
-          args:
-            - "--csi-address=$(ADDRESS)"
-            - "--v=5"
-            - "--timeout=150s"
-            - "--retry-interval-start=500ms"
-            - "--leader-election=true"
-            #  set it to true to use topology based provisioning
-            - "--feature-gates=Topology=false"
-            # if fstype is not specified in storageclass, ext4 is default
-            - "--default-fstype=ext4"
-            - "--extra-create-metadata=true"
-          env:
-            - name: ADDRESS
-              value: unix:///csi/csi-provisioner.sock
-          imagePullPolicy: "IfNotPresent"
-          volumeMounts:
-            - name: socket-dir
-              mountPath: /csi
-        - name: csi-snapshotter
-          image: k8s.gcr.io/sig-storage/csi-snapshotter:v4.0.0
-          args:
-            - "--csi-address=$(ADDRESS)"
-            - "--v=5"
-            - "--timeout=150s"
-            - "--leader-election=true"
-          env:
-            - name: ADDRESS
-              value: unix:///csi/csi-provisioner.sock
-          imagePullPolicy: "IfNotPresent"
-          securityContext:
-            privileged: true
-          volumeMounts:
-            - name: socket-dir
-              mountPath: /csi
-        - name: csi-attacher
-          image: k8s.gcr.io/sig-storage/csi-attacher:v3.1.0
-          args:
-            - "--v=5"
-            - "--csi-address=$(ADDRESS)"
-            - "--leader-election=true"
-            - "--retry-interval-start=500ms"
-          env:
-            - name: ADDRESS
-              value: /csi/csi-provisioner.sock
-          imagePullPolicy: "IfNotPresent"
-          volumeMounts:
-            - name: socket-dir
-              mountPath: /csi
-        - name: csi-resizer
-          image: k8s.gcr.io/sig-storage/csi-resizer:v1.1.0
-          args:
-            - "--csi-address=$(ADDRESS)"
-            - "--v=5"
-            - "--timeout=150s"
-            - "--leader-election"
-            - "--retry-interval-start=500ms"
-            - "--handle-volume-inuse-error=false"
-          env:
-            - name: ADDRESS
-              value: unix:///csi/csi-provisioner.sock
-          imagePullPolicy: "IfNotPresent"
-          volumeMounts:
-            - name: socket-dir
-              mountPath: /csi
-        - name: csi-vitastor
-          securityContext:
-            privileged: true
-            capabilities:
-              add: ["SYS_ADMIN"]
-          image: vitalif/vitastor-csi:v1.9.3
-          args:
-            - "--node=$(NODE_ID)"
-            - "--endpoint=$(CSI_ENDPOINT)"
-          env:
-            - name: NODE_ID
-              valueFrom:
-                fieldRef:
-                  fieldPath: spec.nodeName
-            - name: CSI_ENDPOINT
-              value: unix:///csi/csi-provisioner.sock
-          imagePullPolicy: "IfNotPresent"
-          volumeMounts:
-            - name: socket-dir
-              mountPath: /csi
-            - mountPath: /dev
-              name: host-dev
-            - mountPath: /sys
-              name: host-sys
-            - mountPath: /lib/modules
-              name: lib-modules
-              readOnly: true
-            - name: vitastor-config
-              mountPath: /etc/vitastor
-      volumes:
-        - name: host-dev
-          hostPath:
-            path: /dev
-        - name: host-sys
-          hostPath:
-            path: /sys
-        - name: lib-modules
-          hostPath:
-            path: /lib/modules
-        - name: socket-dir
-          emptyDir: {
-            medium: "Memory"
-          }
-        - name: vitastor-config
-          configMap:
-            name: vitastor-config
--- a/csi/deploy/008-csi-driver.yaml
+++ b/csi/deploy/008-csi-driver.yaml
@ -1,11 +0,0 @@
---
-# if Kubernetes version is less than 1.18 change
-# apiVersion to storage.k8s.io/v1betav1
-apiVersion: storage.k8s.io/v1
-kind: CSIDriver
-metadata:
-  namespace: vitastor-system
-  name: csi.vitastor.io
-spec:
-  attachRequired: true
-  podInfoOnMount: false
--- a/csi/deploy/009-storage-class.yaml
+++ b/csi/deploy/009-storage-class.yaml
@ -1,17 +0,0 @@
---
-apiVersion: storage.k8s.io/v1
-kind: StorageClass
-metadata:
-  namespace: vitastor-system
-  name: vitastor
-  annotations:
-    storageclass.kubernetes.io/is-default-class: "true"
-provisioner: csi.vitastor.io
-volumeBindingMode: Immediate
-parameters:
-  etcdVolumePrefix: ""
-  poolId: "1"
-  # you can choose other configuration file if you have it in the config map
-  # different etcd URLs and prefixes should also be put in the config
-  #configPath: "/etc/vitastor/vitastor.conf"
-allowVolumeExpansion: true
--- a/csi/deploy/example-pvc-block.yaml
+++ b/csi/deploy/example-pvc-block.yaml
@ -1,13 +0,0 @@
---
-apiVersion: v1
-kind: PersistentVolumeClaim
-metadata:
-  name: test-vitastor-pvc-block
-spec:
-  storageClassName: vitastor
-  volumeMode: Block
-  accessModes:
-    - ReadWriteMany
-  resources:
-    requests:
-      storage: 10Gi
--- a/csi/deploy/example-pvc.yaml
+++ b/csi/deploy/example-pvc.yaml
@ -1,12 +0,0 @@
---
-apiVersion: v1
-kind: PersistentVolumeClaim
-metadata:
-  name: test-vitastor-pvc
-spec:
-  storageClassName: vitastor
-  accessModes:
-    - ReadWriteOnce
-  resources:
-    requests:
-      storage: 10Gi
--- a/csi/deploy/example-snapshot-class.yaml
+++ b/csi/deploy/example-snapshot-class.yaml
@ -1,7 +0,0 @@
-apiVersion: snapshot.storage.k8s.io/v1
-kind: VolumeSnapshotClass
-metadata:
-  name: vitastor-snapclass
-driver: csi.vitastor.io
-deletionPolicy: Delete
-parameters:
--- a/csi/deploy/example-snapshot-clone.yaml
+++ b/csi/deploy/example-snapshot-clone.yaml
@ -1,16 +0,0 @@
---
-apiVersion: v1
-kind: PersistentVolumeClaim
-metadata:
-  name: test-vitastor-clone
-spec:
-  storageClassName: vitastor
-  dataSource:
-    name: snap1
-    kind: VolumeSnapshot
-    apiGroup: snapshot.storage.k8s.io
-  accessModes:
-    - ReadWriteOnce
-  resources:
-    requests:
-      storage: 10Gi
--- a/csi/deploy/example-snapshot.yaml
+++ b/csi/deploy/example-snapshot.yaml
@ -1,8 +0,0 @@
-apiVersion: snapshot.storage.k8s.io/v1
-kind: VolumeSnapshot
-metadata:
-  name: snap1
-spec:
-  volumeSnapshotClassName: vitastor-snapclass
-  source:
-    persistentVolumeClaimName: test-vitastor-pvc
--- a/csi/deploy/example-test-pod-block.yaml
+++ b/csi/deploy/example-test-pod-block.yaml
@ -1,17 +0,0 @@
-apiVersion: v1
-kind: Pod
-metadata:
-  name: vitastor-test-block-pvc
-  namespace: default
-spec:
-  containers:
-  - name: vitastor-test-block-pvc
-    image: nginx
-    volumeDevices:
-      - name: data
-        devicePath: /dev/xvda
-  volumes:
-  - name: data
-    persistentVolumeClaim:
-      claimName: test-vitastor-pvc-block
-      readOnly: false
--- a/csi/deploy/example-test-pod.yaml
+++ b/csi/deploy/example-test-pod.yaml
@ -1,17 +0,0 @@
-apiVersion: v1
-kind: Pod
-metadata:
-  name: vitastor-test-nginx
-  namespace: default
-spec:
-  containers:
-   - name: vitastor-test-nginx
-     image: nginx
-     volumeMounts:
-       - mountPath: /usr/share/nginx/html/s3
-         name: data
-  volumes:
-   - name: data
-     persistentVolumeClaim:
-       claimName: test-vitastor-pvc
-       readOnly: false
--- a/csi/go.mod
+++ b/csi/go.mod
@ -1,21 +0,0 @@
-module vitastor.io/csi
-
-go 1.15
-
-require (
-	github.com/container-storage-interface/spec v1.8.0
-	github.com/golang/glog v0.0.0-20160126235308-23def4e6c14b
-	github.com/kubernetes-csi/csi-lib-utils v0.9.1
-	golang.org/x/net v0.7.0
-	golang.org/x/xerrors v0.0.0-20200804184101-5ec99f83aff1 // indirect
-	google.golang.org/grpc v1.33.1
-	google.golang.org/protobuf v1.24.0
-	k8s.io/klog v1.0.0
-	k8s.io/utils v0.0.0-20210305010621-2afb4311ab10
-)
-
-replace github.com/coreos/bbolt => go.etcd.io/bbolt v1.3.5
-
-replace go.etcd.io/bbolt => github.com/coreos/bbolt v1.3.5
-
-replace google.golang.org/grpc => google.golang.org/grpc v1.25.1
--- a/csi/go.sum
+++ b/csi/go.sum
@ -1,383 +0,0 @@
-cloud.google.com/go v0.34.0/go.mod h1:aQUYkXzVsufM+DwF1aE+0xfcU+56JwCaLick0ClmMTw=
-cloud.google.com/go v0.38.0/go.mod h1:990N+gfupTy94rShfmMCWGDn0LpTmnzTp2qbd1dvSRU=
-cloud.google.com/go v0.44.1/go.mod h1:iSa0KzasP4Uvy3f1mN/7PiObzGgflwredwwASm/v6AU=
-cloud.google.com/go v0.44.2/go.mod h1:60680Gw3Yr4ikxnPRS/oxxkBccT6SA1yMk63TGekxKY=
-cloud.google.com/go v0.45.1/go.mod h1:RpBamKRgapWJb87xiFSdk4g1CME7QZg3uwTez+TSTjc=
-cloud.google.com/go v0.46.3/go.mod h1:a6bKKbmY7er1mI7TEI4lsAkts/mkhTSZK8w33B4RAg0=
-cloud.google.com/go v0.51.0/go.mod h1:hWtGJ6gnXH+KgDv+V0zFGDvpi07n3z8ZNj3T1RW0Gcw=
-cloud.google.com/go/bigquery v1.0.1/go.mod h1:i/xbL2UlR5RvWAURpBYZTtm/cXjCha9lbfbpx4poX+o=
-cloud.google.com/go/datastore v1.0.0/go.mod h1:LXYbyblFSglQ5pkeyhO+Qmw7ukd3C+pD7TKLgZqpHYE=
-cloud.google.com/go/pubsub v1.0.1/go.mod h1:R0Gpsv3s54REJCy4fxDixWD93lHJMoZTyQ2kNxGRt3I=
-cloud.google.com/go/storage v1.0.0/go.mod h1:IhtSnM/ZTZV8YYJWCY8RULGVqBDmpoyjwiyrjsg+URw=
-dmitri.shuralyov.com/gpu/mtl v0.0.0-20190408044501-666a987793e9/go.mod h1:H6x//7gZCb22OMCxBHrMx7a5I7Hp++hsVxbQ4BYO7hU=
-github.com/Azure/go-ansiterm v0.0.0-20170929234023-d6e3b3328b78/go.mod h1:LmzpDX56iTiv29bbRTIsUNlaFfuhWRQBWjQdVyAevI8=
-github.com/Azure/go-autorest/autorest v0.9.0/go.mod h1:xyHB1BMZT0cuDHU7I0+g046+BFDTQ8rEZB0s4Yfa6bI=
-github.com/Azure/go-autorest/autorest v0.9.6/go.mod h1:/FALq9T/kS7b5J5qsQ+RSTUdAmGFqi0vUdVNNx8q630=
-github.com/Azure/go-autorest/autorest/adal v0.5.0/go.mod h1:8Z9fGy2MpX0PvDjB1pEgQTmVqjGhiHBW7RJJEciWzS0=
-github.com/Azure/go-autorest/autorest/adal v0.8.2/go.mod h1:ZjhuQClTqx435SRJ2iMlOxPYt3d2C/T/7TiQCVZSn3Q=
-github.com/Azure/go-autorest/autorest/date v0.1.0/go.mod h1:plvfp3oPSKwf2DNjlBjWF/7vwR+cUD/ELuzDCXwHUVA=
-github.com/Azure/go-autorest/autorest/date v0.2.0/go.mod h1:vcORJHLJEh643/Ioh9+vPmf1Ij9AEBM5FuBIXLmIy0g=
-github.com/Azure/go-autorest/autorest/mocks v0.1.0/go.mod h1:OTyCOPRA2IgIlWxVYxBee2F5Gr4kF2zd2J5cFRaIDN0=
-github.com/Azure/go-autorest/autorest/mocks v0.2.0/go.mod h1:OTyCOPRA2IgIlWxVYxBee2F5Gr4kF2zd2J5cFRaIDN0=
-github.com/Azure/go-autorest/autorest/mocks v0.3.0/go.mod h1:a8FDP3DYzQ4RYfVAxAN3SVSiiO77gL2j2ronKKP0syM=
-github.com/Azure/go-autorest/logger v0.1.0/go.mod h1:oExouG+K6PryycPJfVSxi/koC6LSNgds39diKLz7Vrc=
-github.com/Azure/go-autorest/tracing v0.5.0/go.mod h1:r/s2XiOKccPW3HrqB+W0TQzfbtp2fGCgRFtBroKn4Dk=
-github.com/BurntSushi/toml v0.3.1/go.mod h1:xHWCNGjB5oqiDr8zfno3MHue2Ht5sIBksp03qcyfWMU=
-github.com/BurntSushi/xgb v0.0.0-20160522181843-27f122750802/go.mod h1:IVnqGOEym/WlBOVXweHU+Q+/VP0lqqI8lqeDx9IjBqo=
-github.com/NYTimes/gziphandler v0.0.0-20170623195520-56545f4a5d46/go.mod h1:3wb06e3pkSAbeQ52E9H9iFoQsEEwGN64994WTCIhntQ=
-github.com/PuerkitoBio/purell v1.0.0/go.mod h1:c11w/QuzBsJSee3cPx9rAFu61PvFxuPbtSwDGJws/X0=
-github.com/PuerkitoBio/urlesc v0.0.0-20160726150825-5bd2802263f2/go.mod h1:uGdkoq3SwY9Y+13GIhn11/XLaGBb4BfwItxLd5jeuXE=
-github.com/alecthomas/template v0.0.0-20160405071501-a0175ee3bccc/go.mod h1:LOuyumcjzFXgccqObfd/Ljyb9UuFJ6TxHnclSeseNhc=
-github.com/alecthomas/template v0.0.0-20190718012654-fb15b899a751/go.mod h1:LOuyumcjzFXgccqObfd/Ljyb9UuFJ6TxHnclSeseNhc=
-github.com/alecthomas/units v0.0.0-20151022065526-2efee857e7cf/go.mod h1:ybxpYRFXyAe+OPACYpWeL0wqObRcbAqCMya13uyzqw0=
-github.com/alecthomas/units v0.0.0-20190717042225-c3de453c63f4/go.mod h1:ybxpYRFXyAe+OPACYpWeL0wqObRcbAqCMya13uyzqw0=
-github.com/beorn7/perks v0.0.0-20180321164747-3a771d992973/go.mod h1:Dwedo/Wpr24TaqPxmxbtue+5NUziq4I4S80YR8gNf3Q=
-github.com/beorn7/perks v1.0.0/go.mod h1:KWe93zE9D1o94FZ5RNwFwVgaQK1VOXiVxmqh+CedLV8=
-github.com/beorn7/perks v1.0.1/go.mod h1:G2ZrVWU2WbWT9wwq4/hrbKbnv/1ERSJQ0ibhJ6rlkpw=
-github.com/blang/semver v3.5.0+incompatible/go.mod h1:kRBLl5iJ+tD4TcOOxsy/0fnwebNt5EWlYSAyrTnjyyk=
-github.com/census-instrumentation/opencensus-proto v0.2.1/go.mod h1:f6KPmirojxKA12rnyqOA5BBL4O983OfeGPqjHWSTneU=
-github.com/cespare/xxhash/v2 v2.1.1/go.mod h1:VGX0DQ3Q6kWi7AoAeZDth3/j3BFtOZR5XLFGgcrjCOs=
-github.com/chzyer/logex v1.1.10/go.mod h1:+Ywpsq7O8HXn0nuIou7OrIPyXbp3wmkHB+jjWRnGsAI=
-github.com/chzyer/readline v0.0.0-20180603132655-2972be24d48e/go.mod h1:nSuG5e5PlCu98SY8svDHJxuZscDgtXS6KTTbou5AhLI=
-github.com/chzyer/test v0.0.0-20180213035817-a1ea475d72b1/go.mod h1:Q3SI9o4m/ZMnBNeIyt5eFwwo7qiLfzFZmjNmxjkiQlU=
-github.com/container-storage-interface/spec v1.2.0/go.mod h1:6URME8mwIBbpVyZV93Ce5St17xBiQJQY67NDsuohiy4=
-github.com/container-storage-interface/spec v1.8.0 h1:D0vhF3PLIZwlwZEf2eNbpujGCNwspwTYf2idJRJx4xI=
-github.com/container-storage-interface/spec v1.8.0/go.mod h1:ROLik+GhPslwwWRNFF1KasPzroNARibH2rfz1rkg4H0=
-github.com/davecgh/go-spew v1.1.0/go.mod h1:J7Y8YcW2NihsgmVo/mv3lAwl/skON4iLHjSsI+c5H38=
-github.com/davecgh/go-spew v1.1.1 h1:vj9j/u1bqnvCEfJOwUhtlOARqs3+rkHYY13jYWTU97c=
-github.com/davecgh/go-spew v1.1.1/go.mod h1:J7Y8YcW2NihsgmVo/mv3lAwl/skON4iLHjSsI+c5H38=
-github.com/dgrijalva/jwt-go v3.2.0+incompatible/go.mod h1:E3ru+11k8xSBh+hMPgOLZmtrrCbhqsmaPHjLKYnJCaQ=
-github.com/docker/spdystream v0.0.0-20160310174837-449fdfce4d96/go.mod h1:Qh8CwZgvJUkLughtfhJv5dyTYa91l1fOUCrgjqmcifM=
-github.com/docopt/docopt-go v0.0.0-20180111231733-ee0de3bc6815/go.mod h1:WwZ+bS3ebgob9U8Nd0kOddGdZWjyMGR8Wziv+TBNwSE=
-github.com/elazarl/goproxy v0.0.0-20180725130230-947c36da3153/go.mod h1:/Zj4wYkgs4iZTTu3o/KG3Itv/qCCa8VVMlb3i9OVuzc=
-github.com/emicklei/go-restful v0.0.0-20170410110728-ff4f55a20633/go.mod h1:otzb+WCGbkyDHkqmQmT5YD2WR4BBwUdeQoFo8l/7tVs=
-github.com/envoyproxy/go-control-plane v0.9.0/go.mod h1:YTl/9mNaCwkRvm6d1a2C3ymFceY/DCBVvsKhRF0iEA4=
-github.com/envoyproxy/protoc-gen-validate v0.1.0/go.mod h1:iSmxcyjqTsJpI2R4NaDN7+kN2VEUnK/pcBlmesArF7c=
-github.com/evanphx/json-patch v4.9.0+incompatible/go.mod h1:50XU6AFN0ol/bzJsmQLiYLvXMP4fmwYFNcr97nuDLSk=
-github.com/fsnotify/fsnotify v1.4.7/go.mod h1:jwhsz4b93w/PPRr/qN1Yymfu8t87LnFCMoQvtojpjFo=
-github.com/fsnotify/fsnotify v1.4.9/go.mod h1:znqG4EE+3YCdAaPaxE2ZRY/06pZUdp0tY4IgpuI1SZQ=
-github.com/ghodss/yaml v0.0.0-20150909031657-73d445a93680/go.mod h1:4dBDuWmgqj2HViK6kFavaiC9ZROes6MMH2rRYeMEF04=
-github.com/go-gl/glfw/v3.3/glfw v0.0.0-20191125211704-12ad95a8df72/go.mod h1:tQ2UAYgL5IevRw8kRxooKSPJfGvJ9fJQFa0TUsXzTg8=
-github.com/go-kit/kit v0.8.0/go.mod h1:xBxKIO96dXMWWy0MnWVtmwkA9/13aqxPnvrjFYMA2as=
-github.com/go-kit/kit v0.9.0/go.mod h1:xBxKIO96dXMWWy0MnWVtmwkA9/13aqxPnvrjFYMA2as=
-github.com/go-logfmt/logfmt v0.3.0/go.mod h1:Qt1PoO58o5twSAckw1HlFXLmHsOX5/0LbT9GBnD5lWE=
-github.com/go-logfmt/logfmt v0.4.0/go.mod h1:3RMwSq7FuexP4Kalkev3ejPJsZTpXXBr9+V4qmtdjCk=
-github.com/go-logr/logr v0.1.0/go.mod h1:ixOQHD9gLJUVQQ2ZOR7zLEifBX6tGkNJF4QyIY7sIas=
-github.com/go-logr/logr v0.2.0 h1:QvGt2nLcHH0WK9orKa+ppBPAxREcH364nPUedEpK0TY=
-github.com/go-logr/logr v0.2.0/go.mod h1:z6/tIYblkpsD+a4lm/fGIIU9mZ+XfAiaFtq7xTgseGU=
-github.com/go-openapi/jsonpointer v0.0.0-20160704185906-46af16f9f7b1/go.mod h1:+35s3my2LFTysnkMfxsJBAMHj/DoqoB9knIWoYG/Vk0=
-github.com/go-openapi/jsonreference v0.0.0-20160704190145-13c6e3589ad9/go.mod h1:W3Z9FmVs9qj+KR4zFKmDPGiLdk1D9Rlm7cyMvf57TTg=
-github.com/go-openapi/spec v0.0.0-20160808142527-6aced65f8501/go.mod h1:J8+jY1nAiCcj+friV/PDoE1/3eeccG9LYBs0tYvLOWc=
-github.com/go-openapi/swag v0.0.0-20160704191624-1d0bd113de87/go.mod h1:DXUve3Dpr1UfpPtxFw+EFuQ41HhCWZfha5jSVRG7C7I=
-github.com/go-stack/stack v1.8.0/go.mod h1:v0f6uXyyMGvRgIKkXu+yp6POWl0qKG85gN/melR3HDY=
-github.com/gogo/protobuf v1.1.1/go.mod h1:r8qH/GZQm5c6nD/R0oafs1akxWv10x8SbQlK7atdtwQ=
-github.com/gogo/protobuf v1.3.1/go.mod h1:SlYgWuQ5SjCEi6WLHjHCa1yvBfUnHcTbrrZtXPKa29o=
-github.com/golang/glog v0.0.0-20160126235308-23def4e6c14b h1:VKtxabqXZkF25pY9ekfRL6a582T4P37/31XEstQ5p58=
-github.com/golang/glog v0.0.0-20160126235308-23def4e6c14b/go.mod h1:SBH7ygxi8pfUlaOkMMuAQtPIUF8ecWP5IEl/CR7VP2Q=
-github.com/golang/groupcache v0.0.0-20190702054246-869f871628b6/go.mod h1:cIg4eruTrX1D+g88fzRXU5OdNfaM+9IcxsU14FzY7Hc=
-github.com/golang/groupcache v0.0.0-20191227052852-215e87163ea7/go.mod h1:cIg4eruTrX1D+g88fzRXU5OdNfaM+9IcxsU14FzY7Hc=
-github.com/golang/mock v1.1.1/go.mod h1:oTYuIxOrZwtPieC+H1uAHpcLFnEyAGVDL/k47Jfbm0A=
-github.com/golang/mock v1.2.0/go.mod h1:oTYuIxOrZwtPieC+H1uAHpcLFnEyAGVDL/k47Jfbm0A=
-github.com/golang/mock v1.3.1/go.mod h1:sBzyDLLjw3U8JLTeZvSv8jJB+tU5PVekmnlKIyFUx0Y=
-github.com/golang/protobuf v1.2.0/go.mod h1:6lQm79b+lXiMfvg/cZm0SGofjICqVBUtrP5yJMmIC1U=
-github.com/golang/protobuf v1.3.1/go.mod h1:6lQm79b+lXiMfvg/cZm0SGofjICqVBUtrP5yJMmIC1U=
-github.com/golang/protobuf v1.3.2/go.mod h1:6lQm79b+lXiMfvg/cZm0SGofjICqVBUtrP5yJMmIC1U=
-github.com/golang/protobuf v1.3.3/go.mod h1:vzj43D7+SQXF/4pzW/hwtAqwc6iTitCiVSaWz5lYuqw=
-github.com/golang/protobuf v1.4.0-rc.1/go.mod h1:ceaxUfeHdC40wWswd/P6IGgMaK3YpKi5j83Wpe3EHw8=
-github.com/golang/protobuf v1.4.0-rc.1.0.20200221234624-67d41d38c208/go.mod h1:xKAWHe0F5eneWXFV3EuXVDTCmh+JuBKY0li0aMyXATA=
-github.com/golang/protobuf v1.4.0-rc.2/go.mod h1:LlEzMj4AhA7rCAGe4KMBDvJI+AwstrUpVNzEA03Pprs=
-github.com/golang/protobuf v1.4.0-rc.4.0.20200313231945-b860323f09d0/go.mod h1:WU3c8KckQ9AFe+yFwt9sWVRKCVIyN9cPHBJSNnbL67w=
-github.com/golang/protobuf v1.4.0/go.mod h1:jodUvKwWbYaEsadDk5Fwe5c77LiNKVO9IDvqG2KuDX0=
-github.com/golang/protobuf v1.4.1/go.mod h1:U8fpvMrcmy5pZrNK1lt4xCsGvpyWQ/VVv6QDs8UjoX8=
-github.com/golang/protobuf v1.4.2 h1:+Z5KGCizgyZCbGh1KZqA0fcLLkwbsjIzS4aV2v7wJX0=
-github.com/golang/protobuf v1.4.2/go.mod h1:oDoupMAO8OvCJWAcko0GGGIgR6R6ocIYbsSw735rRwI=
-github.com/google/btree v0.0.0-20180813153112-4030bb1f1f0c/go.mod h1:lNA+9X1NB3Zf8V7Ke586lFgjr2dZNuvo3lPJSGZ5JPQ=
-github.com/google/btree v1.0.0/go.mod h1:lNA+9X1NB3Zf8V7Ke586lFgjr2dZNuvo3lPJSGZ5JPQ=
-github.com/google/go-cmp v0.2.0/go.mod h1:oXzfMopK8JAjlY9xF4vHSVASa0yLyX7SntLO5aqRK0M=
-github.com/google/go-cmp v0.3.0/go.mod h1:8QqcDgzrUqlUb/G2PQTWiueGozuR1884gddMywk6iLU=
-github.com/google/go-cmp v0.3.1/go.mod h1:8QqcDgzrUqlUb/G2PQTWiueGozuR1884gddMywk6iLU=
-github.com/google/go-cmp v0.4.0 h1:xsAVV57WRhGj6kEIi8ReJzQlHHqcBYCElAvkovg3B/4=
-github.com/google/go-cmp v0.4.0/go.mod h1:v8dTdLbMG2kIc/vJvl+f65V22dbkXbowE6jgT/gNBxE=
-github.com/google/gofuzz v1.0.0/go.mod h1:dBl0BpW6vV/+mYPU4Po3pmUjxk6FQPldtuIdl/M65Eg=
-github.com/google/gofuzz v1.1.0/go.mod h1:dBl0BpW6vV/+mYPU4Po3pmUjxk6FQPldtuIdl/M65Eg=
-github.com/google/martian v2.1.0+incompatible/go.mod h1:9I4somxYTbIHy5NJKHRl3wXiIaQGbYVAs8BPL6v8lEs=
-github.com/google/pprof v0.0.0-20181206194817-3ea8567a2e57/go.mod h1:zfwlbNMJ+OItoe0UupaVj+oy1omPYYDuagoSzA8v9mc=
-github.com/google/pprof v0.0.0-20190515194954-54271f7e092f/go.mod h1:zfwlbNMJ+OItoe0UupaVj+oy1omPYYDuagoSzA8v9mc=
-github.com/google/pprof v0.0.0-20191218002539-d4f498aebedc/go.mod h1:ZgVRPoUq/hfqzAqh7sHMqb3I9Rq5C59dIz2SbBwJ4eM=
-github.com/google/renameio v0.1.0/go.mod h1:KWCgfxg9yswjAJkECMjeO8J8rahYeXnNhOm40UhjYkI=
-github.com/google/uuid v1.1.1/go.mod h1:TIyPZe4MgqvfeYDBFedMoGGpEw/LqOeaOT+nhxU+yHo=
-github.com/googleapis/gax-go/v2 v2.0.4/go.mod h1:0Wqv26UfaUD9n4G6kQubkQ+KchISgw+vpHVxEJEs9eg=
-github.com/googleapis/gax-go/v2 v2.0.5/go.mod h1:DWXyrwAJ9X0FpwwEdw+IPEYBICEFu5mhpdKc/us6bOk=
-github.com/googleapis/gnostic v0.4.1/go.mod h1:LRhVm6pbyptWbWbuZ38d1eyptfvIytN3ir6b65WBswg=
-github.com/gregjones/httpcache v0.0.0-20180305231024-9cad4c3443a7/go.mod h1:FecbI9+v66THATjSRHfNgh1IVFe/9kFxbXtjV0ctIMA=
-github.com/hashicorp/golang-lru v0.5.0/go.mod h1:/m3WP610KZHVQ1SGc6re/UDhFvYD7pJ4Ao+sR/qLZy8=
-github.com/hashicorp/golang-lru v0.5.1/go.mod h1:/m3WP610KZHVQ1SGc6re/UDhFvYD7pJ4Ao+sR/qLZy8=
-github.com/hpcloud/tail v1.0.0/go.mod h1:ab1qPbhIpdTxEkNHXyeSf5vhxWSCs/tWer42PpOxQnU=
-github.com/ianlancetaylor/demangle v0.0.0-20181102032728-5e5cf60278f6/go.mod h1:aSSvb/t6k1mPoxDqO4vJh6VOCGPwU4O0C2/Eqndh1Sc=
-github.com/imdario/mergo v0.3.5/go.mod h1:2EnlNZ0deacrJVfApfmtdGgDfMuh/nq6Ok1EcJh5FfA=
-github.com/json-iterator/go v1.1.6/go.mod h1:+SdeFBvtyEkXs7REEP0seUULqWtbJapLOCVDaaPEHmU=
-github.com/json-iterator/go v1.1.10/go.mod h1:KdQUCv79m/52Kvf8AW2vK1V8akMuk1QjK/uOdHXbAo4=
-github.com/jstemmer/go-junit-report v0.0.0-20190106144839-af01ea7f8024/go.mod h1:6v2b51hI/fHJwM22ozAgKL4VKDeJcHhJFhtBdhmNjmU=
-github.com/jstemmer/go-junit-report v0.9.1/go.mod h1:Brl9GWCQeLvo8nXZwPNNblvFj/XSXhF0NWZEnDohbsk=
-github.com/julienschmidt/httprouter v1.2.0/go.mod h1:SYymIcj16QtmaHHD7aYtjjsJG7VTCxuUUipMqKk8s4w=
-github.com/kisielk/errcheck v1.2.0/go.mod h1:/BMXB+zMLi60iA8Vv6Ksmxu/1UDYcXs4uQLJ+jE2L00=
-github.com/kisielk/gotool v1.0.0/go.mod h1:XhKaO+MFFWcvkIS/tQcRk01m1F5IRFswLeQ+oQHNcck=
-github.com/konsorten/go-windows-terminal-sequences v1.0.1/go.mod h1:T0+1ngSBFLxvqU3pZ+m/2kptfBszLMUkC4ZK/EgS/cQ=
-github.com/konsorten/go-windows-terminal-sequences v1.0.3/go.mod h1:T0+1ngSBFLxvqU3pZ+m/2kptfBszLMUkC4ZK/EgS/cQ=
-github.com/kr/logfmt v0.0.0-20140226030751-b84e30acd515/go.mod h1:+0opPa2QZZtGFBFZlji/RkVcI2GknAs/DXo4wKdlNEc=
-github.com/kr/pretty v0.1.0/go.mod h1:dAy3ld7l9f0ibDNOQOHHMYYIIbhfbHSm3C4ZsoJORNo=
-github.com/kr/pretty v0.2.0 h1:s5hAObm+yFO5uHYt5dYjxi2rXrsnmRpJx4OYvIWUaQs=
-github.com/kr/pretty v0.2.0/go.mod h1:ipq/a2n7PKx3OHsz4KJII5eveXtPO4qwEXGdVfWzfnI=
-github.com/kr/pty v1.1.1/go.mod h1:pFQYn66WHrOpPYNljwOMqo10TkYh1fy3cYio2l3bCsQ=
-github.com/kr/text v0.1.0 h1:45sCR5RtlFHMR4UwH9sdQ5TC8v0qDQCHnXt+kaKSTVE=
-github.com/kr/text v0.1.0/go.mod h1:4Jbv+DJW3UT/LiOwJeYQe1efqtUx/iVham/4vfdArNI=
-github.com/kubernetes-csi/csi-lib-utils v0.9.1 h1:sGq6ifVujfMSkfTsMZip44Ttv8SDXvsBlFk9GdYl/b8=
-github.com/kubernetes-csi/csi-lib-utils v0.9.1/go.mod h1:8E2jVUX9j3QgspwHXa6LwyN7IHQDjW9jX3kwoWnSC+M=
-github.com/mailru/easyjson v0.0.0-20160728113105-d5b7844b561a/go.mod h1:C1wdFJiN94OJF2b5HbByQZoLdCWB1Yqtg26g4irojpc=
-github.com/matttproud/golang_protobuf_extensions v1.0.1/go.mod h1:D8He9yQNgCq6Z5Ld7szi9bcBfOoFv/3dc6xSMkL2PC0=
-github.com/matttproud/golang_protobuf_extensions v1.0.2-0.20181231171920-c182affec369/go.mod h1:BSXmuO+STAnVfrANrmjBb36TMTDstsz7MSK+HVaYKv4=
-github.com/moby/term v0.0.0-20200312100748-672ec06f55cd/go.mod h1:DdlQx2hp0Ss5/fLikoLlEeIYiATotOjgB//nb973jeo=
-github.com/modern-go/concurrent v0.0.0-20180228061459-e0a39a4cb421/go.mod h1:6dJC0mAP4ikYIbvyc7fijjWJddQyLn8Ig3JB5CqoB9Q=
-github.com/modern-go/concurrent v0.0.0-20180306012644-bacd9c7ef1dd/go.mod h1:6dJC0mAP4ikYIbvyc7fijjWJddQyLn8Ig3JB5CqoB9Q=
-github.com/modern-go/reflect2 v0.0.0-20180701023420-4b7aa43c6742/go.mod h1:bx2lNnkwVCuqBIxFjflWJWanXIb3RllmbCylyMrvgv0=
-github.com/modern-go/reflect2 v1.0.1/go.mod h1:bx2lNnkwVCuqBIxFjflWJWanXIb3RllmbCylyMrvgv0=
-github.com/munnerz/goautoneg v0.0.0-20120707110453-a547fc61f48d/go.mod h1:+n7T8mK8HuQTcFwEeznm/DIxMOiR9yIdICNftLE1DvQ=
-github.com/mwitkow/go-conntrack v0.0.0-20161129095857-cc309e4a2223/go.mod h1:qRWi+5nqEBWmkhHvq77mSJWrCKwh8bxhgT7d/eI7P4U=
-github.com/mxk/go-flowrate v0.0.0-20140419014527-cca7078d478f/go.mod h1:ZdcZmHo+o7JKHSa8/e818NopupXU1YMK5fe1lsApnBw=
-github.com/onsi/ginkgo v0.0.0-20170829012221-11459a886d9c/go.mod h1:lLunBs/Ym6LB5Z9jYTR76FiuTmxDTDusOGeTQH+WWjE=
-github.com/onsi/ginkgo v1.6.0/go.mod h1:lLunBs/Ym6LB5Z9jYTR76FiuTmxDTDusOGeTQH+WWjE=
-github.com/onsi/ginkgo v1.11.0/go.mod h1:lLunBs/Ym6LB5Z9jYTR76FiuTmxDTDusOGeTQH+WWjE=
-github.com/onsi/gomega v0.0.0-20170829124025-dcabb60a477c/go.mod h1:C1qb7wdrVGGVU+Z6iS04AVkA3Q65CEZX59MT0QO5uiA=
-github.com/onsi/gomega v1.7.0/go.mod h1:ex+gbHU/CVuBBDIJjb2X0qEXbFg53c61hWP/1CpauHY=
-github.com/peterbourgon/diskv v2.0.1+incompatible/go.mod h1:uqqh8zWWbv1HBMNONnaR/tNboyR3/BZd58JJSHlUSCU=
-github.com/pkg/errors v0.8.0/go.mod h1:bwawxfHBFNV+L2hUp1rHADufV3IMtnDRdf1r5NINEl0=
-github.com/pkg/errors v0.8.1/go.mod h1:bwawxfHBFNV+L2hUp1rHADufV3IMtnDRdf1r5NINEl0=
-github.com/pkg/errors v0.9.1/go.mod h1:bwawxfHBFNV+L2hUp1rHADufV3IMtnDRdf1r5NINEl0=
-github.com/pmezard/go-difflib v1.0.0 h1:4DBwDE0NGyQoBHbLQYPwSUPoCMWR5BEzIk/f1lZbAQM=
-github.com/pmezard/go-difflib v1.0.0/go.mod h1:iKH77koFhYxTK1pcRnkKkqfTogsbg7gZNVY4sRDYZ/4=
-github.com/prometheus/client_golang v0.9.1/go.mod h1:7SWBe2y4D6OKWSNQJUaRYU/AaXPKyh/dDVn+NZz0KFw=
-github.com/prometheus/client_golang v1.0.0/go.mod h1:db9x61etRT2tGnBNRi70OPL5FsnadC4Ky3P0J6CfImo=
-github.com/prometheus/client_golang v1.7.1/go.mod h1:PY5Wy2awLA44sXw4AOSfFBetzPP4j5+D6mVACh+pe2M=
-github.com/prometheus/client_model v0.0.0-20180712105110-5c3871d89910/go.mod h1:MbSGuTsp3dbXC40dX6PRTWyKYBIrTGTE9sqQNg2J8bo=
-github.com/prometheus/client_model v0.0.0-20190129233127-fd36f4220a90/go.mod h1:xMI15A0UPsDsEKsMN9yxemIoYk6Tm2C1GtYGdfGttqA=
-github.com/prometheus/client_model v0.0.0-20190812154241-14fe0d1b01d4/go.mod h1:xMI15A0UPsDsEKsMN9yxemIoYk6Tm2C1GtYGdfGttqA=
-github.com/prometheus/client_model v0.2.0/go.mod h1:xMI15A0UPsDsEKsMN9yxemIoYk6Tm2C1GtYGdfGttqA=
-github.com/prometheus/common v0.4.1/go.mod h1:TNfzLD0ON7rHzMJeJkieUDPYmFC7Snx/y86RQel1bk4=
-github.com/prometheus/common v0.10.0/go.mod h1:Tlit/dnDKsSWFlCLTWaA1cyBgKHSMdTB80sz/V91rCo=
-github.com/prometheus/procfs v0.0.0-20181005140218-185b4288413d/go.mod h1:c3At6R/oaqEKCNdg8wHV1ftS6bRYblBhIjjI8uT2IGk=
-github.com/prometheus/procfs v0.0.2/go.mod h1:TjEm7ze935MbeOT/UhFTIMYKhuLP4wbCsTZCD3I8kEA=
-github.com/prometheus/procfs v0.1.3/go.mod h1:lV6e/gmhEcM9IjHGsFOCxxuZ+z1YqCvr4OA4YeYWdaU=
-github.com/rogpeppe/go-internal v1.3.0/go.mod h1:M8bDsm7K2OlrFYOpmOWEs/qY81heoFRclV5y23lUDJ4=
-github.com/sirupsen/logrus v1.2.0/go.mod h1:LxeOpSwHxABJmUn/MG1IvRgCAasNZTLOkJPxbbu5VWo=
-github.com/sirupsen/logrus v1.4.2/go.mod h1:tLMulIdttU9McNUspp0xgXVQah82FyeX6MwdIuYE2rE=
-github.com/sirupsen/logrus v1.6.0/go.mod h1:7uNnSEd1DgxDLC74fIahvMZmmYsHGZGEOFrfsX/uA88=
-github.com/spf13/afero v1.2.2/go.mod h1:9ZxEEn6pIJ8Rxe320qSDBk6AsU0r9pR7Q4OcevTdifk=
-github.com/spf13/pflag v0.0.0-20170130214245-9ff6c6923cff/go.mod h1:DYY7MBk1bdzusC3SYhjObp+wFpr4gzcvqqNjLnInEg4=
-github.com/spf13/pflag v1.0.3/go.mod h1:DYY7MBk1bdzusC3SYhjObp+wFpr4gzcvqqNjLnInEg4=
-github.com/spf13/pflag v1.0.5/go.mod h1:McXfInJRrz4CZXVZOBLb0bTZqETkiAhM9Iw0y3An2Bg=
-github.com/stretchr/objx v0.1.0/go.mod h1:HFkY916IF+rwdDfMAkV7OtwuqBVzrE8GR6GFx+wExME=
-github.com/stretchr/objx v0.1.1/go.mod h1:HFkY916IF+rwdDfMAkV7OtwuqBVzrE8GR6GFx+wExME=
-github.com/stretchr/testify v1.2.2/go.mod h1:a8OnRcib4nhh0OaRAV+Yts87kKdq0PP7pXfy6kDkUVs=
-github.com/stretchr/testify v1.3.0/go.mod h1:M5WIy9Dh21IEIfnGCwXGc5bZfKNJtfHm1UVUgZn+9EI=
-github.com/stretchr/testify v1.4.0/go.mod h1:j7eGeouHqKxXV5pUuKE4zz7dFj8WfuZ+81PSLYec5m4=
-github.com/stretchr/testify v1.5.1 h1:nOGnQDM7FYENwehXlg/kFVnos3rEvtKTjRvOWSzb6H4=
-github.com/stretchr/testify v1.5.1/go.mod h1:5W2xD1RspED5o8YsWQXVCued0rvSQ+mT+I5cxcmMvtA=
-github.com/yuin/goldmark v1.4.13/go.mod h1:6yULJ656Px+3vBD8DxQVa3kxgyrAnzto9xy5taEt/CY=
-go.opencensus.io v0.21.0/go.mod h1:mSImk1erAIZhrmZN+AvHh14ztQfjbGwt4TtuofqLduU=
-go.opencensus.io v0.22.0/go.mod h1:+kGneAE2xo2IficOXnaByMWTGM9T73dGwxeWcUqIpI8=
-go.opencensus.io v0.22.2/go.mod h1:yxeiOL68Rb0Xd1ddK5vPZ/oVn4vY4Ynel7k9FzqtOIw=
-go.uber.org/atomic v1.4.0/go.mod h1:gD2HeocX3+yG+ygLZcrzQJaqmWj9AIm7n08wl/qW/PE=
-go.uber.org/multierr v1.1.0/go.mod h1:wR5kodmAFQ0UK8QlbwjlSNy0Z68gJhDJUG5sjR94q/0=
-go.uber.org/zap v1.10.0/go.mod h1:vwi/ZaCAaUcBkycHslxD9B2zi4UTXhF60s6SWpuDF0Q=
-golang.org/x/crypto v0.0.0-20180904163835-0709b304e793/go.mod h1:6SG95UA2DQfeDnfUPMdvaQW0Q7yPrPDi9nlGo2tz2b4=
-golang.org/x/crypto v0.0.0-20190308221718-c2843e01d9a2/go.mod h1:djNgcEr1/C05ACkg1iLfiJU5Ep61QUkGW8qpdssI0+w=
-golang.org/x/crypto v0.0.0-20190510104115-cbcb75029529/go.mod h1:yigFU9vqHzYiE8UmvKecakEJjdnWj3jj499lnFckfCI=
-golang.org/x/crypto v0.0.0-20190605123033-f99c8df09eb5/go.mod h1:yigFU9vqHzYiE8UmvKecakEJjdnWj3jj499lnFckfCI=
-golang.org/x/crypto v0.0.0-20191011191535-87dc89f01550/go.mod h1:yigFU9vqHzYiE8UmvKecakEJjdnWj3jj499lnFckfCI=
-golang.org/x/crypto v0.0.0-20191206172530-e9b2fee46413/go.mod h1:LzIPMQfyMNhhGPhUkYOs5KpL4U8rLKemX1yGLhDgUto=
-golang.org/x/crypto v0.0.0-20200622213623-75b288015ac9/go.mod h1:LzIPMQfyMNhhGPhUkYOs5KpL4U8rLKemX1yGLhDgUto=
-golang.org/x/crypto v0.0.0-20210921155107-089bfa567519/go.mod h1:GvvjBRRGRdwPK5ydBHafDWAxML/pGHZbMvKqRZ5+Abc=
-golang.org/x/exp v0.0.0-20190121172915-509febef88a4/go.mod h1:CJ0aWSM057203Lf6IL+f9T1iT9GByDxfZKAQTCR3kQA=
-golang.org/x/exp v0.0.0-20190306152737-a1d7652674e8/go.mod h1:CJ0aWSM057203Lf6IL+f9T1iT9GByDxfZKAQTCR3kQA=
-golang.org/x/exp v0.0.0-20190510132918-efd6b22b2522/go.mod h1:ZjyILWgesfNpC6sMxTJOJm9Kp84zZh5NQWvqDGG3Qr8=
-golang.org/x/exp v0.0.0-20190829153037-c13cbed26979/go.mod h1:86+5VVa7VpoJ4kLfm080zCjGlMRFzhUhsZKEZO7MGek=
-golang.org/x/exp v0.0.0-20191227195350-da58074b4299/go.mod h1:2RIsYlXP63K8oxa1u096TMicItID8zy7Y6sNkU49FU4=
-golang.org/x/image v0.0.0-20190227222117-0694c2d4d067/go.mod h1:kZ7UVZpmo3dzQBMxlp+ypCbDeSB+sBbTgSJuh5dn5js=
-golang.org/x/image v0.0.0-20190802002840-cff245a6509b/go.mod h1:FeLwcggjj3mMvU+oOTbSwawSJRM1uh48EjtB4UJZlP0=
-golang.org/x/lint v0.0.0-20190227174305-5b3e6a55c961/go.mod h1:wehouNa3lNwaWXcvxsM5YxQ5yQlVC4a0KAMCusXpPoU=
-golang.org/x/lint v0.0.0-20190301231843-5614ed5bae6f/go.mod h1:UVdnD1Gm6xHRNCYTkRU2/jEulfH38KcIWyp/GAMgvoE=
-golang.org/x/lint v0.0.0-20190313153728-d0100b6bd8b3/go.mod h1:6SW0HCj/g11FgYtHlgUYUwCkIfeOF89ocIRzGO/8vkc=
-golang.org/x/lint v0.0.0-20190409202823-959b441ac422/go.mod h1:6SW0HCj/g11FgYtHlgUYUwCkIfeOF89ocIRzGO/8vkc=
-golang.org/x/lint v0.0.0-20190909230951-414d861bb4ac/go.mod h1:6SW0HCj/g11FgYtHlgUYUwCkIfeOF89ocIRzGO/8vkc=
-golang.org/x/lint v0.0.0-20191125180803-fdd1cda4f05f/go.mod h1:5qLYkcX4OjUUV8bRuDixDT3tpyyb+LUpUlRWLxfhWrs=
-golang.org/x/mobile v0.0.0-20190312151609-d3739f865fa6/go.mod h1:z+o9i4GpDbdi3rU15maQ/Ox0txvL9dWGYEHz965HBQE=
-golang.org/x/mobile v0.0.0-20190719004257-d2bd2a29d028/go.mod h1:E/iHnbuqvinMTCcRqshq8CkpyQDoeVncDDYHnLhea+o=
-golang.org/x/mod v0.0.0-20190513183733-4bf6d317e70e/go.mod h1:mXi4GBBbnImb6dmsKGUJ2LatrhH/nqhxcFungHvyanc=
-golang.org/x/mod v0.1.0/go.mod h1:0QHyrYULN0/3qlju5TqG8bIK38QM8yzMo5ekMj3DlcY=
-golang.org/x/mod v0.1.1-0.20191105210325-c90efee705ee/go.mod h1:QqPTAvyqsEbceGzBzNggFXnrqF1CaUcvgkdR5Ot7KZg=
-golang.org/x/mod v0.6.0-dev.0.20220419223038-86c51ed26bb4/go.mod h1:jJ57K6gSWd91VN4djpZkiMVwK6gcyfeH4XE8wZrZaV4=
-golang.org/x/net v0.0.0-20180724234803-3673e40ba225/go.mod h1:mL1N/T3taQHkDXs73rZJwtUhF3w3ftmwwsq0BUmARs4=
-golang.org/x/net v0.0.0-20180906233101-161cd47e91fd/go.mod h1:mL1N/T3taQHkDXs73rZJwtUhF3w3ftmwwsq0BUmARs4=
-golang.org/x/net v0.0.0-20181114220301-adae6a3d119a/go.mod h1:mL1N/T3taQHkDXs73rZJwtUhF3w3ftmwwsq0BUmARs4=
-golang.org/x/net v0.0.0-20190108225652-1e06a53dbb7e/go.mod h1:mL1N/T3taQHkDXs73rZJwtUhF3w3ftmwwsq0BUmARs4=
-golang.org/x/net v0.0.0-20190213061140-3a22650c66bd/go.mod h1:mL1N/T3taQHkDXs73rZJwtUhF3w3ftmwwsq0BUmARs4=
-golang.org/x/net v0.0.0-20190311183353-d8887717615a/go.mod h1:t9HGtf8HONx5eT2rtn7q6eTqICYqUVnKs3thJo3Qplg=
-golang.org/x/net v0.0.0-20190404232315-eb5bcb51f2a3/go.mod h1:t9HGtf8HONx5eT2rtn7q6eTqICYqUVnKs3thJo3Qplg=
-golang.org/x/net v0.0.0-20190501004415-9ce7a6920f09/go.mod h1:t9HGtf8HONx5eT2rtn7q6eTqICYqUVnKs3thJo3Qplg=
-golang.org/x/net v0.0.0-20190503192946-f4e77d36d62c/go.mod h1:t9HGtf8HONx5eT2rtn7q6eTqICYqUVnKs3thJo3Qplg=
-golang.org/x/net v0.0.0-20190603091049-60506f45cf65/go.mod h1:HSz+uSET+XFnRR8LxR5pz3Of3rY3CfYBVs4xY44aLks=
-golang.org/x/net v0.0.0-20190613194153-d28f0bde5980/go.mod h1:z5CRVTTTmAJ677TzLLGU+0bjPO0LkuOLi4/5GtJWs/s=
-golang.org/x/net v0.0.0-20190620200207-3b0461eec859/go.mod h1:z5CRVTTTmAJ677TzLLGU+0bjPO0LkuOLi4/5GtJWs/s=
-golang.org/x/net v0.0.0-20191209160850-c0dbc17a3553/go.mod h1:z5CRVTTTmAJ677TzLLGU+0bjPO0LkuOLi4/5GtJWs/s=
-golang.org/x/net v0.0.0-20200324143707-d3edc9973b7e/go.mod h1:qpuaurCH72eLCgpAm/N6yyVIVM9cpaDIP3A8BGJEC5A=
-golang.org/x/net v0.0.0-20200707034311-ab3426394381/go.mod h1:/O7V0waA8r7cgGh81Ro3o1hOxt32SMVPicZroKQ2sZA=
-golang.org/x/net v0.0.0-20210226172049-e18ecbb05110/go.mod h1:m0MpNAwzfU5UDzcl9v0D8zg8gWTRqZa9RBIspLL5mdg=
-golang.org/x/net v0.0.0-20220722155237-a158d28d115b/go.mod h1:XRhObCWvk6IyKnWLug+ECip1KBveYUHfp+8e9klMJ9c=
-golang.org/x/net v0.7.0 h1:rJrUqqhjsgNp7KqAIc25s9pZnjU7TUcSY7HcVZjdn1g=
-golang.org/x/net v0.7.0/go.mod h1:2Tu9+aMcznHK/AK1HMvgo6xiTLG5rD5rZLDS+rp2Bjs=
-golang.org/x/oauth2 v0.0.0-20180821212333-d2e6202438be/go.mod h1:N/0e6XlmueqKjAGxoOufVs8QHGRruUQn6yWY3a++T0U=
-golang.org/x/oauth2 v0.0.0-20190226205417-e64efc72b421/go.mod h1:gOpvHmFTYa4IltrdGE7lF6nIHvwfUNPOp7c8zoXwtLw=
-golang.org/x/oauth2 v0.0.0-20190604053449-0f29369cfe45/go.mod h1:gOpvHmFTYa4IltrdGE7lF6nIHvwfUNPOp7c8zoXwtLw=
-golang.org/x/oauth2 v0.0.0-20191202225959-858c2ad4c8b6/go.mod h1:gOpvHmFTYa4IltrdGE7lF6nIHvwfUNPOp7c8zoXwtLw=
-golang.org/x/sync v0.0.0-20180314180146-1d60e4601c6f/go.mod h1:RxMgew5VJxzue5/jJTE5uejpjVlOe/izrB70Jof72aM=
-golang.org/x/sync v0.0.0-20181108010431-42b317875d0f/go.mod h1:RxMgew5VJxzue5/jJTE5uejpjVlOe/izrB70Jof72aM=
-golang.org/x/sync v0.0.0-20181221193216-37e7f081c4d4/go.mod h1:RxMgew5VJxzue5/jJTE5uejpjVlOe/izrB70Jof72aM=
-golang.org/x/sync v0.0.0-20190227155943-e225da77a7e6/go.mod h1:RxMgew5VJxzue5/jJTE5uejpjVlOe/izrB70Jof72aM=
-golang.org/x/sync v0.0.0-20190423024810-112230192c58/go.mod h1:RxMgew5VJxzue5/jJTE5uejpjVlOe/izrB70Jof72aM=
-golang.org/x/sync v0.0.0-20190911185100-cd5d95a43a6e/go.mod h1:RxMgew5VJxzue5/jJTE5uejpjVlOe/izrB70Jof72aM=
-golang.org/x/sync v0.0.0-20220722155255-886fb9371eb4/go.mod h1:RxMgew5VJxzue5/jJTE5uejpjVlOe/izrB70Jof72aM=
-golang.org/x/sys v0.0.0-20180905080454-ebe1bf3edb33/go.mod h1:STP8DvDyc/dI5b8T5hshtkjS+E42TnysNCUPdjciGhY=
-golang.org/x/sys v0.0.0-20180909124046-d0be0721c37e/go.mod h1:STP8DvDyc/dI5b8T5hshtkjS+E42TnysNCUPdjciGhY=
-golang.org/x/sys v0.0.0-20181116152217-5ac8a444bdc5/go.mod h1:STP8DvDyc/dI5b8T5hshtkjS+E42TnysNCUPdjciGhY=
-golang.org/x/sys v0.0.0-20190215142949-d0b11bdaac8a/go.mod h1:STP8DvDyc/dI5b8T5hshtkjS+E42TnysNCUPdjciGhY=
-golang.org/x/sys v0.0.0-20190312061237-fead79001313/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs=
-golang.org/x/sys v0.0.0-20190412213103-97732733099d/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs=
-golang.org/x/sys v0.0.0-20190422165155-953cdadca894/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs=
-golang.org/x/sys v0.0.0-20190502145724-3ef323f4f1fd/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs=
-golang.org/x/sys v0.0.0-20190507160741-ecd444e8653b/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs=
-golang.org/x/sys v0.0.0-20190606165138-5da285871e9c/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs=
-golang.org/x/sys v0.0.0-20190624142023-c5567b49c5d0/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs=
-golang.org/x/sys v0.0.0-20191005200804-aed5e4c7ecf9/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs=
-golang.org/x/sys v0.0.0-20191204072324-ce4227a45e2e/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs=
-golang.org/x/sys v0.0.0-20191228213918-04cbcbbfeed8/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs=
-golang.org/x/sys v0.0.0-20200106162015-b016eb3dc98e/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs=
-golang.org/x/sys v0.0.0-20200302150141-5c8b2ff67527/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs=
-golang.org/x/sys v0.0.0-20200323222414-85ca7c5b95cd/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs=
-golang.org/x/sys v0.0.0-20200615200032-f1bc736245b1/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs=
-golang.org/x/sys v0.0.0-20200622214017-ed371f2e16b4/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs=
-golang.org/x/sys v0.0.0-20201119102817-f84b799fce68/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs=
-golang.org/x/sys v0.0.0-20210615035016-665e8c7367d1/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg=
-golang.org/x/sys v0.0.0-20220520151302-bc2c85ada10a/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg=
-golang.org/x/sys v0.0.0-20220722155257-8c9f86f7a55f/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg=
-golang.org/x/sys v0.5.0 h1:MUK/U/4lj1t1oPg0HfuXDN/Z1wv31ZJ/YcPiGccS4DU=
-golang.org/x/sys v0.5.0/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg=
-golang.org/x/term v0.0.0-20201126162022-7de9c90e9dd1/go.mod h1:bj7SfCRtBDWHUb9snDiAeCFNEtKQo2Wmx5Cou7ajbmo=
-golang.org/x/term v0.0.0-20210927222741-03fcf44c2211/go.mod h1:jbD1KX2456YbFQfuXm/mYQcufACuNUgVhRMnK/tPxf8=
-golang.org/x/term v0.5.0/go.mod h1:jMB1sMXY+tzblOD4FWmEbocvup2/aLOaQEp7JmGp78k=
-golang.org/x/text v0.3.0/go.mod h1:NqM8EUOU14njkJ3fqMW+pc6Ldnwhi/IjpwHt7yyuwOQ=
-golang.org/x/text v0.3.1-0.20180807135948-17ff2d5776d2/go.mod h1:NqM8EUOU14njkJ3fqMW+pc6Ldnwhi/IjpwHt7yyuwOQ=
-golang.org/x/text v0.3.2/go.mod h1:bEr9sfX3Q8Zfm5fL9x+3itogRgK3+ptLWKqgva+5dAk=
-golang.org/x/text v0.3.3/go.mod h1:5Zoc/QRtKVWzQhOtBMvqHzDpF6irO9z98xDceosuGiQ=
-golang.org/x/text v0.3.7/go.mod h1:u+2+/6zg+i71rQMx5EYifcz6MCKuco9NR6JIITiCfzQ=
-golang.org/x/text v0.7.0 h1:4BRB4x83lYWy72KwLD/qYDuTu7q9PjSagHvijDw7cLo=
-golang.org/x/text v0.7.0/go.mod h1:mrYo+phRRbMaCq/xk9113O4dZlRixOauAjOtrjsXDZ8=
-golang.org/x/time v0.0.0-20181108054448-85acf8d2951c/go.mod h1:tRJNPiyCQ0inRvYxbN9jk5I+vvW/OXSQhTDSoE431IQ=
-golang.org/x/time v0.0.0-20190308202827-9d24e82272b4/go.mod h1:tRJNPiyCQ0inRvYxbN9jk5I+vvW/OXSQhTDSoE431IQ=
-golang.org/x/time v0.0.0-20191024005414-555d28b269f0/go.mod h1:tRJNPiyCQ0inRvYxbN9jk5I+vvW/OXSQhTDSoE431IQ=
-golang.org/x/tools v0.0.0-20180917221912-90fa682c2a6e/go.mod h1:n7NCudcB/nEzxVGmLbDWY5pfWTLqBcC2KZ6jyYvM4mQ=
-golang.org/x/tools v0.0.0-20181011042414-1f849cf54d09/go.mod h1:n7NCudcB/nEzxVGmLbDWY5pfWTLqBcC2KZ6jyYvM4mQ=
-golang.org/x/tools v0.0.0-20181030221726-6c7e314b6563/go.mod h1:n7NCudcB/nEzxVGmLbDWY5pfWTLqBcC2KZ6jyYvM4mQ=
-golang.org/x/tools v0.0.0-20190226205152-f727befe758c/go.mod h1:9Yl7xja0Znq3iFh3HoIrodX9oNMXvdceNzlUR8zjMvY=
-golang.org/x/tools v0.0.0-20190311212946-11955173bddd/go.mod h1:LCzVGOaR6xXOjkQ3onu1FJEFr0SW1gC7cKk1uF8kGRs=
-golang.org/x/tools v0.0.0-20190312151545-0bb0c0a6e846/go.mod h1:LCzVGOaR6xXOjkQ3onu1FJEFr0SW1gC7cKk1uF8kGRs=
-golang.org/x/tools v0.0.0-20190312170243-e65039ee4138/go.mod h1:LCzVGOaR6xXOjkQ3onu1FJEFr0SW1gC7cKk1uF8kGRs=
-golang.org/x/tools v0.0.0-20190425150028-36563e24a262/go.mod h1:RgjU9mgBXZiqYHBnxXauZ1Gv1EHHAz9KjViQ78xBX0Q=
-golang.org/x/tools v0.0.0-20190506145303-2d16b83fe98c/go.mod h1:RgjU9mgBXZiqYHBnxXauZ1Gv1EHHAz9KjViQ78xBX0Q=
-golang.org/x/tools v0.0.0-20190524140312-2c0ae7006135/go.mod h1:RgjU9mgBXZiqYHBnxXauZ1Gv1EHHAz9KjViQ78xBX0Q=
-golang.org/x/tools v0.0.0-20190606124116-d0a3d012864b/go.mod h1:/rFqwRUd4F7ZHNgwSSTFct+R/Kf4OFW1sUzUTQQTgfc=
-golang.org/x/tools v0.0.0-20190621195816-6e04913cbbac/go.mod h1:/rFqwRUd4F7ZHNgwSSTFct+R/Kf4OFW1sUzUTQQTgfc=
-golang.org/x/tools v0.0.0-20190624222133-a101b041ded4/go.mod h1:/rFqwRUd4F7ZHNgwSSTFct+R/Kf4OFW1sUzUTQQTgfc=
-golang.org/x/tools v0.0.0-20190628153133-6cdbf07be9d0/go.mod h1:/rFqwRUd4F7ZHNgwSSTFct+R/Kf4OFW1sUzUTQQTgfc=
-golang.org/x/tools v0.0.0-20190816200558-6889da9d5479/go.mod h1:b+2E5dAYhXwXZwtnZ6UAqBI28+e2cm9otk0dWdXHAEo=
-golang.org/x/tools v0.0.0-20190911174233-4f2ddba30aff/go.mod h1:b+2E5dAYhXwXZwtnZ6UAqBI28+e2cm9otk0dWdXHAEo=
-golang.org/x/tools v0.0.0-20191012152004-8de300cfc20a/go.mod h1:b+2E5dAYhXwXZwtnZ6UAqBI28+e2cm9otk0dWdXHAEo=
-golang.org/x/tools v0.0.0-20191119224855-298f0cb1881e/go.mod h1:b+2E5dAYhXwXZwtnZ6UAqBI28+e2cm9otk0dWdXHAEo=
-golang.org/x/tools v0.0.0-20191125144606-a911d9008d1f/go.mod h1:b+2E5dAYhXwXZwtnZ6UAqBI28+e2cm9otk0dWdXHAEo=
-golang.org/x/tools v0.0.0-20191227053925-7b8e75db28f4/go.mod h1:TB2adYChydJhpapKDTa4BR/hXlZSLoq2Wpct/0txZ28=
-golang.org/x/tools v0.1.12/go.mod h1:hNGJHUnrk76NpqgfD5Aqm5Crs+Hm0VOH/i9J2+nxYbc=
-golang.org/x/xerrors v0.0.0-20190717185122-a985d3407aa7/go.mod h1:I/5z698sn9Ka8TeJc9MKroUUfqBBauWjQqLJ2OPfmY0=
-golang.org/x/xerrors v0.0.0-20191011141410-1b5146add898/go.mod h1:I/5z698sn9Ka8TeJc9MKroUUfqBBauWjQqLJ2OPfmY0=
-golang.org/x/xerrors v0.0.0-20191204190536-9bdfabe68543/go.mod h1:I/5z698sn9Ka8TeJc9MKroUUfqBBauWjQqLJ2OPfmY0=
-golang.org/x/xerrors v0.0.0-20200804184101-5ec99f83aff1 h1:go1bK/D/BFZV2I8cIQd1NKEZ+0owSTG1fDTci4IqFcE=
-golang.org/x/xerrors v0.0.0-20200804184101-5ec99f83aff1/go.mod h1:I/5z698sn9Ka8TeJc9MKroUUfqBBauWjQqLJ2OPfmY0=
-google.golang.org/api v0.4.0/go.mod h1:8k5glujaEP+g9n7WNsDg8QP6cUVNI86fCNMcbazEtwE=
-google.golang.org/api v0.7.0/go.mod h1:WtwebWUNSVBH/HAw79HIFXZNqEvBhG+Ra+ax0hx3E3M=
-google.golang.org/api v0.8.0/go.mod h1:o4eAsZoiT+ibD93RtjEohWalFOjRDx6CVaqeizhEnKg=
-google.golang.org/api v0.9.0/go.mod h1:o4eAsZoiT+ibD93RtjEohWalFOjRDx6CVaqeizhEnKg=
-google.golang.org/api v0.15.0/go.mod h1:iLdEw5Ide6rF15KTC1Kkl0iskquN2gFfn9o9XIsbkAI=
-google.golang.org/appengine v1.4.0/go.mod h1:xpcJRLb0r/rnEns0DIKYYv+WjYCduHsrkT7/EB5XEv4=
-google.golang.org/appengine v1.5.0/go.mod h1:xpcJRLb0r/rnEns0DIKYYv+WjYCduHsrkT7/EB5XEv4=
-google.golang.org/appengine v1.6.1/go.mod h1:i06prIuMbXzDqacNJfV5OdTW448YApPu5ww/cMBSeb0=
-google.golang.org/appengine v1.6.5/go.mod h1:8WjMMxjGQR8xUklV/ARdw2HLXBOI7O7uCIDZVag1xfc=
-google.golang.org/genproto v0.0.0-20190307195333-5fe7a883aa19/go.mod h1:VzzqZJRnGkLBvHegQrXjBqPurQTc5/KpmUdxsrq26oE=
-google.golang.org/genproto v0.0.0-20190418145605-e7d98fc518a7/go.mod h1:VzzqZJRnGkLBvHegQrXjBqPurQTc5/KpmUdxsrq26oE=
-google.golang.org/genproto v0.0.0-20190425155659-357c62f0e4bb/go.mod h1:VzzqZJRnGkLBvHegQrXjBqPurQTc5/KpmUdxsrq26oE=
-google.golang.org/genproto v0.0.0-20190502173448-54afdca5d873/go.mod h1:VzzqZJRnGkLBvHegQrXjBqPurQTc5/KpmUdxsrq26oE=
-google.golang.org/genproto v0.0.0-20190801165951-fa694d86fc64/go.mod h1:DMBHOl98Agz4BDEuKkezgsaosCRResVns1a3J2ZsMNc=
-google.golang.org/genproto v0.0.0-20190819201941-24fa4b261c55/go.mod h1:DMBHOl98Agz4BDEuKkezgsaosCRResVns1a3J2ZsMNc=
-google.golang.org/genproto v0.0.0-20190911173649-1774047e7e51/go.mod h1:IbNlFCBrqXvoKpeg0TB2l7cyZUmoaFKYIwrEpbDKLA8=
-google.golang.org/genproto v0.0.0-20191230161307-f3c370f40bfb/go.mod h1:n3cpQtvxv34hfy77yVDNjmbRyujviMdxYliBSkLhpCc=
-google.golang.org/genproto v0.0.0-20200526211855-cb27e3aa2013 h1:+kGHl1aib/qcwaRi1CbqBZ1rk19r85MNUf8HaBghugY=
-google.golang.org/genproto v0.0.0-20200526211855-cb27e3aa2013/go.mod h1:NbSheEEYHJ7i3ixzK3sjbqSGDJWnxyFXZblF3eUsNvo=
-google.golang.org/grpc v1.25.1 h1:wdKvqQk7IttEw92GoRyKG2IDrUIpgpj6H6m81yfeMW0=
-google.golang.org/grpc v1.25.1/go.mod h1:c3i+UQWmh7LiEpx4sFZnkU36qjEYZ0imhYfXVyQciAY=
-google.golang.org/protobuf v0.0.0-20200109180630-ec00e32a8dfd/go.mod h1:DFci5gLYBciE7Vtevhsrf46CRTquxDuWsQurQQe4oz8=
-google.golang.org/protobuf v0.0.0-20200221191635-4d8936d0db64/go.mod h1:kwYJMbMJ01Woi6D6+Kah6886xMZcty6N08ah7+eCXa0=
-google.golang.org/protobuf v0.0.0-20200228230310-ab0ca4ff8a60/go.mod h1:cfTl7dwQJ+fmap5saPgwCLgHXTUD7jkjRqWcaiX5VyM=
-google.golang.org/protobuf v1.20.1-0.20200309200217-e05f789c0967/go.mod h1:A+miEFZTKqfCUM6K7xSMQL9OKL/b6hQv+e19PK+JZNE=
-google.golang.org/protobuf v1.21.0/go.mod h1:47Nbq4nVaFHyn7ilMalzfO3qCViNmqZ2kzikPIcrTAo=
-google.golang.org/protobuf v1.22.0/go.mod h1:EGpADcykh3NcUnDUJcl1+ZksZNG86OlYog2l/sGQquU=
-google.golang.org/protobuf v1.23.0/go.mod h1:EGpADcykh3NcUnDUJcl1+ZksZNG86OlYog2l/sGQquU=
-google.golang.org/protobuf v1.23.1-0.20200526195155-81db48ad09cc/go.mod h1:EGpADcykh3NcUnDUJcl1+ZksZNG86OlYog2l/sGQquU=
-google.golang.org/protobuf v1.24.0 h1:UhZDfRO8JRQru4/+LlLE0BRKGF8L+PICnvYZmx/fEGA=
-google.golang.org/protobuf v1.24.0/go.mod h1:r/3tXBNzIEhYS9I1OUVjXDlt8tc493IdKGjtUeSXeh4=
-gopkg.in/alecthomas/kingpin.v2 v2.2.6/go.mod h1:FMv+mEhP44yOT+4EoQTLFTRgOQ1FBLkstjWtayDeSgw=
-gopkg.in/check.v1 v0.0.0-20161208181325-20d25e280405/go.mod h1:Co6ibVJAznAaIkqp8huTwlJQCZ016jof/cbN4VW5Yz0=
-gopkg.in/check.v1 v1.0.0-20180628173108-788fd7840127/go.mod h1:Co6ibVJAznAaIkqp8huTwlJQCZ016jof/cbN4VW5Yz0=
-gopkg.in/check.v1 v1.0.0-20190902080502-41f04d3bba15 h1:YR8cESwS4TdDjEe65xsg0ogRM/Nc3DYOhEAlW+xobZo=
-gopkg.in/check.v1 v1.0.0-20190902080502-41f04d3bba15/go.mod h1:Co6ibVJAznAaIkqp8huTwlJQCZ016jof/cbN4VW5Yz0=
-gopkg.in/errgo.v2 v2.1.0/go.mod h1:hNsd1EY+bozCKY1Ytp96fpM3vjJbqLJn88ws8XvfDNI=
-gopkg.in/fsnotify.v1 v1.4.7/go.mod h1:Tz8NjZHkW78fSQdbUxIjBTcgA1z1m8ZHf0WmKUhAMys=
-gopkg.in/inf.v0 v0.9.1/go.mod h1:cWUDdTG/fYaXco+Dcufb5Vnc6Gp2YChqWtbxRZE0mXw=
-gopkg.in/tomb.v1 v1.0.0-20141024135613-dd632973f1e7/go.mod h1:dt/ZhP58zS4L8KSrWDmTeBkI65Dw0HsyUHuEVlX15mw=
-gopkg.in/yaml.v2 v2.2.1/go.mod h1:hI93XBmqTisBFMUTm0b8Fm+jr3Dg1NNxqwp+5A1VGuI=
-gopkg.in/yaml.v2 v2.2.2/go.mod h1:hI93XBmqTisBFMUTm0b8Fm+jr3Dg1NNxqwp+5A1VGuI=
-gopkg.in/yaml.v2 v2.2.4/go.mod h1:hI93XBmqTisBFMUTm0b8Fm+jr3Dg1NNxqwp+5A1VGuI=
-gopkg.in/yaml.v2 v2.2.5/go.mod h1:hI93XBmqTisBFMUTm0b8Fm+jr3Dg1NNxqwp+5A1VGuI=
-gopkg.in/yaml.v2 v2.2.8 h1:obN1ZagJSUGI0Ek/LBmuj4SNLPfIny3KsKFopxRdj10=
-gopkg.in/yaml.v2 v2.2.8/go.mod h1:hI93XBmqTisBFMUTm0b8Fm+jr3Dg1NNxqwp+5A1VGuI=
-gotest.tools v2.2.0+incompatible/go.mod h1:DsYFclhRJ6vuDpmuTbkuFWG+y2sxOXAzmJt81HFBacw=
-gotest.tools/v3 v3.0.2/go.mod h1:3SzNCllyD9/Y+b5r9JIKQ474KzkZyqLqEfYqMsX94Bk=
-honnef.co/go/tools v0.0.0-20190102054323-c2f93a96b099/go.mod h1:rf3lG4BRIbNafJWhAfAdb/ePZxsR/4RtNHQocxwk9r4=
-honnef.co/go/tools v0.0.0-20190106161140-3f1c8253044a/go.mod h1:rf3lG4BRIbNafJWhAfAdb/ePZxsR/4RtNHQocxwk9r4=
-honnef.co/go/tools v0.0.0-20190418001031-e561f6794a2a/go.mod h1:rf3lG4BRIbNafJWhAfAdb/ePZxsR/4RtNHQocxwk9r4=
-honnef.co/go/tools v0.0.0-20190523083050-ea95bdfd59fc/go.mod h1:rf3lG4BRIbNafJWhAfAdb/ePZxsR/4RtNHQocxwk9r4=
-honnef.co/go/tools v0.0.1-2019.2.3/go.mod h1:a3bituU0lyd329TUQxRnasdCoJDkEUEAqEt0JzvZhAg=
-k8s.io/api v0.19.0/go.mod h1:I1K45XlvTrDjmj5LoM5LuP/KYrhWbjUKT/SoPG0qTjw=
-k8s.io/apimachinery v0.19.0/go.mod h1:DnPGDnARWFvYa3pMHgSxtbZb7gpzzAZ1pTfaUNDVlmA=
-k8s.io/client-go v0.19.0/go.mod h1:H9E/VT95blcFQnlyShFgnFT9ZnJOAceiUHM3MlRC+mU=
-k8s.io/component-base v0.19.0/go.mod h1:dKsY8BxkA+9dZIAh2aWJLL/UdASFDNtGYTCItL4LM7Y=
-k8s.io/gengo v0.0.0-20200413195148-3a45101e95ac/go.mod h1:ezvh/TsK7cY6rbqRK0oQQ8IAqLxYwwyPxAX1Pzy0ii0=
-k8s.io/klog v1.0.0 h1:Pt+yjF5aB1xDSVbau4VsWe+dQNzA0qv1LlXdC2dF6Q8=
-k8s.io/klog v1.0.0/go.mod h1:4Bi6QPql/J/LkTDqv7R/cd3hPo4k2DG6Ptcz060Ez5I=
-k8s.io/klog/v2 v2.0.0/go.mod h1:PBfzABfn139FHAV07az/IF9Wp1bkk3vpT2XSJ76fSDE=
-k8s.io/klog/v2 v2.2.0 h1:XRvcwJozkgZ1UQJmfMGpvRthQHOvihEhYtDfAaxMz/A=
-k8s.io/klog/v2 v2.2.0/go.mod h1:Od+F08eJP+W3HUb4pSrPpgp9DGU4GzlpG/TmITuYh/Y=
-k8s.io/kube-openapi v0.0.0-20200805222855-6aeccd4b50c6/go.mod h1:UuqjUnNftUyPE5H64/qeyjQoUZhGpeFDVdxjTeEVN2o=
-k8s.io/utils v0.0.0-20200729134348-d5654de09c73/go.mod h1:jPW/WVKK9YHAvNhRxK0md/EJ228hCsBRufyofKtW8HA=
-k8s.io/utils v0.0.0-20210305010621-2afb4311ab10 h1:u5rPykqiCpL+LBfjRkXvnK71gOgIdmq3eHUEkPrbeTI=
-k8s.io/utils v0.0.0-20210305010621-2afb4311ab10/go.mod h1:jPW/WVKK9YHAvNhRxK0md/EJ228hCsBRufyofKtW8HA=
-rsc.io/binaryregexp v0.2.0/go.mod h1:qTv7/COck+e2FymRvadv62gMdZztPaShugOCi3I+8D8=
-sigs.k8s.io/structured-merge-diff/v4 v4.0.1/go.mod h1:bJZC9H9iH24zzfZ/41RGcq60oK1F7G282QMXDPYydCw=
-sigs.k8s.io/yaml v1.1.0/go.mod h1:UJmg0vDUVViEyp3mgSv9WPwZCDxu4rQW1olrI1uml+o=
-sigs.k8s.io/yaml v1.2.0/go.mod h1:yfXDCHCao9+ENCvLSE62v9VSji2MKu5jeNfTrofGhJc=
--- a/csi/src/config.go
+++ b/csi/src/config.go
@ -1,22 +0,0 @@
-// Copyright (c) Vitaliy Filippov, 2019+
-// License: VNPL-1.1 or GNU GPL-2.0+ (see README.md for details)
-
-package vitastor
-
-const (
-    vitastorCSIDriverName    = "csi.vitastor.io"
-    vitastorCSIDriverVersion = "1.9.3"
-)
-
-// Config struct fills the parameters of request or user input
-type Config struct
-{
-    Endpoint string
-    NodeID   string
-}
-
-// NewConfig returns config struct to initialize new driver
-func NewConfig() *Config
-{
-    return &Config{}
-}
--- a/csi/src/controllerserver.go
+++ b/csi/src/controllerserver.go
@ -1,620 +0,0 @@
-// Copyright (c) Vitaliy Filippov, 2019+
-// License: VNPL-1.1 or GNU GPL-2.0+ (see README.md for details)
-
-package vitastor
-
-import (
-    "context"
-    "encoding/json"
-    "fmt"
-    "strings"
-    "strconv"
-    "time"
-    "os"
-    "io/ioutil"
-
-    "github.com/kubernetes-csi/csi-lib-utils/protosanitizer"
-    "k8s.io/klog"
-
-    "google.golang.org/grpc/codes"
-    "google.golang.org/grpc/status"
-    "google.golang.org/protobuf/types/known/timestamppb"
-
-    "github.com/container-storage-interface/spec/lib/go/csi"
-)
-
-const (
-    KB int64 = 1024
-    MB int64 = 1024 * KB
-    GB int64 = 1024 * MB
-    TB int64 = 1024 * GB
-    ETCD_TIMEOUT time.Duration = 15*time.Second
-)
-
-type InodeIndex struct
-{
-    Id uint64 `json:"id"`
-    PoolId uint64 `json:"pool_id"`
-}
-
-type InodeConfig struct
-{
-    Name string `json:"name"`
-    Size uint64 `json:"size,omitempty"`
-    ParentPool uint64 `json:"parent_pool,omitempty"`
-    ParentId uint64 `json:"parent_id,omitempty"`
-    Readonly bool `json:"readonly,omitempty"`
-    CreateTs uint64 `json:"create_ts,omitempty"`
-}
-
-type ControllerServer struct
-{
-    *Driver
-}
-
-// NewControllerServer create new instance controller
-func NewControllerServer(driver *Driver) *ControllerServer
-{
-    return &ControllerServer{
-        Driver: driver,
-    }
-}
-
-func GetConnectionParams(params map[string]string) (map[string]string, error)
-{
-    ctxVars := make(map[string]string)
-    configPath := params["configPath"]
-    if (configPath == "")
-    {
-        configPath = "/etc/vitastor/vitastor.conf"
-    }
-    else
-    {
-        ctxVars["configPath"] = configPath
-    }
-    config := make(map[string]interface{})
-    configFD, err := os.Open(configPath)
-    if (err != nil)
-    {
-        return nil, err
-    }
-    defer configFD.Close()
-    data, _ := ioutil.ReadAll(configFD)
-    json.Unmarshal(data, &config)
-    // Check etcd URL in the config, but do not use the explicit etcdUrl
-    // parameter for CLI calls, otherwise users won't be able to later
-    // change them - storage class parameters are saved in volume IDs
-    var etcdUrl []string
-    switch config["etcd_address"].(type)
-    {
-    case string:
-        url := strings.TrimSpace(config["etcd_address"].(string))
-        if (url != "")
-        {
-            etcdUrl = strings.Split(url, ",")
-        }
-    case []string:
-        etcdUrl = config["etcd_address"].([]string)
-    case []interface{}:
-        for _, url := range config["etcd_address"].([]interface{})
-        {
-            s, ok := url.(string)
-            if (ok)
-            {
-                etcdUrl = append(etcdUrl, s)
-            }
-        }
-    }
-    if (len(etcdUrl) == 0)
-    {
-        return nil, status.Error(codes.InvalidArgument, "etcd_address is missing in "+configPath)
-    }
-    return ctxVars, nil
-}
-
-func invokeCLI(ctxVars map[string]string, args []string) ([]byte, error)
-{
-    if (ctxVars["configPath"] != "")
-    {
-        args = append(args, "--config_path", ctxVars["configPath"])
-    }
-    stdout, _, err := system("/usr/bin/vitastor-cli", args...)
-    return stdout, err
-}
-
-// Create the volume
-func (cs *ControllerServer) CreateVolume(ctx context.Context, req *csi.CreateVolumeRequest) (*csi.CreateVolumeResponse, error)
-{
-    klog.Infof("received controller create volume request %+v", protosanitizer.StripSecrets(req))
-    if (req == nil)
-    {
-        return nil, status.Errorf(codes.InvalidArgument, "request cannot be empty")
-    }
-    if (req.GetName() == "")
-    {
-        return nil, status.Error(codes.InvalidArgument, "name is a required field")
-    }
-    volumeCapabilities := req.GetVolumeCapabilities()
-    if (volumeCapabilities == nil)
-    {
-        return nil, status.Error(codes.InvalidArgument, "volume capabilities is a required field")
-    }
-
-    err := cs.checkCaps(volumeCapabilities)
-    if (err != nil)
-    {
-        return nil, err
-    }
-
-    etcdVolumePrefix := req.Parameters["etcdVolumePrefix"]
-    poolId, _ := strconv.ParseUint(req.Parameters["poolId"], 10, 64)
-    if (poolId == 0)
-    {
-        return nil, status.Error(codes.InvalidArgument, "poolId is missing in storage class configuration")
-    }
-
-    volName := etcdVolumePrefix + req.GetName()
-    volSize := 1 * GB
-    if capRange := req.GetCapacityRange(); capRange != nil
-    {
-        volSize = ((capRange.GetRequiredBytes() + MB - 1) / MB) * MB
-    }
-
-    ctxVars, err := GetConnectionParams(req.Parameters)
-    if (err != nil)
-    {
-        return nil, err
-    }
-
-    args := []string{ "create", volName, "-s", fmt.Sprintf("%v", volSize), "--pool", fmt.Sprintf("%v", poolId) }
-
-    // Support creation from snapshot
-    var src *csi.VolumeContentSource
-    if (req.VolumeContentSource.GetSnapshot() != nil)
-    {
-        snapId := req.VolumeContentSource.GetSnapshot().GetSnapshotId()
-        if (snapId != "")
-        {
-            snapVars := make(map[string]string)
-            err := json.Unmarshal([]byte(snapId), &snapVars)
-            if (err != nil)
-            {
-                return nil, status.Error(codes.Internal, "volume ID not in JSON format")
-            }
-            args = append(args, "--parent", snapVars["name"]+"@"+snapVars["snapshot"])
-            src = &csi.VolumeContentSource{
-                Type: &csi.VolumeContentSource_Snapshot{
-                    Snapshot: &csi.VolumeContentSource_SnapshotSource{
-                        SnapshotId: snapId,
-                    },
-                },
-            }
-        }
-    }
-
-    // Create image using vitastor-cli
-    _, err = invokeCLI(ctxVars, args)
-    if (err != nil)
-    {
-        if (strings.Index(err.Error(), "already exists") > 0)
-        {
-            inodeCfg, err := invokeList(ctxVars, volName, true)
-            if (err != nil)
-            {
-                return nil, err
-            }
-            if (inodeCfg[0].Size < uint64(volSize))
-            {
-                return nil, status.Error(codes.Internal, "image "+volName+" is already created, but size is less than expected")
-            }
-        }
-        else
-        {
-            return nil, err
-        }
-    }
-
-    ctxVars["name"] = volName
-    volumeIdJson, _ := json.Marshal(ctxVars)
-    return &csi.CreateVolumeResponse{
-        Volume: &csi.Volume{
-            // Ugly, but VolumeContext isn't passed to DeleteVolume :-(
-            VolumeId: string(volumeIdJson),
-            CapacityBytes: volSize,
-            ContentSource: src,
-        },
-    }, nil
-}
-
-// DeleteVolume deletes the given volume
-func (cs *ControllerServer) DeleteVolume(ctx context.Context, req *csi.DeleteVolumeRequest) (*csi.DeleteVolumeResponse, error)
-{
-    klog.Infof("received controller delete volume request %+v", protosanitizer.StripSecrets(req))
-    if (req == nil)
-    {
-        return nil, status.Error(codes.InvalidArgument, "request cannot be empty")
-    }
-
-    volVars := make(map[string]string)
-    err := json.Unmarshal([]byte(req.VolumeId), &volVars)
-    if (err != nil)
-    {
-        return nil, status.Error(codes.Internal, "volume ID not in JSON format")
-    }
-    volName := volVars["name"]
-
-    ctxVars, err := GetConnectionParams(volVars)
-    if (err != nil)
-    {
-        return nil, err
-    }
-
-    _, err = invokeCLI(ctxVars, []string{ "rm", volName })
-    if (err != nil)
-    {
-        return nil, err
-    }
-
-    return &csi.DeleteVolumeResponse{}, nil
-}
-
-// ControllerPublishVolume return Unimplemented error
-func (cs *ControllerServer) ControllerPublishVolume(ctx context.Context, req *csi.ControllerPublishVolumeRequest) (*csi.ControllerPublishVolumeResponse, error)
-{
-    return nil, status.Error(codes.Unimplemented, "")
-}
-
-// ControllerUnpublishVolume return Unimplemented error
-func (cs *ControllerServer) ControllerUnpublishVolume(ctx context.Context, req *csi.ControllerUnpublishVolumeRequest) (*csi.ControllerUnpublishVolumeResponse, error)
-{
-    return nil, status.Error(codes.Unimplemented, "")
-}
-
-// ValidateVolumeCapabilities checks whether the volume capabilities requested are supported.
-func (cs *ControllerServer) ValidateVolumeCapabilities(ctx context.Context, req *csi.ValidateVolumeCapabilitiesRequest) (*csi.ValidateVolumeCapabilitiesResponse, error)
-{
-    klog.Infof("received controller validate volume capability request %+v", protosanitizer.StripSecrets(req))
-    if (req == nil)
-    {
-        return nil, status.Errorf(codes.InvalidArgument, "request is nil")
-    }
-    volumeID := req.GetVolumeId()
-    if (volumeID == "")
-    {
-        return nil, status.Error(codes.InvalidArgument, "volumeId is nil")
-    }
-    volumeCapabilities := req.GetVolumeCapabilities()
-    if (volumeCapabilities == nil)
-    {
-        return nil, status.Error(codes.InvalidArgument, "volumeCapabilities is nil")
-    }
-
-    err := cs.checkCaps(volumeCapabilities)
-    if (err != nil)
-    {
-        return nil, err
-    }
-
-    return &csi.ValidateVolumeCapabilitiesResponse{
-        Confirmed: &csi.ValidateVolumeCapabilitiesResponse_Confirmed{
-            VolumeCapabilities: req.VolumeCapabilities,
-        },
-    }, nil
-}
-
-func (cs *ControllerServer) checkCaps(volumeCapabilities []*csi.VolumeCapability) error
-{
-    var volumeCapabilityAccessModes []*csi.VolumeCapability_AccessMode
-    for _, mode := range []csi.VolumeCapability_AccessMode_Mode{
-        csi.VolumeCapability_AccessMode_SINGLE_NODE_WRITER,
-        csi.VolumeCapability_AccessMode_SINGLE_NODE_READER_ONLY,
-        csi.VolumeCapability_AccessMode_MULTI_NODE_READER_ONLY,
-        csi.VolumeCapability_AccessMode_SINGLE_NODE_SINGLE_WRITER,
-        csi.VolumeCapability_AccessMode_SINGLE_NODE_MULTI_WRITER,
-    } {
-        volumeCapabilityAccessModes = append(volumeCapabilityAccessModes, &csi.VolumeCapability_AccessMode{Mode: mode})
-    }
-    for _, capability := range volumeCapabilities
-    {
-        if (capability.GetBlock() != nil)
-        {
-            for _, mode := range []csi.VolumeCapability_AccessMode_Mode{
-                csi.VolumeCapability_AccessMode_MULTI_NODE_SINGLE_WRITER,
-                csi.VolumeCapability_AccessMode_MULTI_NODE_MULTI_WRITER,
-            } {
-                volumeCapabilityAccessModes = append(volumeCapabilityAccessModes, &csi.VolumeCapability_AccessMode{Mode: mode})
-            }
-            break
-        }
-    }
-
-    capabilitySupport := false
-    for _, capability := range volumeCapabilities
-    {
-        for _, volumeCapabilityAccessMode := range volumeCapabilityAccessModes
-        {
-            if (volumeCapabilityAccessMode.Mode == capability.AccessMode.Mode)
-            {
-                capabilitySupport = true
-            }
-        }
-    }
-
-    if (!capabilitySupport)
-    {
-        return status.Errorf(codes.NotFound, "%v not supported", volumeCapabilities)
-    }
-
-    return nil
-}
-
-// ListVolumes returns a list of volumes
-func (cs *ControllerServer) ListVolumes(ctx context.Context, req *csi.ListVolumesRequest) (*csi.ListVolumesResponse, error)
-{
-    return nil, status.Error(codes.Unimplemented, "")
-}
-
-// GetCapacity returns the capacity of the storage pool
-func (cs *ControllerServer) GetCapacity(ctx context.Context, req *csi.GetCapacityRequest) (*csi.GetCapacityResponse, error)
-{
-    return nil, status.Error(codes.Unimplemented, "")
-}
-
-// ControllerGetCapabilities returns the capabilities of the controller service.
-func (cs *ControllerServer) ControllerGetCapabilities(ctx context.Context, req *csi.ControllerGetCapabilitiesRequest) (*csi.ControllerGetCapabilitiesResponse, error)
-{
-    functionControllerServerCapabilities := func(cap csi.ControllerServiceCapability_RPC_Type) *csi.ControllerServiceCapability
-    {
-        return &csi.ControllerServiceCapability{
-            Type: &csi.ControllerServiceCapability_Rpc{
-                Rpc: &csi.ControllerServiceCapability_RPC{
-                    Type: cap,
-                },
-            },
-        }
-    }
-
-    var controllerServerCapabilities []*csi.ControllerServiceCapability
-    for _, capability := range []csi.ControllerServiceCapability_RPC_Type{
-        csi.ControllerServiceCapability_RPC_CREATE_DELETE_VOLUME,
-        csi.ControllerServiceCapability_RPC_LIST_VOLUMES,
-        csi.ControllerServiceCapability_RPC_EXPAND_VOLUME,
-        csi.ControllerServiceCapability_RPC_CREATE_DELETE_SNAPSHOT,
-        csi.ControllerServiceCapability_RPC_LIST_SNAPSHOTS,
-        // TODO: csi.ControllerServiceCapability_RPC_CLONE_VOLUME,
-    } {
-        controllerServerCapabilities = append(controllerServerCapabilities, functionControllerServerCapabilities(capability))
-    }
-
-    return &csi.ControllerGetCapabilitiesResponse{
-        Capabilities: controllerServerCapabilities,
-    }, nil
-}
-
-func invokeList(ctxVars map[string]string, pattern string, expectExist bool) ([]InodeConfig, error)
-{
-    stat, err := invokeCLI(ctxVars, []string{ "ls", "--json", pattern })
-    if (err != nil)
-    {
-        return nil, err
-    }
-    var inodeCfg []InodeConfig
-    err = json.Unmarshal(stat, &inodeCfg)
-    if (err != nil)
-    {
-        return nil, status.Error(codes.Internal, "Invalid JSON in vitastor-cli ls: "+err.Error())
-    }
-    if (expectExist && len(inodeCfg) == 0)
-    {
-        return nil, status.Error(codes.Internal, "Can't find expected image "+pattern+" via vitastor-cli ls")
-    }
-    return inodeCfg, nil
-}
-
-// CreateSnapshot create snapshot of an existing PV
-func (cs *ControllerServer) CreateSnapshot(ctx context.Context, req *csi.CreateSnapshotRequest) (*csi.CreateSnapshotResponse, error)
-{
-    klog.Infof("received controller create snapshot request %+v", protosanitizer.StripSecrets(req))
-    if (req == nil)
-    {
-        return nil, status.Errorf(codes.InvalidArgument, "request cannot be empty")
-    }
-    if (req.SourceVolumeId == "" || req.Name == "")
-    {
-        return nil, status.Error(codes.InvalidArgument, "source volume ID and snapshot name are required fields")
-    }
-
-    // snapshot name
-    snapName := req.Name
-
-    // req.VolumeId is an ugly json string in our case :)
-    ctxVars := make(map[string]string)
-    err := json.Unmarshal([]byte(req.SourceVolumeId), &ctxVars)
-    if (err != nil)
-    {
-        return nil, status.Error(codes.Internal, "volume ID not in JSON format")
-    }
-    volName := ctxVars["name"]
-
-    // Create image using vitastor-cli
-    _, err = invokeCLI(ctxVars, []string{ "create", "--snapshot", snapName, volName })
-    if (err != nil && strings.Index(err.Error(), "already exists") <= 0)
-    {
-        return nil, err
-    }
-
-    // Check created snapshot
-    inodeCfg, err := invokeList(ctxVars, volName+"@"+snapName, true)
-    if (err != nil)
-    {
-        return nil, err
-    }
-
-    // Use ugly JSON snapshot ID again, DeleteSnapshot doesn't have context :-(
-    ctxVars["snapshot"] = snapName
-    snapIdJson, _ := json.Marshal(ctxVars)
-    return &csi.CreateSnapshotResponse{
-        Snapshot: &csi.Snapshot{
-            SizeBytes: int64(inodeCfg[0].Size),
-            SnapshotId: string(snapIdJson),
-            SourceVolumeId: req.SourceVolumeId,
-            CreationTime: &timestamppb.Timestamp{ Seconds: int64(inodeCfg[0].CreateTs) },
-            ReadyToUse: true,
-        },
-    }, nil
-}
-
-// DeleteSnapshot delete provided snapshot of a PV
-func (cs *ControllerServer) DeleteSnapshot(ctx context.Context, req *csi.DeleteSnapshotRequest) (*csi.DeleteSnapshotResponse, error)
-{
-    klog.Infof("received controller delete snapshot request %+v", protosanitizer.StripSecrets(req))
-    if (req == nil)
-    {
-        return nil, status.Errorf(codes.InvalidArgument, "request cannot be empty")
-    }
-    if (req.SnapshotId == "")
-    {
-        return nil, status.Error(codes.InvalidArgument, "snapshot ID is a required field")
-    }
-
-    volVars := make(map[string]string)
-    err := json.Unmarshal([]byte(req.SnapshotId), &volVars)
-    if (err != nil)
-    {
-        return nil, status.Error(codes.Internal, "snapshot ID not in JSON format")
-    }
-    volName := volVars["name"]
-    snapName := volVars["snapshot"]
-
-    ctxVars, err := GetConnectionParams(volVars)
-    if (err != nil)
-    {
-        return nil, err
-    }
-
-    _, err = invokeCLI(ctxVars, []string{ "rm", volName+"@"+snapName })
-    if (err != nil)
-    {
-        return nil, err
-    }
-
-    return &csi.DeleteSnapshotResponse{}, nil
-}
-
-// ListSnapshots list the snapshots of a PV
-func (cs *ControllerServer) ListSnapshots(ctx context.Context, req *csi.ListSnapshotsRequest) (*csi.ListSnapshotsResponse, error)
-{
-    klog.Infof("received controller list snapshots request %+v", protosanitizer.StripSecrets(req))
-    if (req == nil)
-    {
-        return nil, status.Error(codes.InvalidArgument, "request cannot be empty")
-    }
-
-    volVars := make(map[string]string)
-    err := json.Unmarshal([]byte(req.SourceVolumeId), &volVars)
-    if (err != nil)
-    {
-        return nil, status.Error(codes.Internal, "volume ID not in JSON format")
-    }
-    volName := volVars["name"]
-    ctxVars, err := GetConnectionParams(volVars)
-    if (err != nil)
-    {
-        return nil, err
-    }
-
-    inodeCfg, err := invokeList(ctxVars, volName+"@*", false)
-    if (err != nil)
-    {
-        return nil, err
-    }
-
-    resp := &csi.ListSnapshotsResponse{}
-    for _, ino := range inodeCfg
-    {
-        snapName := ino.Name[len(volName)+1:]
-        if (len(req.StartingToken) > 0 && snapName < req.StartingToken)
-        {
-        }
-        else if (req.MaxEntries == 0 || len(resp.Entries) < int(req.MaxEntries))
-        {
-            volVars["snapshot"] = snapName
-            snapIdJson, _ := json.Marshal(volVars)
-            resp.Entries = append(resp.Entries, &csi.ListSnapshotsResponse_Entry{
-                Snapshot: &csi.Snapshot{
-                    SizeBytes: int64(ino.Size),
-                    SnapshotId: string(snapIdJson),
-                    SourceVolumeId: req.SourceVolumeId,
-                    CreationTime: &timestamppb.Timestamp{ Seconds: int64(ino.CreateTs) },
-                    ReadyToUse: true,
-                },
-            })
-        }
-        else
-        {
-            resp.NextToken = snapName
-            break
-        }
-    }
-
-    return resp, nil
-}
-
-// ControllerExpandVolume increases the size of a volume
-func (cs *ControllerServer) ControllerExpandVolume(ctx context.Context, req *csi.ControllerExpandVolumeRequest) (*csi.ControllerExpandVolumeResponse, error)
-{
-    klog.Infof("received controller expand volume request %+v", protosanitizer.StripSecrets(req))
-    if (req == nil)
-    {
-        return nil, status.Error(codes.InvalidArgument, "request cannot be empty")
-    }
-    if (req.VolumeId == "" || req.CapacityRange == nil || req.CapacityRange.RequiredBytes == 0)
-    {
-        return nil, status.Error(codes.InvalidArgument, "VolumeId, CapacityRange and RequiredBytes are required fields")
-    }
-
-    volVars := make(map[string]string)
-    err := json.Unmarshal([]byte(req.VolumeId), &volVars)
-    if (err != nil)
-    {
-        return nil, status.Error(codes.Internal, "volume ID not in JSON format")
-    }
-    volName := volVars["name"]
-    ctxVars, err := GetConnectionParams(volVars)
-    if (err != nil)
-    {
-        return nil, err
-    }
-
-    inodeCfg, err := invokeList(ctxVars, volName, true)
-    if (err != nil)
-    {
-        return nil, err
-    }
-
-    if (req.CapacityRange.RequiredBytes > 0 && inodeCfg[0].Size < uint64(req.CapacityRange.RequiredBytes))
-    {
-        sz := ((req.CapacityRange.RequiredBytes+4095)/4096)*4096
-        _, err := invokeCLI(ctxVars, []string{ "modify", "--inc_size", "1", "--resize", fmt.Sprintf("%d", sz), volName })
-        if (err != nil)
-        {
-            return nil, err
-        }
-        inodeCfg, err = invokeList(ctxVars, volName, true)
-        if (err != nil)
-        {
-            return nil, err
-        }
-    }
-
-    return &csi.ControllerExpandVolumeResponse{
-        CapacityBytes: int64(inodeCfg[0].Size),
-        NodeExpansionRequired: false,
-    }, nil
-}
-
-// ControllerGetVolume get volume info
-func (cs *ControllerServer) ControllerGetVolume(ctx context.Context, req *csi.ControllerGetVolumeRequest) (*csi.ControllerGetVolumeResponse, error)
-{
-    return nil, status.Error(codes.Unimplemented, "")
-}
--- a/csi/src/grpc.go
+++ b/csi/src/grpc.go
@ -1,137 +0,0 @@
-/*
-Copyright 2017 The Kubernetes Authors.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-*/
-
-package vitastor
-
-import (
-    "fmt"
-    "net"
-    "os"
-    "strings"
-    "sync"
-
-    "github.com/golang/glog"
-    "golang.org/x/net/context"
-    "google.golang.org/grpc"
-
-    "github.com/container-storage-interface/spec/lib/go/csi"
-    "github.com/kubernetes-csi/csi-lib-utils/protosanitizer"
-)
-
-// Defines Non blocking GRPC server interfaces
-type NonBlockingGRPCServer interface {
-    // Start services at the endpoint
-    Start(endpoint string, ids csi.IdentityServer, cs csi.ControllerServer, ns csi.NodeServer)
-    // Waits for the service to stop
-    Wait()
-    // Stops the service gracefully
-    Stop()
-    // Stops the service forcefully
-    ForceStop()
-}
-
-func NewNonBlockingGRPCServer() NonBlockingGRPCServer {
-    return &nonBlockingGRPCServer{}
-}
-
-// NonBlocking server
-type nonBlockingGRPCServer struct {
-    wg     sync.WaitGroup
-    server *grpc.Server
-}
-
-func (s *nonBlockingGRPCServer) Start(endpoint string, ids csi.IdentityServer, cs csi.ControllerServer, ns csi.NodeServer) {
-
-    s.wg.Add(1)
-
-    go s.serve(endpoint, ids, cs, ns)
-
-    return
-}
-
-func (s *nonBlockingGRPCServer) Wait() {
-    s.wg.Wait()
-}
-
-func (s *nonBlockingGRPCServer) Stop() {
-    s.server.GracefulStop()
-}
-
-func (s *nonBlockingGRPCServer) ForceStop() {
-    s.server.Stop()
-}
-
-func (s *nonBlockingGRPCServer) serve(endpoint string, ids csi.IdentityServer, cs csi.ControllerServer, ns csi.NodeServer) {
-
-    proto, addr, err := ParseEndpoint(endpoint)
-    if err != nil {
-        glog.Fatal(err.Error())
-    }
-
-    if proto == "unix" {
-        addr = "/" + addr
-        if err := os.Remove(addr); err != nil && !os.IsNotExist(err) {
-            glog.Fatalf("Failed to remove %s, error: %s", addr, err.Error())
-        }
-    }
-
-    listener, err := net.Listen(proto, addr)
-    if err != nil {
-        glog.Fatalf("Failed to listen: %v", err)
-    }
-
-    opts := []grpc.ServerOption{
-        grpc.UnaryInterceptor(logGRPC),
-    }
-    server := grpc.NewServer(opts...)
-    s.server = server
-
-    if ids != nil {
-        csi.RegisterIdentityServer(server, ids)
-    }
-    if cs != nil {
-        csi.RegisterControllerServer(server, cs)
-    }
-    if ns != nil {
-        csi.RegisterNodeServer(server, ns)
-    }
-
-    glog.Infof("Listening for connections on address: %#v", listener.Addr())
-
-    server.Serve(listener)
-}
-
-func ParseEndpoint(ep string) (string, string, error) {
-    if strings.HasPrefix(strings.ToLower(ep), "unix://") || strings.HasPrefix(strings.ToLower(ep), "tcp://") {
-        s := strings.SplitN(ep, "://", 2)
-        if s[1] != "" {
-            return s[0], s[1], nil
-        }
-    }
-    return "", "", fmt.Errorf("Invalid endpoint: %v", ep)
-}
-
-func logGRPC(ctx context.Context, req interface{}, info *grpc.UnaryServerInfo, handler grpc.UnaryHandler) (interface{}, error) {
-    glog.V(3).Infof("GRPC call: %s", info.FullMethod)
-    glog.V(5).Infof("GRPC request: %s", protosanitizer.StripSecrets(req))
-    resp, err := handler(ctx, req)
-    if err != nil {
-        glog.Errorf("GRPC error: %v", err)
-    } else {
-        glog.V(5).Infof("GRPC response: %s", protosanitizer.StripSecrets(resp))
-    }
-    return resp, err
-}
--- a/csi/src/identityserver.go
+++ b/csi/src/identityserver.go
@ -1,67 +0,0 @@
-// Copyright (c) Vitaliy Filippov, 2019+
-// License: VNPL-1.1 or GNU GPL-2.0+ (see README.md for details)
-
-package vitastor
-
-import (
-    "context"
-
-    "github.com/kubernetes-csi/csi-lib-utils/protosanitizer"
-    "k8s.io/klog"
-
-    "github.com/container-storage-interface/spec/lib/go/csi"
-)
-
-// IdentityServer struct of Vitastor CSI driver with supported methods of CSI identity server spec.
-type IdentityServer struct
-{
-    *Driver
-}
-
-// NewIdentityServer create new instance identity
-func NewIdentityServer(driver *Driver) *IdentityServer
-{
-    return &IdentityServer{
-        Driver: driver,
-    }
-}
-
-// GetPluginInfo returns metadata of the plugin
-func (is *IdentityServer) GetPluginInfo(ctx context.Context, req *csi.GetPluginInfoRequest) (*csi.GetPluginInfoResponse, error)
-{
-    klog.Infof("received identity plugin info request %+v", protosanitizer.StripSecrets(req))
-    return &csi.GetPluginInfoResponse{
-        Name:          vitastorCSIDriverName,
-        VendorVersion: vitastorCSIDriverVersion,
-    }, nil
-}
-
-// GetPluginCapabilities returns available capabilities of the plugin
-func (is *IdentityServer) GetPluginCapabilities(ctx context.Context, req *csi.GetPluginCapabilitiesRequest) (*csi.GetPluginCapabilitiesResponse, error)
-{
-    klog.Infof("received identity plugin capabilities request %+v", protosanitizer.StripSecrets(req))
-    return &csi.GetPluginCapabilitiesResponse{
-        Capabilities: []*csi.PluginCapability{
-            {
-                Type: &csi.PluginCapability_Service_{
-                    Service: &csi.PluginCapability_Service{
-                        Type: csi.PluginCapability_Service_CONTROLLER_SERVICE,
-                    },
-                },
-            },
-            {
-                Type: &csi.PluginCapability_VolumeExpansion_{
-                    VolumeExpansion: &csi.PluginCapability_VolumeExpansion{
-                        Type: csi.PluginCapability_VolumeExpansion_OFFLINE,
-                    },
-                },
-            },
-        },
-    }, nil
-}
-
-// Probe returns the health and readiness of the plugin
-func (is *IdentityServer) Probe(ctx context.Context, req *csi.ProbeRequest) (*csi.ProbeResponse, error)
-{
-    return &csi.ProbeResponse{}, nil
-}
--- a/csi/src/nodeserver.go
+++ b/csi/src/nodeserver.go
@ -1,641 +0,0 @@
-// Copyright (c) Vitaliy Filippov, 2019+
-// License: VNPL-1.1 or GNU GPL-2.0+ (see README.md for details)
-
-package vitastor
-
-import (
-    "context"
-    "encoding/json"
-    "fmt"
-    "os"
-    "os/exec"
-    "path/filepath"
-    "strings"
-    "sync"
-    "syscall"
-    "time"
-
-    "google.golang.org/grpc/codes"
-    "google.golang.org/grpc/status"
-    "k8s.io/utils/mount"
-    utilexec "k8s.io/utils/exec"
-
-    "github.com/container-storage-interface/spec/lib/go/csi"
-    "github.com/kubernetes-csi/csi-lib-utils/protosanitizer"
-    "k8s.io/klog"
-)
-
-// NodeServer struct of Vitastor CSI driver with supported methods of CSI node server spec.
-type NodeServer struct
-{
-    *Driver
-    useVduse bool
-    stateDir string
-    mounter mount.Interface
-    restartInterval time.Duration
-    mu sync.Mutex
-    cond *sync.Cond
-    volumeLocks map[string]bool
-}
-
-type DeviceState struct
-{
-    ConfigPath string `json:"configPath"`
-    VdpaId     string `json:"vdpaId"`
-    Image      string `json:"image"`
-    Blockdev   string `json:"blockdev"`
-    Readonly   bool   `json:"readonly"`
-    PidFile    string `json:"pidFile"`
-}
-
-// NewNodeServer create new instance node
-func NewNodeServer(driver *Driver) *NodeServer
-{
-    stateDir := os.Getenv("STATE_DIR")
-    if (stateDir == "")
-    {
-        stateDir = "/run/vitastor-csi"
-    }
-    if (stateDir[len(stateDir)-1] != '/')
-    {
-        stateDir += "/"
-    }
-    ns := &NodeServer{
-        Driver: driver,
-        useVduse: checkVduseSupport(),
-        stateDir: stateDir,
-        mounter: mount.New(""),
-        volumeLocks: make(map[string]bool),
-    }
-    ns.cond = sync.NewCond(&ns.mu)
-    if (ns.useVduse)
-    {
-        ns.restoreVduseDaemons()
-        dur, err := time.ParseDuration(os.Getenv("RESTART_INTERVAL"))
-        if (err != nil)
-        {
-            dur = 10 * time.Second
-        }
-        ns.restartInterval = dur
-        if (ns.restartInterval != time.Duration(0))
-        {
-            go ns.restarter()
-        }
-    }
-    return ns
-}
-
-func (ns *NodeServer) lockVolume(lockId string)
-{
-    ns.mu.Lock()
-    defer ns.mu.Unlock()
-    for (ns.volumeLocks[lockId])
-    {
-        ns.cond.Wait()
-    }
-    ns.volumeLocks[lockId] = true
-    ns.cond.Broadcast()
-}
-
-func (ns *NodeServer) unlockVolume(lockId string)
-{
-    ns.mu.Lock()
-    defer ns.mu.Unlock()
-    delete(ns.volumeLocks, lockId)
-    ns.cond.Broadcast()
-}
-
-func (ns *NodeServer) restarter()
-{
-    // Restart dead VDUSE daemons at regular intervals
-    // Otherwise volume I/O may hang in case of a qemu-storage-daemon crash
-    // Moreover, it may lead to a kernel panic of the kernel is configured to
-    // panic on hung tasks
-    ticker := time.NewTicker(ns.restartInterval)
-    defer ticker.Stop()
-    for
-    {
-        <-ticker.C
-        ns.restoreVduseDaemons()
-    }
-}
-
-func (ns *NodeServer) restoreVduseDaemons()
-{
-    pattern := ns.stateDir+"vitastor-vduse-*.json"
-    matches, err := filepath.Glob(pattern)
-    if (err != nil)
-    {
-        klog.Errorf("failed to list %s: %v", pattern, err)
-    }
-    if (len(matches) == 0)
-    {
-        return
-    }
-    devList := make(map[string]interface{})
-    // example output: {"dev":{"test1":{"type":"block","mgmtdev":"vduse","vendor_id":0,"max_vqs":16,"max_vq_size":128}}}
-    devListJSON, _, err := system("/sbin/vdpa", "-j", "dev", "list")
-    if (err != nil)
-    {
-        return
-    }
-    err = json.Unmarshal(devListJSON, &devList)
-    devs, ok := devList["dev"].(map[string]interface{})
-    if (err != nil || !ok)
-    {
-        klog.Errorf("/sbin/vdpa -j dev list returned bad JSON (error %v): %v", err, string(devListJSON))
-        return
-    }
-    for _, stateFile := range matches
-    {
-        vdpaId := filepath.Base(stateFile)
-        vdpaId = vdpaId[0:len(vdpaId)-5]
-        // Check if VDPA device is still added to the bus
-        if (devs[vdpaId] == nil)
-        {
-            // Unused, clean it up
-            unmapVduseById(ns.stateDir, vdpaId)
-            continue
-        }
-
-        stateJSON, err := os.ReadFile(stateFile)
-        if (err != nil)
-        {
-            klog.Warningf("error reading state file %v: %v", stateFile, err)
-            continue
-        }
-        var state DeviceState
-        err = json.Unmarshal(stateJSON, &state)
-        if (err != nil)
-        {
-            klog.Warningf("state file %v contains invalid JSON (error %v): %v", stateFile, err, string(stateJSON))
-            continue
-        }
-
-        ns.lockVolume(state.ConfigPath+":"+state.Image)
-
-        // Recheck state file after locking
-        _, err = os.ReadFile(stateFile)
-        if (err != nil)
-        {
-            klog.Warningf("state file %v disappeared, skipping volume", stateFile)
-            ns.unlockVolume(state.ConfigPath+":"+state.Image)
-            continue
-        }
-
-        // Check if the storage daemon is still active
-        pidFile := ns.stateDir + vdpaId + ".pid"
-        exists := false
-        proc, err := findByPidFile(pidFile)
-        if (err == nil)
-        {
-            exists = proc.Signal(syscall.Signal(0)) == nil
-        }
-        if (!exists)
-        {
-            // Restart daemon
-            klog.Warningf("restarting storage daemon for volume %v (VDPA ID %v)", state.Image, vdpaId)
-            _ = startStorageDaemon(vdpaId, state.Image, pidFile, state.ConfigPath, state.Readonly)
-        }
-
-        ns.unlockVolume(state.ConfigPath+":"+state.Image)
-    }
-}
-
-// NodeStageVolume mounts the volume to a staging path on the node.
-func (ns *NodeServer) NodeStageVolume(ctx context.Context, req *csi.NodeStageVolumeRequest) (*csi.NodeStageVolumeResponse, error)
-{
-    klog.Infof("received node stage volume request %+v", protosanitizer.StripSecrets(req))
-
-    ctxVars := make(map[string]string)
-    err := json.Unmarshal([]byte(req.VolumeId), &ctxVars)
-    if (err != nil)
-    {
-        return nil, status.Error(codes.Internal, "volume ID not in JSON format")
-    }
-    _, err = GetConnectionParams(ctxVars)
-    if (err != nil)
-    {
-        return nil, err
-    }
-    volName := ctxVars["name"]
-
-    ns.lockVolume(ctxVars["configPath"]+":"+volName)
-    defer ns.unlockVolume(ctxVars["configPath"]+":"+volName)
-
-    targetPath := req.GetStagingTargetPath()
-    isBlock := req.GetVolumeCapability().GetBlock() != nil
-
-    // Check that it's not already mounted
-    notmnt, err := mount.IsNotMountPoint(ns.mounter, targetPath)
-    if (err == nil)
-    {
-        if (!notmnt)
-        {
-            klog.Errorf("target path %s is already mounted", targetPath)
-            return nil, fmt.Errorf("target path %s is already mounted", targetPath)
-        }
-        var finfo os.FileInfo
-        finfo, err = os.Stat(targetPath)
-        if (err != nil)
-        {
-            klog.Errorf("failed to stat %s: %v", targetPath, err)
-            return nil, err
-        }
-        if (finfo.IsDir() != (!isBlock))
-        {
-            err = os.Remove(targetPath)
-            if (err != nil)
-            {
-                klog.Errorf("failed to remove %s (to recreate it with correct type): %v", targetPath, err)
-                return nil, err
-            }
-            err = os.ErrNotExist
-        }
-    }
-    if (err != nil)
-    {
-        if (os.IsNotExist(err))
-        {
-            if (isBlock)
-            {
-                pathFile, err := os.OpenFile(targetPath, os.O_CREATE|os.O_RDWR, 0o600)
-                if (err != nil)
-                {
-                    klog.Errorf("failed to create block device mount target %s with error: %v", targetPath, err)
-                    return nil, err
-                }
-                err = pathFile.Close()
-                if (err != nil)
-                {
-                    klog.Errorf("failed to close %s with error: %v", targetPath, err)
-                    return nil, err
-                }
-            }
-            else
-            {
-                err := os.MkdirAll(targetPath, 0777)
-                if (err != nil)
-                {
-                    klog.Errorf("failed to create fs mount target %s with error: %v", targetPath, err)
-                    return nil, err
-                }
-            }
-        }
-        else
-        {
-            return nil, err
-        }
-    }
-
-    var devicePath, vdpaId string
-    if (!ns.useVduse)
-    {
-        devicePath, err = mapNbd(volName, ctxVars, false)
-    }
-    else
-    {
-        devicePath, vdpaId, err = mapVduse(ns.stateDir, volName, ctxVars, false)
-    }
-    if (err != nil)
-    {
-        return nil, err
-    }
-
-    diskMounter := &mount.SafeFormatAndMount{Interface: ns.mounter, Exec: utilexec.New()}
-    if (isBlock)
-    {
-        klog.Infof("bind-mounting %s to %s", devicePath, targetPath)
-        err = diskMounter.Mount(devicePath, targetPath, "", []string{"bind"})
-    }
-    else
-    {
-        // Check existing format
-        existingFormat, err := diskMounter.GetDiskFormat(devicePath)
-        if (err != nil)
-        {
-            klog.Errorf("failed to get disk format for path %s, error: %v", err)
-            goto unmap
-        }
-
-        // Format the device (ext4 or xfs)
-        fsType := req.GetVolumeCapability().GetMount().GetFsType()
-        opt := req.GetVolumeCapability().GetMount().GetMountFlags()
-        opt = append(opt, "_netdev")
-        if ((req.VolumeCapability.AccessMode.Mode == csi.VolumeCapability_AccessMode_MULTI_NODE_READER_ONLY ||
-            req.VolumeCapability.AccessMode.Mode == csi.VolumeCapability_AccessMode_SINGLE_NODE_READER_ONLY) &&
-            !Contains(opt, "ro"))
-        {
-            opt = append(opt, "ro")
-        }
-        if (fsType == "xfs")
-        {
-            opt = append(opt, "nouuid")
-        }
-        readOnly := Contains(opt, "ro")
-        if (existingFormat == "" && !readOnly)
-        {
-            switch fsType
-            {
-                case "ext4":
-                    args := []string{"-m0", "-Enodiscard,lazy_itable_init=1,lazy_journal_init=1", devicePath}
-                    _, err = systemCombined("mkfs.ext4", args...)
-                case "xfs":
-                    _, err = systemCombined("mkfs.xfs", "-K", devicePath)
-            }
-            if (err != nil)
-            {
-                goto unmap
-            }
-        }
-
-        klog.Infof("formatting and mounting %s to %s with FS %s, options: %v", devicePath, targetPath, fsType, opt)
-        err = diskMounter.FormatAndMount(devicePath, targetPath, fsType, opt)
-        if (err == nil)
-        {
-            klog.Infof("successfully mounted %s to %s", devicePath, targetPath)
-        }
-
-        // Try to run online resize on mount.
-        // FIXME: Implement online resize. It requires online resize support in vitastor-nbd.
-        if (err == nil && existingFormat != "" && !readOnly)
-        {
-            switch (fsType)
-            {
-                case "ext4":
-                    _, err = systemCombined("resize2fs", devicePath)
-                case "xfs":
-                    _, err = systemCombined("xfs_growfs", devicePath)
-            }
-            if (err != nil)
-            {
-                goto unmap
-            }
-        }
-    }
-    if (err != nil)
-    {
-        klog.Errorf(
-            "failed to mount device path (%s) to path (%s) for volume (%s) error: %s",
-            devicePath, targetPath, volName, err,
-        )
-        goto unmap
-    }
-    return &csi.NodeStageVolumeResponse{}, nil
-
-unmap:
-    if (!ns.useVduse || len(devicePath) >= 8 && devicePath[0:8] == "/dev/nbd")
-    {
-        unmapNbd(devicePath)
-    }
-    else
-    {
-        unmapVduseById(ns.stateDir, vdpaId)
-    }
-    return nil, err
-}
-
-// NodeUnstageVolume unstages the volume from the staging path
-func (ns *NodeServer) NodeUnstageVolume(ctx context.Context, req *csi.NodeUnstageVolumeRequest) (*csi.NodeUnstageVolumeResponse, error)
-{
-    klog.Infof("received node unstage volume request %+v", protosanitizer.StripSecrets(req))
-
-    ctxVars := make(map[string]string)
-    err := json.Unmarshal([]byte(req.VolumeId), &ctxVars)
-    if (err != nil)
-    {
-        return nil, status.Error(codes.Internal, "volume ID not in JSON format")
-    }
-    volName := ctxVars["name"]
-
-    ns.lockVolume(ctxVars["configPath"]+":"+volName)
-    defer ns.unlockVolume(ctxVars["configPath"]+":"+volName)
-
-    targetPath := req.GetStagingTargetPath()
-    devicePath, _, err := mount.GetDeviceNameFromMount(ns.mounter, targetPath)
-    if (err != nil)
-    {
-        if (os.IsNotExist(err))
-        {
-            return nil, status.Error(codes.NotFound, "Target path not found")
-        }
-        return nil, err
-    }
-    if (devicePath == "")
-    {
-        // volume not mounted
-        klog.Warningf("%s is not a mountpoint, deleting", targetPath)
-        os.Remove(targetPath)
-        return &csi.NodeUnstageVolumeResponse{}, nil
-    }
-
-    refList, err := ns.mounter.GetMountRefs(targetPath)
-    if (err != nil)
-    {
-        return nil, err
-    }
-    if (len(refList) > 0)
-    {
-        klog.Warningf("%s is still referenced: %v", targetPath, refList)
-    }
-
-    // unmount
-    err = mount.CleanupMountPoint(targetPath, ns.mounter, false)
-    if (err != nil)
-    {
-        return nil, err
-    }
-
-    // unmap device
-    if (len(refList) == 0)
-    {
-        if (!ns.useVduse)
-        {
-            unmapNbd(devicePath)
-        }
-        else
-        {
-            unmapVduse(ns.stateDir, devicePath)
-        }
-    }
-
-    return &csi.NodeUnstageVolumeResponse{}, nil
-}
-
-// NodePublishVolume mounts the volume mounted to the staging path to the target path
-func (ns *NodeServer) NodePublishVolume(ctx context.Context, req *csi.NodePublishVolumeRequest) (*csi.NodePublishVolumeResponse, error)
-{
-    klog.Infof("received node publish volume request %+v", protosanitizer.StripSecrets(req))
-
-    ctxVars := make(map[string]string)
-    err := json.Unmarshal([]byte(req.VolumeId), &ctxVars)
-    if (err != nil)
-    {
-        return nil, status.Error(codes.Internal, "volume ID not in JSON format")
-    }
-    _, err = GetConnectionParams(ctxVars)
-    if (err != nil)
-    {
-        return nil, err
-    }
-    volName := ctxVars["name"]
-
-    ns.lockVolume(ctxVars["configPath"]+":"+volName)
-    defer ns.unlockVolume(ctxVars["configPath"]+":"+volName)
-
-    stagingTargetPath := req.GetStagingTargetPath()
-    targetPath := req.GetTargetPath()
-    isBlock := req.GetVolumeCapability().GetBlock() != nil
-
-    // Check that stagingTargetPath is mounted
-    notmnt, err := mount.IsNotMountPoint(ns.mounter, stagingTargetPath)
-    if (err != nil)
-    {
-        klog.Errorf("staging path %v is not mounted: %w", stagingTargetPath, err)
-        return nil, fmt.Errorf("staging path %v is not mounted: %w", stagingTargetPath, err)
-    }
-    else if (notmnt)
-    {
-        klog.Errorf("staging path %v is not mounted", stagingTargetPath)
-        return nil, fmt.Errorf("staging path %v is not mounted", stagingTargetPath)
-    }
-
-    // Check that targetPath is not already mounted
-    notmnt, err = mount.IsNotMountPoint(ns.mounter, targetPath)
-    if (err != nil)
-    {
-        if (os.IsNotExist(err))
-        {
-            if (isBlock)
-            {
-                pathFile, err := os.OpenFile(targetPath, os.O_CREATE|os.O_RDWR, 0o600)
-                if (err != nil)
-                {
-                    klog.Errorf("failed to create block device mount target %s with error: %v", targetPath, err)
-                    return nil, err
-                }
-                err = pathFile.Close()
-                if (err != nil)
-                {
-                    klog.Errorf("failed to close %s with error: %v", targetPath, err)
-                    return nil, err
-                }
-            }
-            else
-            {
-                err := os.MkdirAll(targetPath, 0777)
-                if (err != nil)
-                {
-                    klog.Errorf("failed to create fs mount target %s with error: %v", targetPath, err)
-                    return nil, err
-                }
-            }
-        }
-        else
-        {
-            return nil, err
-        }
-    }
-    else if (!notmnt)
-    {
-        klog.Errorf("target path %s is already mounted", targetPath)
-        return nil, fmt.Errorf("target path %s is already mounted", targetPath)
-    }
-
-    execArgs := []string{"--bind", stagingTargetPath, targetPath}
-    if (req.GetReadonly())
-    {
-        execArgs = append(execArgs, "-o", "ro")
-    }
-    cmd := exec.Command("mount", execArgs...)
-    cmd.Stderr = os.Stderr
-    klog.Infof("binding volume %v (%v) from %v to %v", volName, ctxVars["configPath"], stagingTargetPath, targetPath)
-    out, err := cmd.Output()
-    if (err != nil)
-    {
-        return nil, fmt.Errorf("Error running mount %v: %s", strings.Join(execArgs, " "), out)
-    }
-
-    return &csi.NodePublishVolumeResponse{}, nil
-}
-
-// NodeUnpublishVolume unmounts the volume from the target path
-func (ns *NodeServer) NodeUnpublishVolume(ctx context.Context, req *csi.NodeUnpublishVolumeRequest) (*csi.NodeUnpublishVolumeResponse, error)
-{
-    klog.Infof("received node unpublish volume request %+v", protosanitizer.StripSecrets(req))
-
-    ctxVars := make(map[string]string)
-    err := json.Unmarshal([]byte(req.VolumeId), &ctxVars)
-    if (err != nil)
-    {
-        return nil, status.Error(codes.Internal, "volume ID not in JSON format")
-    }
-    volName := ctxVars["name"]
-
-    ns.lockVolume(ctxVars["configPath"]+":"+volName)
-    defer ns.unlockVolume(ctxVars["configPath"]+":"+volName)
-
-    targetPath := req.GetTargetPath()
-    devicePath, _, err := mount.GetDeviceNameFromMount(ns.mounter, targetPath)
-    if (err != nil)
-    {
-        if (os.IsNotExist(err))
-        {
-            return nil, status.Error(codes.NotFound, "Target path not found")
-        }
-        return nil, err
-    }
-    if (devicePath == "")
-    {
-        // volume not mounted
-        klog.Warningf("%s is not a mountpoint, deleting", targetPath)
-        os.Remove(targetPath)
-        return &csi.NodeUnpublishVolumeResponse{}, nil
-    }
-
-    // unmount
-    err = mount.CleanupMountPoint(targetPath, ns.mounter, false)
-    if (err != nil)
-    {
-        return nil, err
-    }
-
-    return &csi.NodeUnpublishVolumeResponse{}, nil
-}
-
-// NodeGetVolumeStats returns volume capacity statistics available for the volume
-func (ns *NodeServer) NodeGetVolumeStats(ctx context.Context, req *csi.NodeGetVolumeStatsRequest) (*csi.NodeGetVolumeStatsResponse, error)
-{
-    return nil, status.Error(codes.Unimplemented, "")
-}
-
-// NodeExpandVolume expanding the file system on the node
-func (ns *NodeServer) NodeExpandVolume(ctx context.Context, req *csi.NodeExpandVolumeRequest) (*csi.NodeExpandVolumeResponse, error)
-{
-    return nil, status.Error(codes.Unimplemented, "")
-}
-
-// NodeGetCapabilities returns the supported capabilities of the node server
-func (ns *NodeServer) NodeGetCapabilities(ctx context.Context, req *csi.NodeGetCapabilitiesRequest) (*csi.NodeGetCapabilitiesResponse, error)
-{
-    return &csi.NodeGetCapabilitiesResponse{
-        Capabilities: []*csi.NodeServiceCapability{
-            &csi.NodeServiceCapability{
-                Type: &csi.NodeServiceCapability_Rpc{
-                    Rpc: &csi.NodeServiceCapability_RPC{
-                        Type: csi.NodeServiceCapability_RPC_STAGE_UNSTAGE_VOLUME,
-                    },
-                },
-            },
-        },
-    }, nil
-}
-
-// NodeGetInfo returns NodeGetInfoResponse for CO.
-func (ns *NodeServer) NodeGetInfo(ctx context.Context, req *csi.NodeGetInfoRequest) (*csi.NodeGetInfoResponse, error)
-{
-    klog.Infof("received node get info request %+v", protosanitizer.StripSecrets(req))
-    return &csi.NodeGetInfoResponse{
-        NodeId: ns.NodeID,
-    }, nil
-}
--- a/csi/src/server.go
+++ b/csi/src/server.go
@ -1,36 +0,0 @@
-// Copyright (c) Vitaliy Filippov, 2019+
-// License: VNPL-1.1 or GNU GPL-2.0+ (see README.md for details)
-
-package vitastor
-
-import (
-    "k8s.io/klog"
-)
-
-type Driver struct
-{
-    *Config
-}
-
-// NewDriver create new instance driver
-func NewDriver(config *Config) (*Driver, error)
-{
-    if (config == nil)
-    {
-        klog.Errorf("Vitastor CSI driver initialization failed")
-        return nil, nil
-    }
-    driver := &Driver{
-        Config: config,
-    }
-    klog.Infof("Vitastor CSI driver initialized")
-    return driver, nil
-}
-
-// Start server
-func (driver *Driver) Run()
-{
-    server := NewNonBlockingGRPCServer()
-    server.Start(driver.Endpoint, NewIdentityServer(driver), NewControllerServer(driver), NewNodeServer(driver))
-    server.Wait()
-}
--- a/csi/src/utils.go
+++ b/csi/src/utils.go
@ -1,342 +0,0 @@
-// Copyright (c) Vitaliy Filippov, 2019+
-// License: VNPL-1.1 or GNU GPL-2.0+ (see README.md for details)
-
-package vitastor
-
-import (
-    "bytes"
-    "errors"
-    "encoding/json"
-    "fmt"
-    "os"
-    "os/exec"
-    "path/filepath"
-    "strconv"
-    "strings"
-    "syscall"
-
-    "k8s.io/klog"
-    "google.golang.org/grpc/codes"
-    "google.golang.org/grpc/status"
-)
-
-func Contains(list []string, s string) bool
-{
-    for i := 0; i < len(list); i++
-    {
-        if (list[i] == s)
-        {
-            return true
-        }
-    }
-    return false
-}
-
-func checkVduseSupport() bool
-{
-    // Check VDUSE support (vdpa, vduse, virtio-vdpa kernel modules)
-    vduse := true
-    for _, mod := range []string{"vdpa", "vduse", "virtio-vdpa"}
-    {
-        _, err := os.Stat("/sys/module/"+mod)
-        if (err != nil)
-        {
-            if (!errors.Is(err, os.ErrNotExist))
-            {
-                klog.Errorf("failed to check /sys/module/%s: %v", mod, err)
-            }
-            c := exec.Command("/sbin/modprobe", mod)
-            c.Stdout = os.Stderr
-            c.Stderr = os.Stderr
-            err := c.Run()
-            if (err != nil)
-            {
-                klog.Errorf("/sbin/modprobe %s failed: %v", mod, err)
-                vduse = false
-                break
-            }
-        }
-    }
-    // Check that vdpa tool functions
-    if (vduse)
-    {
-        c := exec.Command("/sbin/vdpa", "-j", "dev")
-        c.Stderr = os.Stderr
-        err := c.Run()
-        if (err != nil)
-        {
-            klog.Errorf("/sbin/vdpa -j dev failed: %v", err)
-            vduse = false
-        }
-    }
-    if (!vduse)
-    {
-        klog.Errorf(
-            "Your host apparently has no VDUSE support. VDUSE support disabled, NBD will be used to map devices."+
-            " For VDUSE you need at least Linux 5.15 and the following kernel modules: vdpa, virtio-vdpa, vduse.",
-        )
-    }
-    else
-    {
-        klog.Infof("VDUSE support enabled successfully")
-    }
-    return vduse
-}
-
-func mapNbd(volName string, ctxVars map[string]string, readonly bool) (string, error)
-{
-    // Map NBD device
-    // FIXME: Check if already mapped
-    args := []string{
-        "map", "--image", volName,
-    }
-    if (ctxVars["configPath"] != "")
-    {
-        args = append(args, "--config_path", ctxVars["configPath"])
-    }
-    if (readonly)
-    {
-        args = append(args, "--readonly", "1")
-    }
-    stdout, stderr, err := system("/usr/bin/vitastor-nbd", args...)
-    dev := strings.TrimSpace(string(stdout))
-    if (dev == "")
-    {
-        return "", fmt.Errorf("vitastor-nbd did not return the name of NBD device. output: %s", stderr)
-    }
-    klog.Infof("Attached volume %s via NBD as %s", volName, dev)
-    return dev, err
-}
-
-func unmapNbd(devicePath string)
-{
-    // unmap NBD device
-    unmapOut, unmapErr := exec.Command("/usr/bin/vitastor-nbd", "unmap", devicePath).CombinedOutput()
-    if (unmapErr != nil)
-    {
-        klog.Errorf("failed to unmap NBD device %s: %s, error: %v", devicePath, unmapOut, unmapErr)
-    }
-}
-
-func findByPidFile(pidFile string) (*os.Process, error)
-{
-    pidBuf, err := os.ReadFile(pidFile)
-    if (err != nil)
-    {
-        return nil, err
-    }
-    pid, err := strconv.ParseInt(strings.TrimSpace(string(pidBuf)), 0, 64)
-    if (err != nil)
-    {
-        return nil, err
-    }
-    proc, err := os.FindProcess(int(pid))
-    if (err != nil)
-    {
-        return nil, err
-    }
-    return proc, nil
-}
-
-func killByPidFile(pidFile string) error
-{
-    klog.Infof("killing process with PID from file %s", pidFile)
-    proc, err := findByPidFile(pidFile)
-    if (err != nil)
-    {
-        return err
-    }
-    return proc.Signal(syscall.SIGTERM)
-}
-
-func startStorageDaemon(vdpaId, volName, pidFile, configPath string, readonly bool) error
-{
-    // Start qemu-storage-daemon
-    blockSpec := map[string]interface{}{
-        "node-name": "disk1",
-        "driver": "vitastor",
-        "image": volName,
-        "cache": map[string]bool{
-            "direct": true,
-            "no-flush": false,
-        },
-        "discard": "unmap",
-    }
-    if (configPath != "")
-    {
-        blockSpec["config-path"] = configPath
-    }
-    blockSpecJson, _ := json.Marshal(blockSpec)
-    writable := "true"
-    if (readonly)
-    {
-        writable = "false"
-    }
-    _, _, err := system(
-        "/usr/bin/qemu-storage-daemon", "--daemonize", "--pidfile", pidFile, "--blockdev", string(blockSpecJson),
-        "--export", "vduse-blk,id="+vdpaId+",node-name=disk1,name="+vdpaId+",num-queues=16,queue-size=128,writable="+writable,
-    )
-    return err
-}
-
-func mapVduse(stateDir string, volName string, ctxVars map[string]string, readonly bool) (string, string, error)
-{
-    // Generate state file
-    stateFd, err := os.CreateTemp(stateDir, "vitastor-vduse-*.json")
-    if (err != nil)
-    {
-        return "", "", err
-    }
-    stateFile := stateFd.Name()
-    stateFd.Close()
-    vdpaId := filepath.Base(stateFile)
-    vdpaId = vdpaId[0:len(vdpaId)-5] // remove ".json"
-    pidFile := stateDir + vdpaId + ".pid"
-    // Map VDUSE device via qemu-storage-daemon
-    err = startStorageDaemon(vdpaId, volName, pidFile, ctxVars["configPath"], readonly)
-    if (err == nil)
-    {
-        // Add device to VDPA bus
-        _, _, err = system("/sbin/vdpa", "-j", "dev", "add", "name", vdpaId, "mgmtdev", "vduse")
-        if (err == nil)
-        {
-            // Find block device name
-            var matches []string
-            matches, err = filepath.Glob("/sys/bus/vdpa/devices/"+vdpaId+"/virtio*/block/*")
-            if (err == nil && len(matches) == 0)
-            {
-                err = errors.New("/sys/bus/vdpa/devices/"+vdpaId+"/virtio*/block/* is not found")
-            }
-            if (err == nil)
-            {
-                blockdev := "/dev/"+filepath.Base(matches[0])
-                _, err = os.Stat(blockdev)
-                if (err == nil)
-                {
-                    // Generate state file
-                    stateJSON, _ := json.Marshal(&DeviceState{
-                        ConfigPath: ctxVars["configPath"],
-                        VdpaId:     vdpaId,
-                        Image:      volName,
-                        Blockdev:   blockdev,
-                        Readonly:   readonly,
-                        PidFile:    pidFile,
-                    })
-                    err = os.WriteFile(stateFile, stateJSON, 0600)
-                    if (err == nil)
-                    {
-                        klog.Infof("Attached volume %s via VDUSE as %s (VDPA ID %s)", volName, blockdev, vdpaId)
-                        return blockdev, vdpaId, nil
-                    }
-                }
-            }
-        }
-        killErr := killByPidFile(pidFile)
-        if (killErr != nil)
-        {
-            klog.Errorf("Failed to kill started qemu-storage-daemon: %v", killErr)
-        }
-        os.Remove(stateFile)
-        os.Remove(pidFile)
-    }
-    return "", "", err
-}
-
-func unmapVduse(stateDir, devicePath string)
-{
-    if (len(devicePath) < 6 || devicePath[0:6] != "/dev/v")
-    {
-        klog.Errorf("%s does not start with /dev/v", devicePath)
-        return
-    }
-    vduseDev, err := os.Readlink("/sys/block/"+devicePath[5:])
-    if (err != nil)
-    {
-        klog.Errorf("%s is not a symbolic link to VDUSE device (../devices/virtual/vduse/xxx): %v", devicePath, err)
-        return
-    }
-    vdpaId := ""
-    p := strings.Index(vduseDev, "/vduse/")
-    if (p >= 0)
-    {
-        vduseDev = vduseDev[p+7:]
-        p = strings.Index(vduseDev, "/")
-        if (p >= 0)
-        {
-            vdpaId = vduseDev[0:p]
-        }
-    }
-    if (vdpaId == "")
-    {
-        klog.Errorf("%s is not a symbolic link to VDUSE device (../devices/virtual/vduse/xxx), but is %v", devicePath, vduseDev)
-        return
-    }
-    unmapVduseById(stateDir, vdpaId)
-}
-
-func unmapVduseById(stateDir, vdpaId string)
-{
-    _, err := os.Stat("/sys/bus/vdpa/devices/"+vdpaId)
-    if (err != nil)
-    {
-        klog.Errorf("failed to stat /sys/bus/vdpa/devices/"+vdpaId+": %v", err)
-    }
-    else
-    {
-        _, _, _ = system("/sbin/vdpa", "-j", "dev", "del", vdpaId)
-    }
-    stateFile := stateDir + vdpaId + ".json"
-    os.Remove(stateFile)
-    pidFile := stateDir + vdpaId + ".pid"
-    _, err = os.Stat(pidFile)
-    if (os.IsNotExist(err))
-    {
-        // ok, already killed
-    }
-    else if (err != nil)
-    {
-        klog.Errorf("Failed to stat %v: %v", pidFile, err)
-        return
-    }
-    else
-    {
-        err = killByPidFile(pidFile)
-        if (err != nil)
-        {
-            klog.Errorf("Failed to kill started qemu-storage-daemon: %v", err)
-        }
-        os.Remove(pidFile)
-    }
-}
-
-func system(program string, args ...string) ([]byte, []byte, error)
-{
-    klog.Infof("Running "+program+" "+strings.Join(args, " "))
-    c := exec.Command(program, args...)
-    var stdout, stderr bytes.Buffer
-    c.Stdout, c.Stderr = &stdout, &stderr
-    err := c.Run()
-    if (err != nil)
-    {
-        stdoutStr, stderrStr := string(stdout.Bytes()), string(stderr.Bytes())
-        klog.Errorf(program+" "+strings.Join(args, " ")+" failed: %s\nOutput:\n%s", err, stdoutStr+stderrStr)
-        return nil, nil, status.Error(codes.Internal, stdoutStr+stderrStr+" (status "+err.Error()+")")
-    }
-    return stdout.Bytes(), stderr.Bytes(), nil
-}
-
-func systemCombined(program string, args ...string) ([]byte, error)
-{
-    klog.Infof("Running "+program+" "+strings.Join(args, " "))
-    c := exec.Command(program, args...)
-    var out bytes.Buffer
-    c.Stdout, c.Stderr = &out, &out
-    err := c.Run()
-    if (err != nil)
-    {
-        outStr := string(out.Bytes())
-        klog.Errorf(program+" "+strings.Join(args, " ")+" failed: %s, status %s\n", outStr, err)
-        return nil, status.Error(codes.Internal, outStr+" (status "+err.Error()+")")
-    }
-    return out.Bytes(), nil
-}
--- a/csi/vitastor-csi.go
+++ b/csi/vitastor-csi.go
@ -1,39 +0,0 @@
-// Copyright (c) Vitaliy Filippov, 2019+
-// License: VNPL-1.1 or GNU GPL-2.0+ (see README.md for details)
-
-package main
-
-import (
-    "flag"
-    "fmt"
-    "os"
-    "k8s.io/klog"
-    "vitastor.io/csi/src"
-)
-
-func main()
-{
-    var config = vitastor.NewConfig()
-    flag.StringVar(&config.Endpoint, "endpoint", "", "CSI endpoint")
-    flag.StringVar(&config.NodeID, "node", "", "Node ID")
-    flag.Parse()
-    if (config.Endpoint == "")
-    {
-        config.Endpoint = os.Getenv("CSI_ENDPOINT")
-    }
-    if (config.NodeID == "")
-    {
-        config.NodeID = os.Getenv("NODE_ID")
-    }
-    if (config.Endpoint == "" && config.NodeID == "")
-    {
-        fmt.Fprintf(os.Stderr, "Please set -endpoint and -node / CSI_ENDPOINT & NODE_ID env vars\n")
-        os.Exit(1)
-    }
-    drv, err := vitastor.NewDriver(config)
-    if (err != nil)
-    {
-        klog.Fatalln(err)
-    }
-    drv.Run()
-}
--- a/debian/build-pve-qemu.sh
+++ b/debian/build-pve-qemu.sh
@ -1,58 +0,0 @@
-exit
-
-git clone https://git.yourcmc.ru/vitalif/pve-qemu .
-
-# bookworm
-
-docker run -it -v `pwd`/pve-qemu:/root/pve-qemu --name pve-qemu-bullseye debian:bullseye bash
-
-perl -i -pe 's/Types: deb$/Types: deb deb-src/' /etc/apt/sources.list.d/debian.sources
-echo 'deb [arch=amd64] http://download.proxmox.com/debian/pve bookworm pve-no-subscription' >> /etc/apt/sources.list
-echo 'deb https://vitastor.io/debian bookworm main' >> /etc/apt/sources.list
-echo 'APT::Install-Recommends false;' >> /etc/apt/apt.conf
-echo 'ru_RU UTF-8' >> /etc/locale.gen
-echo 'en_US UTF-8' >> /etc/locale.gen
-apt-get update
-apt-get install wget ca-certificates
-wget https://enterprise.proxmox.com/debian/proxmox-release-bookworm.gpg -O /etc/apt/trusted.gpg.d/proxmox-release-bookworm.gpg
-wget https://vitastor.io/debian/pubkey.gpg -O /etc/apt/trusted.gpg.d/vitastor.gpg
-apt-get update
-apt-get install git devscripts equivs wget mc libjemalloc-dev vitastor-client-dev lintian locales
-mk-build-deps --install ./control
-
-# bullseye
-
-docker run -it -v `pwd`/pve-qemu:/root/pve-qemu --name pve-qemu-bullseye debian:bullseye bash
-
-grep '^deb ' /etc/apt/sources.list | perl -pe 's/^deb /deb-src /' >> /etc/apt/sources.list
-echo 'deb [arch=amd64] http://download.proxmox.com/debian/pve bullseye pve-no-subscription' >> /etc/apt/sources.list
-echo 'deb https://vitastor.io/debian bullseye main' >> /etc/apt/sources.list
-echo 'APT::Install-Recommends false;' >> /etc/apt/apt.conf
-echo 'ru_RU UTF-8' >> /etc/locale.gen
-echo 'en_US UTF-8' >> /etc/locale.gen
-apt-get update
-apt-get install wget
-wget https://enterprise.proxmox.com/debian/proxmox-release-bullseye.gpg -O /etc/apt/trusted.gpg.d/proxmox-release-bullseye.gpg
-wget https://vitastor.io/debian/pubkey.gpg -O /etc/apt/trusted.gpg.d/vitastor.gpg
-apt-get update
-apt-get install git devscripts equivs wget mc libjemalloc-dev vitastor-client-dev lintian locales
-mk-build-deps --install ./control
-
-# buster
-
-docker run -it -v `pwd`/pve-qemu:/root/pve-qemu --name pve-qemu-buster debian:buster bash
-
-grep '^deb ' /etc/apt/sources.list | perl -pe 's/^deb /deb-src /' >> /etc/apt/sources.list
-echo 'deb [arch=amd64] http://download.proxmox.com/debian/pve buster pve-no-subscription' >> /etc/apt/sources.list
-echo 'deb https://vitastor.io/debian buster main' >> /etc/apt/sources.list
-echo 'deb http://deb.debian.org/debian buster-backports main' >> /etc/apt/sources.list
-echo 'APT::Install-Recommends false;' >> /etc/apt/apt.conf
-echo 'ru_RU UTF-8' >> /etc/locale.gen
-echo 'en_US UTF-8' >> /etc/locale.gen
-apt-get update
-apt-get install wget ca-certificates
-wget http://download.proxmox.com/debian/proxmox-ve-release-6.x.gpg -O /etc/apt/trusted.gpg.d/proxmox-ve-release-6.x.gpg
-wget https://vitastor.io/debian/pubkey.gpg -O /etc/apt/trusted.gpg.d/vitastor.gpg
-apt-get update
-apt-get install git devscripts equivs wget mc libjemalloc-dev vitastor-client-dev lintian locales
-mk-build-deps --install ./control
--- a/debian/build-vitastor-bookworm.sh
+++ b/debian/build-vitastor-bookworm.sh
@ -1,7 +0,0 @@
-#!/bin/bash
-
-cat < vitastor.Dockerfile > ../Dockerfile
-cd ..
-mkdir -p packages
-sudo podman build --build-arg DISTRO=debian --build-arg REL=bookworm -v `pwd`/packages:/root/packages -f Dockerfile .
-rm Dockerfile
--- a/debian/build-vitastor-bullseye.sh
+++ b/debian/build-vitastor-bullseye.sh
@ -1,7 +0,0 @@
-#!/bin/bash
-
-cat < vitastor.Dockerfile > ../Dockerfile
-cd ..
-mkdir -p packages
-sudo podman build --build-arg DISTRO=debian --build-arg REL=bullseye -v `pwd`/packages:/root/packages -f Dockerfile .
-rm Dockerfile
--- a/debian/build-vitastor-buster.sh
+++ b/debian/build-vitastor-buster.sh
@ -1,7 +0,0 @@
-#!/bin/bash
-
-cat < vitastor.Dockerfile > ../Dockerfile
-cd ..
-mkdir -p packages
-sudo podman build --build-arg DISTRO=debian --build-arg REL=buster -v `pwd`/packages:/root/packages -f Dockerfile .
-rm Dockerfile
--- a/debian/build-vitastor-ubuntu-jammy.sh
+++ b/debian/build-vitastor-ubuntu-jammy.sh
@ -1,7 +0,0 @@
-#!/bin/bash
-
-cat < vitastor.Dockerfile > ../Dockerfile
-cd ..
-mkdir -p packages
-sudo podman build --build-arg DISTRO=ubuntu --build-arg REL=jammy -v `pwd`/packages:/root/packages -f Dockerfile .
-rm Dockerfile
--- a/debian/changelog
+++ b/debian/changelog
@ -1,41 +0,0 @@
-vitastor (1.9.3-1) unstable; urgency=medium
-
-  * Bugfixes
-
- -- Vitaliy Filippov <vitalif@yourcmc.ru>  Fri, 03 Jun 2022 02:09:44 +0300
-
-vitastor (0.7.0-1) unstable; urgency=medium
-
-  * Implement NFS proxy
-  * Add documentation
-  * Bugfixes
-
- -- Vitaliy Filippov <vitalif@yourcmc.ru>  Sun, 29 May 2022 23:39:13 +0300
-
-vitastor (0.6.3-1) unstable; urgency=medium
-
-  * RDMA support
-  * Bugfixes
-
- -- Vitaliy Filippov <vitalif@yourcmc.ru>  Sat, 01 May 2021 18:46:10 +0300
-
-vitastor (0.6.0-1) unstable; urgency=medium
-
-  * Snapshots and Copy-on-Write clones
-  * Image metadata in etcd (name, size)
-  * Image I/O and space statistics in etcd
-  * Write throttling for smoothing random write workloads in SSD+HDD configurations
-
- -- Vitaliy Filippov <vitalif@yourcmc.ru>  Sun, 11 Apr 2021 00:49:18 +0300
-
-vitastor (0.5.1-1) unstable; urgency=medium
-
-  * Add jerasure support
-
- -- Vitaliy Filippov <vitalif@yourcmc.ru>  Sat, 05 Dec 2020 17:02:26 +0300
-
-vitastor (0.5-1) unstable; urgency=medium
-
-  * First packaging for Debian
-
- -- Vitaliy Filippov <vitalif@yourcmc.ru>  Thu, 05 Nov 2020 02:20:59 +0300
--- a/debian/compat
+++ b/debian/compat
@ -1 +0,0 @@
-13
--- a/debian/control
+++ b/debian/control
@ -1,61 +0,0 @@
-Source: vitastor
-Section: admin
-Priority: optional
-Maintainer: Vitaliy Filippov <vitalif@yourcmc.ru>
-Build-Depends: debhelper, liburing-dev (>= 0.6), g++ (>= 8), libstdc++6 (>= 8), linux-libc-dev, libgoogle-perftools-dev, libjerasure-dev, libgf-complete-dev, libibverbs-dev, libisal-dev, cmake, pkg-config, libnl-3-dev, libnl-genl-3-dev
-Standards-Version: 4.5.0
-Homepage: https://vitastor.io/
-Rules-Requires-Root: no
-
-Package: vitastor
-Architecture: amd64
-Depends: vitastor-osd, vitastor-mon, vitastor-client, vitastor-client-dev, vitastor-fio
-Description: Vitastor, a fast software-defined clustered block storage
- Vitastor is a small, simple and fast clustered block storage (storage for VM drives),
- architecturally similar to Ceph which means strong consistency, primary-replication,
- symmetric clustering and automatic data distribution over any number of drives of any
- size with configurable redundancy (replication or erasure codes/XOR).
-
-Package: vitastor-osd
-Architecture: amd64
-Depends: ${shlibs:Depends}, ${misc:Depends}, vitastor-client (= ${binary:Version}), fdisk, util-linux, parted
-Description: Vitastor, a fast software-defined clustered block storage - object storage daemon
- Vitastor object storage daemon, i.e. server program that stores data.
-
-Package: vitastor-mon
-Architecture: amd64
-Depends: ${misc:Depends}, nodejs (>= 10), node-sprintf-js, node-ws (>= 7), lp-solve
-Description: Vitastor, a fast software-defined clustered block storage - monitor
- Vitastor monitor, i.e. server program responsible for watching cluster state and
- scheduling cluster-level operations.
-
-Package: vitastor-client
-Architecture: amd64
-Depends: ${shlibs:Depends}, ${misc:Depends}
-Description: Vitastor, a fast software-defined clustered block storage - client
- Vitastor client library and command-line interface.
-
-Package: vitastor-client-dev
-Section: devel
-Architecture: amd64
-Depends: ${misc:Depends}, vitastor-client (= ${binary:Version})
-Description: Vitastor, a fast software-defined clustered block storage - development files
- Vitastor library headers for development.
-
-Package: vitastor-fio
-Architecture: amd64
-Depends: ${shlibs:Depends}, ${misc:Depends}, vitastor-client (= ${binary:Version}), fio (= ${dep:fio})
-Description: Vitastor, a fast software-defined clustered block storage - fio drivers
- Vitastor fio drivers for benchmarking.
-
-Package: pve-storage-vitastor
-Architecture: amd64
-Depends: ${shlibs:Depends}, ${misc:Depends}, vitastor-client (= ${binary:Version})
-Description: Vitastor Proxmox Virtual Environment storage plugin
- Vitastor storage plugin for Proxmox Virtual Environment.
-
-Package: vitastor-opennebula
-Architecture: amd64
-Depends: ${shlibs:Depends}, ${misc:Depends}, vitastor-client, patch, python3, jq
-Description: Vitastor OpenNebula storage plugin
- Vitastor storage plugin for OpenNebula.
--- a/debian/copyright
+++ b/debian/copyright
@ -1,21 +0,0 @@
-Format: https://www.debian.org/doc/packaging-manuals/copyright-format/1.0/
-Upstream-Name: vitastor
-Upstream-Contact: Vitaliy Filippov <vitalif@yourcmc.ru>
-Source: https://vitastor.io
-
-Files: *
-Copyright: 2019+ Vitaliy Filippov <vitalif@yourcmc.ru>
-License: Multiple licenses VNPL-1.1 and/or GPL-2.0+
- All server-side code (OSD, Monitor and so on) is licensed under the terms of
- Vitastor Network Public License 1.1 (VNPL 1.1), a copyleft license based on
- GNU GPLv3.0 with the additional "Network Interaction" clause which requires
- opensourcing all programs directly or indirectly interacting with Vitastor
- through a computer network and expressly designed to be used in conjunction
- with it ("Proxy Programs"). Proxy Programs may be made public not only under
- the terms of the same license, but also under the terms of any GPL-Compatible
- Free Software License, as listed by the Free Software Foundation.
- This is a stricter copyleft license than the Affero GPL.
- .
- Client libraries (cluster_client and so on) are dual-licensed under the same
- VNPL 1.1 and also GNU GPL 2.0 or later to allow for compatibility with GPLed
- software like QEMU and fio.
--- a/debian/fio_version
+++ b/debian/fio_version
@ -1 +0,0 @@
-dep:fio=3.16-1
--- a/debian/install
+++ b/debian/install
@ -1,4 +0,0 @@
-VNPL-1.1.txt usr/share/doc/vitastor
-GPL-2.0.txt usr/share/doc/vitastor
-README.md usr/share/doc/vitastor
-README-ru.md usr/share/doc/vitastor
--- a/debian/libisal.pc
+++ b/debian/libisal.pc
@ -1,11 +0,0 @@
-prefix=/usr
-exec_prefix=${prefix}
-libdir=${prefix}/lib/x86_64-linux-gnu
-includedir=${prefix}/include
-
-Name: libisal
-Description: Library for storage systems
-Version: 2.30.0
-Libs: -L${libdir} -lisal
-Libs.private:
-Cflags: -I${includedir}
--- a/debian/libvirt.Dockerfile
+++ b/debian/libvirt.Dockerfile
@ -1,41 +0,0 @@
-# Build patched libvirt for Debian Buster or Bullseye/Sid inside a container
-# cd ..; podman build --build-arg DISTRO=debian --build-arg REL=bullseye -v `pwd`/packages:/root/packages -f debian/libvirt.Dockerfile .
-
-ARG DISTRO=
-ARG REL=
-FROM $DISTRO:$REL
-ARG REL=
-
-WORKDIR /root
-
-RUN if ([ "${DISTRO}" = "debian" ]) && ( [ "${REL}" = "buster" -o "${REL}" = "bullseye" ] ); then \
-        echo "deb http://deb.debian.org/debian $REL-backports main" >> /etc/apt/sources.list; \
-        echo >> /etc/apt/preferences; \
-        echo 'Package: *' >> /etc/apt/preferences; \
-        echo "Pin: release a=$REL-backports" >> /etc/apt/preferences; \
-        echo 'Pin-Priority: 500' >> /etc/apt/preferences; \
-    fi; \
-    grep '^deb ' /etc/apt/sources.list | perl -pe 's/^deb/deb-src/' >> /etc/apt/sources.list; \
-    echo 'APT::Install-Recommends false;' >> /etc/apt/apt.conf; \
-    echo 'APT::Install-Suggests false;' >> /etc/apt/apt.conf
-
-RUN apt-get update; apt-get -y install devscripts
-RUN apt-get -y build-dep libvirt0
-RUN apt-get -y install libglusterfs-dev
-RUN apt-get --download-only source libvirt
-
-ADD patches/libvirt-5.0-vitastor.diff patches/libvirt-7.0-vitastor.diff patches/libvirt-7.5-vitastor.diff patches/libvirt-7.6-vitastor.diff patches/libvirt-8.0-vitastor.diff /root
-RUN set -e; \
-    mkdir -p /root/packages/libvirt-$REL; \
-    rm -rf /root/packages/libvirt-$REL/*; \
-    cd /root/packages/libvirt-$REL; \
-    dpkg-source -x /root/libvirt*.dsc; \
-    D=$(ls -d libvirt-*/); \
-    V=$(ls -d libvirt-*/ | perl -pe 's/libvirt-(\d+\.\d+).*/$1/'); \
-    cp /root/libvirt-$V-vitastor.diff $D/debian/patches; \
-    echo libvirt-$V-vitastor.diff >> $D/debian/patches/series; \
-    cd $D; \
-    V=$(head -n1 debian/changelog | perl -pe 's/^.*\((.*?)(~bpo[\d\+]*)?(\+deb[u\d]+)?\).*$/$1/')+vitastor2; \
-    DEBEMAIL="Vitaliy Filippov <vitalif@yourcmc.ru>" dch -D $REL -v $V 'Add Vitastor support'; \
-    DEB_BUILD_OPTIONS=nocheck dpkg-buildpackage --jobs=auto -sa; \
-    rm -rf /root/packages/libvirt-$REL/$D
--- a/debian/patched-qemu.Dockerfile
+++ b/debian/patched-qemu.Dockerfile
@ -1,61 +0,0 @@
-# Build patched QEMU for Debian inside a container
-# cd ..; podman build --build-arg REL=bullseye -v `pwd`/packages:/root/packages -f debian/patched-qemu.Dockerfile .
-
-ARG REL=
-FROM debian:$REL
-ARG REL=
-
-WORKDIR /root
-
-RUN if [ "$REL" = "buster" -o "$REL" = "bullseye" -o "$REL" = "bookworm" ]; then \
-        echo "deb http://deb.debian.org/debian $REL-backports main" >> /etc/apt/sources.list; \
-        echo >> /etc/apt/preferences; \
-        echo 'Package: *' >> /etc/apt/preferences; \
-        echo "Pin: release a=$REL-backports" >> /etc/apt/preferences; \
-        echo 'Pin-Priority: 500' >> /etc/apt/preferences; \
-    fi; \
-    grep '^deb ' /etc/apt/sources.list | perl -pe 's/^deb/deb-src/' >> /etc/apt/sources.list; \
-    perl -i -pe 's/Types: deb$/Types: deb deb-src/' /etc/apt/sources.list.d/debian.sources || true; \
-    echo 'APT::Install-Recommends false;' >> /etc/apt/apt.conf; \
-    echo 'APT::Install-Suggests false;' >> /etc/apt/apt.conf
-
-RUN apt-get update
-RUN apt-get -y install fio liburing-dev libgoogle-perftools-dev devscripts
-RUN apt-get -y build-dep qemu
-# To build a custom version
-#RUN cp /root/packages/qemu-orig/* /root
-RUN apt-get --download-only source qemu
-
-ADD patches /root/vitastor/patches
-ADD src/client/qemu_driver.c /root/qemu_driver.c
-
-#RUN set -e; \
-#    apt-get install -y wget; \
-#    wget -q -O /etc/apt/trusted.gpg.d/vitastor.gpg https://vitastor.io/debian/pubkey.gpg; \
-#    (echo deb http://vitastor.io/debian $REL main > /etc/apt/sources.list.d/vitastor.list); \
-#    (echo "APT::Install-Recommends false;" > /etc/apt/apt.conf) && \
-#    apt-get update; \
-#    apt-get install -y vitastor-client vitastor-client-dev quilt
-
-RUN set -e; \
-    dpkg -i /root/packages/vitastor-$REL/vitastor-client_*.deb /root/packages/vitastor-$REL/vitastor-client-dev_*.deb; \
-    apt-get update; \
-    apt-get install -y quilt; \
-    mkdir -p /root/packages/qemu-$REL; \
-    rm -rf /root/packages/qemu-$REL/*; \
-    cd /root/packages/qemu-$REL; \
-    dpkg-source -x /root/qemu*.dsc; \
-    QEMU_VER=$(ls -d qemu*/ | perl -pe 's!^.*?(\d+\.\d+).*!$1!'); \
-    D=$(ls -d qemu*/); \
-    cp /root/vitastor/patches/qemu-$QEMU_VER-vitastor.patch ./qemu-*/debian/patches; \
-    echo qemu-$QEMU_VER-vitastor.patch >> $D/debian/patches/series; \
-    cd /root/packages/qemu-$REL/qemu-*/; \
-    quilt push -a; \
-    quilt add block/vitastor.c; \
-    cp /root/qemu_driver.c block/vitastor.c; \
-    quilt refresh; \
-    V=$(head -n1 debian/changelog | perl -pe 's/5\.2\+dfsg-9/5.2+dfsg-11/; s/^.*\((.*?)(~bpo[\d\+]*)?\).*$/$1/')+vitastor4; \
-    if [ "$REL" = bullseye ]; then V=${V}bullseye; fi; \
-    DEBEMAIL="Vitaliy Filippov <vitalif@yourcmc.ru>" dch -D $REL -v $V 'Plug Vitastor block driver'; \
-    DEB_BUILD_OPTIONS=nocheck dpkg-buildpackage --jobs=auto -sa; \
-    rm -rf /root/packages/qemu-$REL/qemu-*/
--- a/debian/pve-storage-vitastor.install
+++ b/debian/pve-storage-vitastor.install
@ -1 +0,0 @@
-patches/VitastorPlugin.pm usr/share/perl5/PVE/Storage/Custom/
--- a/debian/raw.h
+++ b/debian/raw.h
@ -1,19 +0,0 @@
-/* Removed in Linux 5.14 */
-
-/* SPDX-License-Identifier: GPL-2.0 WITH Linux-syscall-note */
-#ifndef __LINUX_RAW_H
-#define __LINUX_RAW_H
-
-#include <linux/types.h>
-
-#define RAW_SETBIND	_IO( 0xac, 0 )
-#define RAW_GETBIND	_IO( 0xac, 1 )
-
-struct raw_config_request 
-{
-	int	raw_minor;
-	__u64	block_major;
-	__u64	block_minor;
-};
-
-#endif /* __LINUX_RAW_H */
--- a/debian/rules
+++ b/debian/rules
@ -1,10 +0,0 @@
-#!/usr/bin/make -f
-export DH_VERBOSE = 1
-
-%:
-	dh $@
-
-override_dh_installdeb:
-	cat debian/fio_version >> debian/vitastor-fio.substvars
-	[ -f debian/qemu_version ] && (cat debian/qemu_version >> debian/vitastor-qemu.substvars) || true
-	dh_installdeb
--- a/debian/source/format
+++ b/debian/source/format
@ -1 +0,0 @@
-3.0 (quilt)
--- a/debian/vitastor-client-dev.install
+++ b/debian/vitastor-client-dev.install
@ -1,2 +0,0 @@
-usr/include
-usr/lib/*/pkgconfig
--- a/debian/vitastor-client.install
+++ b/debian/vitastor-client.install
@ -1,8 +0,0 @@
-usr/bin/vita
-usr/bin/vitastor-cli
-usr/bin/vitastor-rm
-usr/bin/vitastor-nbd
-usr/bin/vitastor-nfs
-usr/bin/vitastor-kv
-usr/bin/vitastor-kv-stress
-usr/lib/*/libvitastor*.so*
--- a/debian/vitastor-fio.install
+++ b/debian/vitastor-fio.install
@ -1 +0,0 @@
-usr/lib/*/libfio*.so*
--- a/debian/vitastor-mon.install
+++ b/debian/vitastor-mon.install
@ -1,3 +0,0 @@
-mon usr/lib/vitastor/
-mon/scripts/make-etcd usr/lib/vitastor/mon
-mon/scripts/vitastor-mon.service /lib/systemd/system
--- a/debian/vitastor-mon.postinst
+++ b/debian/vitastor-mon.postinst
@ -1,11 +0,0 @@
-#!/bin/sh
-
-set -e
-
-if [ "$1" = "configure" ]; then
-	addgroup --system --quiet vitastor
-	adduser --system --quiet --ingroup vitastor --no-create-home --home /nonexistent vitastor
-	mkdir -p /etc/vitastor
-	mkdir -p /var/lib/vitastor
-	chown vitastor:vitastor /var/lib/vitastor
-fi
--- a/debian/vitastor-opennebula.install
+++ b/debian/vitastor-opennebula.install
@ -1,3 +0,0 @@
-opennebula/remotes var/lib/one/
-opennebula/sudoers.d etc/
-opennebula/install.sh var/lib/one/remotes/datastore/vitastor/
--- a/debian/vitastor-opennebula.postinst
+++ b/debian/vitastor-opennebula.postinst
@ -1,7 +0,0 @@
-#!/bin/sh
-
-set -e
-
-if [ "$1" = "configure" ]; then
-	/var/lib/one/remotes/datastore/vitastor/install.sh
-fi
--- a/debian/vitastor-opennebula.triggers
+++ b/debian/vitastor-opennebula.triggers
@ -1,4 +0,0 @@
-interest /var/lib/one/remotes/datastore/downloader.sh
-interest /etc/one/oned.conf
-interest /etc/one/vmm_exec/vmm_execrc
-interest /etc/apparmor.d/local/abstractions/libvirt-qemu
--- a/debian/vitastor-osd.install
+++ b/debian/vitastor-osd.install
@ -1,6 +0,0 @@
-usr/bin/vitastor-osd
-usr/bin/vitastor-disk
-usr/bin/vitastor-dump-journal
-mon/scripts/vitastor-osd@.service /lib/systemd/system
-mon/scripts/vitastor.target /lib/systemd/system
-mon/scripts/90-vitastor.rules /lib/udev/rules.d
--- a/Show More
+++ b/Show More
				`@ -1 +0,0 @@`
				`patches/VitastorPlugin.pm usr/share/perl5/PVE/Storage/Custom/`