forked from vitalif/vitastor
Compare commits
1 Commits
v0.9.6
...
qemu-send-
Author | SHA1 | Date | |
---|---|---|---|
c3ec6dd3b9 |
@@ -2,6 +2,6 @@ cmake_minimum_required(VERSION 2.8.12)
|
||||
|
||||
project(vitastor)
|
||||
|
||||
set(VERSION "0.9.6")
|
||||
set(VERSION "0.9.2")
|
||||
|
||||
add_subdirectory(src)
|
||||
|
@@ -15,7 +15,7 @@ Vitastor архитектурно похож на Ceph, что означает
|
||||
и автоматическое распределение данных по любому числу дисков любого размера с настраиваемыми схемами
|
||||
избыточности - репликацией или с произвольными кодами коррекции ошибок.
|
||||
|
||||
Vitastor нацелен в первую очередь на SSD и SSD+HDD кластеры с как минимум 10 Гбит/с сетью, поддерживает
|
||||
Vitastor нацелен на SSD и SSD+HDD кластеры с как минимум 10 Гбит/с сетью, поддерживает
|
||||
TCP и RDMA и на хорошем железе может достигать задержки 4 КБ чтения и записи на уровне ~0.1 мс,
|
||||
что примерно в 10 раз быстрее, чем Ceph и другие популярные программные СХД.
|
||||
|
||||
|
@@ -14,8 +14,8 @@ Vitastor is architecturally similar to Ceph which means strong consistency,
|
||||
primary-replication, symmetric clustering and automatic data distribution over any
|
||||
number of drives of any size with configurable redundancy (replication or erasure codes/XOR).
|
||||
|
||||
Vitastor targets primarily SSD and SSD+HDD clusters with at least 10 Gbit/s network,
|
||||
supports TCP and RDMA and may achieve 4 KB read and write latency as low as ~0.1 ms
|
||||
Vitastor targets SSD and SSD+HDD clusters with at least 10 Gbit/s network, supports
|
||||
TCP and RDMA and may achieve 4 KB read and write latency as low as ~0.1 ms
|
||||
with proper hardware which is ~10 times faster than other popular SDS's like Ceph
|
||||
or internal systems of public clouds.
|
||||
|
||||
|
@@ -1,4 +1,4 @@
|
||||
VERSION ?= v0.9.6
|
||||
VERSION ?= v0.9.2
|
||||
|
||||
all: build push
|
||||
|
||||
|
@@ -49,7 +49,7 @@ spec:
|
||||
capabilities:
|
||||
add: ["SYS_ADMIN"]
|
||||
allowPrivilegeEscalation: true
|
||||
image: vitalif/vitastor-csi:v0.9.6
|
||||
image: vitalif/vitastor-csi:v0.9.2
|
||||
args:
|
||||
- "--node=$(NODE_ID)"
|
||||
- "--endpoint=$(CSI_ENDPOINT)"
|
||||
|
@@ -116,7 +116,7 @@ spec:
|
||||
privileged: true
|
||||
capabilities:
|
||||
add: ["SYS_ADMIN"]
|
||||
image: vitalif/vitastor-csi:v0.9.6
|
||||
image: vitalif/vitastor-csi:v0.9.2
|
||||
args:
|
||||
- "--node=$(NODE_ID)"
|
||||
- "--endpoint=$(CSI_ENDPOINT)"
|
||||
|
@@ -5,7 +5,7 @@ package vitastor
|
||||
|
||||
const (
|
||||
vitastorCSIDriverName = "csi.vitastor.io"
|
||||
vitastorCSIDriverVersion = "0.9.6"
|
||||
vitastorCSIDriverVersion = "0.9.2"
|
||||
)
|
||||
|
||||
// Config struct fills the parameters of request or user input
|
||||
|
4
debian/changelog
vendored
4
debian/changelog
vendored
@@ -1,10 +1,10 @@
|
||||
vitastor (0.9.6-1) unstable; urgency=medium
|
||||
vitastor (0.9.2-1) unstable; urgency=medium
|
||||
|
||||
* Bugfixes
|
||||
|
||||
-- Vitaliy Filippov <vitalif@yourcmc.ru> Fri, 03 Jun 2022 02:09:44 +0300
|
||||
|
||||
vitastor (0.9.6-1) unstable; urgency=medium
|
||||
vitastor (0.9.2-1) unstable; urgency=medium
|
||||
|
||||
* Implement NFS proxy
|
||||
* Add documentation
|
||||
|
18
debian/patched-qemu.Dockerfile
vendored
18
debian/patched-qemu.Dockerfile
vendored
@@ -28,19 +28,13 @@ RUN apt-get --download-only source qemu
|
||||
|
||||
ADD patches /root/vitastor/patches
|
||||
ADD src/qemu_driver.c /root/vitastor/src/qemu_driver.c
|
||||
|
||||
#RUN set -e; \
|
||||
# apt-get install -y wget; \
|
||||
# wget -q -O /etc/apt/trusted.gpg.d/vitastor.gpg https://vitastor.io/debian/pubkey.gpg; \
|
||||
# (echo deb http://vitastor.io/debian $REL main > /etc/apt/sources.list.d/vitastor.list); \
|
||||
# (echo "APT::Install-Recommends false;" > /etc/apt/apt.conf) && \
|
||||
# apt-get update; \
|
||||
# apt-get install -y vitastor-client vitastor-client-dev quilt
|
||||
|
||||
RUN set -e; \
|
||||
dpkg -i /root/packages/vitastor-$REL/vitastor-client_*.deb /root/packages/vitastor-$REL/vitastor-client-dev_*.deb; \
|
||||
apt-get install -y wget; \
|
||||
wget -q -O /etc/apt/trusted.gpg.d/vitastor.gpg https://vitastor.io/debian/pubkey.gpg; \
|
||||
(echo deb http://vitastor.io/debian $REL main > /etc/apt/sources.list.d/vitastor.list); \
|
||||
(echo "APT::Install-Recommends false;" > /etc/apt/apt.conf) && \
|
||||
apt-get update; \
|
||||
apt-get install -y quilt; \
|
||||
apt-get install -y vitastor-client vitastor-client-dev quilt; \
|
||||
mkdir -p /root/packages/qemu-$REL; \
|
||||
rm -rf /root/packages/qemu-$REL/*; \
|
||||
cd /root/packages/qemu-$REL; \
|
||||
@@ -54,7 +48,7 @@ RUN set -e; \
|
||||
quilt add block/vitastor.c; \
|
||||
cp /root/vitastor/src/qemu_driver.c block/vitastor.c; \
|
||||
quilt refresh; \
|
||||
V=$(head -n1 debian/changelog | perl -pe 's/^.*\((.*?)(~bpo[\d\+]*)?\).*$/$1/')+vitastor3; \
|
||||
V=$(head -n1 debian/changelog | perl -pe 's/^.*\((.*?)(~bpo[\d\+]*)?\).*$/$1/')+vitastor1; \
|
||||
DEBEMAIL="Vitaliy Filippov <vitalif@yourcmc.ru>" dch -D $REL -v $V 'Plug Vitastor block driver'; \
|
||||
DEB_BUILD_OPTIONS=nocheck dpkg-buildpackage --jobs=auto -sa; \
|
||||
rm -rf /root/packages/qemu-$REL/qemu-*/
|
||||
|
8
debian/vitastor.Dockerfile
vendored
8
debian/vitastor.Dockerfile
vendored
@@ -35,8 +35,8 @@ RUN set -e -x; \
|
||||
mkdir -p /root/packages/vitastor-$REL; \
|
||||
rm -rf /root/packages/vitastor-$REL/*; \
|
||||
cd /root/packages/vitastor-$REL; \
|
||||
cp -r /root/vitastor vitastor-0.9.6; \
|
||||
cd vitastor-0.9.6; \
|
||||
cp -r /root/vitastor vitastor-0.9.2; \
|
||||
cd vitastor-0.9.2; \
|
||||
ln -s /root/fio-build/fio-*/ ./fio; \
|
||||
FIO=$(head -n1 fio/debian/changelog | perl -pe 's/^.*\((.*?)\).*$/$1/'); \
|
||||
ls /usr/include/linux/raw.h || cp ./debian/raw.h /usr/include/linux/raw.h; \
|
||||
@@ -49,8 +49,8 @@ RUN set -e -x; \
|
||||
rm -rf a b; \
|
||||
echo "dep:fio=$FIO" > debian/fio_version; \
|
||||
cd /root/packages/vitastor-$REL; \
|
||||
tar --sort=name --mtime='2020-01-01' --owner=0 --group=0 --exclude=debian -cJf vitastor_0.9.6.orig.tar.xz vitastor-0.9.6; \
|
||||
cd vitastor-0.9.6; \
|
||||
tar --sort=name --mtime='2020-01-01' --owner=0 --group=0 --exclude=debian -cJf vitastor_0.9.2.orig.tar.xz vitastor-0.9.2; \
|
||||
cd vitastor-0.9.2; \
|
||||
V=$(head -n1 debian/changelog | perl -pe 's/^.*\((.*?)\).*$/$1/'); \
|
||||
DEBFULLNAME="Vitaliy Filippov <vitalif@yourcmc.ru>" dch -D $REL -v "$V""$REL" "Rebuild for $REL"; \
|
||||
DEB_BUILD_OPTIONS=nocheck dpkg-buildpackage --jobs=auto -sa; \
|
||||
|
@@ -33,13 +33,12 @@ Size of objects (data blocks) into which all physical and virtual drives
|
||||
in Vitastor, affects memory usage, write amplification and I/O load
|
||||
distribution effectiveness.
|
||||
|
||||
Recommended default block size is 128 KB for SSD and 1 MB for HDD. In fact,
|
||||
it's possible to use 1 MB for SSD too - it will lower memory usage, but
|
||||
Recommended default block size is 128 KB for SSD and 4 MB for HDD. In fact,
|
||||
it's possible to use 4 MB for SSD too - it will lower memory usage, but
|
||||
may increase average WA and reduce linear performance.
|
||||
|
||||
OSD memory usage is roughly (SIZE / BLOCK * 68 bytes) which is roughly
|
||||
544 MB per 1 TB of used disk space with the default 128 KB block size.
|
||||
With 1 MB it's 8 times lower.
|
||||
|
||||
## bitmap_granularity
|
||||
|
||||
|
@@ -33,14 +33,14 @@ OSD) могут сосуществовать в одном кластере Vita
|
||||
настроек, влияет на потребление памяти, объём избыточной записи (write
|
||||
amplification) и эффективность распределения нагрузки по OSD.
|
||||
|
||||
Рекомендуемые по умолчанию размеры блока - 128 килобайт для SSD и 1 мегабайт
|
||||
для HDD. В принципе, для SSD можно тоже использовать блок размером 1 мегабайт,
|
||||
Рекомендуемые по умолчанию размеры блока - 128 килобайт для SSD и 4
|
||||
мегабайта для HDD. В принципе, для SSD можно тоже использовать 4 мегабайта,
|
||||
это понизит использование памяти, но ухудшит распределение нагрузки и в
|
||||
среднем увеличит WA.
|
||||
|
||||
Потребление памяти OSD составляет примерно (РАЗМЕР / БЛОК * 68 байт),
|
||||
т.е. примерно 544 МБ памяти на 1 ТБ занятого места на диске при
|
||||
стандартном 128 КБ блоке. При 1 МБ блоке памяти нужно в 8 раз меньше.
|
||||
стандартном 128 КБ блоке.
|
||||
|
||||
## bitmap_granularity
|
||||
|
||||
|
@@ -7,27 +7,26 @@
|
||||
in Vitastor, affects memory usage, write amplification and I/O load
|
||||
distribution effectiveness.
|
||||
|
||||
Recommended default block size is 128 KB for SSD and 1 MB for HDD. In fact,
|
||||
it's possible to use 1 MB for SSD too - it will lower memory usage, but
|
||||
Recommended default block size is 128 KB for SSD and 4 MB for HDD. In fact,
|
||||
it's possible to use 4 MB for SSD too - it will lower memory usage, but
|
||||
may increase average WA and reduce linear performance.
|
||||
|
||||
OSD memory usage is roughly (SIZE / BLOCK * 68 bytes) which is roughly
|
||||
544 MB per 1 TB of used disk space with the default 128 KB block size.
|
||||
With 1 MB it's 8 times lower.
|
||||
info_ru: |
|
||||
Размер объектов (блоков данных), на которые делятся физические и виртуальные
|
||||
диски в Vitastor (в рамках каждого пула). Одна из ключевых на данный момент
|
||||
настроек, влияет на потребление памяти, объём избыточной записи (write
|
||||
amplification) и эффективность распределения нагрузки по OSD.
|
||||
|
||||
Рекомендуемые по умолчанию размеры блока - 128 килобайт для SSD и 1 мегабайт
|
||||
для HDD. В принципе, для SSD можно тоже использовать блок размером 1 мегабайт,
|
||||
Рекомендуемые по умолчанию размеры блока - 128 килобайт для SSD и 4
|
||||
мегабайта для HDD. В принципе, для SSD можно тоже использовать 4 мегабайта,
|
||||
это понизит использование памяти, но ухудшит распределение нагрузки и в
|
||||
среднем увеличит WA.
|
||||
|
||||
Потребление памяти OSD составляет примерно (РАЗМЕР / БЛОК * 68 байт),
|
||||
т.е. примерно 544 МБ памяти на 1 ТБ занятого места на диске при
|
||||
стандартном 128 КБ блоке. При 1 МБ блоке памяти нужно в 8 раз меньше.
|
||||
стандартном 128 КБ блоке.
|
||||
- name: bitmap_granularity
|
||||
type: int
|
||||
default: 4096
|
||||
|
@@ -1,4 +1,4 @@
|
||||
[Документация](../../README-ru.md#документация) → Установка → Proxmox VE
|
||||
[Документация](../../README-ru.md#документация) → Установка → Proxmox
|
||||
|
||||
-----
|
||||
|
||||
|
@@ -21,7 +21,7 @@
|
||||
|
||||
## Basic instructions
|
||||
|
||||
Download source, for example using git: `git clone --recurse-submodules https://git.yourcmc.ru/vitalif/vitastor/`
|
||||
Download source, for example using git: `git clone --recurse-submodules https://yourcmc.ru/git/vitalif/vitastor/`
|
||||
|
||||
Get `fio` source and symlink it into `<vitastor>/fio`. If you don't want to build fio engine,
|
||||
you can disable it by passing `-DWITH_FIO=no` to cmake.
|
||||
@@ -41,7 +41,7 @@ It's recommended to build the QEMU driver (qemu_driver.c) in-tree, as a part of
|
||||
QEMU build process. To do that:
|
||||
- Install vitastor client library headers (from source or from vitastor-client-dev package)
|
||||
- Take a corresponding patch from `patches/qemu-*-vitastor.patch` and apply it to QEMU source
|
||||
- Copy `src/qemu_driver.c` to QEMU source directory as `block/vitastor.c`
|
||||
- Copy `src/qemu_driver.c` to QEMU source directory as `block/block-vitastor.c`
|
||||
- Build QEMU as usual
|
||||
|
||||
But it is also possible to build it out-of-tree. To do that:
|
||||
|
@@ -21,7 +21,7 @@
|
||||
|
||||
## Базовая инструкция
|
||||
|
||||
Скачайте исходные коды, например, из git: `git clone --recurse-submodules https://git.yourcmc.ru/vitalif/vitastor/`
|
||||
Скачайте исходные коды, например, из git: `git clone --recurse-submodules https://yourcmc.ru/git/vitalif/vitastor/`
|
||||
|
||||
Скачайте исходные коды пакета `fio`, распакуйте их и создайте символическую ссылку на них
|
||||
в директории исходников Vitastor: `<vitastor>/fio`. Либо, если вы не хотите собирать плагин fio,
|
||||
@@ -41,7 +41,7 @@ cmake .. && make -j8 install
|
||||
Драйвер QEMU (qemu_driver.c) рекомендуется собирать вместе с самим QEMU. Для этого:
|
||||
- Установите заголовки клиентской библиотеки Vitastor (из исходников или из пакета vitastor-client-dev)
|
||||
- Возьмите соответствующий патч из `patches/qemu-*-vitastor.patch` и примените его к исходникам QEMU
|
||||
- Скопируйте [src/qemu_driver.c](../../src/qemu_driver.c) в директорию исходников QEMU как `block/vitastor.c`
|
||||
- Скопируйте [src/qemu_driver.c](../../src/qemu_driver.c) в директорию исходников QEMU как `block/block-vitastor.c`
|
||||
- Соберите QEMU как обычно
|
||||
|
||||
Однако в целях отладки драйвер также можно собирать отдельно от QEMU. Для этого:
|
||||
@@ -60,7 +60,7 @@ cmake .. && make -j8 install
|
||||
* Для QEMU 2.0+: `<qemu>/qapi-types.h` → `<vitastor>/qemu/b/qemu/qapi-types.h`
|
||||
- `config-host.h` и `qapi` нужны, т.к. в них содержатся автогенерируемые заголовки
|
||||
- Сконфигурируйте cmake Vitastor с `WITH_QEMU=yes` (`cmake .. -DWITH_QEMU=yes`) и, если вы
|
||||
используете RHEL-подобный дистрибутив, также с `QEMU_PLUGINDIR=qemu-kvm`.
|
||||
используете RHEL-подобый дистрибутив, также с `QEMU_PLUGINDIR=qemu-kvm`.
|
||||
- После этого в процессе сборки Vitastor также будет собираться подходящий для вашей
|
||||
версии QEMU `block-vitastor.so`.
|
||||
- Таким образом можно использовать драйвер даже с немодифицированным QEMU, но в этом случае
|
||||
|
@@ -7,7 +7,6 @@
|
||||
# Quick Start
|
||||
|
||||
- [Preparation](#preparation)
|
||||
- [Recommended drives](#recommended-drives)
|
||||
- [Configure monitors](#configure-monitors)
|
||||
- [Configure OSDs](#configure-osds)
|
||||
- [Create a pool](#create-a-pool)
|
||||
@@ -20,20 +19,10 @@
|
||||
- Get some SATA or NVMe SSDs with capacitors (server-grade drives). You can use desktop SSDs
|
||||
with lazy fsync, but prepare for inferior single-thread latency. Read more about capacitors
|
||||
[here](../config/layout-cluster.en.md#immediate_commit).
|
||||
- If you want to use HDDs, get modern HDDs with Media Cache or SSD Cache: HGST Ultrastar,
|
||||
Toshiba MG08, Seagate EXOS or something similar. If your drives don't have such cache then
|
||||
you also need small SSDs for journal and metadata (even 2 GB per 1 TB of HDD space is enough).
|
||||
- Get a fast network (at least 10 Gbit/s). Something like Mellanox ConnectX-4 with RoCEv2 is ideal.
|
||||
- Disable CPU powersaving: `cpupower idle-set -D 0 && cpupower frequency-set -g performance`.
|
||||
- [Install Vitastor packages](../installation/packages.en.md).
|
||||
|
||||
## Recommended drives
|
||||
|
||||
- SATA SSD: Micron 5100/5200/5300/5400, Samsung PM863/PM883/PM893, Intel D3-S4510/4520/4610/4620, Kingston DC500M
|
||||
- NVMe: Micron 9100/9200/9300/9400, Micron 7300/7450, Samsung PM983/PM9A3, Samsung PM1723/1735/1743,
|
||||
Intel DC-P3700/P4500/P4600, Intel D7-P5500/P5600, Intel Optane, Kingston DC1000B/DC1500M
|
||||
- HDD: HGST Ultrastar, Toshiba MG06/MG07/MG08, Seagate EXOS
|
||||
|
||||
## Configure monitors
|
||||
|
||||
On the monitor hosts:
|
||||
@@ -56,10 +45,9 @@ On the monitor hosts:
|
||||
}
|
||||
```
|
||||
- Initialize OSDs:
|
||||
- SSD-only or HDD-only: `vitastor-disk prepare /dev/sdXXX [/dev/sdYYY ...]`.
|
||||
Add `--disable_data_fsync off` to leave disk write cache enabled if you use
|
||||
desktop SSDs without capacitors. Do NOT add `--disable_data_fsync off` if you
|
||||
use HDDs or SSD+HDD.
|
||||
- SSD-only: `vitastor-disk prepare /dev/sdXXX [/dev/sdYYY ...]`. You can add
|
||||
`--disable_data_fsync off` to leave disk cache enabled if you use desktop
|
||||
SSDs without capacitors.
|
||||
- Hybrid, SSD+HDD: `vitastor-disk prepare --hybrid /dev/sdXXX [/dev/sdYYY ...]`.
|
||||
Pass all your devices (HDD and SSD) to this script — it will partition disks and initialize journals on its own.
|
||||
This script skips HDDs which are already partitioned so if you want to use non-empty disks for
|
||||
|
@@ -7,7 +7,6 @@
|
||||
# Быстрый старт
|
||||
|
||||
- [Подготовка](#подготовка)
|
||||
- [Рекомендуемые диски](#рекомендуемые-диски)
|
||||
- [Настройте мониторы](#настройте-мониторы)
|
||||
- [Настройте OSD](#настройте-osd)
|
||||
- [Создайте пул](#создайте-пул)
|
||||
@@ -20,20 +19,10 @@
|
||||
- Возьмите серверы с SSD (SATA или NVMe), желательно с конденсаторами (серверные SSD). Можно
|
||||
использовать и десктопные SSD, включив режим отложенного fsync, но производительность будет хуже.
|
||||
О конденсаторах читайте [здесь](../config/layout-cluster.ru.md#immediate_commit).
|
||||
- Если хотите использовать HDD, берите современные модели с Media или SSD кэшем - HGST Ultrastar,
|
||||
Toshiba MG08, Seagate EXOS или что-то похожее. Если такого кэша у ваших дисков нет,
|
||||
обязательно возьмите SSD под метаданные и журнал (маленькие, буквально 2 ГБ на 1 ТБ HDD-места).
|
||||
- Возьмите быструю сеть, минимум 10 гбит/с. Идеал - что-то вроде Mellanox ConnectX-4 с RoCEv2.
|
||||
- Для лучшей производительности отключите энергосбережение CPU: `cpupower idle-set -D 0 && cpupower frequency-set -g performance`.
|
||||
- [Установите пакеты Vitastor](../installation/packages.ru.md).
|
||||
|
||||
## Рекомендуемые диски
|
||||
|
||||
- SATA SSD: Micron 5100/5200/5300/5400, Samsung PM863/PM883/PM893, Intel D3-S4510/4520/4610/4620, Kingston DC500M
|
||||
- NVMe: Micron 9100/9200/9300/9400, Micron 7300/7450, Samsung PM983/PM9A3, Samsung PM1723/1735/1743,
|
||||
Intel DC-P3700/P4500/P4600, Intel D7-P5500/P5600, Intel Optane, Kingston DC1000B/DC1500M
|
||||
- HDD: HGST Ultrastar, Toshiba MG06/MG07/MG08, Seagate EXOS
|
||||
|
||||
## Настройте мониторы
|
||||
|
||||
На хостах, выделенных под мониторы:
|
||||
@@ -56,10 +45,9 @@
|
||||
}
|
||||
```
|
||||
- Инициализуйте OSD:
|
||||
- Только SSD или только HDD: `vitastor-disk prepare /dev/sdXXX [/dev/sdYYY ...]`.
|
||||
Если вы используете десктопные SSD без конденсаторов, добавьте опцию `--disable_data_fsync off`,
|
||||
чтобы оставить кэш записи диска включённым. НЕ добавляйте эту опцию, если используете
|
||||
жёсткие диски (HDD).
|
||||
- SSD: `vitastor-disk prepare /dev/sdXXX [/dev/sdYYY ...]`. Если вы используете
|
||||
десктопные SSD без конденсаторов, можете оставить кэш включённым, добавив
|
||||
опцию `--disable_data_fsync off`.
|
||||
- Гибридные, SSD+HDD: `vitastor-disk prepare --hybrid /dev/sdXXX [/dev/sdYYY ...]`.
|
||||
Передайте все ваши SSD и HDD скрипту в командной строке подряд, скрипт автоматически выделит
|
||||
разделы под журналы на SSD и данные на HDD. Скрипт пропускает HDD, на которых уже есть разделы
|
||||
|
@@ -13,8 +13,6 @@ remains decent (see an example [here](../performance/comparison1.en.md#vitastor-
|
||||
|
||||
Vitastor Kubernetes CSI driver is based on NBD.
|
||||
|
||||
See also [VDUSE](qemu.en.md#vduse).
|
||||
|
||||
## Map image
|
||||
|
||||
To create a local block device for a Vitastor image run:
|
||||
|
@@ -16,8 +16,6 @@ NBD немного снижает производительность из-за
|
||||
|
||||
CSI-драйвер Kubernetes Vitastor основан на NBD.
|
||||
|
||||
Смотрите также [VDUSE](qemu.ru.md#vduse).
|
||||
|
||||
## Подключить устройство
|
||||
|
||||
Чтобы создать локальное блочное устройство для образа, выполните команду:
|
||||
|
@@ -83,44 +83,3 @@ qemu-img rebase -u -b '' testimg.qcow2
|
||||
This can be used for backups. Just note that exporting an image that is currently being written to
|
||||
is of course unsafe and doesn't produce a consistent result, so only export snapshots if you do this
|
||||
on a live VM.
|
||||
|
||||
## VDUSE
|
||||
|
||||
Linux kernel, starting with version 5.15, supports a new interface for attaching virtual disks
|
||||
to the host - VDUSE (vDPA Device in Userspace). QEMU, starting with 7.2, has support for
|
||||
exporting QEMU block devices over this protocol using qemu-storage-daemon.
|
||||
|
||||
VDUSE has the same problem as other FUSE-like interfaces in Linux: if a userspace process hangs,
|
||||
for example, if it loses connectivity with Vitastor cluster - active processes doing I/O may
|
||||
hang in the D state (uninterruptible sleep) and you won't be able to kill them even with kill -9.
|
||||
In this case reboot will be the only way to remove VDUSE devices from system.
|
||||
|
||||
On the other hand, VDUSE is faster than [NBD](nbd.en.md), so you may prefer to use it if
|
||||
performance is important for you. Approximate performance numbers:
|
||||
direct fio benchmark - 115000 iops, NBD - 60000 iops, VDUSE - 90000 iops.
|
||||
|
||||
To try VDUSE you need at least Linux 5.15, built with VDUSE support
|
||||
(CONFIG_VIRTIO_VDPA=m and CONFIG_VDPA_USER=m). Debian Linux kernels have these options
|
||||
disabled by now, so if you want to try it on Debian, use a kernel from Ubuntu
|
||||
[kernel-ppa/mainline](https://kernel.ubuntu.com/~kernel-ppa/mainline/) or Proxmox.
|
||||
|
||||
Commands to attach Vitastor image as a VDUSE device:
|
||||
|
||||
```
|
||||
modprobe vduse
|
||||
modprobe virtio-vdpa
|
||||
qemu-storage-daemon --daemonize --blockdev '{"node-name":"test1","driver":"vitastor",\
|
||||
"etcd-host":"192.168.7.2:2379/v3","image":"testosd1","cache":{"direct":true,"no-flush":false},"discard":"unmap"}' \
|
||||
--export vduse-blk,id=test1,node-name=test1,name=test1,num-queues=16,queue-size=128,writable=true
|
||||
vdpa dev add name test1 mgmtdev vduse
|
||||
```
|
||||
|
||||
After running these commands /dev/vda device will appear in the system and you'll be able to
|
||||
use it as a normal disk.
|
||||
|
||||
To remove the device:
|
||||
|
||||
```
|
||||
vdpa dev del test1
|
||||
kill <qemu-storage-daemon_process_PID>
|
||||
```
|
||||
|
@@ -87,44 +87,3 @@ qemu-img rebase -u -b '' testimg.qcow2
|
||||
Это можно использовать для резервного копирования. Только помните, что экспортировать образ, в который
|
||||
в то же время идёт запись, небезопасно - результат чтения не будет целостным. Так что если вы работаете
|
||||
с активными виртуальными машинами, экспортируйте только их снимки, но не сам образ.
|
||||
|
||||
## VDUSE
|
||||
|
||||
В Linux, начиная с версии ядра 5.15, доступен новый интерфейс для подключения виртуальных дисков
|
||||
к системе - VDUSE (vDPA Device in Userspace), а в QEMU, начиная с версии 7.2, есть поддержка
|
||||
экспорта блочных устройств QEMU по этому протоколу через qemu-storage-daemon.
|
||||
|
||||
VDUSE страдает общей проблемой FUSE-подобных интерфейсов в Linux: если пользовательский процесс
|
||||
подвиснет, например, если будет потеряна связь с кластером Vitastor - читающие/пишущие в кластер
|
||||
процессы могут "залипнуть" в состоянии D (непрерываемый сон) и их будет невозможно убить даже
|
||||
через kill -9. В этом случае удалить из системы устройство можно только перезагрузившись.
|
||||
|
||||
С другой стороны, VDUSE быстрее по сравнению с [NBD](nbd.ru.md), поэтому его может
|
||||
быть предпочтительно использовать там, где производительность важнее. Порядок показателей:
|
||||
прямое тестирование через fio - 115000 iops, NBD - 60000 iops, VDUSE - 90000 iops.
|
||||
|
||||
Чтобы использовать VDUSE, вам нужно ядро Linux версии хотя бы 5.15, собранное с поддержкой
|
||||
VDUSE (CONFIG_VIRTIO_VDPA=m и CONFIG_VDPA_USER=m). В ядрах в Debian Linux поддержка пока
|
||||
отключена - если хотите попробовать эту функцию на Debian, поставьте ядро из Ubuntu
|
||||
[kernel-ppa/mainline](https://kernel.ubuntu.com/~kernel-ppa/mainline/) или из Proxmox.
|
||||
|
||||
Команды для подключения виртуального диска через VDUSE:
|
||||
|
||||
```
|
||||
modprobe vduse
|
||||
modprobe virtio-vdpa
|
||||
qemu-storage-daemon --daemonize --blockdev '{"node-name":"test1","driver":"vitastor",\
|
||||
"etcd-host":"192.168.7.2:2379/v3","image":"testosd1","cache":{"direct":true,"no-flush":false},"discard":"unmap"}' \
|
||||
--export vduse-blk,id=test1,node-name=test1,name=test1,num-queues=16,queue-size=128,writable=true
|
||||
vdpa dev add name test1 mgmtdev vduse
|
||||
```
|
||||
|
||||
После этого в системе появится устройство /dev/vda, которое можно будет использовать как
|
||||
обычный диск.
|
||||
|
||||
Для удаления устройства из системы:
|
||||
|
||||
```
|
||||
vdpa dev del test1
|
||||
kill <PID_процесса_qemu-storage-daemon>
|
||||
```
|
||||
|
@@ -63,9 +63,8 @@ Wants=network-online.target local-fs.target time-sync.target
|
||||
|
||||
[Service]
|
||||
Restart=always
|
||||
Environment=GOGC=50
|
||||
ExecStart=etcd -name etcd${num} --data-dir /var/lib/etcd${num}.etcd \\
|
||||
--snapshot-count 10000 --advertise-client-urls http://${etcds[num]}:2379 --listen-client-urls http://${etcds[num]}:2379 \\
|
||||
--advertise-client-urls http://${etcds[num]}:2379 --listen-client-urls http://${etcds[num]}:2379 \\
|
||||
--initial-advertise-peer-urls http://${etcds[num]}:2380 --listen-peer-urls http://${etcds[num]}:2380 \\
|
||||
--initial-cluster-token vitastor-etcd-1 --initial-cluster ${etcd_cluster} \\
|
||||
--initial-cluster-state new --max-txn-ops=100000 --max-request-bytes=104857600 \\
|
||||
|
34
mon/mon.js
34
mon/mon.js
@@ -1497,14 +1497,10 @@ class Mon
|
||||
break;
|
||||
}
|
||||
}
|
||||
const pool_cfg = (this.state.config.pools[pool_id]||{});
|
||||
if (!object_size)
|
||||
{
|
||||
object_size = pool_cfg.block_size || this.config.block_size || 131072;
|
||||
}
|
||||
if (pool_cfg.scheme !== 'replicated')
|
||||
{
|
||||
object_size *= ((pool_cfg.pg_size||0) - (pool_cfg.parity_chunks||0));
|
||||
object_size = (this.state.config.pools[pool_id]||{}).block_size ||
|
||||
this.config.block_size || 131072;
|
||||
}
|
||||
object_size = BigInt(object_size);
|
||||
for (const pg_num in this.state.pg.stats[pool_id])
|
||||
@@ -1612,7 +1608,7 @@ class Mon
|
||||
}
|
||||
}
|
||||
}
|
||||
return { inode_stats, seen_pools };
|
||||
return inode_stats;
|
||||
}
|
||||
|
||||
serialize_bigints(obj)
|
||||
@@ -1638,7 +1634,7 @@ class Mon
|
||||
const timestamp = Date.now();
|
||||
const { object_counts, object_bytes } = this.sum_object_counts();
|
||||
let stats = this.sum_op_stats(timestamp, this.prev_stats);
|
||||
let { inode_stats, seen_pools } = this.sum_inode_stats(
|
||||
let inode_stats = this.sum_inode_stats(
|
||||
this.prev_stats ? this.prev_stats.inode_stats : null,
|
||||
timestamp, this.prev_stats ? this.prev_stats.timestamp : null
|
||||
);
|
||||
@@ -1673,22 +1669,12 @@ class Mon
|
||||
}
|
||||
for (const pool_id in this.state.pool.stats)
|
||||
{
|
||||
if (!seen_pools[pool_id])
|
||||
{
|
||||
txn.push({ requestDeleteRange: {
|
||||
key: b64(this.etcd_prefix+'/pool/stats/'+pool_id),
|
||||
} });
|
||||
delete this.state.pool.stats[pool_id];
|
||||
}
|
||||
else
|
||||
{
|
||||
const pool_stats = { ...this.state.pool.stats[pool_id] };
|
||||
this.serialize_bigints(pool_stats);
|
||||
txn.push({ requestPut: {
|
||||
key: b64(this.etcd_prefix+'/pool/stats/'+pool_id),
|
||||
value: b64(JSON.stringify(pool_stats)),
|
||||
} });
|
||||
}
|
||||
const pool_stats = { ...this.state.pool.stats[pool_id] };
|
||||
this.serialize_bigints(pool_stats);
|
||||
txn.push({ requestPut: {
|
||||
key: b64(this.etcd_prefix+'/pool/stats/'+pool_id),
|
||||
value: b64(JSON.stringify(pool_stats)),
|
||||
} });
|
||||
}
|
||||
if (txn.length)
|
||||
{
|
||||
|
@@ -50,7 +50,7 @@ from cinder.volume import configuration
|
||||
from cinder.volume import driver
|
||||
from cinder.volume import volume_utils
|
||||
|
||||
VERSION = '0.9.6'
|
||||
VERSION = '0.9.2'
|
||||
|
||||
LOG = logging.getLogger(__name__)
|
||||
|
||||
|
@@ -1,176 +0,0 @@
|
||||
diff --git a/block/Makefile.objs b/block/Makefile.objs
|
||||
index d644bac60a..e404236291 100644
|
||||
--- a/block/Makefile.objs
|
||||
+++ b/block/Makefile.objs
|
||||
@@ -19,6 +19,7 @@ block-obj-$(if $(CONFIG_LIBISCSI),y,n) += iscsi-opts.o
|
||||
block-obj-$(CONFIG_LIBNFS) += nfs.o
|
||||
block-obj-$(CONFIG_CURL) += curl.o
|
||||
block-obj-$(CONFIG_RBD) += rbd.o
|
||||
+block-obj-$(CONFIG_VITASTOR) += vitastor.o
|
||||
block-obj-$(CONFIG_GLUSTERFS) += gluster.o
|
||||
block-obj-$(CONFIG_VXHS) += vxhs.o
|
||||
block-obj-$(CONFIG_LIBSSH2) += ssh.o
|
||||
@@ -39,6 +40,8 @@ curl.o-cflags := $(CURL_CFLAGS)
|
||||
curl.o-libs := $(CURL_LIBS)
|
||||
rbd.o-cflags := $(RBD_CFLAGS)
|
||||
rbd.o-libs := $(RBD_LIBS)
|
||||
+vitastor.o-cflags := $(VITASTOR_CFLAGS)
|
||||
+vitastor.o-libs := $(VITASTOR_LIBS)
|
||||
gluster.o-cflags := $(GLUSTERFS_CFLAGS)
|
||||
gluster.o-libs := $(GLUSTERFS_LIBS)
|
||||
vxhs.o-libs := $(VXHS_LIBS)
|
||||
diff --git a/configure b/configure
|
||||
index 0a19b033bc..58b7fbf24c 100755
|
||||
--- a/configure
|
||||
+++ b/configure
|
||||
@@ -398,6 +398,7 @@ trace_backends="log"
|
||||
trace_file="trace"
|
||||
spice=""
|
||||
rbd=""
|
||||
+vitastor=""
|
||||
smartcard=""
|
||||
libusb=""
|
||||
usb_redir=""
|
||||
@@ -1213,6 +1214,10 @@ for opt do
|
||||
;;
|
||||
--enable-rbd) rbd="yes"
|
||||
;;
|
||||
+ --disable-vitastor) vitastor="no"
|
||||
+ ;;
|
||||
+ --enable-vitastor) vitastor="yes"
|
||||
+ ;;
|
||||
--disable-xfsctl) xfs="no"
|
||||
;;
|
||||
--enable-xfsctl) xfs="yes"
|
||||
@@ -1601,6 +1606,7 @@ disabled with --disable-FEATURE, default is enabled if available:
|
||||
vhost-crypto vhost-crypto acceleration support
|
||||
spice spice
|
||||
rbd rados block device (rbd)
|
||||
+ vitastor vitastor block device
|
||||
libiscsi iscsi support
|
||||
libnfs nfs support
|
||||
smartcard smartcard support (libcacard)
|
||||
@@ -3594,6 +3600,27 @@ EOF
|
||||
fi
|
||||
fi
|
||||
|
||||
+##########################################
|
||||
+# vitastor probe
|
||||
+if test "$vitastor" != "no" ; then
|
||||
+ cat > $TMPC <<EOF
|
||||
+#include <vitastor_c.h>
|
||||
+int main(void) {
|
||||
+ vitastor_c_create_qemu(0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0);
|
||||
+ return 0;
|
||||
+}
|
||||
+EOF
|
||||
+ vitastor_libs="-lvitastor_client"
|
||||
+ if compile_prog "" "$vitastor_libs" ; then
|
||||
+ vitastor=yes
|
||||
+ else
|
||||
+ if test "$vitastor" = "yes" ; then
|
||||
+ feature_not_found "vitastor block device" "Install vitastor-client-dev"
|
||||
+ fi
|
||||
+ vitastor=no
|
||||
+ fi
|
||||
+fi
|
||||
+
|
||||
##########################################
|
||||
# libssh2 probe
|
||||
min_libssh2_version=1.2.8
|
||||
@@ -5837,6 +5864,7 @@ echo "Trace output file $trace_file-<pid>"
|
||||
fi
|
||||
echo "spice support $spice $(echo_version $spice $spice_protocol_version/$spice_server_version)"
|
||||
echo "rbd support $rbd"
|
||||
+echo "vitastor support $vitastor"
|
||||
echo "xfsctl support $xfs"
|
||||
echo "smartcard support $smartcard"
|
||||
echo "libusb $libusb"
|
||||
@@ -6416,6 +6444,11 @@ if test "$rbd" = "yes" ; then
|
||||
echo "RBD_CFLAGS=$rbd_cflags" >> $config_host_mak
|
||||
echo "RBD_LIBS=$rbd_libs" >> $config_host_mak
|
||||
fi
|
||||
+if test "$vitastor" = "yes" ; then
|
||||
+ echo "CONFIG_VITASTOR=m" >> $config_host_mak
|
||||
+ echo "VITASTOR_CFLAGS=$vitastor_cflags" >> $config_host_mak
|
||||
+ echo "VITASTOR_LIBS=$vitastor_libs" >> $config_host_mak
|
||||
+fi
|
||||
|
||||
echo "CONFIG_COROUTINE_BACKEND=$coroutine" >> $config_host_mak
|
||||
if test "$coroutine_pool" = "yes" ; then
|
||||
diff --git a/qapi/block-core.json b/qapi/block-core.json
|
||||
index c50517bff3..c780bb2c1c 100644
|
||||
--- a/qapi/block-core.json
|
||||
+++ b/qapi/block-core.json
|
||||
@@ -2514,7 +2514,7 @@
|
||||
'dmg', 'file', 'ftp', 'ftps', 'gluster', 'host_cdrom',
|
||||
'host_device', 'http', 'https', 'iscsi', 'luks', 'nbd', 'nfs',
|
||||
'null-aio', 'null-co', 'nvme', 'parallels', 'qcow', 'qcow2', 'qed',
|
||||
- 'quorum', 'raw', 'rbd', 'replication', 'sheepdog', 'ssh',
|
||||
+ 'quorum', 'raw', 'rbd', 'vitastor', 'replication', 'sheepdog', 'ssh',
|
||||
'throttle', 'vdi', 'vhdx', 'vmdk', 'vpc', 'vvfat', 'vxhs' ] }
|
||||
|
||||
##
|
||||
@@ -3217,6 +3217,28 @@
|
||||
'*snap-id': 'uint32',
|
||||
'*tag': 'str' } }
|
||||
|
||||
+##
|
||||
+# @BlockdevOptionsVitastor:
|
||||
+#
|
||||
+# Driver specific block device options for vitastor
|
||||
+#
|
||||
+# @image: Image name
|
||||
+# @inode: Inode number
|
||||
+# @pool: Pool ID
|
||||
+# @size: Desired image size in bytes
|
||||
+# @config-path: Path to Vitastor configuration
|
||||
+# @etcd-host: etcd connection address(es)
|
||||
+# @etcd-prefix: etcd key/value prefix
|
||||
+##
|
||||
+{ 'struct': 'BlockdevOptionsVitastor',
|
||||
+ 'data': { '*inode': 'uint64',
|
||||
+ '*pool': 'uint64',
|
||||
+ '*size': 'uint64',
|
||||
+ '*image': 'str',
|
||||
+ '*config-path': 'str',
|
||||
+ '*etcd-host': 'str',
|
||||
+ '*etcd-prefix': 'str' } }
|
||||
+
|
||||
##
|
||||
# @ReplicationMode:
|
||||
#
|
||||
@@ -3547,6 +3569,7 @@
|
||||
'rbd': 'BlockdevOptionsRbd',
|
||||
'replication':'BlockdevOptionsReplication',
|
||||
'sheepdog': 'BlockdevOptionsSheepdog',
|
||||
+ 'vitastor': 'BlockdevOptionsVitastor',
|
||||
'ssh': 'BlockdevOptionsSsh',
|
||||
'throttle': 'BlockdevOptionsThrottle',
|
||||
'vdi': 'BlockdevOptionsGenericFormat',
|
||||
@@ -3991,6 +4014,17 @@
|
||||
'*subformat': 'BlockdevVhdxSubformat',
|
||||
'*block-state-zero': 'bool' } }
|
||||
|
||||
+##
|
||||
+# @BlockdevCreateOptionsVitastor:
|
||||
+#
|
||||
+# Driver specific image creation options for Vitastor.
|
||||
+#
|
||||
+# @size: Size of the virtual disk in bytes
|
||||
+##
|
||||
+{ 'struct': 'BlockdevCreateOptionsVitastor',
|
||||
+ 'data': { 'location': 'BlockdevOptionsVitastor',
|
||||
+ 'size': 'size' } }
|
||||
+
|
||||
##
|
||||
# @BlockdevVpcSubformat:
|
||||
#
|
||||
@@ -4074,6 +4108,7 @@
|
||||
'rbd': 'BlockdevCreateOptionsRbd',
|
||||
'replication': 'BlockdevCreateNotSupported',
|
||||
'sheepdog': 'BlockdevCreateOptionsSheepdog',
|
||||
+ 'vitastor': 'BlockdevCreateOptionsVitastor',
|
||||
'ssh': 'BlockdevCreateOptionsSsh',
|
||||
'throttle': 'BlockdevCreateNotSupported',
|
||||
'vdi': 'BlockdevCreateOptionsVdi',
|
@@ -1,181 +0,0 @@
|
||||
Index: qemu-5.2+dfsg/qapi/block-core.json
|
||||
===================================================================
|
||||
--- qemu-5.2+dfsg.orig/qapi/block-core.json
|
||||
+++ qemu-5.2+dfsg/qapi/block-core.json
|
||||
@@ -2831,7 +2831,7 @@
|
||||
'luks', 'nbd', 'nfs', 'null-aio', 'null-co', 'nvme', 'parallels',
|
||||
'qcow', 'qcow2', 'qed', 'quorum', 'raw', 'rbd',
|
||||
{ 'name': 'replication', 'if': 'defined(CONFIG_REPLICATION)' },
|
||||
- 'sheepdog',
|
||||
+ 'sheepdog', 'vitastor',
|
||||
'ssh', 'throttle', 'vdi', 'vhdx', 'vmdk', 'vpc', 'vvfat' ] }
|
||||
|
||||
##
|
||||
@@ -3668,6 +3668,28 @@
|
||||
'*tag': 'str' } }
|
||||
|
||||
##
|
||||
+# @BlockdevOptionsVitastor:
|
||||
+#
|
||||
+# Driver specific block device options for vitastor
|
||||
+#
|
||||
+# @image: Image name
|
||||
+# @inode: Inode number
|
||||
+# @pool: Pool ID
|
||||
+# @size: Desired image size in bytes
|
||||
+# @config-path: Path to Vitastor configuration
|
||||
+# @etcd-host: etcd connection address(es)
|
||||
+# @etcd-prefix: etcd key/value prefix
|
||||
+##
|
||||
+{ 'struct': 'BlockdevOptionsVitastor',
|
||||
+ 'data': { '*inode': 'uint64',
|
||||
+ '*pool': 'uint64',
|
||||
+ '*size': 'uint64',
|
||||
+ '*image': 'str',
|
||||
+ '*config-path': 'str',
|
||||
+ '*etcd-host': 'str',
|
||||
+ '*etcd-prefix': 'str' } }
|
||||
+
|
||||
+##
|
||||
# @ReplicationMode:
|
||||
#
|
||||
# An enumeration of replication modes.
|
||||
@@ -4015,6 +4037,7 @@
|
||||
'replication': { 'type': 'BlockdevOptionsReplication',
|
||||
'if': 'defined(CONFIG_REPLICATION)' },
|
||||
'sheepdog': 'BlockdevOptionsSheepdog',
|
||||
+ 'vitastor': 'BlockdevOptionsVitastor',
|
||||
'ssh': 'BlockdevOptionsSsh',
|
||||
'throttle': 'BlockdevOptionsThrottle',
|
||||
'vdi': 'BlockdevOptionsGenericFormat',
|
||||
@@ -4404,6 +4427,17 @@
|
||||
'*cluster-size' : 'size' } }
|
||||
|
||||
##
|
||||
+# @BlockdevCreateOptionsVitastor:
|
||||
+#
|
||||
+# Driver specific image creation options for Vitastor.
|
||||
+#
|
||||
+# @size: Size of the virtual disk in bytes
|
||||
+##
|
||||
+{ 'struct': 'BlockdevCreateOptionsVitastor',
|
||||
+ 'data': { 'location': 'BlockdevOptionsVitastor',
|
||||
+ 'size': 'size' } }
|
||||
+
|
||||
+##
|
||||
# @BlockdevVmdkSubformat:
|
||||
#
|
||||
# Subformat options for VMDK images
|
||||
@@ -4665,6 +4699,7 @@
|
||||
'qed': 'BlockdevCreateOptionsQed',
|
||||
'rbd': 'BlockdevCreateOptionsRbd',
|
||||
'sheepdog': 'BlockdevCreateOptionsSheepdog',
|
||||
+ 'vitastor': 'BlockdevCreateOptionsVitastor',
|
||||
'ssh': 'BlockdevCreateOptionsSsh',
|
||||
'vdi': 'BlockdevCreateOptionsVdi',
|
||||
'vhdx': 'BlockdevCreateOptionsVhdx',
|
||||
Index: qemu-5.2+dfsg/block/meson.build
|
||||
===================================================================
|
||||
--- qemu-5.2+dfsg.orig/block/meson.build
|
||||
+++ qemu-5.2+dfsg/block/meson.build
|
||||
@@ -76,6 +76,7 @@ foreach m : [
|
||||
['CONFIG_LIBNFS', 'nfs', libnfs, 'nfs.c'],
|
||||
['CONFIG_LIBSSH', 'ssh', libssh, 'ssh.c'],
|
||||
['CONFIG_RBD', 'rbd', rbd, 'rbd.c'],
|
||||
+ ['CONFIG_VITASTOR', 'vitastor', vitastor, 'vitastor.c'],
|
||||
]
|
||||
if config_host.has_key(m[0])
|
||||
if enable_modules
|
||||
Index: qemu-5.2+dfsg/configure
|
||||
===================================================================
|
||||
--- qemu-5.2+dfsg.orig/configure
|
||||
+++ qemu-5.2+dfsg/configure
|
||||
@@ -372,6 +372,7 @@ trace_backends="log"
|
||||
trace_file="trace"
|
||||
spice=""
|
||||
rbd=""
|
||||
+vitastor=""
|
||||
smartcard=""
|
||||
u2f="auto"
|
||||
libusb=""
|
||||
@@ -1263,6 +1264,10 @@ for opt do
|
||||
;;
|
||||
--enable-rbd) rbd="yes"
|
||||
;;
|
||||
+ --disable-vitastor) vitastor="no"
|
||||
+ ;;
|
||||
+ --enable-vitastor) vitastor="yes"
|
||||
+ ;;
|
||||
--disable-xfsctl) xfs="no"
|
||||
;;
|
||||
--enable-xfsctl) xfs="yes"
|
||||
@@ -1827,6 +1832,7 @@ disabled with --disable-FEATURE, default
|
||||
vhost-vdpa vhost-vdpa kernel backend support
|
||||
spice spice
|
||||
rbd rados block device (rbd)
|
||||
+ vitastor vitastor block device
|
||||
libiscsi iscsi support
|
||||
libnfs nfs support
|
||||
smartcard smartcard support (libcacard)
|
||||
@@ -3719,6 +3725,27 @@ EOF
|
||||
fi
|
||||
|
||||
##########################################
|
||||
+# vitastor probe
|
||||
+if test "$vitastor" != "no" ; then
|
||||
+ cat > $TMPC <<EOF
|
||||
+#include <vitastor_c.h>
|
||||
+int main(void) {
|
||||
+ vitastor_c_create_qemu(0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0);
|
||||
+ return 0;
|
||||
+}
|
||||
+EOF
|
||||
+ vitastor_libs="-lvitastor_client"
|
||||
+ if compile_prog "" "$vitastor_libs" ; then
|
||||
+ vitastor=yes
|
||||
+ else
|
||||
+ if test "$vitastor" = "yes" ; then
|
||||
+ feature_not_found "vitastor block device" "Install vitastor-client-dev"
|
||||
+ fi
|
||||
+ vitastor=no
|
||||
+ fi
|
||||
+fi
|
||||
+
|
||||
+##########################################
|
||||
# libssh probe
|
||||
if test "$libssh" != "no" ; then
|
||||
if $pkg_config --exists libssh; then
|
||||
@@ -6456,6 +6483,10 @@ if test "$rbd" = "yes" ; then
|
||||
echo "CONFIG_RBD=y" >> $config_host_mak
|
||||
echo "RBD_LIBS=$rbd_libs" >> $config_host_mak
|
||||
fi
|
||||
+if test "$vitastor" = "yes" ; then
|
||||
+ echo "CONFIG_VITASTOR=y" >> $config_host_mak
|
||||
+ echo "VITASTOR_LIBS=$vitastor_libs" >> $config_host_mak
|
||||
+fi
|
||||
|
||||
echo "CONFIG_COROUTINE_BACKEND=$coroutine" >> $config_host_mak
|
||||
if test "$coroutine_pool" = "yes" ; then
|
||||
Index: qemu-5.2+dfsg/meson.build
|
||||
===================================================================
|
||||
--- qemu-5.2+dfsg.orig/meson.build
|
||||
+++ qemu-5.2+dfsg/meson.build
|
||||
@@ -596,6 +596,10 @@ rbd = not_found
|
||||
if 'CONFIG_RBD' in config_host
|
||||
rbd = declare_dependency(link_args: config_host['RBD_LIBS'].split())
|
||||
endif
|
||||
+vitastor = not_found
|
||||
+if 'CONFIG_VITASTOR' in config_host
|
||||
+ vitastor = declare_dependency(link_args: config_host['VITASTOR_LIBS'].split())
|
||||
+endif
|
||||
glusterfs = not_found
|
||||
if 'CONFIG_GLUSTERFS' in config_host
|
||||
glusterfs = declare_dependency(compile_args: config_host['GLUSTERFS_CFLAGS'].split(),
|
||||
@@ -2145,6 +2149,7 @@ endif
|
||||
# TODO: add back protocol and server version
|
||||
summary_info += {'spice support': config_host.has_key('CONFIG_SPICE')}
|
||||
summary_info += {'rbd support': config_host.has_key('CONFIG_RBD')}
|
||||
+summary_info += {'vitastor support': config_host.has_key('CONFIG_VITASTOR')}
|
||||
summary_info += {'xfsctl support': config_host.has_key('CONFIG_XFS')}
|
||||
summary_info += {'smartcard support': config_host.has_key('CONFIG_SMARTCARD')}
|
||||
summary_info += {'U2F support': u2f.found()}
|
@@ -24,4 +24,4 @@ rm fio
|
||||
mv fio-copy fio
|
||||
FIO=`rpm -qi fio | perl -e 'while(<>) { /^Epoch[\s:]+(\S+)/ && print "$1:"; /^Version[\s:]+(\S+)/ && print $1; /^Release[\s:]+(\S+)/ && print "-$1"; }'`
|
||||
perl -i -pe 's/(Requires:\s*fio)([^\n]+)?/$1 = '$FIO'/' $VITASTOR/rpm/vitastor-el$EL.spec
|
||||
tar --transform 's#^#vitastor-0.9.6/#' --exclude 'rpm/*.rpm' -czf $VITASTOR/../vitastor-0.9.6$(rpm --eval '%dist').tar.gz *
|
||||
tar --transform 's#^#vitastor-0.9.2/#' --exclude 'rpm/*.rpm' -czf $VITASTOR/../vitastor-0.9.2$(rpm --eval '%dist').tar.gz *
|
||||
|
@@ -22,7 +22,7 @@
|
||||
Name: qemu-kvm
|
||||
Version: 4.2.0
|
||||
-Release: 29.vitastor%{?dist}.6
|
||||
+Release: 34.vitastor%{?dist}.6
|
||||
+Release: 32.vitastor%{?dist}.6
|
||||
# Epoch because we pushed a qemu-1.0 package. AIUI this can't ever be dropped
|
||||
Epoch: 15
|
||||
License: GPLv2 and GPLv2+ and CC-BY
|
||||
|
@@ -13,7 +13,7 @@
|
||||
Name: qemu-kvm
|
||||
Version: 4.2.0
|
||||
-Release: 29%{?dist}.6
|
||||
+Release: 33.vitastor%{?dist}.6
|
||||
+Release: 32.vitastor%{?dist}.6
|
||||
# Epoch because we pushed a qemu-1.0 package. AIUI this can't ever be dropped
|
||||
Epoch: 15
|
||||
License: GPLv2 and GPLv2+ and CC-BY
|
||||
|
@@ -1,103 +0,0 @@
|
||||
--- qemu-kvm-6.2.spec.orig 2023-07-18 13:52:57.636625440 +0000
|
||||
+++ qemu-kvm-6.2.spec 2023-07-18 13:52:19.011683886 +0000
|
||||
@@ -73,6 +73,7 @@ Requires: %{name}-hw-usbredir = %{epoch}
|
||||
%endif \
|
||||
Requires: %{name}-block-iscsi = %{epoch}:%{version}-%{release} \
|
||||
Requires: %{name}-block-rbd = %{epoch}:%{version}-%{release} \
|
||||
+Requires: %{name}-block-vitastor = %{epoch}:%{version}-%{release}\
|
||||
Requires: %{name}-block-ssh = %{epoch}:%{version}-%{release}
|
||||
|
||||
# Macro to properly setup RHEL/RHEV conflict handling
|
||||
@@ -83,7 +84,7 @@ Obsoletes: %1-rhev <= %{epoch}:%{version
|
||||
Summary: QEMU is a machine emulator and virtualizer
|
||||
Name: qemu-kvm
|
||||
Version: 6.2.0
|
||||
-Release: 32%{?rcrel}%{?dist}
|
||||
+Release: 32.vitastor%{?rcrel}%{?dist}
|
||||
# Epoch because we pushed a qemu-1.0 package. AIUI this can't ever be dropped
|
||||
Epoch: 15
|
||||
License: GPLv2 and GPLv2+ and CC-BY
|
||||
@@ -122,6 +123,7 @@ Source37: tests_data_acpi_pc_SSDT.dimmpx
|
||||
Source38: tests_data_acpi_q35_FACP.slic
|
||||
Source39: tests_data_acpi_q35_SSDT.dimmpxm
|
||||
Source40: tests_data_acpi_virt_SSDT.memhp
|
||||
+Source41: qemu-vitastor.c
|
||||
|
||||
Patch0001: 0001-redhat-Adding-slirp-to-the-exploded-tree.patch
|
||||
Patch0005: 0005-Initial-redhat-build.patch
|
||||
@@ -652,6 +654,7 @@ Patch255: kvm-scsi-protect-req-aiocb-wit
|
||||
Patch256: kvm-dma-helpers-prevent-dma_blk_cb-vs-dma_aio_cancel-rac.patch
|
||||
# For bz#2090990 - qemu crash with error scsi_req_unref(SCSIRequest *): Assertion `req->refcount > 0' failed or scsi_dma_complete(void *, int): Assertion `r->req.aiocb != NULL' failed [8.7.0]
|
||||
Patch257: kvm-virtio-scsi-reset-SCSI-devices-from-main-loop-thread.patch
|
||||
+Patch258: qemu-6.2-vitastor.patch
|
||||
|
||||
BuildRequires: wget
|
||||
BuildRequires: rpm-build
|
||||
@@ -689,6 +692,7 @@ BuildRequires: libcurl-devel
|
||||
BuildRequires: libssh-devel
|
||||
BuildRequires: librados-devel
|
||||
BuildRequires: librbd-devel
|
||||
+BuildRequires: vitastor-client-devel
|
||||
%if %{have_gluster}
|
||||
# For gluster block driver
|
||||
BuildRequires: glusterfs-api-devel
|
||||
@@ -926,6 +930,14 @@ Install this package if you want to acce
|
||||
using the rbd protocol.
|
||||
|
||||
|
||||
+%package block-vitastor
|
||||
+Summary: QEMU Vitastor block driver
|
||||
+Requires: %{name}-common%{?_isa} = %{epoch}:%{version}-%{release}
|
||||
+
|
||||
+%description block-vitastor
|
||||
+This package provides the additional Vitastor block driver for QEMU.
|
||||
+
|
||||
+
|
||||
%package block-ssh
|
||||
Summary: QEMU SSH block driver
|
||||
Requires: %{name}-common%{?_isa} = %{epoch}:%{version}-%{release}
|
||||
@@ -979,6 +991,7 @@ This package provides usbredir support.
|
||||
rm -fr slirp
|
||||
mkdir slirp
|
||||
%autopatch -p1
|
||||
+cp %{SOURCE41} ./block/vitastor.c
|
||||
|
||||
%global qemu_kvm_build qemu_kvm_build
|
||||
mkdir -p %{qemu_kvm_build}
|
||||
@@ -994,7 +1007,7 @@ cp -f %{SOURCE40} tests/data/acpi/virt/S
|
||||
# --build-id option is used for giving info to the debug packages.
|
||||
buildldflags="VL_LDFLAGS=-Wl,--build-id"
|
||||
|
||||
-%global block_drivers_list qcow2,raw,file,host_device,nbd,iscsi,rbd,blkdebug,luks,null-co,nvme,copy-on-read,throttle
|
||||
+%global block_drivers_list qcow2,raw,file,host_device,nbd,iscsi,rbd,vitastor,blkdebug,luks,null-co,nvme,copy-on-read,throttle
|
||||
|
||||
%if 0%{have_gluster}
|
||||
%global block_drivers_list %{block_drivers_list},gluster
|
||||
@@ -1149,9 +1162,7 @@ pushd %{qemu_kvm_build}
|
||||
--firmwarepath=%{_prefix}/share/qemu-firmware \
|
||||
--meson="git" \
|
||||
--target-list="%{buildarch}" \
|
||||
- --block-drv-rw-whitelist=%{block_drivers_list} \
|
||||
--audio-drv-list= \
|
||||
- --block-drv-ro-whitelist=vmdk,vhdx,vpc,https,ssh \
|
||||
--with-coroutine=ucontext \
|
||||
--with-git=git \
|
||||
--tls-priority=@QEMU,SYSTEM \
|
||||
@@ -1197,6 +1208,7 @@ pushd %{qemu_kvm_build}
|
||||
%endif
|
||||
--enable-pie \
|
||||
--enable-rbd \
|
||||
+ --enable-vitastor \
|
||||
%if 0%{have_librdma}
|
||||
--enable-rdma \
|
||||
%endif
|
||||
@@ -1794,6 +1806,9 @@ sh %{_sysconfdir}/sysconfig/modules/kvm.
|
||||
%files block-rbd
|
||||
%{_libdir}/qemu-kvm/block-rbd.so
|
||||
|
||||
+%files block-vitastor
|
||||
+%{_libdir}/qemu-kvm/block-vitastor.so
|
||||
+
|
||||
%files block-ssh
|
||||
%{_libdir}/qemu-kvm/block-ssh.so
|
||||
|
@@ -1,93 +0,0 @@
|
||||
--- qemu-kvm-7.2.spec.orig 2023-06-22 13:56:19.000000000 +0000
|
||||
+++ qemu-kvm-7.2.spec 2023-07-18 07:55:22.347090196 +0000
|
||||
@@ -100,8 +100,6 @@
|
||||
%endif
|
||||
|
||||
%global target_list %{kvm_target}-softmmu
|
||||
-%global block_drivers_rw_list qcow2,raw,file,host_device,nbd,iscsi,rbd,blkdebug,luks,null-co,nvme,copy-on-read,throttle,compress
|
||||
-%global block_drivers_ro_list vdi,vmdk,vhdx,vpc,https
|
||||
%define qemudocdir %{_docdir}/%{name}
|
||||
%global firmwaredirs "%{_datadir}/qemu-firmware:%{_datadir}/ipxe/qemu:%{_datadir}/seavgabios:%{_datadir}/seabios"
|
||||
|
||||
@@ -126,6 +124,7 @@ Requires: %{name}-device-usb-host = %{ep
|
||||
Requires: %{name}-device-usb-redirect = %{epoch}:%{version}-%{release} \
|
||||
%endif \
|
||||
Requires: %{name}-block-rbd = %{epoch}:%{version}-%{release} \
|
||||
+Requires: %{name}-block-vitastor = %{epoch}:%{version}-%{release}\
|
||||
Requires: %{name}-audio-pa = %{epoch}:%{version}-%{release}
|
||||
|
||||
# Since SPICE is removed from RHEL-9, the following Obsoletes:
|
||||
@@ -148,7 +147,7 @@ Obsoletes: %{name}-block-ssh <= %{epoch}
|
||||
Summary: QEMU is a machine emulator and virtualizer
|
||||
Name: qemu-kvm
|
||||
Version: 7.2.0
|
||||
-Release: 14%{?rcrel}%{?dist}%{?cc_suffix}.1
|
||||
+Release: 14.vitastor%{?rcrel}%{?dist}%{?cc_suffix}.1
|
||||
# Epoch because we pushed a qemu-1.0 package. AIUI this can't ever be dropped
|
||||
# Epoch 15 used for RHEL 8
|
||||
# Epoch 17 used for RHEL 9 (due to release versioning offset in RHEL 8.5)
|
||||
@@ -171,6 +170,7 @@ Source28: 95-kvm-memlock.conf
|
||||
Source30: kvm-s390x.conf
|
||||
Source31: kvm-x86.conf
|
||||
Source36: README.tests
|
||||
+Source37: qemu-vitastor.c
|
||||
|
||||
|
||||
Patch0004: 0004-Initial-redhat-build.patch
|
||||
@@ -418,6 +418,7 @@ Patch134: kvm-target-i386-Fix-BZHI-instr
|
||||
Patch135: kvm-intel-iommu-fail-DEVIOTLB_UNMAP-without-dt-mode.patch
|
||||
# For bz#2203745 - Disk detach is unsuccessful while the guest is still booting [rhel-9.2.0.z]
|
||||
Patch136: kvm-acpi-pcihp-allow-repeating-hot-unplug-requests.patch
|
||||
+Patch137: qemu-7.2-vitastor.patch
|
||||
|
||||
%if %{have_clang}
|
||||
BuildRequires: clang
|
||||
@@ -449,6 +450,7 @@ BuildRequires: libcurl-devel
|
||||
%if %{have_block_rbd}
|
||||
BuildRequires: librbd-devel
|
||||
%endif
|
||||
+BuildRequires: vitastor-client-devel
|
||||
# We need both because the 'stap' binary is probed for by configure
|
||||
BuildRequires: systemtap
|
||||
BuildRequires: systemtap-sdt-devel
|
||||
@@ -642,6 +644,14 @@ using the rbd protocol.
|
||||
%endif
|
||||
|
||||
|
||||
+%package block-vitastor
|
||||
+Summary: QEMU Vitastor block driver
|
||||
+Requires: %{name}-common%{?_isa} = %{epoch}:%{version}-%{release}
|
||||
+
|
||||
+%description block-vitastor
|
||||
+This package provides the additional Vitastor block driver for QEMU.
|
||||
+
|
||||
+
|
||||
%package audio-pa
|
||||
Summary: QEMU PulseAudio audio driver
|
||||
Requires: %{name}-common%{?_isa} = %{epoch}:%{version}-%{release}
|
||||
@@ -719,6 +729,7 @@ This package provides usbredir support.
|
||||
%prep
|
||||
%setup -q -n qemu-%{version}%{?rcstr}
|
||||
%autopatch -p1
|
||||
+cp %{SOURCE37} ./block/vitastor.c
|
||||
|
||||
%global qemu_kvm_build qemu_kvm_build
|
||||
mkdir -p %{qemu_kvm_build}
|
||||
@@ -946,6 +957,7 @@ run_configure \
|
||||
%if %{have_block_rbd}
|
||||
--enable-rbd \
|
||||
%endif
|
||||
+ --enable-vitastor \
|
||||
%if %{have_librdma}
|
||||
--enable-rdma \
|
||||
%endif
|
||||
@@ -1426,6 +1438,9 @@ useradd -r -u 107 -g qemu -G kvm -d / -s
|
||||
%files block-rbd
|
||||
%{_libdir}/%{name}/block-rbd.so
|
||||
%endif
|
||||
+%files block-vitastor
|
||||
+%{_libdir}/%{name}/block-vitastor.so
|
||||
+
|
||||
%files audio-pa
|
||||
%{_libdir}/%{name}/audio-pa.so
|
||||
|
@@ -35,7 +35,7 @@ ADD . /root/vitastor
|
||||
RUN set -e; \
|
||||
cd /root/vitastor/rpm; \
|
||||
sh build-tarball.sh; \
|
||||
cp /root/vitastor-0.9.6.el7.tar.gz ~/rpmbuild/SOURCES; \
|
||||
cp /root/vitastor-0.9.2.el7.tar.gz ~/rpmbuild/SOURCES; \
|
||||
cp vitastor-el7.spec ~/rpmbuild/SPECS/vitastor.spec; \
|
||||
cd ~/rpmbuild/SPECS/; \
|
||||
rpmbuild -ba vitastor.spec; \
|
||||
|
@@ -1,11 +1,11 @@
|
||||
Name: vitastor
|
||||
Version: 0.9.6
|
||||
Version: 0.9.2
|
||||
Release: 1%{?dist}
|
||||
Summary: Vitastor, a fast software-defined clustered block storage
|
||||
|
||||
License: Vitastor Network Public License 1.1
|
||||
URL: https://vitastor.io/
|
||||
Source0: vitastor-0.9.6.el7.tar.gz
|
||||
Source0: vitastor-0.9.2.el7.tar.gz
|
||||
|
||||
BuildRequires: liburing-devel >= 0.6
|
||||
BuildRequires: gperftools-devel
|
||||
|
@@ -35,7 +35,7 @@ ADD . /root/vitastor
|
||||
RUN set -e; \
|
||||
cd /root/vitastor/rpm; \
|
||||
sh build-tarball.sh; \
|
||||
cp /root/vitastor-0.9.6.el8.tar.gz ~/rpmbuild/SOURCES; \
|
||||
cp /root/vitastor-0.9.2.el8.tar.gz ~/rpmbuild/SOURCES; \
|
||||
cp vitastor-el8.spec ~/rpmbuild/SPECS/vitastor.spec; \
|
||||
cd ~/rpmbuild/SPECS/; \
|
||||
rpmbuild -ba vitastor.spec; \
|
||||
|
@@ -1,11 +1,11 @@
|
||||
Name: vitastor
|
||||
Version: 0.9.6
|
||||
Version: 0.9.2
|
||||
Release: 1%{?dist}
|
||||
Summary: Vitastor, a fast software-defined clustered block storage
|
||||
|
||||
License: Vitastor Network Public License 1.1
|
||||
URL: https://vitastor.io/
|
||||
Source0: vitastor-0.9.6.el8.tar.gz
|
||||
Source0: vitastor-0.9.2.el8.tar.gz
|
||||
|
||||
BuildRequires: liburing-devel >= 0.6
|
||||
BuildRequires: gperftools-devel
|
||||
|
@@ -18,7 +18,7 @@ ADD . /root/vitastor
|
||||
RUN set -e; \
|
||||
cd /root/vitastor/rpm; \
|
||||
sh build-tarball.sh; \
|
||||
cp /root/vitastor-0.9.6.el9.tar.gz ~/rpmbuild/SOURCES; \
|
||||
cp /root/vitastor-0.9.2.el9.tar.gz ~/rpmbuild/SOURCES; \
|
||||
cp vitastor-el9.spec ~/rpmbuild/SPECS/vitastor.spec; \
|
||||
cd ~/rpmbuild/SPECS/; \
|
||||
rpmbuild -ba vitastor.spec; \
|
||||
|
@@ -1,11 +1,11 @@
|
||||
Name: vitastor
|
||||
Version: 0.9.6
|
||||
Version: 0.9.2
|
||||
Release: 1%{?dist}
|
||||
Summary: Vitastor, a fast software-defined clustered block storage
|
||||
|
||||
License: Vitastor Network Public License 1.1
|
||||
URL: https://vitastor.io/
|
||||
Source0: vitastor-0.9.6.el9.tar.gz
|
||||
Source0: vitastor-0.9.2.el9.tar.gz
|
||||
|
||||
BuildRequires: liburing-devel >= 0.6
|
||||
BuildRequires: gperftools-devel
|
||||
|
@@ -16,7 +16,7 @@ if("${CMAKE_INSTALL_PREFIX}" MATCHES "^/usr/local/?$")
|
||||
set(CMAKE_INSTALL_RPATH "${CMAKE_INSTALL_PREFIX}/${CMAKE_INSTALL_LIBDIR}")
|
||||
endif()
|
||||
|
||||
add_definitions(-DVERSION="0.9.6")
|
||||
add_definitions(-DVERSION="0.9.2")
|
||||
add_definitions(-Wall -Wno-sign-compare -Wno-comment -Wno-parentheses -Wno-pointer-arith -fdiagnostics-color=always -I ${CMAKE_SOURCE_DIR}/src)
|
||||
if (${WITH_ASAN})
|
||||
add_definitions(-fsanitize=address -fno-omit-frame-pointer)
|
||||
|
@@ -714,15 +714,9 @@ resume_1:
|
||||
return false;
|
||||
}
|
||||
}
|
||||
if (new_trim_pos < bs->journal.used_start
|
||||
? (bs->journal.dirty_start >= bs->journal.used_start || bs->journal.dirty_start < new_trim_pos)
|
||||
: (bs->journal.dirty_start >= bs->journal.used_start && bs->journal.dirty_start < new_trim_pos))
|
||||
{
|
||||
bs->journal.dirty_start = new_trim_pos;
|
||||
}
|
||||
bs->journal.used_start = new_trim_pos;
|
||||
#ifdef BLOCKSTORE_DEBUG
|
||||
printf("Journal trimmed to %08lx (next_free=%08lx dirty_start=%08lx)\n", bs->journal.used_start, bs->journal.next_free, bs->journal.dirty_start);
|
||||
printf("Journal trimmed to %08lx (next_free=%08lx)\n", bs->journal.used_start, bs->journal.next_free);
|
||||
#endif
|
||||
if (bs->journal.flush_journal && !flusher->flush_queue.size())
|
||||
{
|
||||
|
@@ -103,7 +103,6 @@ public:
|
||||
journal_flusher_t(blockstore_impl_t *bs);
|
||||
~journal_flusher_t();
|
||||
void loop();
|
||||
bool is_trim_wanted() { return trim_wanted; }
|
||||
bool is_active();
|
||||
void mark_trim_possible();
|
||||
void request_trim();
|
||||
|
@@ -85,13 +85,11 @@ void blockstore_impl_t::parse_config(blockstore_config_t & config, bool init)
|
||||
immediate_commit = IMMEDIATE_SMALL;
|
||||
}
|
||||
metadata_buf_size = strtoull(config["meta_buf_size"].c_str(), NULL, 10);
|
||||
inmemory_meta = config["inmemory_metadata"] != "false" && config["inmemory_metadata"] != "0" &&
|
||||
config["inmemory_metadata"] != "no";
|
||||
inmemory_meta = config["inmemory_metadata"] != "false";
|
||||
journal.sector_count = strtoull(config["journal_sector_buffer_count"].c_str(), NULL, 10);
|
||||
journal.no_same_sector_overwrites = config["journal_no_same_sector_overwrites"] == "true" ||
|
||||
config["journal_no_same_sector_overwrites"] == "1" || config["journal_no_same_sector_overwrites"] == "yes";
|
||||
journal.inmemory = config["inmemory_journal"] != "false" && config["inmemory_journal"] != "0" &&
|
||||
config["inmemory_journal"] != "no";
|
||||
journal.inmemory = config["inmemory_journal"] != "false";
|
||||
// Validate
|
||||
if (journal.sector_count < 2)
|
||||
{
|
||||
|
@@ -218,7 +218,7 @@ void blockstore_impl_t::erase_dirty(blockstore_dirty_db_t::iterator dirty_start,
|
||||
auto used = --journal.used_sectors[dirty_it->second.journal_sector];
|
||||
#ifdef BLOCKSTORE_DEBUG
|
||||
printf(
|
||||
"remove usage of journal offset %08lx by %lx:%lx v%lu (%lu refs)\n", dirty_it->second.journal_sector,
|
||||
"remove usage of journal offset %08lx by %lx:%lx v%lu (%d refs)\n", dirty_it->second.journal_sector,
|
||||
dirty_it->first.oid.inode, dirty_it->first.oid.stripe, dirty_it->first.version, used
|
||||
);
|
||||
#endif
|
||||
|
@@ -661,13 +661,8 @@ void blockstore_impl_t::release_journal_sectors(blockstore_op_t *op)
|
||||
uint64_t s = PRIV(op)->min_flushed_journal_sector;
|
||||
while (1)
|
||||
{
|
||||
if (!journal.sector_info[s-1].dirty && journal.sector_info[s-1].flush_count == 0)
|
||||
if (s != (1+journal.cur_sector) && journal.sector_info[s-1].flush_count == 0)
|
||||
{
|
||||
if (s == (1+journal.cur_sector))
|
||||
{
|
||||
// Forcibly move to the next sector and move dirty position
|
||||
journal.in_sector_pos = journal.block_size;
|
||||
}
|
||||
// We know for sure that we won't write into this sector anymore
|
||||
uint64_t new_ds = journal.sector_info[s-1].offset + journal.block_size;
|
||||
if (new_ds >= journal.len)
|
||||
|
@@ -56,15 +56,14 @@ struct image_lister_t
|
||||
{
|
||||
continue;
|
||||
}
|
||||
auto pool_it = parent->cli->st_cli.pool_config.find(INODE_POOL(ic.second.num));
|
||||
bool good_pool = pool_it != parent->cli->st_cli.pool_config.end();
|
||||
auto & pool_cfg = parent->cli->st_cli.pool_config.at(INODE_POOL(ic.second.num));
|
||||
auto item = json11::Json::object {
|
||||
{ "name", ic.second.name },
|
||||
{ "size", ic.second.size },
|
||||
{ "used_size", 0 },
|
||||
{ "readonly", ic.second.readonly },
|
||||
{ "pool_id", (uint64_t)INODE_POOL(ic.second.num) },
|
||||
{ "pool_name", good_pool ? pool_it->second.name : "? (ID:"+std::to_string(INODE_POOL(ic.second.num))+")" },
|
||||
{ "pool_name", pool_cfg.name },
|
||||
{ "inode_num", INODE_NO_POOL(ic.second.num) },
|
||||
{ "inode_id", ic.second.num },
|
||||
};
|
||||
@@ -248,8 +247,6 @@ resume_1:
|
||||
if (state == 1)
|
||||
goto resume_1;
|
||||
get_list();
|
||||
if (state == 100)
|
||||
return;
|
||||
if (show_stats)
|
||||
{
|
||||
resume_1:
|
||||
@@ -272,7 +269,7 @@ resume_1:
|
||||
{ "key", "name" },
|
||||
{ "title", "NAME" },
|
||||
});
|
||||
if (list_pool_name == "")
|
||||
if (!list_pool_id)
|
||||
{
|
||||
cols.push_back(json11::Json::object{
|
||||
{ "key", "pool_name" },
|
||||
@@ -379,18 +376,16 @@ resume_1:
|
||||
|
||||
std::string print_table(json11::Json items, json11::Json header, bool use_esc)
|
||||
{
|
||||
int header_sizes[header.array_items().size()];
|
||||
std::vector<int> sizes;
|
||||
for (int i = 0; i < header.array_items().size(); i++)
|
||||
{
|
||||
header_sizes[i] = utf8_length(header[i]["title"].string_value());
|
||||
sizes.push_back(header_sizes[i]);
|
||||
sizes.push_back(header[i]["title"].string_value().length());
|
||||
}
|
||||
for (auto & item: items.array_items())
|
||||
{
|
||||
for (int i = 0; i < header.array_items().size(); i++)
|
||||
{
|
||||
int l = utf8_length(item[header[i]["key"].string_value()].as_string());
|
||||
int l = item[header[i]["key"].string_value()].as_string().length();
|
||||
sizes[i] = sizes[i] < l ? l : sizes[i];
|
||||
}
|
||||
}
|
||||
@@ -402,7 +397,7 @@ std::string print_table(json11::Json items, json11::Json header, bool use_esc)
|
||||
// Separator
|
||||
str += " ";
|
||||
}
|
||||
int pad = sizes[i]-header_sizes[i];
|
||||
int pad = sizes[i]-header[i]["title"].string_value().length();
|
||||
if (header[i]["right"].bool_value())
|
||||
{
|
||||
// Align right
|
||||
@@ -430,7 +425,7 @@ std::string print_table(json11::Json items, json11::Json header, bool use_esc)
|
||||
// Separator
|
||||
str += " ";
|
||||
}
|
||||
int pad = sizes[i] - utf8_length(item[header[i]["key"].string_value()].as_string());
|
||||
int pad = sizes[i] - item[header[i]["key"].string_value()].as_string().length();
|
||||
if (header[i]["right"].bool_value())
|
||||
{
|
||||
// Align right
|
||||
|
@@ -13,7 +13,7 @@ struct image_changer_t
|
||||
std::string image_name;
|
||||
std::string new_name;
|
||||
uint64_t new_size = 0;
|
||||
bool force_size = false, inc_size = false;
|
||||
bool force_size = false;
|
||||
bool set_readonly = false, set_readwrite = false, force = false;
|
||||
// interval between fsyncs
|
||||
int fsync_interval = 128;
|
||||
@@ -81,14 +81,14 @@ struct image_changer_t
|
||||
}
|
||||
if ((!set_readwrite || !cfg.readonly) &&
|
||||
(!set_readonly || cfg.readonly) &&
|
||||
(!new_size && !force_size || cfg.size == new_size || cfg.size >= new_size && inc_size) &&
|
||||
(!new_size && !force_size || cfg.size == new_size) &&
|
||||
(new_name == "" || new_name == image_name))
|
||||
{
|
||||
result = (cli_result_t){ .text = "No change" };
|
||||
state = 100;
|
||||
return;
|
||||
}
|
||||
if ((new_size != 0 || force_size) && (cfg.size < new_size || !inc_size))
|
||||
if (new_size != 0 || force_size)
|
||||
{
|
||||
if (cfg.size >= new_size)
|
||||
{
|
||||
@@ -233,7 +233,6 @@ std::function<bool(cli_result_t &)> cli_tool_t::start_modify(json11::Json cfg)
|
||||
changer->new_name = cfg["rename"].string_value();
|
||||
changer->new_size = parse_size(cfg["resize"].as_string());
|
||||
changer->force_size = cfg["force_size"].bool_value();
|
||||
changer->inc_size = cfg["inc_size"].bool_value();
|
||||
changer->force = cfg["force"].bool_value();
|
||||
changer->set_readonly = cfg["readonly"].bool_value();
|
||||
changer->set_readwrite = cfg["readwrite"].bool_value();
|
||||
|
@@ -99,16 +99,15 @@ int disk_tool_t::prepare_one(std::map<std::string, std::string> options, int is_
|
||||
if (options["journal_size"] == "")
|
||||
{
|
||||
if (options["journal_device"] == "")
|
||||
options["journal_size"] = is_hdd ? "128M" : "32M";
|
||||
options["journal_size"] = "32M";
|
||||
else if (is_hdd)
|
||||
options["journal_size"] = DEFAULT_HYBRID_JOURNAL;
|
||||
}
|
||||
bool is_hybrid = is_hdd && options["journal_device"] != "" && options["journal_device"] != options["data_device"];
|
||||
if (is_hdd)
|
||||
{
|
||||
if (options["block_size"] == "")
|
||||
options["block_size"] = "1M";
|
||||
if (is_hybrid && options["throttle_small_writes"] == "")
|
||||
if (options["throttle_small_writes"] == "")
|
||||
options["throttle_small_writes"] = "1";
|
||||
}
|
||||
json11::Json::object sb;
|
||||
@@ -135,7 +134,7 @@ int disk_tool_t::prepare_one(std::map<std::string, std::string> options, int is_
|
||||
{ "meta_offset", 4096 + (dsk.meta_device == dsk.journal_device ? dsk.journal_len : 0) },
|
||||
{ "data_offset", 4096 + (dsk.data_device == dsk.meta_device ? dsk.meta_len : 0) +
|
||||
(dsk.data_device == dsk.journal_device ? dsk.journal_len : 0) },
|
||||
{ "journal_no_same_sector_overwrites", !is_hdd || is_hybrid },
|
||||
{ "journal_no_same_sector_overwrites", true },
|
||||
{ "journal_sector_buffer_count", 1024 },
|
||||
{ "disable_data_fsync", json_is_true(options["disable_data_fsync"]) },
|
||||
{ "disable_meta_fsync", json_is_true(options["disable_meta_fsync"]) },
|
||||
@@ -147,7 +146,7 @@ int disk_tool_t::prepare_one(std::map<std::string, std::string> options, int is_
|
||||
for (int i = 0; i < sizeof(allow_additional_params)/sizeof(allow_additional_params[0]); i++)
|
||||
{
|
||||
auto it = options.find(allow_additional_params[i]);
|
||||
if (it != options.end() && it->second != "")
|
||||
if (it != options.end())
|
||||
{
|
||||
sb[it->first] = it->second;
|
||||
}
|
||||
@@ -621,7 +620,7 @@ int disk_tool_t::prepare(std::vector<std::string> devices)
|
||||
}
|
||||
}
|
||||
// Treat all disks as SSDs if not in the hybrid mode
|
||||
prepare_one(options, dev.is_hdd ? 1 : 0);
|
||||
prepare_one(options, hybrid && dev.is_hdd ? 1 : 0);
|
||||
if (hybrid)
|
||||
{
|
||||
options.erase("journal_device");
|
||||
|
@@ -264,7 +264,6 @@ int write_zero(int fd, uint64_t offset, uint64_t size)
|
||||
{
|
||||
uint64_t buf_len = 1024*1024;
|
||||
void *zero_buf = memalign_or_die(MEM_ALIGNMENT, buf_len);
|
||||
memset(zero_buf, 0, buf_len);
|
||||
ssize_t r;
|
||||
while (size > 0)
|
||||
{
|
||||
|
@@ -187,30 +187,22 @@ void etcd_state_client_t::add_etcd_url(std::string addr)
|
||||
check_addr = addr;
|
||||
if (pos == std::string::npos)
|
||||
addr += "/v3";
|
||||
bool local = false;
|
||||
int i;
|
||||
for (i = 0; i < local_ips.size(); i++)
|
||||
{
|
||||
if (local_ips[i] == check_addr)
|
||||
{
|
||||
local = true;
|
||||
this->etcd_local.push_back(addr);
|
||||
break;
|
||||
}
|
||||
}
|
||||
auto & to = local ? this->etcd_local : this->etcd_addresses;
|
||||
for (i = 0; i < to.size(); i++)
|
||||
{
|
||||
if (to[i] == addr)
|
||||
break;
|
||||
}
|
||||
if (i >= to.size())
|
||||
to.push_back(addr);
|
||||
if (i >= local_ips.size())
|
||||
this->etcd_addresses.push_back(addr);
|
||||
}
|
||||
}
|
||||
|
||||
void etcd_state_client_t::parse_config(const json11::Json & config)
|
||||
{
|
||||
this->etcd_local.clear();
|
||||
this->etcd_addresses.clear();
|
||||
if (config["etcd_address"].is_string())
|
||||
{
|
||||
@@ -357,7 +349,7 @@ void etcd_state_client_t::start_etcd_watcher()
|
||||
watch_id == ETCD_OSD_STATE_WATCH_ID)
|
||||
etcd_watches_initialised++;
|
||||
if (etcd_watches_initialised == 4 && this->log_level > 0)
|
||||
fprintf(stderr, "Successfully subscribed to etcd at %s\n", cur_addr.c_str());
|
||||
fprintf(stderr, "Successfully subscribed to etcd at %s\n", selected_etcd_address.c_str());
|
||||
}
|
||||
if (data["result"]["canceled"].bool_value())
|
||||
{
|
||||
@@ -368,17 +360,15 @@ void etcd_state_client_t::start_etcd_watcher()
|
||||
// so we should restart from the beginning if we can
|
||||
if (on_reload_hook != NULL)
|
||||
{
|
||||
// check to not trigger on_reload_hook multiple times
|
||||
if (etcd_watch_ws != NULL)
|
||||
fprintf(stderr, "Revisions before %lu were compacted by etcd, reloading state\n",
|
||||
data["result"]["compact_revision"].uint64_value());
|
||||
if (etcd_watch_ws)
|
||||
{
|
||||
fprintf(stderr, "Revisions before %lu were compacted by etcd, reloading state\n",
|
||||
data["result"]["compact_revision"].uint64_value());
|
||||
http_close(etcd_watch_ws);
|
||||
etcd_watch_ws = NULL;
|
||||
etcd_watch_revision = 0;
|
||||
on_reload_hook();
|
||||
}
|
||||
return;
|
||||
etcd_watch_revision = 0;
|
||||
on_reload_hook();
|
||||
}
|
||||
else
|
||||
{
|
||||
@@ -425,9 +415,13 @@ void etcd_state_client_t::start_etcd_watcher()
|
||||
}
|
||||
if (msg->eof)
|
||||
{
|
||||
fprintf(stderr, "Disconnected from etcd %s\n", cur_addr.c_str());
|
||||
if (cur_addr == selected_etcd_address)
|
||||
{
|
||||
fprintf(stderr, "Disconnected from etcd %s\n", selected_etcd_address.c_str());
|
||||
selected_etcd_address = "";
|
||||
}
|
||||
else
|
||||
fprintf(stderr, "Disconnected from etcd\n");
|
||||
if (etcd_watch_ws)
|
||||
{
|
||||
http_close(etcd_watch_ws);
|
||||
@@ -444,7 +438,6 @@ void etcd_state_client_t::start_etcd_watcher()
|
||||
else if (etcd_watches_initialised > 0)
|
||||
{
|
||||
// Connection was live, retry immediately
|
||||
etcd_watches_initialised = 0;
|
||||
start_etcd_watcher();
|
||||
}
|
||||
}
|
||||
|
@@ -132,6 +132,7 @@ protected:
|
||||
uint64_t rdma_max_msg = 0;
|
||||
#endif
|
||||
|
||||
bool has_send_loop = false;
|
||||
std::vector<int> read_ready_clients;
|
||||
std::vector<int> write_ready_clients;
|
||||
// We don't use ringloop->set_immediate here because we may have no ringloop in client :)
|
||||
@@ -159,6 +160,7 @@ public:
|
||||
std::function<bool(osd_client_t*, json11::Json)> check_config_hook;
|
||||
void read_requests();
|
||||
void send_replies();
|
||||
bool can_send();
|
||||
void accept_connections(int listen_fd);
|
||||
~osd_messenger_t();
|
||||
|
||||
|
@@ -111,22 +111,28 @@ void osd_messenger_t::outbox_push(osd_op_t *cur_op)
|
||||
return;
|
||||
}
|
||||
#endif
|
||||
if (!ringloop)
|
||||
if (!ringloop && !has_send_loop)
|
||||
{
|
||||
// FIXME: It's worse because it doesn't allow batching
|
||||
// "Send loop" should be used, like in QEMU driver, or performance will suffer
|
||||
// due to a large number of sendmsg() syscalls where they can be avoided.
|
||||
// "Send loop" increases 4k T1Q128 randread from ~55k iops to ~90k on a 1 OSD.
|
||||
// The difference is smaller when there are more OSDs though...
|
||||
while (cl->outbox.size())
|
||||
{
|
||||
try_send(cl);
|
||||
}
|
||||
}
|
||||
else if (cl->write_msg.msg_iovlen > 0 || !try_send(cl))
|
||||
else if (cl->write_msg.msg_iovlen > 0 || has_send_loop || !try_send(cl))
|
||||
{
|
||||
if (cl->write_state == 0)
|
||||
{
|
||||
cl->write_state = CL_WRITE_READY;
|
||||
write_ready_clients.push_back(cur_op->peer_fd);
|
||||
}
|
||||
ringloop->wakeup();
|
||||
if (ringloop)
|
||||
{
|
||||
ringloop->wakeup();
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
@@ -218,6 +224,12 @@ void osd_messenger_t::send_replies()
|
||||
write_ready_clients.clear();
|
||||
}
|
||||
|
||||
bool osd_messenger_t::can_send()
|
||||
{
|
||||
has_send_loop = true;
|
||||
return write_ready_clients.size() > 0;
|
||||
}
|
||||
|
||||
void osd_messenger_t::handle_send(int result, osd_client_t *cl)
|
||||
{
|
||||
cl->write_msg.msg_iovlen = 0;
|
||||
|
216
src/nfs_conn.cpp
216
src/nfs_conn.cpp
@@ -190,15 +190,7 @@ static int nfs3_setattr_proc(void *opaque, rpc_op_t *rop)
|
||||
{
|
||||
if (handle == "roothandle" || self->parent->dir_by_hash.find(handle) != self->parent->dir_by_hash.end())
|
||||
{
|
||||
if (args->new_attributes.size.set_it)
|
||||
{
|
||||
*reply = (SETATTR3res){ .status = NFS3ERR_ISDIR };
|
||||
}
|
||||
else
|
||||
{
|
||||
// Silently ignore mode, uid, gid, atime, mtime changes
|
||||
*reply = (SETATTR3res){ .status = NFS3_OK };
|
||||
}
|
||||
*reply = (SETATTR3res){ .status = NFS3ERR_ISDIR };
|
||||
}
|
||||
else
|
||||
{
|
||||
@@ -366,6 +358,7 @@ static int nfs3_read_proc(void *opaque, rpc_op_t *rop)
|
||||
}
|
||||
|
||||
static void nfs_resize_write(nfs_client_t *self, rpc_op_t *rop, uint64_t inode, uint64_t new_size, uint64_t offset, uint64_t count, void *buf);
|
||||
static void nfs_do_write(nfs_client_t *self, rpc_op_t *rop, uint64_t inode, uint64_t offset, uint64_t count, void *buf);
|
||||
|
||||
static int nfs3_write_proc(void *opaque, rpc_op_t *rop)
|
||||
{
|
||||
@@ -399,6 +392,7 @@ static int nfs3_write_proc(void *opaque, rpc_op_t *rop)
|
||||
.resok = (WRITE3resok){
|
||||
//.file_wcc = ...,
|
||||
.count = (unsigned)count,
|
||||
.committed = args->stable,
|
||||
},
|
||||
};
|
||||
if ((args->offset % alignment) != 0 || (count % alignment) != 0)
|
||||
@@ -442,101 +436,42 @@ static int nfs3_write_proc(void *opaque, rpc_op_t *rop)
|
||||
return 1;
|
||||
}
|
||||
|
||||
static void complete_extend_write(nfs_client_t *self, rpc_op_t *rop, inode_t inode, int res)
|
||||
static void nfs_resize_write(nfs_client_t *self, rpc_op_t *rop, uint64_t inode, uint64_t new_size, uint64_t offset, uint64_t count, void *buf)
|
||||
{
|
||||
WRITE3args *args = (WRITE3args*)rop->request;
|
||||
WRITE3res *reply = (WRITE3res*)rop->reply;
|
||||
if (res < 0)
|
||||
{
|
||||
*reply = (WRITE3res){ .status = vitastor_nfs_map_err(res) };
|
||||
rpc_queue_reply(rop);
|
||||
return;
|
||||
}
|
||||
bool imm = self->parent->cli->get_immediate_commit(inode);
|
||||
reply->resok.committed = args->stable != UNSTABLE || imm ? FILE_SYNC : UNSTABLE;
|
||||
*(uint64_t*)reply->resok.verf = self->parent->server_id;
|
||||
if (args->stable != UNSTABLE && !imm)
|
||||
{
|
||||
// Client requested a stable write. Add an fsync
|
||||
auto op = new cluster_op_t;
|
||||
op->opcode = OSD_OP_SYNC;
|
||||
op->callback = [rop](cluster_op_t *op)
|
||||
{
|
||||
if (op->retval != 0)
|
||||
{
|
||||
WRITE3res *reply = (WRITE3res*)rop->reply;
|
||||
*reply = (WRITE3res){ .status = vitastor_nfs_map_err(-op->retval) };
|
||||
}
|
||||
delete op;
|
||||
rpc_queue_reply(rop);
|
||||
};
|
||||
self->parent->cli->execute(op);
|
||||
}
|
||||
else
|
||||
{
|
||||
rpc_queue_reply(rop);
|
||||
}
|
||||
}
|
||||
|
||||
static void complete_extend_inode(nfs_client_t *self, uint64_t inode, uint64_t new_size, int err)
|
||||
{
|
||||
auto ext_it = self->extend_writes.lower_bound((extend_size_t){ .inode = inode, .new_size = 0 });
|
||||
while (ext_it != self->extend_writes.end() &&
|
||||
ext_it->first.inode == inode &&
|
||||
ext_it->first.new_size <= new_size)
|
||||
{
|
||||
ext_it->second.resize_res = err;
|
||||
if (ext_it->second.write_res <= 0)
|
||||
{
|
||||
complete_extend_write(self, ext_it->second.rop, inode, ext_it->second.write_res < 0
|
||||
? ext_it->second.write_res : ext_it->second.resize_res);
|
||||
self->extend_writes.erase(ext_it++);
|
||||
}
|
||||
else
|
||||
ext_it++;
|
||||
}
|
||||
}
|
||||
|
||||
static void extend_inode(nfs_client_t *self, uint64_t inode, uint64_t new_size)
|
||||
{
|
||||
// Send an extend request
|
||||
auto & ext = self->extends[inode];
|
||||
ext.cur_extend = new_size;
|
||||
// Check if we have to resize the inode before writing
|
||||
auto inode_it = self->parent->cli->st_cli.inode_config.find(inode);
|
||||
if (inode_it != self->parent->cli->st_cli.inode_config.end() &&
|
||||
inode_it->second.size < new_size)
|
||||
{
|
||||
self->parent->cmd->loop_and_wait(self->parent->cmd->start_modify(json11::Json::object {
|
||||
// FIXME: Resizing by ID is probably more correct
|
||||
{ "image", inode_it->second.name },
|
||||
{ "resize", new_size },
|
||||
{ "inc_size", true },
|
||||
{ "force_size", true },
|
||||
}), [=](const cli_result_t & r)
|
||||
{
|
||||
auto & ext = self->extends[inode];
|
||||
if (r.err)
|
||||
{
|
||||
fprintf(stderr, "Error extending inode %lu to %lu bytes: %s\n", inode, new_size, r.text.c_str());
|
||||
}
|
||||
if (r.err == EAGAIN || ext.next_extend > ext.cur_extend)
|
||||
{
|
||||
// Multiple concurrent resize requests received, try to repeat
|
||||
extend_inode(self, inode, ext.next_extend > ext.cur_extend ? ext.next_extend : ext.cur_extend);
|
||||
if (r.err == EAGAIN)
|
||||
{
|
||||
// Multiple concurrent resize requests received, try to repeat
|
||||
nfs_resize_write(self, rop, inode, new_size, offset, count, buf);
|
||||
return;
|
||||
}
|
||||
WRITE3res *reply = (WRITE3res*)rop->reply;
|
||||
*reply = (WRITE3res){ .status = vitastor_nfs_map_err(r.err) };
|
||||
rpc_queue_reply(rop);
|
||||
return;
|
||||
}
|
||||
ext.cur_extend = ext.next_extend = 0;
|
||||
complete_extend_inode(self, inode, new_size, r.err);
|
||||
nfs_do_write(self, rop, inode, offset, count, buf);
|
||||
});
|
||||
}
|
||||
else
|
||||
{
|
||||
complete_extend_inode(self, inode, new_size, 0);
|
||||
nfs_do_write(self, rop, inode, offset, count, buf);
|
||||
}
|
||||
}
|
||||
|
||||
static void nfs_do_write(nfs_client_t *self, std::multimap<extend_size_t, extend_write_t>::iterator ewr_it,
|
||||
rpc_op_t *rop, uint64_t inode, uint64_t offset, uint64_t count, void *buf)
|
||||
static void nfs_do_write(nfs_client_t *self, rpc_op_t *rop, uint64_t inode, uint64_t offset, uint64_t count, void *buf)
|
||||
{
|
||||
cluster_op_t *op = new cluster_op_t;
|
||||
op->opcode = OSD_OP_WRITE;
|
||||
@@ -544,61 +479,48 @@ static void nfs_do_write(nfs_client_t *self, std::multimap<extend_size_t, extend
|
||||
op->offset = offset;
|
||||
op->len = count;
|
||||
op->iov.push_back(buf, count);
|
||||
op->callback = [self, ewr_it, rop](cluster_op_t *op)
|
||||
op->callback = [self, rop](cluster_op_t *op)
|
||||
{
|
||||
auto inode = op->inode;
|
||||
int write_res = op->retval < 0 ? op->retval : (op->retval != op->len ? -ERANGE : 0);
|
||||
if (ewr_it == self->extend_writes.end())
|
||||
uint64_t inode = op->inode;
|
||||
WRITE3args *args = (WRITE3args*)rop->request;
|
||||
WRITE3res *reply = (WRITE3res*)rop->reply;
|
||||
if (op->retval != op->len)
|
||||
{
|
||||
complete_extend_write(self, rop, inode, write_res);
|
||||
*reply = (WRITE3res){ .status = vitastor_nfs_map_err(-op->retval) };
|
||||
delete op;
|
||||
rpc_queue_reply(rop);
|
||||
}
|
||||
else
|
||||
{
|
||||
ewr_it->second.write_res = write_res;
|
||||
if (ewr_it->second.resize_res <= 0)
|
||||
*(uint64_t*)reply->resok.verf = self->parent->server_id;
|
||||
delete op;
|
||||
if (args->stable != UNSTABLE &&
|
||||
!self->parent->cli->get_immediate_commit(inode))
|
||||
{
|
||||
complete_extend_write(self, rop, inode, write_res < 0 ? write_res : ewr_it->second.resize_res);
|
||||
self->extend_writes.erase(ewr_it);
|
||||
// Client requested a stable write. Add an fsync
|
||||
op = new cluster_op_t;
|
||||
op->opcode = OSD_OP_SYNC;
|
||||
op->callback = [rop](cluster_op_t *op)
|
||||
{
|
||||
if (op->retval != 0)
|
||||
{
|
||||
WRITE3res *reply = (WRITE3res*)rop->reply;
|
||||
*reply = (WRITE3res){ .status = vitastor_nfs_map_err(-op->retval) };
|
||||
}
|
||||
delete op;
|
||||
rpc_queue_reply(rop);
|
||||
};
|
||||
self->parent->cli->execute(op);
|
||||
}
|
||||
else
|
||||
{
|
||||
rpc_queue_reply(rop);
|
||||
}
|
||||
}
|
||||
};
|
||||
self->parent->cli->execute(op);
|
||||
}
|
||||
|
||||
static void nfs_resize_write(nfs_client_t *self, rpc_op_t *rop, uint64_t inode, uint64_t new_size, uint64_t offset, uint64_t count, void *buf)
|
||||
{
|
||||
// Check if we have to resize the inode during write
|
||||
auto inode_it = self->parent->cli->st_cli.inode_config.find(inode);
|
||||
if (inode_it != self->parent->cli->st_cli.inode_config.end() &&
|
||||
inode_it->second.size < new_size)
|
||||
{
|
||||
auto ewr_it = self->extend_writes.emplace((extend_size_t){
|
||||
.inode = inode,
|
||||
.new_size = new_size,
|
||||
}, (extend_write_t){
|
||||
.rop = rop,
|
||||
.resize_res = 1,
|
||||
.write_res = 1,
|
||||
});
|
||||
auto & ext = self->extends[inode];
|
||||
if (ext.cur_extend > 0)
|
||||
{
|
||||
// Already resizing, just wait
|
||||
if (ext.next_extend < new_size)
|
||||
ext.next_extend = new_size;
|
||||
}
|
||||
else
|
||||
{
|
||||
extend_inode(self, inode, new_size);
|
||||
}
|
||||
nfs_do_write(self, ewr_it, rop, inode, offset, count, buf);
|
||||
}
|
||||
else
|
||||
{
|
||||
nfs_do_write(self, self->extend_writes.end(), rop, inode, offset, count, buf);
|
||||
}
|
||||
}
|
||||
|
||||
static int nfs3_create_proc(void *opaque, rpc_op_t *rop)
|
||||
{
|
||||
nfs_client_t *self = (nfs_client_t*)opaque;
|
||||
@@ -959,27 +881,6 @@ static int nfs3_link_proc(void *opaque, rpc_op_t *rop)
|
||||
return 0;
|
||||
}
|
||||
|
||||
static void fill_dir_entry(nfs_client_t *self, rpc_op_t *rop,
|
||||
std::map<std::string, nfs_dir_t>::iterator dir_id_it, struct entryplus3 *entry, bool is_plus)
|
||||
{
|
||||
if (dir_id_it == self->parent->dir_info.end())
|
||||
{
|
||||
return;
|
||||
}
|
||||
entry->fileid = dir_id_it->second.id;
|
||||
if (is_plus)
|
||||
{
|
||||
entry->name_attributes = (post_op_attr){
|
||||
.attributes_follow = 1,
|
||||
.attributes = get_dir_attributes(self, dir_id_it->first),
|
||||
};
|
||||
entry->name_handle = (post_op_fh3){
|
||||
.handle_follows = 1,
|
||||
.handle = xdr_copy_string(rop->xdrs, "S"+base64_encode(sha256(dir_id_it->first))),
|
||||
};
|
||||
}
|
||||
}
|
||||
|
||||
static void nfs3_readdir_common(void *opaque, rpc_op_t *rop, bool is_plus)
|
||||
{
|
||||
nfs_client_t *self = (nfs_client_t*)opaque;
|
||||
@@ -1057,17 +958,17 @@ static void nfs3_readdir_common(void *opaque, rpc_op_t *rop, bool is_plus)
|
||||
continue;
|
||||
std::string subname = dir_id_it->first.substr(prefix.size());
|
||||
// for directories, fileid changes when the user restarts proxy
|
||||
fill_dir_entry(self, rop, dir_id_it, &entries[subname], is_plus);
|
||||
}
|
||||
// Add . and ..
|
||||
{
|
||||
auto dir_id_it = self->parent->dir_info.find(dir);
|
||||
fill_dir_entry(self, rop, dir_id_it, &entries["."], is_plus);
|
||||
auto sl = dir.rfind("/");
|
||||
if (sl != std::string::npos)
|
||||
entries[subname].fileid = dir_id_it->second.id;
|
||||
if (is_plus)
|
||||
{
|
||||
auto dir_id_it = self->parent->dir_info.find(dir.substr(0, sl));
|
||||
fill_dir_entry(self, rop, dir_id_it, &entries[".."], is_plus);
|
||||
entries[subname].name_attributes = (post_op_attr){
|
||||
.attributes_follow = 1,
|
||||
.attributes = get_dir_attributes(self, dir_id_it->first),
|
||||
};
|
||||
entries[subname].name_handle = (post_op_fh3){
|
||||
.handle_follows = 1,
|
||||
.handle = xdr_copy_string(rop->xdrs, "S"+base64_encode(sha256(dir_id_it->first))),
|
||||
};
|
||||
}
|
||||
}
|
||||
// Offset results by the continuation cookie (equal to index in the listing)
|
||||
@@ -1292,11 +1193,10 @@ static int nfs3_commit_proc(void *opaque, rpc_op_t *rop)
|
||||
cluster_op_t *op = new cluster_op_t;
|
||||
// fsync. we don't know how to fsync a single inode, so just fsync everything
|
||||
op->opcode = OSD_OP_SYNC;
|
||||
op->callback = [self, rop](cluster_op_t *op)
|
||||
op->callback = [rop](cluster_op_t *op)
|
||||
{
|
||||
COMMIT3res *reply = (COMMIT3res*)rop->reply;
|
||||
*reply = (COMMIT3res){ .status = vitastor_nfs_map_err(op->retval) };
|
||||
*(uint64_t*)reply->resok.verf = self->parent->server_id;
|
||||
rpc_queue_reply(rop);
|
||||
};
|
||||
self->parent->cli->execute(op);
|
||||
|
@@ -86,28 +86,6 @@ struct rpc_free_buffer_t
|
||||
unsigned size;
|
||||
};
|
||||
|
||||
struct extend_size_t
|
||||
{
|
||||
inode_t inode;
|
||||
uint64_t new_size;
|
||||
};
|
||||
|
||||
inline bool operator < (const extend_size_t &a, const extend_size_t &b)
|
||||
{
|
||||
return a.inode < b.inode || a.inode == b.inode && a.new_size < b.new_size;
|
||||
}
|
||||
|
||||
struct extend_write_t
|
||||
{
|
||||
rpc_op_t *rop;
|
||||
int resize_res, write_res; // 1 = started, 0 = completed OK, -errno = completed with error
|
||||
};
|
||||
|
||||
struct extend_inode_t
|
||||
{
|
||||
uint64_t cur_extend = 0, next_extend = 0;
|
||||
};
|
||||
|
||||
class nfs_client_t
|
||||
{
|
||||
public:
|
||||
@@ -122,8 +100,6 @@ public:
|
||||
rpc_cur_buffer_t cur_buffer = { 0 };
|
||||
std::map<uint8_t*, rpc_used_buffer_t> used_buffers;
|
||||
std::vector<rpc_free_buffer_t> free_buffers;
|
||||
std::map<inode_t, extend_inode_t> extends;
|
||||
std::multimap<extend_size_t, extend_write_t> extend_writes;
|
||||
|
||||
iovec read_iov;
|
||||
msghdr read_msg = { 0 };
|
||||
|
@@ -198,14 +198,13 @@ class osd_t
|
||||
void on_change_pg_history_hook(pool_id_t pool_id, pg_num_t pg_num);
|
||||
void on_change_etcd_state_hook(std::map<std::string, etcd_kv_t> & changes);
|
||||
void on_load_config_hook(json11::Json::object & changes);
|
||||
void on_reload_config_hook(json11::Json::object & changes);
|
||||
json11::Json on_load_pgs_checks_hook();
|
||||
void on_load_pgs_hook(bool success);
|
||||
void bind_socket();
|
||||
void acquire_lease();
|
||||
json11::Json get_osd_state();
|
||||
void create_osd_state();
|
||||
void renew_lease(bool reload);
|
||||
void renew_lease();
|
||||
void print_stats();
|
||||
void print_slow();
|
||||
void reset_stats();
|
||||
|
@@ -70,7 +70,6 @@ void osd_t::init_cluster()
|
||||
st_cli.on_load_config_hook = [this](json11::Json::object & cfg) { on_load_config_hook(cfg); };
|
||||
st_cli.load_pgs_checks_hook = [this]() { return on_load_pgs_checks_hook(); };
|
||||
st_cli.on_load_pgs_hook = [this](bool success) { on_load_pgs_hook(success); };
|
||||
st_cli.on_reload_hook = [this]() { st_cli.load_global_config(); };
|
||||
peering_state = OSD_LOADING_PGS;
|
||||
st_cli.load_global_config();
|
||||
}
|
||||
@@ -396,14 +395,6 @@ void osd_t::on_load_config_hook(json11::Json::object & global_config)
|
||||
parse_config(true);
|
||||
bind_socket();
|
||||
acquire_lease();
|
||||
st_cli.on_load_config_hook = [this](json11::Json::object & cfg) { on_reload_config_hook(cfg); };
|
||||
}
|
||||
|
||||
void osd_t::on_reload_config_hook(json11::Json::object & global_config)
|
||||
{
|
||||
etcd_global_config = global_config;
|
||||
parse_config(false);
|
||||
renew_lease(true);
|
||||
}
|
||||
|
||||
// Acquire lease
|
||||
@@ -433,7 +424,7 @@ void osd_t::acquire_lease()
|
||||
);
|
||||
tfd->set_timer(etcd_report_interval*1000, true, [this](int timer_id)
|
||||
{
|
||||
renew_lease(false);
|
||||
renew_lease();
|
||||
});
|
||||
}
|
||||
|
||||
@@ -508,11 +499,11 @@ void osd_t::create_osd_state()
|
||||
}
|
||||
|
||||
// Renew lease
|
||||
void osd_t::renew_lease(bool reload)
|
||||
void osd_t::renew_lease()
|
||||
{
|
||||
st_cli.etcd_call("/lease/keepalive", json11::Json::object {
|
||||
{ "ID", etcd_lease_id }
|
||||
}, st_cli.etcd_quick_timeout, 0, 0, [this, reload](std::string err, json11::Json data)
|
||||
}, st_cli.etcd_quick_timeout, 0, 0, [this](std::string err, json11::Json data)
|
||||
{
|
||||
if (err == "" && data["result"]["TTL"].string_value() == "")
|
||||
{
|
||||
@@ -531,20 +522,15 @@ void osd_t::renew_lease(bool reload)
|
||||
force_stop(1);
|
||||
}
|
||||
// Retry
|
||||
tfd->set_timer(st_cli.etcd_quick_timeout, false, [this, reload](int timer_id)
|
||||
tfd->set_timer(st_cli.etcd_quick_timeout, false, [this](int timer_id)
|
||||
{
|
||||
renew_lease(reload);
|
||||
renew_lease();
|
||||
});
|
||||
}
|
||||
else
|
||||
{
|
||||
etcd_failed_attempts = 0;
|
||||
report_statistics();
|
||||
// Reload PGs
|
||||
if (reload && run_primary)
|
||||
{
|
||||
st_cli.load_pgs();
|
||||
}
|
||||
}
|
||||
});
|
||||
}
|
||||
@@ -574,6 +560,7 @@ void osd_t::force_stop(int exitcode)
|
||||
|
||||
json11::Json osd_t::on_load_pgs_checks_hook()
|
||||
{
|
||||
assert(this->pgs.size() == 0);
|
||||
json11::Json::array checks = {
|
||||
json11::Json::object {
|
||||
{ "target", "LEASE" },
|
||||
|
@@ -58,8 +58,6 @@ typedef struct VitastorFdData VitastorFdData;
|
||||
typedef struct VitastorClient
|
||||
{
|
||||
void *proxy;
|
||||
int uring_eventfd;
|
||||
|
||||
void *watch;
|
||||
char *config_path;
|
||||
char *etcd_host;
|
||||
@@ -79,7 +77,7 @@ typedef struct VitastorClient
|
||||
AioContext *ctx;
|
||||
VitastorFdData **fds;
|
||||
int fd_count, fd_alloc;
|
||||
int bh_uring_scheduled;
|
||||
int bh_send_all_scheduled;
|
||||
|
||||
uint64_t last_bitmap_inode, last_bitmap_offset, last_bitmap_len;
|
||||
uint32_t last_bitmap_granularity;
|
||||
@@ -235,52 +233,47 @@ out:
|
||||
}
|
||||
|
||||
#if defined VITASTOR_C_API_VERSION && VITASTOR_C_API_VERSION >= 2
|
||||
static void vitastor_uring_handler(void *opaque)
|
||||
static void vitastor_bh_send_all(void *opaque)
|
||||
{
|
||||
VitastorClient *client = (VitastorClient*)opaque;
|
||||
qemu_mutex_lock(&client->mutex);
|
||||
client->bh_uring_scheduled = 0;
|
||||
vitastor_c_uring_handle_events(client->proxy);
|
||||
qemu_mutex_unlock(&client->mutex);
|
||||
}
|
||||
|
||||
#if QEMU_VERSION_MAJOR == 2 && QEMU_VERSION_MINOR < 8
|
||||
static void vitastor_bh_uring_handler(void *opaque)
|
||||
{
|
||||
VitastorBH *vbh = opaque;
|
||||
vitastor_bh_handler(vbh->cli);
|
||||
VitastorClient *client = vbh->cli;
|
||||
#else
|
||||
VitastorClient *client = opaque;
|
||||
#endif
|
||||
qemu_mutex_lock(&client->mutex);
|
||||
client->bh_send_all_scheduled = 0;
|
||||
vitastor_c_qemu_send_all(client->proxy);
|
||||
qemu_mutex_unlock(&client->mutex);
|
||||
#if QEMU_VERSION_MAJOR == 2 && QEMU_VERSION_MINOR < 8
|
||||
qemu_bh_delete(vbh->bh);
|
||||
free(vbh);
|
||||
#endif
|
||||
}
|
||||
#endif
|
||||
|
||||
static void vitastor_schedule_uring_handler(VitastorClient *client)
|
||||
static void vitastor_schedule_send_all(VitastorClient *client)
|
||||
{
|
||||
void *opaque = client;
|
||||
if (client->uring_eventfd >= 0 && !client->bh_uring_scheduled)
|
||||
#if defined VITASTOR_C_API_VERSION && VITASTOR_C_API_VERSION >= 2
|
||||
if (!client->bh_send_all_scheduled)
|
||||
{
|
||||
client->bh_uring_scheduled = 1;
|
||||
client->bh_send_all_scheduled = 1;
|
||||
#if QEMU_VERSION_MAJOR > 4 || QEMU_VERSION_MAJOR == 4 && QEMU_VERSION_MINOR >= 2
|
||||
replay_bh_schedule_oneshot_event(client->ctx, vitastor_uring_handler, opaque);
|
||||
replay_bh_schedule_oneshot_event(client->ctx, vitastor_bh_send_all, client);
|
||||
#elif QEMU_VERSION_MAJOR >= 3 || QEMU_VERSION_MAJOR == 2 && QEMU_VERSION_MINOR >= 8
|
||||
aio_bh_schedule_oneshot(client->ctx, vitastor_uring_handler, opaque);
|
||||
#else
|
||||
aio_bh_schedule_oneshot(client->ctx, vitastor_bh_send_all, client);
|
||||
#elif QEMU_VERSION_MAJOR >= 2
|
||||
VitastorBH *vbh = (VitastorBH*)malloc(sizeof(VitastorBH));
|
||||
vbh->cli = client;
|
||||
#if QEMU_VERSION_MAJOR >= 2
|
||||
vbh->bh = aio_bh_new(bdrv_get_aio_context(task->bs), vitastor_bh_uring_handler, vbh);
|
||||
#else
|
||||
vbh->bh = qemu_bh_new(vitastor_bh_uring_handler, vbh);
|
||||
#endif
|
||||
vbh->bh = aio_bh_new(bdrv_get_aio_context(task->bs), vitastor_bh_send_all, vbh);
|
||||
qemu_bh_schedule(vbh->bh);
|
||||
#else
|
||||
client->bh_send_all_scheduled = 0;
|
||||
vitastor_c_qemu_send_all(client->proxy);
|
||||
#endif
|
||||
}
|
||||
}
|
||||
#else
|
||||
static void vitastor_schedule_uring_handler(VitastorClient *client)
|
||||
{
|
||||
}
|
||||
#endif
|
||||
}
|
||||
|
||||
static void coroutine_fn vitastor_co_get_metadata(VitastorRPC *task)
|
||||
{
|
||||
@@ -290,7 +283,7 @@ static void coroutine_fn vitastor_co_get_metadata(VitastorRPC *task)
|
||||
|
||||
qemu_mutex_lock(&client->mutex);
|
||||
vitastor_c_watch_inode(client->proxy, client->image, vitastor_co_generic_cb, task);
|
||||
vitastor_schedule_uring_handler(client);
|
||||
vitastor_schedule_send_all(client);
|
||||
qemu_mutex_unlock(&client->mutex);
|
||||
|
||||
while (!task->complete)
|
||||
@@ -304,7 +297,7 @@ static void vitastor_aio_fd_read(void *fddv)
|
||||
VitastorFdData *fdd = (VitastorFdData*)fddv;
|
||||
qemu_mutex_lock(&fdd->cli->mutex);
|
||||
fdd->fd_read(fdd->opaque);
|
||||
vitastor_schedule_uring_handler(fdd->cli);
|
||||
vitastor_schedule_send_all(fdd->cli);
|
||||
qemu_mutex_unlock(&fdd->cli->mutex);
|
||||
}
|
||||
|
||||
@@ -313,30 +306,10 @@ static void vitastor_aio_fd_write(void *fddv)
|
||||
VitastorFdData *fdd = (VitastorFdData*)fddv;
|
||||
qemu_mutex_lock(&fdd->cli->mutex);
|
||||
fdd->fd_write(fdd->opaque);
|
||||
vitastor_schedule_uring_handler(fdd->cli);
|
||||
vitastor_schedule_send_all(fdd->cli);
|
||||
qemu_mutex_unlock(&fdd->cli->mutex);
|
||||
}
|
||||
|
||||
static void universal_aio_set_fd_handler(AioContext *ctx, int fd, IOHandler *fd_read, IOHandler *fd_write, void *opaque)
|
||||
{
|
||||
aio_set_fd_handler(ctx, fd,
|
||||
#if QEMU_VERSION_MAJOR == 2 && QEMU_VERSION_MINOR >= 5 || QEMU_VERSION_MAJOR >= 3
|
||||
0 /*is_external*/,
|
||||
#endif
|
||||
fd_read,
|
||||
fd_write,
|
||||
#if QEMU_VERSION_MAJOR == 1 && QEMU_VERSION_MINOR <= 6 || QEMU_VERSION_MAJOR < 1
|
||||
NULL /*io_flush*/,
|
||||
#endif
|
||||
#if QEMU_VERSION_MAJOR == 2 && QEMU_VERSION_MINOR >= 9 || QEMU_VERSION_MAJOR >= 3
|
||||
NULL /*io_poll*/,
|
||||
#endif
|
||||
#if QEMU_VERSION_MAJOR >= 7
|
||||
NULL /*io_poll_ready*/,
|
||||
#endif
|
||||
opaque);
|
||||
}
|
||||
|
||||
static void vitastor_aio_set_fd_handler(void *vcli, int fd, int unused1, IOHandler *fd_read, IOHandler *fd_write, void *unused2, void *opaque)
|
||||
{
|
||||
VitastorClient *client = (VitastorClient*)vcli;
|
||||
@@ -379,9 +352,22 @@ static void vitastor_aio_set_fd_handler(void *vcli, int fd, int unused1, IOHandl
|
||||
}
|
||||
client->fds[client->fd_count++] = fdd;
|
||||
}
|
||||
universal_aio_set_fd_handler(
|
||||
client->ctx, fd, fd_read ? vitastor_aio_fd_read : NULL, fd_write ? vitastor_aio_fd_write : NULL, fdd
|
||||
);
|
||||
aio_set_fd_handler(client->ctx, fd,
|
||||
#if QEMU_VERSION_MAJOR == 2 && QEMU_VERSION_MINOR >= 5 || QEMU_VERSION_MAJOR >= 3
|
||||
0 /*is_external*/,
|
||||
#endif
|
||||
fd_read ? vitastor_aio_fd_read : NULL,
|
||||
fd_write ? vitastor_aio_fd_write : NULL,
|
||||
#if QEMU_VERSION_MAJOR == 1 && QEMU_VERSION_MINOR <= 6 || QEMU_VERSION_MAJOR < 1
|
||||
NULL /*io_flush*/,
|
||||
#endif
|
||||
#if QEMU_VERSION_MAJOR == 2 && QEMU_VERSION_MINOR >= 9 || QEMU_VERSION_MAJOR >= 3
|
||||
NULL /*io_poll*/,
|
||||
#endif
|
||||
#if QEMU_VERSION_MAJOR >= 7
|
||||
NULL /*io_poll_ready*/,
|
||||
#endif
|
||||
fdd);
|
||||
}
|
||||
|
||||
static int vitastor_file_open(BlockDriverState *bs, QDict *options, int flags, Error **errp)
|
||||
@@ -402,35 +388,10 @@ static int vitastor_file_open(BlockDriverState *bs, QDict *options, int flags, E
|
||||
client->rdma_gid_index = qdict_get_try_int(options, "rdma-gid-index", 0);
|
||||
client->rdma_mtu = qdict_get_try_int(options, "rdma-mtu", 0);
|
||||
client->ctx = bdrv_get_aio_context(bs);
|
||||
#if defined VITASTOR_C_API_VERSION && VITASTOR_C_API_VERSION >= 2
|
||||
client->proxy = vitastor_c_create_qemu_uring(
|
||||
client->proxy = vitastor_c_create_qemu(
|
||||
vitastor_aio_set_fd_handler, client, client->config_path, client->etcd_host, client->etcd_prefix,
|
||||
client->use_rdma, client->rdma_device, client->rdma_port_num, client->rdma_gid_index, client->rdma_mtu, 0
|
||||
);
|
||||
if (!client->proxy)
|
||||
{
|
||||
fprintf(stderr, "vitastor: failed to create io_uring: %s - I/O will be slower\n", strerror(errno));
|
||||
client->uring_eventfd = -1;
|
||||
#endif
|
||||
client->proxy = vitastor_c_create_qemu(
|
||||
vitastor_aio_set_fd_handler, client, client->config_path, client->etcd_host, client->etcd_prefix,
|
||||
client->use_rdma, client->rdma_device, client->rdma_port_num, client->rdma_gid_index, client->rdma_mtu, 0
|
||||
);
|
||||
#if defined VITASTOR_C_API_VERSION && VITASTOR_C_API_VERSION >= 2
|
||||
}
|
||||
else
|
||||
{
|
||||
client->uring_eventfd = vitastor_c_uring_register_eventfd(client->proxy);
|
||||
if (client->uring_eventfd < 0)
|
||||
{
|
||||
fprintf(stderr, "vitastor: failed to create io_uring eventfd: %s\n", strerror(errno));
|
||||
error_setg(errp, "failed to create io_uring eventfd");
|
||||
vitastor_close(bs);
|
||||
return -1;
|
||||
}
|
||||
universal_aio_set_fd_handler(client->ctx, client->uring_eventfd, vitastor_uring_handler, NULL, client);
|
||||
}
|
||||
#endif
|
||||
image = client->image = g_strdup(qdict_get_try_str(options, "image"));
|
||||
client->readonly = (flags & BDRV_O_RDWR) ? 1 : 0;
|
||||
// Get image metadata (size and readonly flag) or just wait until the client is ready
|
||||
@@ -662,8 +623,7 @@ static void vitastor_co_generic_cb(void *opaque, long retval)
|
||||
task->bh = aio_bh_new(bdrv_get_aio_context(task->bs), vitastor_co_generic_bh_cb, opaque);
|
||||
qemu_bh_schedule(task->bh);
|
||||
#else
|
||||
task->bh = qemu_bh_new(vitastor_co_generic_bh_cb, opaque);
|
||||
qemu_bh_schedule(task->bh);
|
||||
vitastor_co_generic_bh_cb(opaque);
|
||||
#endif
|
||||
}
|
||||
|
||||
@@ -688,7 +648,7 @@ static int coroutine_fn vitastor_co_preadv(BlockDriverState *bs,
|
||||
uint64_t inode = client->watch ? vitastor_c_inode_get_num(client->watch) : client->inode;
|
||||
qemu_mutex_lock(&client->mutex);
|
||||
vitastor_c_read(client->proxy, inode, offset, bytes, iov->iov, iov->niov, vitastor_co_read_cb, &task);
|
||||
vitastor_schedule_uring_handler(client);
|
||||
vitastor_schedule_send_all(client);
|
||||
qemu_mutex_unlock(&client->mutex);
|
||||
|
||||
while (!task.complete)
|
||||
@@ -722,7 +682,7 @@ static int coroutine_fn vitastor_co_pwritev(BlockDriverState *bs,
|
||||
uint64_t inode = client->watch ? vitastor_c_inode_get_num(client->watch) : client->inode;
|
||||
qemu_mutex_lock(&client->mutex);
|
||||
vitastor_c_write(client->proxy, inode, offset, bytes, 0, iov->iov, iov->niov, vitastor_co_generic_cb, &task);
|
||||
vitastor_schedule_uring_handler(client);
|
||||
vitastor_schedule_send_all(client);
|
||||
qemu_mutex_unlock(&client->mutex);
|
||||
|
||||
while (!task.complete)
|
||||
@@ -740,6 +700,7 @@ static void vitastor_co_read_bitmap_cb(void *opaque, long retval, uint8_t *bitma
|
||||
VitastorRPC *task = opaque;
|
||||
VitastorClient *client = task->bs->opaque;
|
||||
task->ret = retval;
|
||||
task->complete = 1;
|
||||
if (retval >= 0)
|
||||
{
|
||||
task->bitmap = bitmap;
|
||||
@@ -751,17 +712,15 @@ static void vitastor_co_read_bitmap_cb(void *opaque, long retval, uint8_t *bitma
|
||||
client->last_bitmap = bitmap;
|
||||
}
|
||||
}
|
||||
#if QEMU_VERSION_MAJOR > 4 || QEMU_VERSION_MAJOR == 4 && QEMU_VERSION_MINOR >= 2
|
||||
replay_bh_schedule_oneshot_event(bdrv_get_aio_context(task->bs), vitastor_co_generic_bh_cb, opaque);
|
||||
#elif QEMU_VERSION_MAJOR >= 3 || QEMU_VERSION_MAJOR == 2 && QEMU_VERSION_MINOR >= 8
|
||||
aio_bh_schedule_oneshot(bdrv_get_aio_context(task->bs), vitastor_co_generic_bh_cb, opaque);
|
||||
#elif QEMU_VERSION_MAJOR >= 2
|
||||
task->bh = aio_bh_new(bdrv_get_aio_context(task->bs), vitastor_co_generic_bh_cb, opaque);
|
||||
qemu_bh_schedule(task->bh);
|
||||
if (qemu_coroutine_self() != task->co)
|
||||
{
|
||||
#if QEMU_VERSION_MAJOR >= 3 || QEMU_VERSION_MAJOR == 2 && QEMU_VERSION_MINOR > 8
|
||||
aio_co_wake(task->co);
|
||||
#else
|
||||
task->bh = qemu_bh_new(vitastor_co_generic_bh_cb, opaque);
|
||||
qemu_bh_schedule(task->bh);
|
||||
qemu_coroutine_enter(task->co, NULL);
|
||||
qemu_aio_release(task);
|
||||
#endif
|
||||
}
|
||||
}
|
||||
|
||||
static int coroutine_fn vitastor_co_block_status(
|
||||
@@ -802,7 +761,7 @@ static int coroutine_fn vitastor_co_block_status(
|
||||
task.bitmap = client->last_bitmap = NULL;
|
||||
qemu_mutex_lock(&client->mutex);
|
||||
vitastor_c_read_bitmap(client->proxy, task.inode, task.offset, task.len, !client->skip_parents, vitastor_co_read_bitmap_cb, &task);
|
||||
vitastor_schedule_uring_handler(client);
|
||||
vitastor_schedule_send_all(client);
|
||||
qemu_mutex_unlock(&client->mutex);
|
||||
while (!task.complete)
|
||||
{
|
||||
@@ -889,7 +848,7 @@ static int coroutine_fn vitastor_co_flush(BlockDriverState *bs)
|
||||
|
||||
qemu_mutex_lock(&client->mutex);
|
||||
vitastor_c_sync(client->proxy, vitastor_co_generic_cb, &task);
|
||||
vitastor_schedule_uring_handler(client);
|
||||
vitastor_schedule_send_all(client);
|
||||
qemu_mutex_unlock(&client->mutex);
|
||||
|
||||
while (!task.complete)
|
||||
|
@@ -2,12 +2,9 @@
|
||||
// License: VNPL-1.1 or GNU GPL-2.0+ (see README.md for details)
|
||||
|
||||
#include <stdlib.h>
|
||||
#include <unistd.h>
|
||||
|
||||
#include <stdexcept>
|
||||
|
||||
#include <sys/eventfd.h>
|
||||
|
||||
#include "ringloop.h"
|
||||
|
||||
ring_loop_t::ring_loop_t(int qd)
|
||||
@@ -35,10 +32,6 @@ ring_loop_t::~ring_loop_t()
|
||||
free(free_ring_data);
|
||||
free(ring_datas);
|
||||
io_uring_queue_exit(&ring);
|
||||
if (ring_eventfd)
|
||||
{
|
||||
close(ring_eventfd);
|
||||
}
|
||||
}
|
||||
|
||||
void ring_loop_t::register_consumer(ring_consumer_t *consumer)
|
||||
@@ -66,16 +59,6 @@ void ring_loop_t::unregister_consumer(ring_consumer_t *consumer)
|
||||
|
||||
void ring_loop_t::loop()
|
||||
{
|
||||
if (ring_eventfd >= 0)
|
||||
{
|
||||
// Reset eventfd counter
|
||||
uint64_t ctr = 0;
|
||||
int r = read(ring_eventfd, &ctr, 8);
|
||||
if (r < 0 && errno != EAGAIN && errno != EINTR)
|
||||
{
|
||||
fprintf(stderr, "Error resetting eventfd: %s\n", strerror(errno));
|
||||
}
|
||||
}
|
||||
struct io_uring_cqe *cqe;
|
||||
while (!io_uring_peek_cqe(&ring, &cqe))
|
||||
{
|
||||
@@ -94,7 +77,7 @@ void ring_loop_t::loop()
|
||||
}
|
||||
else
|
||||
{
|
||||
fprintf(stderr, "Warning: empty callback in SQE\n");
|
||||
printf("Warning: empty callback in SQE\n");
|
||||
free_ring_data[free_ring_data_ptr++] = d - ring_datas;
|
||||
}
|
||||
io_uring_cqe_seen(&ring, cqe);
|
||||
@@ -144,24 +127,3 @@ int ring_loop_t::sqes_left()
|
||||
}
|
||||
return left;
|
||||
}
|
||||
|
||||
int ring_loop_t::register_eventfd()
|
||||
{
|
||||
if (ring_eventfd >= 0)
|
||||
{
|
||||
return ring_eventfd;
|
||||
}
|
||||
ring_eventfd = eventfd(0, EFD_CLOEXEC|EFD_NONBLOCK);
|
||||
if (ring_eventfd < 0)
|
||||
{
|
||||
return -errno;
|
||||
}
|
||||
int r = io_uring_register_eventfd(&ring, ring_eventfd);
|
||||
if (r < 0)
|
||||
{
|
||||
close(ring_eventfd);
|
||||
ring_eventfd = -1;
|
||||
return r;
|
||||
}
|
||||
return ring_eventfd;
|
||||
}
|
||||
|
@@ -126,13 +126,11 @@ class ring_loop_t
|
||||
unsigned free_ring_data_ptr;
|
||||
bool loop_again;
|
||||
struct io_uring ring;
|
||||
int ring_eventfd = -1;
|
||||
public:
|
||||
ring_loop_t(int qd);
|
||||
~ring_loop_t();
|
||||
void register_consumer(ring_consumer_t *consumer);
|
||||
void unregister_consumer(ring_consumer_t *consumer);
|
||||
int register_eventfd();
|
||||
|
||||
inline struct io_uring_sqe* get_sqe()
|
||||
{
|
||||
|
@@ -308,19 +308,3 @@ std::string str_repeat(const std::string & str, int times)
|
||||
r += str;
|
||||
return r;
|
||||
}
|
||||
|
||||
size_t utf8_length(const std::string & s)
|
||||
{
|
||||
size_t len = 0;
|
||||
for (size_t i = 0; i < s.size(); i++)
|
||||
len += (s[i] & 0xC0) != 0x80;
|
||||
return len;
|
||||
}
|
||||
|
||||
size_t utf8_length(const char *s)
|
||||
{
|
||||
size_t len = 0;
|
||||
for (; *s; s++)
|
||||
len += (*s & 0xC0) != 0x80;
|
||||
return len;
|
||||
}
|
||||
|
@@ -18,5 +18,3 @@ void print_help(const char *help_text, std::string exe_name, std::string cmd, bo
|
||||
uint64_t parse_time(std::string time_str, bool *ok = NULL);
|
||||
std::string read_all_fd(int fd);
|
||||
std::string str_repeat(const std::string & str, int times);
|
||||
size_t utf8_length(const std::string & s);
|
||||
size_t utf8_length(const char *s);
|
||||
|
@@ -6,7 +6,7 @@ includedir=${prefix}/@CMAKE_INSTALL_INCLUDEDIR@
|
||||
|
||||
Name: Vitastor
|
||||
Description: Vitastor client library
|
||||
Version: 0.9.6
|
||||
Version: 0.9.2
|
||||
Libs: -L${libdir} -lvitastor_client
|
||||
Cflags: -I${includedir}
|
||||
|
||||
|
@@ -5,7 +5,6 @@
|
||||
// Also acts as a C-C++ proxy for the QEMU driver (QEMU headers don't compile with g++)
|
||||
|
||||
#include <sys/epoll.h>
|
||||
#include <sys/eventfd.h>
|
||||
|
||||
#include "ringloop.h"
|
||||
#include "epoll_manager.h"
|
||||
@@ -26,7 +25,6 @@ struct vitastor_c
|
||||
epoll_manager_t *epmgr = NULL;
|
||||
timerfd_manager_t *tfd = NULL;
|
||||
cluster_client_t *cli = NULL;
|
||||
int uring_eventfd = -1;
|
||||
|
||||
QEMUSetFDHandler *aio_set_fd_handler = NULL;
|
||||
void *aio_ctx = NULL;
|
||||
@@ -72,8 +70,14 @@ static void vitastor_c_write_handler(void *opaque)
|
||||
data->callback(data->fd, EPOLLOUT);
|
||||
}
|
||||
|
||||
static vitastor_c *vitastor_c_create_qemu_common(QEMUSetFDHandler *aio_set_fd_handler, void *aio_context)
|
||||
vitastor_c *vitastor_c_create_qemu(QEMUSetFDHandler *aio_set_fd_handler, void *aio_context,
|
||||
const char *config_path, const char *etcd_host, const char *etcd_prefix,
|
||||
int use_rdma, const char *rdma_device, int rdma_port_num, int rdma_gid_index, int rdma_mtu, int log_level)
|
||||
{
|
||||
json11::Json cfg_json = vitastor_c_common_config(
|
||||
config_path, etcd_host, etcd_prefix, use_rdma,
|
||||
rdma_device, rdma_port_num, rdma_gid_index, rdma_mtu, log_level
|
||||
);
|
||||
vitastor_c *self = new vitastor_c;
|
||||
self->aio_set_fd_handler = aio_set_fd_handler;
|
||||
self->aio_ctx = aio_context;
|
||||
@@ -91,77 +95,24 @@ static vitastor_c *vitastor_c_create_qemu_common(QEMUSetFDHandler *aio_set_fd_ha
|
||||
self->aio_set_fd_handler(self->aio_ctx, fd, false, NULL, NULL, NULL, NULL);
|
||||
}
|
||||
});
|
||||
return self;
|
||||
}
|
||||
|
||||
vitastor_c *vitastor_c_create_qemu(QEMUSetFDHandler *aio_set_fd_handler, void *aio_context,
|
||||
const char *config_path, const char *etcd_host, const char *etcd_prefix,
|
||||
int use_rdma, const char *rdma_device, int rdma_port_num, int rdma_gid_index, int rdma_mtu, int log_level)
|
||||
{
|
||||
json11::Json cfg_json = vitastor_c_common_config(
|
||||
config_path, etcd_host, etcd_prefix, use_rdma,
|
||||
rdma_device, rdma_port_num, rdma_gid_index, rdma_mtu, log_level
|
||||
);
|
||||
auto self = vitastor_c_create_qemu_common(aio_set_fd_handler, aio_context);
|
||||
self->cli = new cluster_client_t(NULL, self->tfd, cfg_json);
|
||||
return self;
|
||||
}
|
||||
|
||||
vitastor_c *vitastor_c_create_qemu_uring(QEMUSetFDHandler *aio_set_fd_handler, void *aio_context,
|
||||
const char *config_path, const char *etcd_host, const char *etcd_prefix,
|
||||
int use_rdma, const char *rdma_device, int rdma_port_num, int rdma_gid_index, int rdma_mtu, int log_level)
|
||||
{
|
||||
ring_loop_t *ringloop = NULL;
|
||||
try
|
||||
{
|
||||
ringloop = new ring_loop_t(512);
|
||||
}
|
||||
catch (std::exception & e)
|
||||
{
|
||||
return NULL;
|
||||
}
|
||||
json11::Json cfg_json = vitastor_c_common_config(
|
||||
config_path, etcd_host, etcd_prefix, use_rdma,
|
||||
rdma_device, rdma_port_num, rdma_gid_index, rdma_mtu, log_level
|
||||
);
|
||||
auto self = vitastor_c_create_qemu_common(aio_set_fd_handler, aio_context);
|
||||
self->ringloop = ringloop;
|
||||
self->cli = new cluster_client_t(self->ringloop, self->tfd, cfg_json);
|
||||
return self;
|
||||
}
|
||||
|
||||
vitastor_c *vitastor_c_create_uring(const char *config_path, const char *etcd_host, const char *etcd_prefix,
|
||||
int use_rdma, const char *rdma_device, int rdma_port_num, int rdma_gid_index, int rdma_mtu, int log_level)
|
||||
{
|
||||
ring_loop_t *ringloop = NULL;
|
||||
try
|
||||
{
|
||||
ringloop = new ring_loop_t(512);
|
||||
}
|
||||
catch (std::exception & e)
|
||||
{
|
||||
return NULL;
|
||||
}
|
||||
json11::Json cfg_json = vitastor_c_common_config(
|
||||
config_path, etcd_host, etcd_prefix, use_rdma,
|
||||
rdma_device, rdma_port_num, rdma_gid_index, rdma_mtu, log_level
|
||||
);
|
||||
vitastor_c *self = new vitastor_c;
|
||||
self->ringloop = ringloop;
|
||||
self->ringloop = new ring_loop_t(512);
|
||||
self->epmgr = new epoll_manager_t(self->ringloop);
|
||||
self->cli = new cluster_client_t(self->ringloop, self->epmgr->tfd, cfg_json);
|
||||
return self;
|
||||
}
|
||||
|
||||
int vitastor_c_uring_register_eventfd(vitastor_c *client)
|
||||
{
|
||||
if (!client->ringloop)
|
||||
{
|
||||
return -EINVAL;
|
||||
}
|
||||
return client->ringloop->register_eventfd();
|
||||
}
|
||||
|
||||
vitastor_c *vitastor_c_create_uring_json(const char **options, int options_len)
|
||||
{
|
||||
json11::Json::object cfg;
|
||||
@@ -215,9 +166,12 @@ void vitastor_c_uring_wait_events(vitastor_c *client)
|
||||
client->ringloop->wait();
|
||||
}
|
||||
|
||||
int vitastor_c_uring_has_work(vitastor_c *client)
|
||||
void vitastor_c_qemu_send_all(vitastor_c *client)
|
||||
{
|
||||
return client->ringloop->has_work();
|
||||
while (client->cli->msgr.can_send())
|
||||
{
|
||||
client->cli->msgr.send_replies();
|
||||
}
|
||||
}
|
||||
|
||||
void vitastor_c_read(vitastor_c *client, uint64_t inode, uint64_t offset, uint64_t len,
|
||||
|
@@ -34,19 +34,15 @@ typedef void QEMUSetFDHandler(void *ctx, int fd, int is_external, IOHandler *fd_
|
||||
vitastor_c *vitastor_c_create_qemu(QEMUSetFDHandler *aio_set_fd_handler, void *aio_context,
|
||||
const char *config_path, const char *etcd_host, const char *etcd_prefix,
|
||||
int use_rdma, const char *rdma_device, int rdma_port_num, int rdma_gid_index, int rdma_mtu, int log_level);
|
||||
vitastor_c *vitastor_c_create_qemu_uring(QEMUSetFDHandler *aio_set_fd_handler, void *aio_context,
|
||||
const char *config_path, const char *etcd_host, const char *etcd_prefix,
|
||||
int use_rdma, const char *rdma_device, int rdma_port_num, int rdma_gid_index, int rdma_mtu, int log_level);
|
||||
vitastor_c *vitastor_c_create_uring(const char *config_path, const char *etcd_host, const char *etcd_prefix,
|
||||
int use_rdma, const char *rdma_device, int rdma_port_num, int rdma_gid_index, int rdma_mtu, int log_level);
|
||||
vitastor_c *vitastor_c_create_uring_json(const char **options, int options_len);
|
||||
void vitastor_c_destroy(vitastor_c *client);
|
||||
int vitastor_c_is_ready(vitastor_c *client);
|
||||
int vitastor_c_uring_register_eventfd(vitastor_c *client);
|
||||
void vitastor_c_uring_wait_ready(vitastor_c *client);
|
||||
void vitastor_c_uring_handle_events(vitastor_c *client);
|
||||
void vitastor_c_uring_wait_events(vitastor_c *client);
|
||||
int vitastor_c_uring_has_work(vitastor_c *client);
|
||||
void vitastor_c_qemu_send_all(vitastor_c *client);
|
||||
void vitastor_c_read(vitastor_c *client, uint64_t inode, uint64_t offset, uint64_t len,
|
||||
struct iovec *iov, int iovcnt, VitastorReadHandler cb, void *opaque);
|
||||
void vitastor_c_write(vitastor_c *client, uint64_t inode, uint64_t offset, uint64_t len, uint64_t check_version,
|
||||
|
@@ -82,9 +82,7 @@ wait_up()
|
||||
done
|
||||
}
|
||||
|
||||
if [[ $OSD_COUNT -gt 0 ]]; then
|
||||
wait_up 60
|
||||
fi
|
||||
wait_up 60
|
||||
|
||||
try_reweight()
|
||||
{
|
||||
|
Reference in New Issue
Block a user