Compare commits

..

1 Commits

Author SHA1 Message Date
c3ec6dd3b9 Add postponed send loop to QEMU driver 2023-07-03 02:29:01 +03:00
64 changed files with 258 additions and 1248 deletions

View File

@@ -2,6 +2,6 @@ cmake_minimum_required(VERSION 2.8.12)
project(vitastor)
set(VERSION "0.9.6")
set(VERSION "0.9.2")
add_subdirectory(src)

View File

@@ -15,7 +15,7 @@ Vitastor архитектурно похож на Ceph, что означает
и автоматическое распределение данных по любому числу дисков любого размера с настраиваемыми схемами
избыточности - репликацией или с произвольными кодами коррекции ошибок.
Vitastor нацелен в первую очередь на SSD и SSD+HDD кластеры с как минимум 10 Гбит/с сетью, поддерживает
Vitastor нацелен на SSD и SSD+HDD кластеры с как минимум 10 Гбит/с сетью, поддерживает
TCP и RDMA и на хорошем железе может достигать задержки 4 КБ чтения и записи на уровне ~0.1 мс,
что примерно в 10 раз быстрее, чем Ceph и другие популярные программные СХД.

View File

@@ -14,8 +14,8 @@ Vitastor is architecturally similar to Ceph which means strong consistency,
primary-replication, symmetric clustering and automatic data distribution over any
number of drives of any size with configurable redundancy (replication or erasure codes/XOR).
Vitastor targets primarily SSD and SSD+HDD clusters with at least 10 Gbit/s network,
supports TCP and RDMA and may achieve 4 KB read and write latency as low as ~0.1 ms
Vitastor targets SSD and SSD+HDD clusters with at least 10 Gbit/s network, supports
TCP and RDMA and may achieve 4 KB read and write latency as low as ~0.1 ms
with proper hardware which is ~10 times faster than other popular SDS's like Ceph
or internal systems of public clouds.

View File

@@ -1,4 +1,4 @@
VERSION ?= v0.9.6
VERSION ?= v0.9.2
all: build push

View File

@@ -49,7 +49,7 @@ spec:
capabilities:
add: ["SYS_ADMIN"]
allowPrivilegeEscalation: true
image: vitalif/vitastor-csi:v0.9.6
image: vitalif/vitastor-csi:v0.9.2
args:
- "--node=$(NODE_ID)"
- "--endpoint=$(CSI_ENDPOINT)"

View File

@@ -116,7 +116,7 @@ spec:
privileged: true
capabilities:
add: ["SYS_ADMIN"]
image: vitalif/vitastor-csi:v0.9.6
image: vitalif/vitastor-csi:v0.9.2
args:
- "--node=$(NODE_ID)"
- "--endpoint=$(CSI_ENDPOINT)"

View File

@@ -5,7 +5,7 @@ package vitastor
const (
vitastorCSIDriverName = "csi.vitastor.io"
vitastorCSIDriverVersion = "0.9.6"
vitastorCSIDriverVersion = "0.9.2"
)
// Config struct fills the parameters of request or user input

4
debian/changelog vendored
View File

@@ -1,10 +1,10 @@
vitastor (0.9.6-1) unstable; urgency=medium
vitastor (0.9.2-1) unstable; urgency=medium
* Bugfixes
-- Vitaliy Filippov <vitalif@yourcmc.ru> Fri, 03 Jun 2022 02:09:44 +0300
vitastor (0.9.6-1) unstable; urgency=medium
vitastor (0.9.2-1) unstable; urgency=medium
* Implement NFS proxy
* Add documentation

View File

@@ -28,19 +28,13 @@ RUN apt-get --download-only source qemu
ADD patches /root/vitastor/patches
ADD src/qemu_driver.c /root/vitastor/src/qemu_driver.c
#RUN set -e; \
# apt-get install -y wget; \
# wget -q -O /etc/apt/trusted.gpg.d/vitastor.gpg https://vitastor.io/debian/pubkey.gpg; \
# (echo deb http://vitastor.io/debian $REL main > /etc/apt/sources.list.d/vitastor.list); \
# (echo "APT::Install-Recommends false;" > /etc/apt/apt.conf) && \
# apt-get update; \
# apt-get install -y vitastor-client vitastor-client-dev quilt
RUN set -e; \
dpkg -i /root/packages/vitastor-$REL/vitastor-client_*.deb /root/packages/vitastor-$REL/vitastor-client-dev_*.deb; \
apt-get install -y wget; \
wget -q -O /etc/apt/trusted.gpg.d/vitastor.gpg https://vitastor.io/debian/pubkey.gpg; \
(echo deb http://vitastor.io/debian $REL main > /etc/apt/sources.list.d/vitastor.list); \
(echo "APT::Install-Recommends false;" > /etc/apt/apt.conf) && \
apt-get update; \
apt-get install -y quilt; \
apt-get install -y vitastor-client vitastor-client-dev quilt; \
mkdir -p /root/packages/qemu-$REL; \
rm -rf /root/packages/qemu-$REL/*; \
cd /root/packages/qemu-$REL; \
@@ -54,7 +48,7 @@ RUN set -e; \
quilt add block/vitastor.c; \
cp /root/vitastor/src/qemu_driver.c block/vitastor.c; \
quilt refresh; \
V=$(head -n1 debian/changelog | perl -pe 's/^.*\((.*?)(~bpo[\d\+]*)?\).*$/$1/')+vitastor3; \
V=$(head -n1 debian/changelog | perl -pe 's/^.*\((.*?)(~bpo[\d\+]*)?\).*$/$1/')+vitastor1; \
DEBEMAIL="Vitaliy Filippov <vitalif@yourcmc.ru>" dch -D $REL -v $V 'Plug Vitastor block driver'; \
DEB_BUILD_OPTIONS=nocheck dpkg-buildpackage --jobs=auto -sa; \
rm -rf /root/packages/qemu-$REL/qemu-*/

View File

@@ -35,8 +35,8 @@ RUN set -e -x; \
mkdir -p /root/packages/vitastor-$REL; \
rm -rf /root/packages/vitastor-$REL/*; \
cd /root/packages/vitastor-$REL; \
cp -r /root/vitastor vitastor-0.9.6; \
cd vitastor-0.9.6; \
cp -r /root/vitastor vitastor-0.9.2; \
cd vitastor-0.9.2; \
ln -s /root/fio-build/fio-*/ ./fio; \
FIO=$(head -n1 fio/debian/changelog | perl -pe 's/^.*\((.*?)\).*$/$1/'); \
ls /usr/include/linux/raw.h || cp ./debian/raw.h /usr/include/linux/raw.h; \
@@ -49,8 +49,8 @@ RUN set -e -x; \
rm -rf a b; \
echo "dep:fio=$FIO" > debian/fio_version; \
cd /root/packages/vitastor-$REL; \
tar --sort=name --mtime='2020-01-01' --owner=0 --group=0 --exclude=debian -cJf vitastor_0.9.6.orig.tar.xz vitastor-0.9.6; \
cd vitastor-0.9.6; \
tar --sort=name --mtime='2020-01-01' --owner=0 --group=0 --exclude=debian -cJf vitastor_0.9.2.orig.tar.xz vitastor-0.9.2; \
cd vitastor-0.9.2; \
V=$(head -n1 debian/changelog | perl -pe 's/^.*\((.*?)\).*$/$1/'); \
DEBFULLNAME="Vitaliy Filippov <vitalif@yourcmc.ru>" dch -D $REL -v "$V""$REL" "Rebuild for $REL"; \
DEB_BUILD_OPTIONS=nocheck dpkg-buildpackage --jobs=auto -sa; \

View File

@@ -33,13 +33,12 @@ Size of objects (data blocks) into which all physical and virtual drives
in Vitastor, affects memory usage, write amplification and I/O load
distribution effectiveness.
Recommended default block size is 128 KB for SSD and 1 MB for HDD. In fact,
it's possible to use 1 MB for SSD too - it will lower memory usage, but
Recommended default block size is 128 KB for SSD and 4 MB for HDD. In fact,
it's possible to use 4 MB for SSD too - it will lower memory usage, but
may increase average WA and reduce linear performance.
OSD memory usage is roughly (SIZE / BLOCK * 68 bytes) which is roughly
544 MB per 1 TB of used disk space with the default 128 KB block size.
With 1 MB it's 8 times lower.
## bitmap_granularity

View File

@@ -33,14 +33,14 @@ OSD) могут сосуществовать в одном кластере Vita
настроек, влияет на потребление памяти, объём избыточной записи (write
amplification) и эффективность распределения нагрузки по OSD.
Рекомендуемые по умолчанию размеры блока - 128 килобайт для SSD и 1 мегабайт
для HDD. В принципе, для SSD можно тоже использовать блок размером 1 мегабайт,
Рекомендуемые по умолчанию размеры блока - 128 килобайт для SSD и 4
мегабайта для HDD. В принципе, для SSD можно тоже использовать 4 мегабайта,
это понизит использование памяти, но ухудшит распределение нагрузки и в
среднем увеличит WA.
Потребление памяти OSD составляет примерно (РАЗМЕР / БЛОК * 68 байт),
т.е. примерно 544 МБ памяти на 1 ТБ занятого места на диске при
стандартном 128 КБ блоке. При 1 МБ блоке памяти нужно в 8 раз меньше.
стандартном 128 КБ блоке.
## bitmap_granularity

View File

@@ -7,27 +7,26 @@
in Vitastor, affects memory usage, write amplification and I/O load
distribution effectiveness.
Recommended default block size is 128 KB for SSD and 1 MB for HDD. In fact,
it's possible to use 1 MB for SSD too - it will lower memory usage, but
Recommended default block size is 128 KB for SSD and 4 MB for HDD. In fact,
it's possible to use 4 MB for SSD too - it will lower memory usage, but
may increase average WA and reduce linear performance.
OSD memory usage is roughly (SIZE / BLOCK * 68 bytes) which is roughly
544 MB per 1 TB of used disk space with the default 128 KB block size.
With 1 MB it's 8 times lower.
info_ru: |
Размер объектов (блоков данных), на которые делятся физические и виртуальные
диски в Vitastor (в рамках каждого пула). Одна из ключевых на данный момент
настроек, влияет на потребление памяти, объём избыточной записи (write
amplification) и эффективность распределения нагрузки по OSD.
Рекомендуемые по умолчанию размеры блока - 128 килобайт для SSD и 1 мегабайт
для HDD. В принципе, для SSD можно тоже использовать блок размером 1 мегабайт,
Рекомендуемые по умолчанию размеры блока - 128 килобайт для SSD и 4
мегабайта для HDD. В принципе, для SSD можно тоже использовать 4 мегабайта,
это понизит использование памяти, но ухудшит распределение нагрузки и в
среднем увеличит WA.
Потребление памяти OSD составляет примерно (РАЗМЕР / БЛОК * 68 байт),
т.е. примерно 544 МБ памяти на 1 ТБ занятого места на диске при
стандартном 128 КБ блоке. При 1 МБ блоке памяти нужно в 8 раз меньше.
стандартном 128 КБ блоке.
- name: bitmap_granularity
type: int
default: 4096

View File

@@ -1,4 +1,4 @@
[Документация](../../README-ru.md#документация) → Установка → Proxmox VE
[Документация](../../README-ru.md#документация) → Установка → Proxmox
-----

View File

@@ -21,7 +21,7 @@
## Basic instructions
Download source, for example using git: `git clone --recurse-submodules https://git.yourcmc.ru/vitalif/vitastor/`
Download source, for example using git: `git clone --recurse-submodules https://yourcmc.ru/git/vitalif/vitastor/`
Get `fio` source and symlink it into `<vitastor>/fio`. If you don't want to build fio engine,
you can disable it by passing `-DWITH_FIO=no` to cmake.
@@ -41,7 +41,7 @@ It's recommended to build the QEMU driver (qemu_driver.c) in-tree, as a part of
QEMU build process. To do that:
- Install vitastor client library headers (from source or from vitastor-client-dev package)
- Take a corresponding patch from `patches/qemu-*-vitastor.patch` and apply it to QEMU source
- Copy `src/qemu_driver.c` to QEMU source directory as `block/vitastor.c`
- Copy `src/qemu_driver.c` to QEMU source directory as `block/block-vitastor.c`
- Build QEMU as usual
But it is also possible to build it out-of-tree. To do that:

View File

@@ -21,7 +21,7 @@
## Базовая инструкция
Скачайте исходные коды, например, из git: `git clone --recurse-submodules https://git.yourcmc.ru/vitalif/vitastor/`
Скачайте исходные коды, например, из git: `git clone --recurse-submodules https://yourcmc.ru/git/vitalif/vitastor/`
Скачайте исходные коды пакета `fio`, распакуйте их и создайте символическую ссылку на них
в директории исходников Vitastor: `<vitastor>/fio`. Либо, если вы не хотите собирать плагин fio,
@@ -41,7 +41,7 @@ cmake .. && make -j8 install
Драйвер QEMU (qemu_driver.c) рекомендуется собирать вместе с самим QEMU. Для этого:
- Установите заголовки клиентской библиотеки Vitastor (из исходников или из пакета vitastor-client-dev)
- Возьмите соответствующий патч из `patches/qemu-*-vitastor.patch` и примените его к исходникам QEMU
- Скопируйте [src/qemu_driver.c](../../src/qemu_driver.c) в директорию исходников QEMU как `block/vitastor.c`
- Скопируйте [src/qemu_driver.c](../../src/qemu_driver.c) в директорию исходников QEMU как `block/block-vitastor.c`
- Соберите QEMU как обычно
Однако в целях отладки драйвер также можно собирать отдельно от QEMU. Для этого:
@@ -60,7 +60,7 @@ cmake .. && make -j8 install
* Для QEMU 2.0+: `<qemu>/qapi-types.h` &rarr; `<vitastor>/qemu/b/qemu/qapi-types.h`
- `config-host.h` и `qapi` нужны, т.к. в них содержатся автогенерируемые заголовки
- Сконфигурируйте cmake Vitastor с `WITH_QEMU=yes` (`cmake .. -DWITH_QEMU=yes`) и, если вы
используете RHEL-подобный дистрибутив, также с `QEMU_PLUGINDIR=qemu-kvm`.
используете RHEL-подобый дистрибутив, также с `QEMU_PLUGINDIR=qemu-kvm`.
- После этого в процессе сборки Vitastor также будет собираться подходящий для вашей
версии QEMU `block-vitastor.so`.
- Таким образом можно использовать драйвер даже с немодифицированным QEMU, но в этом случае

View File

@@ -7,7 +7,6 @@
# Quick Start
- [Preparation](#preparation)
- [Recommended drives](#recommended-drives)
- [Configure monitors](#configure-monitors)
- [Configure OSDs](#configure-osds)
- [Create a pool](#create-a-pool)
@@ -20,20 +19,10 @@
- Get some SATA or NVMe SSDs with capacitors (server-grade drives). You can use desktop SSDs
with lazy fsync, but prepare for inferior single-thread latency. Read more about capacitors
[here](../config/layout-cluster.en.md#immediate_commit).
- If you want to use HDDs, get modern HDDs with Media Cache or SSD Cache: HGST Ultrastar,
Toshiba MG08, Seagate EXOS or something similar. If your drives don't have such cache then
you also need small SSDs for journal and metadata (even 2 GB per 1 TB of HDD space is enough).
- Get a fast network (at least 10 Gbit/s). Something like Mellanox ConnectX-4 with RoCEv2 is ideal.
- Disable CPU powersaving: `cpupower idle-set -D 0 && cpupower frequency-set -g performance`.
- [Install Vitastor packages](../installation/packages.en.md).
## Recommended drives
- SATA SSD: Micron 5100/5200/5300/5400, Samsung PM863/PM883/PM893, Intel D3-S4510/4520/4610/4620, Kingston DC500M
- NVMe: Micron 9100/9200/9300/9400, Micron 7300/7450, Samsung PM983/PM9A3, Samsung PM1723/1735/1743,
Intel DC-P3700/P4500/P4600, Intel D7-P5500/P5600, Intel Optane, Kingston DC1000B/DC1500M
- HDD: HGST Ultrastar, Toshiba MG06/MG07/MG08, Seagate EXOS
## Configure monitors
On the monitor hosts:
@@ -56,10 +45,9 @@ On the monitor hosts:
}
```
- Initialize OSDs:
- SSD-only or HDD-only: `vitastor-disk prepare /dev/sdXXX [/dev/sdYYY ...]`.
Add `--disable_data_fsync off` to leave disk write cache enabled if you use
desktop SSDs without capacitors. Do NOT add `--disable_data_fsync off` if you
use HDDs or SSD+HDD.
- SSD-only: `vitastor-disk prepare /dev/sdXXX [/dev/sdYYY ...]`. You can add
`--disable_data_fsync off` to leave disk cache enabled if you use desktop
SSDs without capacitors.
- Hybrid, SSD+HDD: `vitastor-disk prepare --hybrid /dev/sdXXX [/dev/sdYYY ...]`.
Pass all your devices (HDD and SSD) to this script &mdash; it will partition disks and initialize journals on its own.
This script skips HDDs which are already partitioned so if you want to use non-empty disks for

View File

@@ -7,7 +7,6 @@
# Быстрый старт
- [Подготовка](#подготовка)
- [Рекомендуемые диски](#рекомендуемые-диски)
- [Настройте мониторы](#настройте-мониторы)
- [Настройте OSD](#настройте-osd)
- [Создайте пул](#создайте-пул)
@@ -20,20 +19,10 @@
- Возьмите серверы с SSD (SATA или NVMe), желательно с конденсаторами (серверные SSD). Можно
использовать и десктопные SSD, включив режим отложенного fsync, но производительность будет хуже.
О конденсаторах читайте [здесь](../config/layout-cluster.ru.md#immediate_commit).
- Если хотите использовать HDD, берите современные модели с Media или SSD кэшем - HGST Ultrastar,
Toshiba MG08, Seagate EXOS или что-то похожее. Если такого кэша у ваших дисков нет,
обязательно возьмите SSD под метаданные и журнал (маленькие, буквально 2 ГБ на 1 ТБ HDD-места).
- Возьмите быструю сеть, минимум 10 гбит/с. Идеал - что-то вроде Mellanox ConnectX-4 с RoCEv2.
- Для лучшей производительности отключите энергосбережение CPU: `cpupower idle-set -D 0 && cpupower frequency-set -g performance`.
- [Установите пакеты Vitastor](../installation/packages.ru.md).
## Рекомендуемые диски
- SATA SSD: Micron 5100/5200/5300/5400, Samsung PM863/PM883/PM893, Intel D3-S4510/4520/4610/4620, Kingston DC500M
- NVMe: Micron 9100/9200/9300/9400, Micron 7300/7450, Samsung PM983/PM9A3, Samsung PM1723/1735/1743,
Intel DC-P3700/P4500/P4600, Intel D7-P5500/P5600, Intel Optane, Kingston DC1000B/DC1500M
- HDD: HGST Ultrastar, Toshiba MG06/MG07/MG08, Seagate EXOS
## Настройте мониторы
На хостах, выделенных под мониторы:
@@ -56,10 +45,9 @@
}
```
- Инициализуйте OSD:
- Только SSD или только HDD: `vitastor-disk prepare /dev/sdXXX [/dev/sdYYY ...]`.
Если вы используете десктопные SSD без конденсаторов, добавьте опцию `--disable_data_fsync off`,
чтобы оставить кэш записи диска включённым. НЕ добавляйте эту опцию, если используете
жёсткие диски (HDD).
- SSD: `vitastor-disk prepare /dev/sdXXX [/dev/sdYYY ...]`. Если вы используете
десктопные SSD без конденсаторов, можете оставить кэш включённым, добавив
опцию `--disable_data_fsync off`.
- Гибридные, SSD+HDD: `vitastor-disk prepare --hybrid /dev/sdXXX [/dev/sdYYY ...]`.
Передайте все ваши SSD и HDD скрипту в командной строке подряд, скрипт автоматически выделит
разделы под журналы на SSD и данные на HDD. Скрипт пропускает HDD, на которых уже есть разделы

View File

@@ -13,8 +13,6 @@ remains decent (see an example [here](../performance/comparison1.en.md#vitastor-
Vitastor Kubernetes CSI driver is based on NBD.
See also [VDUSE](qemu.en.md#vduse).
## Map image
To create a local block device for a Vitastor image run:

View File

@@ -16,8 +16,6 @@ NBD немного снижает производительность из-за
CSI-драйвер Kubernetes Vitastor основан на NBD.
Смотрите также [VDUSE](qemu.ru.md#vduse).
## Подключить устройство
Чтобы создать локальное блочное устройство для образа, выполните команду:

View File

@@ -83,44 +83,3 @@ qemu-img rebase -u -b '' testimg.qcow2
This can be used for backups. Just note that exporting an image that is currently being written to
is of course unsafe and doesn't produce a consistent result, so only export snapshots if you do this
on a live VM.
## VDUSE
Linux kernel, starting with version 5.15, supports a new interface for attaching virtual disks
to the host - VDUSE (vDPA Device in Userspace). QEMU, starting with 7.2, has support for
exporting QEMU block devices over this protocol using qemu-storage-daemon.
VDUSE has the same problem as other FUSE-like interfaces in Linux: if a userspace process hangs,
for example, if it loses connectivity with Vitastor cluster - active processes doing I/O may
hang in the D state (uninterruptible sleep) and you won't be able to kill them even with kill -9.
In this case reboot will be the only way to remove VDUSE devices from system.
On the other hand, VDUSE is faster than [NBD](nbd.en.md), so you may prefer to use it if
performance is important for you. Approximate performance numbers:
direct fio benchmark - 115000 iops, NBD - 60000 iops, VDUSE - 90000 iops.
To try VDUSE you need at least Linux 5.15, built with VDUSE support
(CONFIG_VIRTIO_VDPA=m and CONFIG_VDPA_USER=m). Debian Linux kernels have these options
disabled by now, so if you want to try it on Debian, use a kernel from Ubuntu
[kernel-ppa/mainline](https://kernel.ubuntu.com/~kernel-ppa/mainline/) or Proxmox.
Commands to attach Vitastor image as a VDUSE device:
```
modprobe vduse
modprobe virtio-vdpa
qemu-storage-daemon --daemonize --blockdev '{"node-name":"test1","driver":"vitastor",\
"etcd-host":"192.168.7.2:2379/v3","image":"testosd1","cache":{"direct":true,"no-flush":false},"discard":"unmap"}' \
--export vduse-blk,id=test1,node-name=test1,name=test1,num-queues=16,queue-size=128,writable=true
vdpa dev add name test1 mgmtdev vduse
```
After running these commands /dev/vda device will appear in the system and you'll be able to
use it as a normal disk.
To remove the device:
```
vdpa dev del test1
kill <qemu-storage-daemon_process_PID>
```

View File

@@ -87,44 +87,3 @@ qemu-img rebase -u -b '' testimg.qcow2
Это можно использовать для резервного копирования. Только помните, что экспортировать образ, в который
в то же время идёт запись, небезопасно - результат чтения не будет целостным. Так что если вы работаете
с активными виртуальными машинами, экспортируйте только их снимки, но не сам образ.
## VDUSE
В Linux, начиная с версии ядра 5.15, доступен новый интерфейс для подключения виртуальных дисков
к системе - VDUSE (vDPA Device in Userspace), а в QEMU, начиная с версии 7.2, есть поддержка
экспорта блочных устройств QEMU по этому протоколу через qemu-storage-daemon.
VDUSE страдает общей проблемой FUSE-подобных интерфейсов в Linux: если пользовательский процесс
подвиснет, например, если будет потеряна связь с кластером Vitastor - читающие/пишущие в кластер
процессы могут "залипнуть" в состоянии D (непрерываемый сон) и их будет невозможно убить даже
через kill -9. В этом случае удалить из системы устройство можно только перезагрузившись.
С другой стороны, VDUSE быстрее по сравнению с [NBD](nbd.ru.md), поэтому его может
быть предпочтительно использовать там, где производительность важнее. Порядок показателей:
прямое тестирование через fio - 115000 iops, NBD - 60000 iops, VDUSE - 90000 iops.
Чтобы использовать VDUSE, вам нужно ядро Linux версии хотя бы 5.15, собранное с поддержкой
VDUSE (CONFIG_VIRTIO_VDPA=m и CONFIG_VDPA_USER=m). В ядрах в Debian Linux поддержка пока
отключена - если хотите попробовать эту функцию на Debian, поставьте ядро из Ubuntu
[kernel-ppa/mainline](https://kernel.ubuntu.com/~kernel-ppa/mainline/) или из Proxmox.
Команды для подключения виртуального диска через VDUSE:
```
modprobe vduse
modprobe virtio-vdpa
qemu-storage-daemon --daemonize --blockdev '{"node-name":"test1","driver":"vitastor",\
"etcd-host":"192.168.7.2:2379/v3","image":"testosd1","cache":{"direct":true,"no-flush":false},"discard":"unmap"}' \
--export vduse-blk,id=test1,node-name=test1,name=test1,num-queues=16,queue-size=128,writable=true
vdpa dev add name test1 mgmtdev vduse
```
После этого в системе появится устройство /dev/vda, которое можно будет использовать как
обычный диск.
Для удаления устройства из системы:
```
vdpa dev del test1
kill <PID_процесса_qemu-storage-daemon>
```

View File

@@ -63,9 +63,8 @@ Wants=network-online.target local-fs.target time-sync.target
[Service]
Restart=always
Environment=GOGC=50
ExecStart=etcd -name etcd${num} --data-dir /var/lib/etcd${num}.etcd \\
--snapshot-count 10000 --advertise-client-urls http://${etcds[num]}:2379 --listen-client-urls http://${etcds[num]}:2379 \\
--advertise-client-urls http://${etcds[num]}:2379 --listen-client-urls http://${etcds[num]}:2379 \\
--initial-advertise-peer-urls http://${etcds[num]}:2380 --listen-peer-urls http://${etcds[num]}:2380 \\
--initial-cluster-token vitastor-etcd-1 --initial-cluster ${etcd_cluster} \\
--initial-cluster-state new --max-txn-ops=100000 --max-request-bytes=104857600 \\

View File

@@ -1497,14 +1497,10 @@ class Mon
break;
}
}
const pool_cfg = (this.state.config.pools[pool_id]||{});
if (!object_size)
{
object_size = pool_cfg.block_size || this.config.block_size || 131072;
}
if (pool_cfg.scheme !== 'replicated')
{
object_size *= ((pool_cfg.pg_size||0) - (pool_cfg.parity_chunks||0));
object_size = (this.state.config.pools[pool_id]||{}).block_size ||
this.config.block_size || 131072;
}
object_size = BigInt(object_size);
for (const pg_num in this.state.pg.stats[pool_id])
@@ -1612,7 +1608,7 @@ class Mon
}
}
}
return { inode_stats, seen_pools };
return inode_stats;
}
serialize_bigints(obj)
@@ -1638,7 +1634,7 @@ class Mon
const timestamp = Date.now();
const { object_counts, object_bytes } = this.sum_object_counts();
let stats = this.sum_op_stats(timestamp, this.prev_stats);
let { inode_stats, seen_pools } = this.sum_inode_stats(
let inode_stats = this.sum_inode_stats(
this.prev_stats ? this.prev_stats.inode_stats : null,
timestamp, this.prev_stats ? this.prev_stats.timestamp : null
);
@@ -1673,22 +1669,12 @@ class Mon
}
for (const pool_id in this.state.pool.stats)
{
if (!seen_pools[pool_id])
{
txn.push({ requestDeleteRange: {
key: b64(this.etcd_prefix+'/pool/stats/'+pool_id),
} });
delete this.state.pool.stats[pool_id];
}
else
{
const pool_stats = { ...this.state.pool.stats[pool_id] };
this.serialize_bigints(pool_stats);
txn.push({ requestPut: {
key: b64(this.etcd_prefix+'/pool/stats/'+pool_id),
value: b64(JSON.stringify(pool_stats)),
} });
}
const pool_stats = { ...this.state.pool.stats[pool_id] };
this.serialize_bigints(pool_stats);
txn.push({ requestPut: {
key: b64(this.etcd_prefix+'/pool/stats/'+pool_id),
value: b64(JSON.stringify(pool_stats)),
} });
}
if (txn.length)
{

View File

@@ -50,7 +50,7 @@ from cinder.volume import configuration
from cinder.volume import driver
from cinder.volume import volume_utils
VERSION = '0.9.6'
VERSION = '0.9.2'
LOG = logging.getLogger(__name__)

View File

@@ -1,176 +0,0 @@
diff --git a/block/Makefile.objs b/block/Makefile.objs
index d644bac60a..e404236291 100644
--- a/block/Makefile.objs
+++ b/block/Makefile.objs
@@ -19,6 +19,7 @@ block-obj-$(if $(CONFIG_LIBISCSI),y,n) += iscsi-opts.o
block-obj-$(CONFIG_LIBNFS) += nfs.o
block-obj-$(CONFIG_CURL) += curl.o
block-obj-$(CONFIG_RBD) += rbd.o
+block-obj-$(CONFIG_VITASTOR) += vitastor.o
block-obj-$(CONFIG_GLUSTERFS) += gluster.o
block-obj-$(CONFIG_VXHS) += vxhs.o
block-obj-$(CONFIG_LIBSSH2) += ssh.o
@@ -39,6 +40,8 @@ curl.o-cflags := $(CURL_CFLAGS)
curl.o-libs := $(CURL_LIBS)
rbd.o-cflags := $(RBD_CFLAGS)
rbd.o-libs := $(RBD_LIBS)
+vitastor.o-cflags := $(VITASTOR_CFLAGS)
+vitastor.o-libs := $(VITASTOR_LIBS)
gluster.o-cflags := $(GLUSTERFS_CFLAGS)
gluster.o-libs := $(GLUSTERFS_LIBS)
vxhs.o-libs := $(VXHS_LIBS)
diff --git a/configure b/configure
index 0a19b033bc..58b7fbf24c 100755
--- a/configure
+++ b/configure
@@ -398,6 +398,7 @@ trace_backends="log"
trace_file="trace"
spice=""
rbd=""
+vitastor=""
smartcard=""
libusb=""
usb_redir=""
@@ -1213,6 +1214,10 @@ for opt do
;;
--enable-rbd) rbd="yes"
;;
+ --disable-vitastor) vitastor="no"
+ ;;
+ --enable-vitastor) vitastor="yes"
+ ;;
--disable-xfsctl) xfs="no"
;;
--enable-xfsctl) xfs="yes"
@@ -1601,6 +1606,7 @@ disabled with --disable-FEATURE, default is enabled if available:
vhost-crypto vhost-crypto acceleration support
spice spice
rbd rados block device (rbd)
+ vitastor vitastor block device
libiscsi iscsi support
libnfs nfs support
smartcard smartcard support (libcacard)
@@ -3594,6 +3600,27 @@ EOF
fi
fi
+##########################################
+# vitastor probe
+if test "$vitastor" != "no" ; then
+ cat > $TMPC <<EOF
+#include <vitastor_c.h>
+int main(void) {
+ vitastor_c_create_qemu(0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0);
+ return 0;
+}
+EOF
+ vitastor_libs="-lvitastor_client"
+ if compile_prog "" "$vitastor_libs" ; then
+ vitastor=yes
+ else
+ if test "$vitastor" = "yes" ; then
+ feature_not_found "vitastor block device" "Install vitastor-client-dev"
+ fi
+ vitastor=no
+ fi
+fi
+
##########################################
# libssh2 probe
min_libssh2_version=1.2.8
@@ -5837,6 +5864,7 @@ echo "Trace output file $trace_file-<pid>"
fi
echo "spice support $spice $(echo_version $spice $spice_protocol_version/$spice_server_version)"
echo "rbd support $rbd"
+echo "vitastor support $vitastor"
echo "xfsctl support $xfs"
echo "smartcard support $smartcard"
echo "libusb $libusb"
@@ -6416,6 +6444,11 @@ if test "$rbd" = "yes" ; then
echo "RBD_CFLAGS=$rbd_cflags" >> $config_host_mak
echo "RBD_LIBS=$rbd_libs" >> $config_host_mak
fi
+if test "$vitastor" = "yes" ; then
+ echo "CONFIG_VITASTOR=m" >> $config_host_mak
+ echo "VITASTOR_CFLAGS=$vitastor_cflags" >> $config_host_mak
+ echo "VITASTOR_LIBS=$vitastor_libs" >> $config_host_mak
+fi
echo "CONFIG_COROUTINE_BACKEND=$coroutine" >> $config_host_mak
if test "$coroutine_pool" = "yes" ; then
diff --git a/qapi/block-core.json b/qapi/block-core.json
index c50517bff3..c780bb2c1c 100644
--- a/qapi/block-core.json
+++ b/qapi/block-core.json
@@ -2514,7 +2514,7 @@
'dmg', 'file', 'ftp', 'ftps', 'gluster', 'host_cdrom',
'host_device', 'http', 'https', 'iscsi', 'luks', 'nbd', 'nfs',
'null-aio', 'null-co', 'nvme', 'parallels', 'qcow', 'qcow2', 'qed',
- 'quorum', 'raw', 'rbd', 'replication', 'sheepdog', 'ssh',
+ 'quorum', 'raw', 'rbd', 'vitastor', 'replication', 'sheepdog', 'ssh',
'throttle', 'vdi', 'vhdx', 'vmdk', 'vpc', 'vvfat', 'vxhs' ] }
##
@@ -3217,6 +3217,28 @@
'*snap-id': 'uint32',
'*tag': 'str' } }
+##
+# @BlockdevOptionsVitastor:
+#
+# Driver specific block device options for vitastor
+#
+# @image: Image name
+# @inode: Inode number
+# @pool: Pool ID
+# @size: Desired image size in bytes
+# @config-path: Path to Vitastor configuration
+# @etcd-host: etcd connection address(es)
+# @etcd-prefix: etcd key/value prefix
+##
+{ 'struct': 'BlockdevOptionsVitastor',
+ 'data': { '*inode': 'uint64',
+ '*pool': 'uint64',
+ '*size': 'uint64',
+ '*image': 'str',
+ '*config-path': 'str',
+ '*etcd-host': 'str',
+ '*etcd-prefix': 'str' } }
+
##
# @ReplicationMode:
#
@@ -3547,6 +3569,7 @@
'rbd': 'BlockdevOptionsRbd',
'replication':'BlockdevOptionsReplication',
'sheepdog': 'BlockdevOptionsSheepdog',
+ 'vitastor': 'BlockdevOptionsVitastor',
'ssh': 'BlockdevOptionsSsh',
'throttle': 'BlockdevOptionsThrottle',
'vdi': 'BlockdevOptionsGenericFormat',
@@ -3991,6 +4014,17 @@
'*subformat': 'BlockdevVhdxSubformat',
'*block-state-zero': 'bool' } }
+##
+# @BlockdevCreateOptionsVitastor:
+#
+# Driver specific image creation options for Vitastor.
+#
+# @size: Size of the virtual disk in bytes
+##
+{ 'struct': 'BlockdevCreateOptionsVitastor',
+ 'data': { 'location': 'BlockdevOptionsVitastor',
+ 'size': 'size' } }
+
##
# @BlockdevVpcSubformat:
#
@@ -4074,6 +4108,7 @@
'rbd': 'BlockdevCreateOptionsRbd',
'replication': 'BlockdevCreateNotSupported',
'sheepdog': 'BlockdevCreateOptionsSheepdog',
+ 'vitastor': 'BlockdevCreateOptionsVitastor',
'ssh': 'BlockdevCreateOptionsSsh',
'throttle': 'BlockdevCreateNotSupported',
'vdi': 'BlockdevCreateOptionsVdi',

View File

@@ -1,181 +0,0 @@
Index: qemu-5.2+dfsg/qapi/block-core.json
===================================================================
--- qemu-5.2+dfsg.orig/qapi/block-core.json
+++ qemu-5.2+dfsg/qapi/block-core.json
@@ -2831,7 +2831,7 @@
'luks', 'nbd', 'nfs', 'null-aio', 'null-co', 'nvme', 'parallels',
'qcow', 'qcow2', 'qed', 'quorum', 'raw', 'rbd',
{ 'name': 'replication', 'if': 'defined(CONFIG_REPLICATION)' },
- 'sheepdog',
+ 'sheepdog', 'vitastor',
'ssh', 'throttle', 'vdi', 'vhdx', 'vmdk', 'vpc', 'vvfat' ] }
##
@@ -3668,6 +3668,28 @@
'*tag': 'str' } }
##
+# @BlockdevOptionsVitastor:
+#
+# Driver specific block device options for vitastor
+#
+# @image: Image name
+# @inode: Inode number
+# @pool: Pool ID
+# @size: Desired image size in bytes
+# @config-path: Path to Vitastor configuration
+# @etcd-host: etcd connection address(es)
+# @etcd-prefix: etcd key/value prefix
+##
+{ 'struct': 'BlockdevOptionsVitastor',
+ 'data': { '*inode': 'uint64',
+ '*pool': 'uint64',
+ '*size': 'uint64',
+ '*image': 'str',
+ '*config-path': 'str',
+ '*etcd-host': 'str',
+ '*etcd-prefix': 'str' } }
+
+##
# @ReplicationMode:
#
# An enumeration of replication modes.
@@ -4015,6 +4037,7 @@
'replication': { 'type': 'BlockdevOptionsReplication',
'if': 'defined(CONFIG_REPLICATION)' },
'sheepdog': 'BlockdevOptionsSheepdog',
+ 'vitastor': 'BlockdevOptionsVitastor',
'ssh': 'BlockdevOptionsSsh',
'throttle': 'BlockdevOptionsThrottle',
'vdi': 'BlockdevOptionsGenericFormat',
@@ -4404,6 +4427,17 @@
'*cluster-size' : 'size' } }
##
+# @BlockdevCreateOptionsVitastor:
+#
+# Driver specific image creation options for Vitastor.
+#
+# @size: Size of the virtual disk in bytes
+##
+{ 'struct': 'BlockdevCreateOptionsVitastor',
+ 'data': { 'location': 'BlockdevOptionsVitastor',
+ 'size': 'size' } }
+
+##
# @BlockdevVmdkSubformat:
#
# Subformat options for VMDK images
@@ -4665,6 +4699,7 @@
'qed': 'BlockdevCreateOptionsQed',
'rbd': 'BlockdevCreateOptionsRbd',
'sheepdog': 'BlockdevCreateOptionsSheepdog',
+ 'vitastor': 'BlockdevCreateOptionsVitastor',
'ssh': 'BlockdevCreateOptionsSsh',
'vdi': 'BlockdevCreateOptionsVdi',
'vhdx': 'BlockdevCreateOptionsVhdx',
Index: qemu-5.2+dfsg/block/meson.build
===================================================================
--- qemu-5.2+dfsg.orig/block/meson.build
+++ qemu-5.2+dfsg/block/meson.build
@@ -76,6 +76,7 @@ foreach m : [
['CONFIG_LIBNFS', 'nfs', libnfs, 'nfs.c'],
['CONFIG_LIBSSH', 'ssh', libssh, 'ssh.c'],
['CONFIG_RBD', 'rbd', rbd, 'rbd.c'],
+ ['CONFIG_VITASTOR', 'vitastor', vitastor, 'vitastor.c'],
]
if config_host.has_key(m[0])
if enable_modules
Index: qemu-5.2+dfsg/configure
===================================================================
--- qemu-5.2+dfsg.orig/configure
+++ qemu-5.2+dfsg/configure
@@ -372,6 +372,7 @@ trace_backends="log"
trace_file="trace"
spice=""
rbd=""
+vitastor=""
smartcard=""
u2f="auto"
libusb=""
@@ -1263,6 +1264,10 @@ for opt do
;;
--enable-rbd) rbd="yes"
;;
+ --disable-vitastor) vitastor="no"
+ ;;
+ --enable-vitastor) vitastor="yes"
+ ;;
--disable-xfsctl) xfs="no"
;;
--enable-xfsctl) xfs="yes"
@@ -1827,6 +1832,7 @@ disabled with --disable-FEATURE, default
vhost-vdpa vhost-vdpa kernel backend support
spice spice
rbd rados block device (rbd)
+ vitastor vitastor block device
libiscsi iscsi support
libnfs nfs support
smartcard smartcard support (libcacard)
@@ -3719,6 +3725,27 @@ EOF
fi
##########################################
+# vitastor probe
+if test "$vitastor" != "no" ; then
+ cat > $TMPC <<EOF
+#include <vitastor_c.h>
+int main(void) {
+ vitastor_c_create_qemu(0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0);
+ return 0;
+}
+EOF
+ vitastor_libs="-lvitastor_client"
+ if compile_prog "" "$vitastor_libs" ; then
+ vitastor=yes
+ else
+ if test "$vitastor" = "yes" ; then
+ feature_not_found "vitastor block device" "Install vitastor-client-dev"
+ fi
+ vitastor=no
+ fi
+fi
+
+##########################################
# libssh probe
if test "$libssh" != "no" ; then
if $pkg_config --exists libssh; then
@@ -6456,6 +6483,10 @@ if test "$rbd" = "yes" ; then
echo "CONFIG_RBD=y" >> $config_host_mak
echo "RBD_LIBS=$rbd_libs" >> $config_host_mak
fi
+if test "$vitastor" = "yes" ; then
+ echo "CONFIG_VITASTOR=y" >> $config_host_mak
+ echo "VITASTOR_LIBS=$vitastor_libs" >> $config_host_mak
+fi
echo "CONFIG_COROUTINE_BACKEND=$coroutine" >> $config_host_mak
if test "$coroutine_pool" = "yes" ; then
Index: qemu-5.2+dfsg/meson.build
===================================================================
--- qemu-5.2+dfsg.orig/meson.build
+++ qemu-5.2+dfsg/meson.build
@@ -596,6 +596,10 @@ rbd = not_found
if 'CONFIG_RBD' in config_host
rbd = declare_dependency(link_args: config_host['RBD_LIBS'].split())
endif
+vitastor = not_found
+if 'CONFIG_VITASTOR' in config_host
+ vitastor = declare_dependency(link_args: config_host['VITASTOR_LIBS'].split())
+endif
glusterfs = not_found
if 'CONFIG_GLUSTERFS' in config_host
glusterfs = declare_dependency(compile_args: config_host['GLUSTERFS_CFLAGS'].split(),
@@ -2145,6 +2149,7 @@ endif
# TODO: add back protocol and server version
summary_info += {'spice support': config_host.has_key('CONFIG_SPICE')}
summary_info += {'rbd support': config_host.has_key('CONFIG_RBD')}
+summary_info += {'vitastor support': config_host.has_key('CONFIG_VITASTOR')}
summary_info += {'xfsctl support': config_host.has_key('CONFIG_XFS')}
summary_info += {'smartcard support': config_host.has_key('CONFIG_SMARTCARD')}
summary_info += {'U2F support': u2f.found()}

View File

@@ -24,4 +24,4 @@ rm fio
mv fio-copy fio
FIO=`rpm -qi fio | perl -e 'while(<>) { /^Epoch[\s:]+(\S+)/ && print "$1:"; /^Version[\s:]+(\S+)/ && print $1; /^Release[\s:]+(\S+)/ && print "-$1"; }'`
perl -i -pe 's/(Requires:\s*fio)([^\n]+)?/$1 = '$FIO'/' $VITASTOR/rpm/vitastor-el$EL.spec
tar --transform 's#^#vitastor-0.9.6/#' --exclude 'rpm/*.rpm' -czf $VITASTOR/../vitastor-0.9.6$(rpm --eval '%dist').tar.gz *
tar --transform 's#^#vitastor-0.9.2/#' --exclude 'rpm/*.rpm' -czf $VITASTOR/../vitastor-0.9.2$(rpm --eval '%dist').tar.gz *

View File

@@ -22,7 +22,7 @@
Name: qemu-kvm
Version: 4.2.0
-Release: 29.vitastor%{?dist}.6
+Release: 34.vitastor%{?dist}.6
+Release: 32.vitastor%{?dist}.6
# Epoch because we pushed a qemu-1.0 package. AIUI this can't ever be dropped
Epoch: 15
License: GPLv2 and GPLv2+ and CC-BY

View File

@@ -13,7 +13,7 @@
Name: qemu-kvm
Version: 4.2.0
-Release: 29%{?dist}.6
+Release: 33.vitastor%{?dist}.6
+Release: 32.vitastor%{?dist}.6
# Epoch because we pushed a qemu-1.0 package. AIUI this can't ever be dropped
Epoch: 15
License: GPLv2 and GPLv2+ and CC-BY

View File

@@ -1,103 +0,0 @@
--- qemu-kvm-6.2.spec.orig 2023-07-18 13:52:57.636625440 +0000
+++ qemu-kvm-6.2.spec 2023-07-18 13:52:19.011683886 +0000
@@ -73,6 +73,7 @@ Requires: %{name}-hw-usbredir = %{epoch}
%endif \
Requires: %{name}-block-iscsi = %{epoch}:%{version}-%{release} \
Requires: %{name}-block-rbd = %{epoch}:%{version}-%{release} \
+Requires: %{name}-block-vitastor = %{epoch}:%{version}-%{release}\
Requires: %{name}-block-ssh = %{epoch}:%{version}-%{release}
# Macro to properly setup RHEL/RHEV conflict handling
@@ -83,7 +84,7 @@ Obsoletes: %1-rhev <= %{epoch}:%{version
Summary: QEMU is a machine emulator and virtualizer
Name: qemu-kvm
Version: 6.2.0
-Release: 32%{?rcrel}%{?dist}
+Release: 32.vitastor%{?rcrel}%{?dist}
# Epoch because we pushed a qemu-1.0 package. AIUI this can't ever be dropped
Epoch: 15
License: GPLv2 and GPLv2+ and CC-BY
@@ -122,6 +123,7 @@ Source37: tests_data_acpi_pc_SSDT.dimmpx
Source38: tests_data_acpi_q35_FACP.slic
Source39: tests_data_acpi_q35_SSDT.dimmpxm
Source40: tests_data_acpi_virt_SSDT.memhp
+Source41: qemu-vitastor.c
Patch0001: 0001-redhat-Adding-slirp-to-the-exploded-tree.patch
Patch0005: 0005-Initial-redhat-build.patch
@@ -652,6 +654,7 @@ Patch255: kvm-scsi-protect-req-aiocb-wit
Patch256: kvm-dma-helpers-prevent-dma_blk_cb-vs-dma_aio_cancel-rac.patch
# For bz#2090990 - qemu crash with error scsi_req_unref(SCSIRequest *): Assertion `req->refcount > 0' failed or scsi_dma_complete(void *, int): Assertion `r->req.aiocb != NULL' failed [8.7.0]
Patch257: kvm-virtio-scsi-reset-SCSI-devices-from-main-loop-thread.patch
+Patch258: qemu-6.2-vitastor.patch
BuildRequires: wget
BuildRequires: rpm-build
@@ -689,6 +692,7 @@ BuildRequires: libcurl-devel
BuildRequires: libssh-devel
BuildRequires: librados-devel
BuildRequires: librbd-devel
+BuildRequires: vitastor-client-devel
%if %{have_gluster}
# For gluster block driver
BuildRequires: glusterfs-api-devel
@@ -926,6 +930,14 @@ Install this package if you want to acce
using the rbd protocol.
+%package block-vitastor
+Summary: QEMU Vitastor block driver
+Requires: %{name}-common%{?_isa} = %{epoch}:%{version}-%{release}
+
+%description block-vitastor
+This package provides the additional Vitastor block driver for QEMU.
+
+
%package block-ssh
Summary: QEMU SSH block driver
Requires: %{name}-common%{?_isa} = %{epoch}:%{version}-%{release}
@@ -979,6 +991,7 @@ This package provides usbredir support.
rm -fr slirp
mkdir slirp
%autopatch -p1
+cp %{SOURCE41} ./block/vitastor.c
%global qemu_kvm_build qemu_kvm_build
mkdir -p %{qemu_kvm_build}
@@ -994,7 +1007,7 @@ cp -f %{SOURCE40} tests/data/acpi/virt/S
# --build-id option is used for giving info to the debug packages.
buildldflags="VL_LDFLAGS=-Wl,--build-id"
-%global block_drivers_list qcow2,raw,file,host_device,nbd,iscsi,rbd,blkdebug,luks,null-co,nvme,copy-on-read,throttle
+%global block_drivers_list qcow2,raw,file,host_device,nbd,iscsi,rbd,vitastor,blkdebug,luks,null-co,nvme,copy-on-read,throttle
%if 0%{have_gluster}
%global block_drivers_list %{block_drivers_list},gluster
@@ -1149,9 +1162,7 @@ pushd %{qemu_kvm_build}
--firmwarepath=%{_prefix}/share/qemu-firmware \
--meson="git" \
--target-list="%{buildarch}" \
- --block-drv-rw-whitelist=%{block_drivers_list} \
--audio-drv-list= \
- --block-drv-ro-whitelist=vmdk,vhdx,vpc,https,ssh \
--with-coroutine=ucontext \
--with-git=git \
--tls-priority=@QEMU,SYSTEM \
@@ -1197,6 +1208,7 @@ pushd %{qemu_kvm_build}
%endif
--enable-pie \
--enable-rbd \
+ --enable-vitastor \
%if 0%{have_librdma}
--enable-rdma \
%endif
@@ -1794,6 +1806,9 @@ sh %{_sysconfdir}/sysconfig/modules/kvm.
%files block-rbd
%{_libdir}/qemu-kvm/block-rbd.so
+%files block-vitastor
+%{_libdir}/qemu-kvm/block-vitastor.so
+
%files block-ssh
%{_libdir}/qemu-kvm/block-ssh.so

View File

@@ -1,93 +0,0 @@
--- qemu-kvm-7.2.spec.orig 2023-06-22 13:56:19.000000000 +0000
+++ qemu-kvm-7.2.spec 2023-07-18 07:55:22.347090196 +0000
@@ -100,8 +100,6 @@
%endif
%global target_list %{kvm_target}-softmmu
-%global block_drivers_rw_list qcow2,raw,file,host_device,nbd,iscsi,rbd,blkdebug,luks,null-co,nvme,copy-on-read,throttle,compress
-%global block_drivers_ro_list vdi,vmdk,vhdx,vpc,https
%define qemudocdir %{_docdir}/%{name}
%global firmwaredirs "%{_datadir}/qemu-firmware:%{_datadir}/ipxe/qemu:%{_datadir}/seavgabios:%{_datadir}/seabios"
@@ -126,6 +124,7 @@ Requires: %{name}-device-usb-host = %{ep
Requires: %{name}-device-usb-redirect = %{epoch}:%{version}-%{release} \
%endif \
Requires: %{name}-block-rbd = %{epoch}:%{version}-%{release} \
+Requires: %{name}-block-vitastor = %{epoch}:%{version}-%{release}\
Requires: %{name}-audio-pa = %{epoch}:%{version}-%{release}
# Since SPICE is removed from RHEL-9, the following Obsoletes:
@@ -148,7 +147,7 @@ Obsoletes: %{name}-block-ssh <= %{epoch}
Summary: QEMU is a machine emulator and virtualizer
Name: qemu-kvm
Version: 7.2.0
-Release: 14%{?rcrel}%{?dist}%{?cc_suffix}.1
+Release: 14.vitastor%{?rcrel}%{?dist}%{?cc_suffix}.1
# Epoch because we pushed a qemu-1.0 package. AIUI this can't ever be dropped
# Epoch 15 used for RHEL 8
# Epoch 17 used for RHEL 9 (due to release versioning offset in RHEL 8.5)
@@ -171,6 +170,7 @@ Source28: 95-kvm-memlock.conf
Source30: kvm-s390x.conf
Source31: kvm-x86.conf
Source36: README.tests
+Source37: qemu-vitastor.c
Patch0004: 0004-Initial-redhat-build.patch
@@ -418,6 +418,7 @@ Patch134: kvm-target-i386-Fix-BZHI-instr
Patch135: kvm-intel-iommu-fail-DEVIOTLB_UNMAP-without-dt-mode.patch
# For bz#2203745 - Disk detach is unsuccessful while the guest is still booting [rhel-9.2.0.z]
Patch136: kvm-acpi-pcihp-allow-repeating-hot-unplug-requests.patch
+Patch137: qemu-7.2-vitastor.patch
%if %{have_clang}
BuildRequires: clang
@@ -449,6 +450,7 @@ BuildRequires: libcurl-devel
%if %{have_block_rbd}
BuildRequires: librbd-devel
%endif
+BuildRequires: vitastor-client-devel
# We need both because the 'stap' binary is probed for by configure
BuildRequires: systemtap
BuildRequires: systemtap-sdt-devel
@@ -642,6 +644,14 @@ using the rbd protocol.
%endif
+%package block-vitastor
+Summary: QEMU Vitastor block driver
+Requires: %{name}-common%{?_isa} = %{epoch}:%{version}-%{release}
+
+%description block-vitastor
+This package provides the additional Vitastor block driver for QEMU.
+
+
%package audio-pa
Summary: QEMU PulseAudio audio driver
Requires: %{name}-common%{?_isa} = %{epoch}:%{version}-%{release}
@@ -719,6 +729,7 @@ This package provides usbredir support.
%prep
%setup -q -n qemu-%{version}%{?rcstr}
%autopatch -p1
+cp %{SOURCE37} ./block/vitastor.c
%global qemu_kvm_build qemu_kvm_build
mkdir -p %{qemu_kvm_build}
@@ -946,6 +957,7 @@ run_configure \
%if %{have_block_rbd}
--enable-rbd \
%endif
+ --enable-vitastor \
%if %{have_librdma}
--enable-rdma \
%endif
@@ -1426,6 +1438,9 @@ useradd -r -u 107 -g qemu -G kvm -d / -s
%files block-rbd
%{_libdir}/%{name}/block-rbd.so
%endif
+%files block-vitastor
+%{_libdir}/%{name}/block-vitastor.so
+
%files audio-pa
%{_libdir}/%{name}/audio-pa.so

View File

@@ -35,7 +35,7 @@ ADD . /root/vitastor
RUN set -e; \
cd /root/vitastor/rpm; \
sh build-tarball.sh; \
cp /root/vitastor-0.9.6.el7.tar.gz ~/rpmbuild/SOURCES; \
cp /root/vitastor-0.9.2.el7.tar.gz ~/rpmbuild/SOURCES; \
cp vitastor-el7.spec ~/rpmbuild/SPECS/vitastor.spec; \
cd ~/rpmbuild/SPECS/; \
rpmbuild -ba vitastor.spec; \

View File

@@ -1,11 +1,11 @@
Name: vitastor
Version: 0.9.6
Version: 0.9.2
Release: 1%{?dist}
Summary: Vitastor, a fast software-defined clustered block storage
License: Vitastor Network Public License 1.1
URL: https://vitastor.io/
Source0: vitastor-0.9.6.el7.tar.gz
Source0: vitastor-0.9.2.el7.tar.gz
BuildRequires: liburing-devel >= 0.6
BuildRequires: gperftools-devel

View File

@@ -35,7 +35,7 @@ ADD . /root/vitastor
RUN set -e; \
cd /root/vitastor/rpm; \
sh build-tarball.sh; \
cp /root/vitastor-0.9.6.el8.tar.gz ~/rpmbuild/SOURCES; \
cp /root/vitastor-0.9.2.el8.tar.gz ~/rpmbuild/SOURCES; \
cp vitastor-el8.spec ~/rpmbuild/SPECS/vitastor.spec; \
cd ~/rpmbuild/SPECS/; \
rpmbuild -ba vitastor.spec; \

View File

@@ -1,11 +1,11 @@
Name: vitastor
Version: 0.9.6
Version: 0.9.2
Release: 1%{?dist}
Summary: Vitastor, a fast software-defined clustered block storage
License: Vitastor Network Public License 1.1
URL: https://vitastor.io/
Source0: vitastor-0.9.6.el8.tar.gz
Source0: vitastor-0.9.2.el8.tar.gz
BuildRequires: liburing-devel >= 0.6
BuildRequires: gperftools-devel

View File

@@ -18,7 +18,7 @@ ADD . /root/vitastor
RUN set -e; \
cd /root/vitastor/rpm; \
sh build-tarball.sh; \
cp /root/vitastor-0.9.6.el9.tar.gz ~/rpmbuild/SOURCES; \
cp /root/vitastor-0.9.2.el9.tar.gz ~/rpmbuild/SOURCES; \
cp vitastor-el9.spec ~/rpmbuild/SPECS/vitastor.spec; \
cd ~/rpmbuild/SPECS/; \
rpmbuild -ba vitastor.spec; \

View File

@@ -1,11 +1,11 @@
Name: vitastor
Version: 0.9.6
Version: 0.9.2
Release: 1%{?dist}
Summary: Vitastor, a fast software-defined clustered block storage
License: Vitastor Network Public License 1.1
URL: https://vitastor.io/
Source0: vitastor-0.9.6.el9.tar.gz
Source0: vitastor-0.9.2.el9.tar.gz
BuildRequires: liburing-devel >= 0.6
BuildRequires: gperftools-devel

View File

@@ -16,7 +16,7 @@ if("${CMAKE_INSTALL_PREFIX}" MATCHES "^/usr/local/?$")
set(CMAKE_INSTALL_RPATH "${CMAKE_INSTALL_PREFIX}/${CMAKE_INSTALL_LIBDIR}")
endif()
add_definitions(-DVERSION="0.9.6")
add_definitions(-DVERSION="0.9.2")
add_definitions(-Wall -Wno-sign-compare -Wno-comment -Wno-parentheses -Wno-pointer-arith -fdiagnostics-color=always -I ${CMAKE_SOURCE_DIR}/src)
if (${WITH_ASAN})
add_definitions(-fsanitize=address -fno-omit-frame-pointer)

View File

@@ -714,15 +714,9 @@ resume_1:
return false;
}
}
if (new_trim_pos < bs->journal.used_start
? (bs->journal.dirty_start >= bs->journal.used_start || bs->journal.dirty_start < new_trim_pos)
: (bs->journal.dirty_start >= bs->journal.used_start && bs->journal.dirty_start < new_trim_pos))
{
bs->journal.dirty_start = new_trim_pos;
}
bs->journal.used_start = new_trim_pos;
#ifdef BLOCKSTORE_DEBUG
printf("Journal trimmed to %08lx (next_free=%08lx dirty_start=%08lx)\n", bs->journal.used_start, bs->journal.next_free, bs->journal.dirty_start);
printf("Journal trimmed to %08lx (next_free=%08lx)\n", bs->journal.used_start, bs->journal.next_free);
#endif
if (bs->journal.flush_journal && !flusher->flush_queue.size())
{

View File

@@ -103,7 +103,6 @@ public:
journal_flusher_t(blockstore_impl_t *bs);
~journal_flusher_t();
void loop();
bool is_trim_wanted() { return trim_wanted; }
bool is_active();
void mark_trim_possible();
void request_trim();

View File

@@ -85,13 +85,11 @@ void blockstore_impl_t::parse_config(blockstore_config_t & config, bool init)
immediate_commit = IMMEDIATE_SMALL;
}
metadata_buf_size = strtoull(config["meta_buf_size"].c_str(), NULL, 10);
inmemory_meta = config["inmemory_metadata"] != "false" && config["inmemory_metadata"] != "0" &&
config["inmemory_metadata"] != "no";
inmemory_meta = config["inmemory_metadata"] != "false";
journal.sector_count = strtoull(config["journal_sector_buffer_count"].c_str(), NULL, 10);
journal.no_same_sector_overwrites = config["journal_no_same_sector_overwrites"] == "true" ||
config["journal_no_same_sector_overwrites"] == "1" || config["journal_no_same_sector_overwrites"] == "yes";
journal.inmemory = config["inmemory_journal"] != "false" && config["inmemory_journal"] != "0" &&
config["inmemory_journal"] != "no";
journal.inmemory = config["inmemory_journal"] != "false";
// Validate
if (journal.sector_count < 2)
{

View File

@@ -218,7 +218,7 @@ void blockstore_impl_t::erase_dirty(blockstore_dirty_db_t::iterator dirty_start,
auto used = --journal.used_sectors[dirty_it->second.journal_sector];
#ifdef BLOCKSTORE_DEBUG
printf(
"remove usage of journal offset %08lx by %lx:%lx v%lu (%lu refs)\n", dirty_it->second.journal_sector,
"remove usage of journal offset %08lx by %lx:%lx v%lu (%d refs)\n", dirty_it->second.journal_sector,
dirty_it->first.oid.inode, dirty_it->first.oid.stripe, dirty_it->first.version, used
);
#endif

View File

@@ -661,13 +661,8 @@ void blockstore_impl_t::release_journal_sectors(blockstore_op_t *op)
uint64_t s = PRIV(op)->min_flushed_journal_sector;
while (1)
{
if (!journal.sector_info[s-1].dirty && journal.sector_info[s-1].flush_count == 0)
if (s != (1+journal.cur_sector) && journal.sector_info[s-1].flush_count == 0)
{
if (s == (1+journal.cur_sector))
{
// Forcibly move to the next sector and move dirty position
journal.in_sector_pos = journal.block_size;
}
// We know for sure that we won't write into this sector anymore
uint64_t new_ds = journal.sector_info[s-1].offset + journal.block_size;
if (new_ds >= journal.len)

View File

@@ -56,15 +56,14 @@ struct image_lister_t
{
continue;
}
auto pool_it = parent->cli->st_cli.pool_config.find(INODE_POOL(ic.second.num));
bool good_pool = pool_it != parent->cli->st_cli.pool_config.end();
auto & pool_cfg = parent->cli->st_cli.pool_config.at(INODE_POOL(ic.second.num));
auto item = json11::Json::object {
{ "name", ic.second.name },
{ "size", ic.second.size },
{ "used_size", 0 },
{ "readonly", ic.second.readonly },
{ "pool_id", (uint64_t)INODE_POOL(ic.second.num) },
{ "pool_name", good_pool ? pool_it->second.name : "? (ID:"+std::to_string(INODE_POOL(ic.second.num))+")" },
{ "pool_name", pool_cfg.name },
{ "inode_num", INODE_NO_POOL(ic.second.num) },
{ "inode_id", ic.second.num },
};
@@ -248,8 +247,6 @@ resume_1:
if (state == 1)
goto resume_1;
get_list();
if (state == 100)
return;
if (show_stats)
{
resume_1:
@@ -272,7 +269,7 @@ resume_1:
{ "key", "name" },
{ "title", "NAME" },
});
if (list_pool_name == "")
if (!list_pool_id)
{
cols.push_back(json11::Json::object{
{ "key", "pool_name" },
@@ -379,18 +376,16 @@ resume_1:
std::string print_table(json11::Json items, json11::Json header, bool use_esc)
{
int header_sizes[header.array_items().size()];
std::vector<int> sizes;
for (int i = 0; i < header.array_items().size(); i++)
{
header_sizes[i] = utf8_length(header[i]["title"].string_value());
sizes.push_back(header_sizes[i]);
sizes.push_back(header[i]["title"].string_value().length());
}
for (auto & item: items.array_items())
{
for (int i = 0; i < header.array_items().size(); i++)
{
int l = utf8_length(item[header[i]["key"].string_value()].as_string());
int l = item[header[i]["key"].string_value()].as_string().length();
sizes[i] = sizes[i] < l ? l : sizes[i];
}
}
@@ -402,7 +397,7 @@ std::string print_table(json11::Json items, json11::Json header, bool use_esc)
// Separator
str += " ";
}
int pad = sizes[i]-header_sizes[i];
int pad = sizes[i]-header[i]["title"].string_value().length();
if (header[i]["right"].bool_value())
{
// Align right
@@ -430,7 +425,7 @@ std::string print_table(json11::Json items, json11::Json header, bool use_esc)
// Separator
str += " ";
}
int pad = sizes[i] - utf8_length(item[header[i]["key"].string_value()].as_string());
int pad = sizes[i] - item[header[i]["key"].string_value()].as_string().length();
if (header[i]["right"].bool_value())
{
// Align right

View File

@@ -13,7 +13,7 @@ struct image_changer_t
std::string image_name;
std::string new_name;
uint64_t new_size = 0;
bool force_size = false, inc_size = false;
bool force_size = false;
bool set_readonly = false, set_readwrite = false, force = false;
// interval between fsyncs
int fsync_interval = 128;
@@ -81,14 +81,14 @@ struct image_changer_t
}
if ((!set_readwrite || !cfg.readonly) &&
(!set_readonly || cfg.readonly) &&
(!new_size && !force_size || cfg.size == new_size || cfg.size >= new_size && inc_size) &&
(!new_size && !force_size || cfg.size == new_size) &&
(new_name == "" || new_name == image_name))
{
result = (cli_result_t){ .text = "No change" };
state = 100;
return;
}
if ((new_size != 0 || force_size) && (cfg.size < new_size || !inc_size))
if (new_size != 0 || force_size)
{
if (cfg.size >= new_size)
{
@@ -233,7 +233,6 @@ std::function<bool(cli_result_t &)> cli_tool_t::start_modify(json11::Json cfg)
changer->new_name = cfg["rename"].string_value();
changer->new_size = parse_size(cfg["resize"].as_string());
changer->force_size = cfg["force_size"].bool_value();
changer->inc_size = cfg["inc_size"].bool_value();
changer->force = cfg["force"].bool_value();
changer->set_readonly = cfg["readonly"].bool_value();
changer->set_readwrite = cfg["readwrite"].bool_value();

View File

@@ -99,16 +99,15 @@ int disk_tool_t::prepare_one(std::map<std::string, std::string> options, int is_
if (options["journal_size"] == "")
{
if (options["journal_device"] == "")
options["journal_size"] = is_hdd ? "128M" : "32M";
options["journal_size"] = "32M";
else if (is_hdd)
options["journal_size"] = DEFAULT_HYBRID_JOURNAL;
}
bool is_hybrid = is_hdd && options["journal_device"] != "" && options["journal_device"] != options["data_device"];
if (is_hdd)
{
if (options["block_size"] == "")
options["block_size"] = "1M";
if (is_hybrid && options["throttle_small_writes"] == "")
if (options["throttle_small_writes"] == "")
options["throttle_small_writes"] = "1";
}
json11::Json::object sb;
@@ -135,7 +134,7 @@ int disk_tool_t::prepare_one(std::map<std::string, std::string> options, int is_
{ "meta_offset", 4096 + (dsk.meta_device == dsk.journal_device ? dsk.journal_len : 0) },
{ "data_offset", 4096 + (dsk.data_device == dsk.meta_device ? dsk.meta_len : 0) +
(dsk.data_device == dsk.journal_device ? dsk.journal_len : 0) },
{ "journal_no_same_sector_overwrites", !is_hdd || is_hybrid },
{ "journal_no_same_sector_overwrites", true },
{ "journal_sector_buffer_count", 1024 },
{ "disable_data_fsync", json_is_true(options["disable_data_fsync"]) },
{ "disable_meta_fsync", json_is_true(options["disable_meta_fsync"]) },
@@ -147,7 +146,7 @@ int disk_tool_t::prepare_one(std::map<std::string, std::string> options, int is_
for (int i = 0; i < sizeof(allow_additional_params)/sizeof(allow_additional_params[0]); i++)
{
auto it = options.find(allow_additional_params[i]);
if (it != options.end() && it->second != "")
if (it != options.end())
{
sb[it->first] = it->second;
}
@@ -621,7 +620,7 @@ int disk_tool_t::prepare(std::vector<std::string> devices)
}
}
// Treat all disks as SSDs if not in the hybrid mode
prepare_one(options, dev.is_hdd ? 1 : 0);
prepare_one(options, hybrid && dev.is_hdd ? 1 : 0);
if (hybrid)
{
options.erase("journal_device");

View File

@@ -264,7 +264,6 @@ int write_zero(int fd, uint64_t offset, uint64_t size)
{
uint64_t buf_len = 1024*1024;
void *zero_buf = memalign_or_die(MEM_ALIGNMENT, buf_len);
memset(zero_buf, 0, buf_len);
ssize_t r;
while (size > 0)
{

View File

@@ -187,30 +187,22 @@ void etcd_state_client_t::add_etcd_url(std::string addr)
check_addr = addr;
if (pos == std::string::npos)
addr += "/v3";
bool local = false;
int i;
for (i = 0; i < local_ips.size(); i++)
{
if (local_ips[i] == check_addr)
{
local = true;
this->etcd_local.push_back(addr);
break;
}
}
auto & to = local ? this->etcd_local : this->etcd_addresses;
for (i = 0; i < to.size(); i++)
{
if (to[i] == addr)
break;
}
if (i >= to.size())
to.push_back(addr);
if (i >= local_ips.size())
this->etcd_addresses.push_back(addr);
}
}
void etcd_state_client_t::parse_config(const json11::Json & config)
{
this->etcd_local.clear();
this->etcd_addresses.clear();
if (config["etcd_address"].is_string())
{
@@ -357,7 +349,7 @@ void etcd_state_client_t::start_etcd_watcher()
watch_id == ETCD_OSD_STATE_WATCH_ID)
etcd_watches_initialised++;
if (etcd_watches_initialised == 4 && this->log_level > 0)
fprintf(stderr, "Successfully subscribed to etcd at %s\n", cur_addr.c_str());
fprintf(stderr, "Successfully subscribed to etcd at %s\n", selected_etcd_address.c_str());
}
if (data["result"]["canceled"].bool_value())
{
@@ -368,17 +360,15 @@ void etcd_state_client_t::start_etcd_watcher()
// so we should restart from the beginning if we can
if (on_reload_hook != NULL)
{
// check to not trigger on_reload_hook multiple times
if (etcd_watch_ws != NULL)
fprintf(stderr, "Revisions before %lu were compacted by etcd, reloading state\n",
data["result"]["compact_revision"].uint64_value());
if (etcd_watch_ws)
{
fprintf(stderr, "Revisions before %lu were compacted by etcd, reloading state\n",
data["result"]["compact_revision"].uint64_value());
http_close(etcd_watch_ws);
etcd_watch_ws = NULL;
etcd_watch_revision = 0;
on_reload_hook();
}
return;
etcd_watch_revision = 0;
on_reload_hook();
}
else
{
@@ -425,9 +415,13 @@ void etcd_state_client_t::start_etcd_watcher()
}
if (msg->eof)
{
fprintf(stderr, "Disconnected from etcd %s\n", cur_addr.c_str());
if (cur_addr == selected_etcd_address)
{
fprintf(stderr, "Disconnected from etcd %s\n", selected_etcd_address.c_str());
selected_etcd_address = "";
}
else
fprintf(stderr, "Disconnected from etcd\n");
if (etcd_watch_ws)
{
http_close(etcd_watch_ws);
@@ -444,7 +438,6 @@ void etcd_state_client_t::start_etcd_watcher()
else if (etcd_watches_initialised > 0)
{
// Connection was live, retry immediately
etcd_watches_initialised = 0;
start_etcd_watcher();
}
}

View File

@@ -132,6 +132,7 @@ protected:
uint64_t rdma_max_msg = 0;
#endif
bool has_send_loop = false;
std::vector<int> read_ready_clients;
std::vector<int> write_ready_clients;
// We don't use ringloop->set_immediate here because we may have no ringloop in client :)
@@ -159,6 +160,7 @@ public:
std::function<bool(osd_client_t*, json11::Json)> check_config_hook;
void read_requests();
void send_replies();
bool can_send();
void accept_connections(int listen_fd);
~osd_messenger_t();

View File

@@ -111,22 +111,28 @@ void osd_messenger_t::outbox_push(osd_op_t *cur_op)
return;
}
#endif
if (!ringloop)
if (!ringloop && !has_send_loop)
{
// FIXME: It's worse because it doesn't allow batching
// "Send loop" should be used, like in QEMU driver, or performance will suffer
// due to a large number of sendmsg() syscalls where they can be avoided.
// "Send loop" increases 4k T1Q128 randread from ~55k iops to ~90k on a 1 OSD.
// The difference is smaller when there are more OSDs though...
while (cl->outbox.size())
{
try_send(cl);
}
}
else if (cl->write_msg.msg_iovlen > 0 || !try_send(cl))
else if (cl->write_msg.msg_iovlen > 0 || has_send_loop || !try_send(cl))
{
if (cl->write_state == 0)
{
cl->write_state = CL_WRITE_READY;
write_ready_clients.push_back(cur_op->peer_fd);
}
ringloop->wakeup();
if (ringloop)
{
ringloop->wakeup();
}
}
}
@@ -218,6 +224,12 @@ void osd_messenger_t::send_replies()
write_ready_clients.clear();
}
bool osd_messenger_t::can_send()
{
has_send_loop = true;
return write_ready_clients.size() > 0;
}
void osd_messenger_t::handle_send(int result, osd_client_t *cl)
{
cl->write_msg.msg_iovlen = 0;

View File

@@ -190,15 +190,7 @@ static int nfs3_setattr_proc(void *opaque, rpc_op_t *rop)
{
if (handle == "roothandle" || self->parent->dir_by_hash.find(handle) != self->parent->dir_by_hash.end())
{
if (args->new_attributes.size.set_it)
{
*reply = (SETATTR3res){ .status = NFS3ERR_ISDIR };
}
else
{
// Silently ignore mode, uid, gid, atime, mtime changes
*reply = (SETATTR3res){ .status = NFS3_OK };
}
*reply = (SETATTR3res){ .status = NFS3ERR_ISDIR };
}
else
{
@@ -366,6 +358,7 @@ static int nfs3_read_proc(void *opaque, rpc_op_t *rop)
}
static void nfs_resize_write(nfs_client_t *self, rpc_op_t *rop, uint64_t inode, uint64_t new_size, uint64_t offset, uint64_t count, void *buf);
static void nfs_do_write(nfs_client_t *self, rpc_op_t *rop, uint64_t inode, uint64_t offset, uint64_t count, void *buf);
static int nfs3_write_proc(void *opaque, rpc_op_t *rop)
{
@@ -399,6 +392,7 @@ static int nfs3_write_proc(void *opaque, rpc_op_t *rop)
.resok = (WRITE3resok){
//.file_wcc = ...,
.count = (unsigned)count,
.committed = args->stable,
},
};
if ((args->offset % alignment) != 0 || (count % alignment) != 0)
@@ -442,101 +436,42 @@ static int nfs3_write_proc(void *opaque, rpc_op_t *rop)
return 1;
}
static void complete_extend_write(nfs_client_t *self, rpc_op_t *rop, inode_t inode, int res)
static void nfs_resize_write(nfs_client_t *self, rpc_op_t *rop, uint64_t inode, uint64_t new_size, uint64_t offset, uint64_t count, void *buf)
{
WRITE3args *args = (WRITE3args*)rop->request;
WRITE3res *reply = (WRITE3res*)rop->reply;
if (res < 0)
{
*reply = (WRITE3res){ .status = vitastor_nfs_map_err(res) };
rpc_queue_reply(rop);
return;
}
bool imm = self->parent->cli->get_immediate_commit(inode);
reply->resok.committed = args->stable != UNSTABLE || imm ? FILE_SYNC : UNSTABLE;
*(uint64_t*)reply->resok.verf = self->parent->server_id;
if (args->stable != UNSTABLE && !imm)
{
// Client requested a stable write. Add an fsync
auto op = new cluster_op_t;
op->opcode = OSD_OP_SYNC;
op->callback = [rop](cluster_op_t *op)
{
if (op->retval != 0)
{
WRITE3res *reply = (WRITE3res*)rop->reply;
*reply = (WRITE3res){ .status = vitastor_nfs_map_err(-op->retval) };
}
delete op;
rpc_queue_reply(rop);
};
self->parent->cli->execute(op);
}
else
{
rpc_queue_reply(rop);
}
}
static void complete_extend_inode(nfs_client_t *self, uint64_t inode, uint64_t new_size, int err)
{
auto ext_it = self->extend_writes.lower_bound((extend_size_t){ .inode = inode, .new_size = 0 });
while (ext_it != self->extend_writes.end() &&
ext_it->first.inode == inode &&
ext_it->first.new_size <= new_size)
{
ext_it->second.resize_res = err;
if (ext_it->second.write_res <= 0)
{
complete_extend_write(self, ext_it->second.rop, inode, ext_it->second.write_res < 0
? ext_it->second.write_res : ext_it->second.resize_res);
self->extend_writes.erase(ext_it++);
}
else
ext_it++;
}
}
static void extend_inode(nfs_client_t *self, uint64_t inode, uint64_t new_size)
{
// Send an extend request
auto & ext = self->extends[inode];
ext.cur_extend = new_size;
// Check if we have to resize the inode before writing
auto inode_it = self->parent->cli->st_cli.inode_config.find(inode);
if (inode_it != self->parent->cli->st_cli.inode_config.end() &&
inode_it->second.size < new_size)
{
self->parent->cmd->loop_and_wait(self->parent->cmd->start_modify(json11::Json::object {
// FIXME: Resizing by ID is probably more correct
{ "image", inode_it->second.name },
{ "resize", new_size },
{ "inc_size", true },
{ "force_size", true },
}), [=](const cli_result_t & r)
{
auto & ext = self->extends[inode];
if (r.err)
{
fprintf(stderr, "Error extending inode %lu to %lu bytes: %s\n", inode, new_size, r.text.c_str());
}
if (r.err == EAGAIN || ext.next_extend > ext.cur_extend)
{
// Multiple concurrent resize requests received, try to repeat
extend_inode(self, inode, ext.next_extend > ext.cur_extend ? ext.next_extend : ext.cur_extend);
if (r.err == EAGAIN)
{
// Multiple concurrent resize requests received, try to repeat
nfs_resize_write(self, rop, inode, new_size, offset, count, buf);
return;
}
WRITE3res *reply = (WRITE3res*)rop->reply;
*reply = (WRITE3res){ .status = vitastor_nfs_map_err(r.err) };
rpc_queue_reply(rop);
return;
}
ext.cur_extend = ext.next_extend = 0;
complete_extend_inode(self, inode, new_size, r.err);
nfs_do_write(self, rop, inode, offset, count, buf);
});
}
else
{
complete_extend_inode(self, inode, new_size, 0);
nfs_do_write(self, rop, inode, offset, count, buf);
}
}
static void nfs_do_write(nfs_client_t *self, std::multimap<extend_size_t, extend_write_t>::iterator ewr_it,
rpc_op_t *rop, uint64_t inode, uint64_t offset, uint64_t count, void *buf)
static void nfs_do_write(nfs_client_t *self, rpc_op_t *rop, uint64_t inode, uint64_t offset, uint64_t count, void *buf)
{
cluster_op_t *op = new cluster_op_t;
op->opcode = OSD_OP_WRITE;
@@ -544,61 +479,48 @@ static void nfs_do_write(nfs_client_t *self, std::multimap<extend_size_t, extend
op->offset = offset;
op->len = count;
op->iov.push_back(buf, count);
op->callback = [self, ewr_it, rop](cluster_op_t *op)
op->callback = [self, rop](cluster_op_t *op)
{
auto inode = op->inode;
int write_res = op->retval < 0 ? op->retval : (op->retval != op->len ? -ERANGE : 0);
if (ewr_it == self->extend_writes.end())
uint64_t inode = op->inode;
WRITE3args *args = (WRITE3args*)rop->request;
WRITE3res *reply = (WRITE3res*)rop->reply;
if (op->retval != op->len)
{
complete_extend_write(self, rop, inode, write_res);
*reply = (WRITE3res){ .status = vitastor_nfs_map_err(-op->retval) };
delete op;
rpc_queue_reply(rop);
}
else
{
ewr_it->second.write_res = write_res;
if (ewr_it->second.resize_res <= 0)
*(uint64_t*)reply->resok.verf = self->parent->server_id;
delete op;
if (args->stable != UNSTABLE &&
!self->parent->cli->get_immediate_commit(inode))
{
complete_extend_write(self, rop, inode, write_res < 0 ? write_res : ewr_it->second.resize_res);
self->extend_writes.erase(ewr_it);
// Client requested a stable write. Add an fsync
op = new cluster_op_t;
op->opcode = OSD_OP_SYNC;
op->callback = [rop](cluster_op_t *op)
{
if (op->retval != 0)
{
WRITE3res *reply = (WRITE3res*)rop->reply;
*reply = (WRITE3res){ .status = vitastor_nfs_map_err(-op->retval) };
}
delete op;
rpc_queue_reply(rop);
};
self->parent->cli->execute(op);
}
else
{
rpc_queue_reply(rop);
}
}
};
self->parent->cli->execute(op);
}
static void nfs_resize_write(nfs_client_t *self, rpc_op_t *rop, uint64_t inode, uint64_t new_size, uint64_t offset, uint64_t count, void *buf)
{
// Check if we have to resize the inode during write
auto inode_it = self->parent->cli->st_cli.inode_config.find(inode);
if (inode_it != self->parent->cli->st_cli.inode_config.end() &&
inode_it->second.size < new_size)
{
auto ewr_it = self->extend_writes.emplace((extend_size_t){
.inode = inode,
.new_size = new_size,
}, (extend_write_t){
.rop = rop,
.resize_res = 1,
.write_res = 1,
});
auto & ext = self->extends[inode];
if (ext.cur_extend > 0)
{
// Already resizing, just wait
if (ext.next_extend < new_size)
ext.next_extend = new_size;
}
else
{
extend_inode(self, inode, new_size);
}
nfs_do_write(self, ewr_it, rop, inode, offset, count, buf);
}
else
{
nfs_do_write(self, self->extend_writes.end(), rop, inode, offset, count, buf);
}
}
static int nfs3_create_proc(void *opaque, rpc_op_t *rop)
{
nfs_client_t *self = (nfs_client_t*)opaque;
@@ -959,27 +881,6 @@ static int nfs3_link_proc(void *opaque, rpc_op_t *rop)
return 0;
}
static void fill_dir_entry(nfs_client_t *self, rpc_op_t *rop,
std::map<std::string, nfs_dir_t>::iterator dir_id_it, struct entryplus3 *entry, bool is_plus)
{
if (dir_id_it == self->parent->dir_info.end())
{
return;
}
entry->fileid = dir_id_it->second.id;
if (is_plus)
{
entry->name_attributes = (post_op_attr){
.attributes_follow = 1,
.attributes = get_dir_attributes(self, dir_id_it->first),
};
entry->name_handle = (post_op_fh3){
.handle_follows = 1,
.handle = xdr_copy_string(rop->xdrs, "S"+base64_encode(sha256(dir_id_it->first))),
};
}
}
static void nfs3_readdir_common(void *opaque, rpc_op_t *rop, bool is_plus)
{
nfs_client_t *self = (nfs_client_t*)opaque;
@@ -1057,17 +958,17 @@ static void nfs3_readdir_common(void *opaque, rpc_op_t *rop, bool is_plus)
continue;
std::string subname = dir_id_it->first.substr(prefix.size());
// for directories, fileid changes when the user restarts proxy
fill_dir_entry(self, rop, dir_id_it, &entries[subname], is_plus);
}
// Add . and ..
{
auto dir_id_it = self->parent->dir_info.find(dir);
fill_dir_entry(self, rop, dir_id_it, &entries["."], is_plus);
auto sl = dir.rfind("/");
if (sl != std::string::npos)
entries[subname].fileid = dir_id_it->second.id;
if (is_plus)
{
auto dir_id_it = self->parent->dir_info.find(dir.substr(0, sl));
fill_dir_entry(self, rop, dir_id_it, &entries[".."], is_plus);
entries[subname].name_attributes = (post_op_attr){
.attributes_follow = 1,
.attributes = get_dir_attributes(self, dir_id_it->first),
};
entries[subname].name_handle = (post_op_fh3){
.handle_follows = 1,
.handle = xdr_copy_string(rop->xdrs, "S"+base64_encode(sha256(dir_id_it->first))),
};
}
}
// Offset results by the continuation cookie (equal to index in the listing)
@@ -1292,11 +1193,10 @@ static int nfs3_commit_proc(void *opaque, rpc_op_t *rop)
cluster_op_t *op = new cluster_op_t;
// fsync. we don't know how to fsync a single inode, so just fsync everything
op->opcode = OSD_OP_SYNC;
op->callback = [self, rop](cluster_op_t *op)
op->callback = [rop](cluster_op_t *op)
{
COMMIT3res *reply = (COMMIT3res*)rop->reply;
*reply = (COMMIT3res){ .status = vitastor_nfs_map_err(op->retval) };
*(uint64_t*)reply->resok.verf = self->parent->server_id;
rpc_queue_reply(rop);
};
self->parent->cli->execute(op);

View File

@@ -86,28 +86,6 @@ struct rpc_free_buffer_t
unsigned size;
};
struct extend_size_t
{
inode_t inode;
uint64_t new_size;
};
inline bool operator < (const extend_size_t &a, const extend_size_t &b)
{
return a.inode < b.inode || a.inode == b.inode && a.new_size < b.new_size;
}
struct extend_write_t
{
rpc_op_t *rop;
int resize_res, write_res; // 1 = started, 0 = completed OK, -errno = completed with error
};
struct extend_inode_t
{
uint64_t cur_extend = 0, next_extend = 0;
};
class nfs_client_t
{
public:
@@ -122,8 +100,6 @@ public:
rpc_cur_buffer_t cur_buffer = { 0 };
std::map<uint8_t*, rpc_used_buffer_t> used_buffers;
std::vector<rpc_free_buffer_t> free_buffers;
std::map<inode_t, extend_inode_t> extends;
std::multimap<extend_size_t, extend_write_t> extend_writes;
iovec read_iov;
msghdr read_msg = { 0 };

View File

@@ -198,14 +198,13 @@ class osd_t
void on_change_pg_history_hook(pool_id_t pool_id, pg_num_t pg_num);
void on_change_etcd_state_hook(std::map<std::string, etcd_kv_t> & changes);
void on_load_config_hook(json11::Json::object & changes);
void on_reload_config_hook(json11::Json::object & changes);
json11::Json on_load_pgs_checks_hook();
void on_load_pgs_hook(bool success);
void bind_socket();
void acquire_lease();
json11::Json get_osd_state();
void create_osd_state();
void renew_lease(bool reload);
void renew_lease();
void print_stats();
void print_slow();
void reset_stats();

View File

@@ -70,7 +70,6 @@ void osd_t::init_cluster()
st_cli.on_load_config_hook = [this](json11::Json::object & cfg) { on_load_config_hook(cfg); };
st_cli.load_pgs_checks_hook = [this]() { return on_load_pgs_checks_hook(); };
st_cli.on_load_pgs_hook = [this](bool success) { on_load_pgs_hook(success); };
st_cli.on_reload_hook = [this]() { st_cli.load_global_config(); };
peering_state = OSD_LOADING_PGS;
st_cli.load_global_config();
}
@@ -396,14 +395,6 @@ void osd_t::on_load_config_hook(json11::Json::object & global_config)
parse_config(true);
bind_socket();
acquire_lease();
st_cli.on_load_config_hook = [this](json11::Json::object & cfg) { on_reload_config_hook(cfg); };
}
void osd_t::on_reload_config_hook(json11::Json::object & global_config)
{
etcd_global_config = global_config;
parse_config(false);
renew_lease(true);
}
// Acquire lease
@@ -433,7 +424,7 @@ void osd_t::acquire_lease()
);
tfd->set_timer(etcd_report_interval*1000, true, [this](int timer_id)
{
renew_lease(false);
renew_lease();
});
}
@@ -508,11 +499,11 @@ void osd_t::create_osd_state()
}
// Renew lease
void osd_t::renew_lease(bool reload)
void osd_t::renew_lease()
{
st_cli.etcd_call("/lease/keepalive", json11::Json::object {
{ "ID", etcd_lease_id }
}, st_cli.etcd_quick_timeout, 0, 0, [this, reload](std::string err, json11::Json data)
}, st_cli.etcd_quick_timeout, 0, 0, [this](std::string err, json11::Json data)
{
if (err == "" && data["result"]["TTL"].string_value() == "")
{
@@ -531,20 +522,15 @@ void osd_t::renew_lease(bool reload)
force_stop(1);
}
// Retry
tfd->set_timer(st_cli.etcd_quick_timeout, false, [this, reload](int timer_id)
tfd->set_timer(st_cli.etcd_quick_timeout, false, [this](int timer_id)
{
renew_lease(reload);
renew_lease();
});
}
else
{
etcd_failed_attempts = 0;
report_statistics();
// Reload PGs
if (reload && run_primary)
{
st_cli.load_pgs();
}
}
});
}
@@ -574,6 +560,7 @@ void osd_t::force_stop(int exitcode)
json11::Json osd_t::on_load_pgs_checks_hook()
{
assert(this->pgs.size() == 0);
json11::Json::array checks = {
json11::Json::object {
{ "target", "LEASE" },

View File

@@ -58,8 +58,6 @@ typedef struct VitastorFdData VitastorFdData;
typedef struct VitastorClient
{
void *proxy;
int uring_eventfd;
void *watch;
char *config_path;
char *etcd_host;
@@ -79,7 +77,7 @@ typedef struct VitastorClient
AioContext *ctx;
VitastorFdData **fds;
int fd_count, fd_alloc;
int bh_uring_scheduled;
int bh_send_all_scheduled;
uint64_t last_bitmap_inode, last_bitmap_offset, last_bitmap_len;
uint32_t last_bitmap_granularity;
@@ -235,52 +233,47 @@ out:
}
#if defined VITASTOR_C_API_VERSION && VITASTOR_C_API_VERSION >= 2
static void vitastor_uring_handler(void *opaque)
static void vitastor_bh_send_all(void *opaque)
{
VitastorClient *client = (VitastorClient*)opaque;
qemu_mutex_lock(&client->mutex);
client->bh_uring_scheduled = 0;
vitastor_c_uring_handle_events(client->proxy);
qemu_mutex_unlock(&client->mutex);
}
#if QEMU_VERSION_MAJOR == 2 && QEMU_VERSION_MINOR < 8
static void vitastor_bh_uring_handler(void *opaque)
{
VitastorBH *vbh = opaque;
vitastor_bh_handler(vbh->cli);
VitastorClient *client = vbh->cli;
#else
VitastorClient *client = opaque;
#endif
qemu_mutex_lock(&client->mutex);
client->bh_send_all_scheduled = 0;
vitastor_c_qemu_send_all(client->proxy);
qemu_mutex_unlock(&client->mutex);
#if QEMU_VERSION_MAJOR == 2 && QEMU_VERSION_MINOR < 8
qemu_bh_delete(vbh->bh);
free(vbh);
#endif
}
#endif
static void vitastor_schedule_uring_handler(VitastorClient *client)
static void vitastor_schedule_send_all(VitastorClient *client)
{
void *opaque = client;
if (client->uring_eventfd >= 0 && !client->bh_uring_scheduled)
#if defined VITASTOR_C_API_VERSION && VITASTOR_C_API_VERSION >= 2
if (!client->bh_send_all_scheduled)
{
client->bh_uring_scheduled = 1;
client->bh_send_all_scheduled = 1;
#if QEMU_VERSION_MAJOR > 4 || QEMU_VERSION_MAJOR == 4 && QEMU_VERSION_MINOR >= 2
replay_bh_schedule_oneshot_event(client->ctx, vitastor_uring_handler, opaque);
replay_bh_schedule_oneshot_event(client->ctx, vitastor_bh_send_all, client);
#elif QEMU_VERSION_MAJOR >= 3 || QEMU_VERSION_MAJOR == 2 && QEMU_VERSION_MINOR >= 8
aio_bh_schedule_oneshot(client->ctx, vitastor_uring_handler, opaque);
#else
aio_bh_schedule_oneshot(client->ctx, vitastor_bh_send_all, client);
#elif QEMU_VERSION_MAJOR >= 2
VitastorBH *vbh = (VitastorBH*)malloc(sizeof(VitastorBH));
vbh->cli = client;
#if QEMU_VERSION_MAJOR >= 2
vbh->bh = aio_bh_new(bdrv_get_aio_context(task->bs), vitastor_bh_uring_handler, vbh);
#else
vbh->bh = qemu_bh_new(vitastor_bh_uring_handler, vbh);
#endif
vbh->bh = aio_bh_new(bdrv_get_aio_context(task->bs), vitastor_bh_send_all, vbh);
qemu_bh_schedule(vbh->bh);
#else
client->bh_send_all_scheduled = 0;
vitastor_c_qemu_send_all(client->proxy);
#endif
}
}
#else
static void vitastor_schedule_uring_handler(VitastorClient *client)
{
}
#endif
}
static void coroutine_fn vitastor_co_get_metadata(VitastorRPC *task)
{
@@ -290,7 +283,7 @@ static void coroutine_fn vitastor_co_get_metadata(VitastorRPC *task)
qemu_mutex_lock(&client->mutex);
vitastor_c_watch_inode(client->proxy, client->image, vitastor_co_generic_cb, task);
vitastor_schedule_uring_handler(client);
vitastor_schedule_send_all(client);
qemu_mutex_unlock(&client->mutex);
while (!task->complete)
@@ -304,7 +297,7 @@ static void vitastor_aio_fd_read(void *fddv)
VitastorFdData *fdd = (VitastorFdData*)fddv;
qemu_mutex_lock(&fdd->cli->mutex);
fdd->fd_read(fdd->opaque);
vitastor_schedule_uring_handler(fdd->cli);
vitastor_schedule_send_all(fdd->cli);
qemu_mutex_unlock(&fdd->cli->mutex);
}
@@ -313,30 +306,10 @@ static void vitastor_aio_fd_write(void *fddv)
VitastorFdData *fdd = (VitastorFdData*)fddv;
qemu_mutex_lock(&fdd->cli->mutex);
fdd->fd_write(fdd->opaque);
vitastor_schedule_uring_handler(fdd->cli);
vitastor_schedule_send_all(fdd->cli);
qemu_mutex_unlock(&fdd->cli->mutex);
}
static void universal_aio_set_fd_handler(AioContext *ctx, int fd, IOHandler *fd_read, IOHandler *fd_write, void *opaque)
{
aio_set_fd_handler(ctx, fd,
#if QEMU_VERSION_MAJOR == 2 && QEMU_VERSION_MINOR >= 5 || QEMU_VERSION_MAJOR >= 3
0 /*is_external*/,
#endif
fd_read,
fd_write,
#if QEMU_VERSION_MAJOR == 1 && QEMU_VERSION_MINOR <= 6 || QEMU_VERSION_MAJOR < 1
NULL /*io_flush*/,
#endif
#if QEMU_VERSION_MAJOR == 2 && QEMU_VERSION_MINOR >= 9 || QEMU_VERSION_MAJOR >= 3
NULL /*io_poll*/,
#endif
#if QEMU_VERSION_MAJOR >= 7
NULL /*io_poll_ready*/,
#endif
opaque);
}
static void vitastor_aio_set_fd_handler(void *vcli, int fd, int unused1, IOHandler *fd_read, IOHandler *fd_write, void *unused2, void *opaque)
{
VitastorClient *client = (VitastorClient*)vcli;
@@ -379,9 +352,22 @@ static void vitastor_aio_set_fd_handler(void *vcli, int fd, int unused1, IOHandl
}
client->fds[client->fd_count++] = fdd;
}
universal_aio_set_fd_handler(
client->ctx, fd, fd_read ? vitastor_aio_fd_read : NULL, fd_write ? vitastor_aio_fd_write : NULL, fdd
);
aio_set_fd_handler(client->ctx, fd,
#if QEMU_VERSION_MAJOR == 2 && QEMU_VERSION_MINOR >= 5 || QEMU_VERSION_MAJOR >= 3
0 /*is_external*/,
#endif
fd_read ? vitastor_aio_fd_read : NULL,
fd_write ? vitastor_aio_fd_write : NULL,
#if QEMU_VERSION_MAJOR == 1 && QEMU_VERSION_MINOR <= 6 || QEMU_VERSION_MAJOR < 1
NULL /*io_flush*/,
#endif
#if QEMU_VERSION_MAJOR == 2 && QEMU_VERSION_MINOR >= 9 || QEMU_VERSION_MAJOR >= 3
NULL /*io_poll*/,
#endif
#if QEMU_VERSION_MAJOR >= 7
NULL /*io_poll_ready*/,
#endif
fdd);
}
static int vitastor_file_open(BlockDriverState *bs, QDict *options, int flags, Error **errp)
@@ -402,35 +388,10 @@ static int vitastor_file_open(BlockDriverState *bs, QDict *options, int flags, E
client->rdma_gid_index = qdict_get_try_int(options, "rdma-gid-index", 0);
client->rdma_mtu = qdict_get_try_int(options, "rdma-mtu", 0);
client->ctx = bdrv_get_aio_context(bs);
#if defined VITASTOR_C_API_VERSION && VITASTOR_C_API_VERSION >= 2
client->proxy = vitastor_c_create_qemu_uring(
client->proxy = vitastor_c_create_qemu(
vitastor_aio_set_fd_handler, client, client->config_path, client->etcd_host, client->etcd_prefix,
client->use_rdma, client->rdma_device, client->rdma_port_num, client->rdma_gid_index, client->rdma_mtu, 0
);
if (!client->proxy)
{
fprintf(stderr, "vitastor: failed to create io_uring: %s - I/O will be slower\n", strerror(errno));
client->uring_eventfd = -1;
#endif
client->proxy = vitastor_c_create_qemu(
vitastor_aio_set_fd_handler, client, client->config_path, client->etcd_host, client->etcd_prefix,
client->use_rdma, client->rdma_device, client->rdma_port_num, client->rdma_gid_index, client->rdma_mtu, 0
);
#if defined VITASTOR_C_API_VERSION && VITASTOR_C_API_VERSION >= 2
}
else
{
client->uring_eventfd = vitastor_c_uring_register_eventfd(client->proxy);
if (client->uring_eventfd < 0)
{
fprintf(stderr, "vitastor: failed to create io_uring eventfd: %s\n", strerror(errno));
error_setg(errp, "failed to create io_uring eventfd");
vitastor_close(bs);
return -1;
}
universal_aio_set_fd_handler(client->ctx, client->uring_eventfd, vitastor_uring_handler, NULL, client);
}
#endif
image = client->image = g_strdup(qdict_get_try_str(options, "image"));
client->readonly = (flags & BDRV_O_RDWR) ? 1 : 0;
// Get image metadata (size and readonly flag) or just wait until the client is ready
@@ -662,8 +623,7 @@ static void vitastor_co_generic_cb(void *opaque, long retval)
task->bh = aio_bh_new(bdrv_get_aio_context(task->bs), vitastor_co_generic_bh_cb, opaque);
qemu_bh_schedule(task->bh);
#else
task->bh = qemu_bh_new(vitastor_co_generic_bh_cb, opaque);
qemu_bh_schedule(task->bh);
vitastor_co_generic_bh_cb(opaque);
#endif
}
@@ -688,7 +648,7 @@ static int coroutine_fn vitastor_co_preadv(BlockDriverState *bs,
uint64_t inode = client->watch ? vitastor_c_inode_get_num(client->watch) : client->inode;
qemu_mutex_lock(&client->mutex);
vitastor_c_read(client->proxy, inode, offset, bytes, iov->iov, iov->niov, vitastor_co_read_cb, &task);
vitastor_schedule_uring_handler(client);
vitastor_schedule_send_all(client);
qemu_mutex_unlock(&client->mutex);
while (!task.complete)
@@ -722,7 +682,7 @@ static int coroutine_fn vitastor_co_pwritev(BlockDriverState *bs,
uint64_t inode = client->watch ? vitastor_c_inode_get_num(client->watch) : client->inode;
qemu_mutex_lock(&client->mutex);
vitastor_c_write(client->proxy, inode, offset, bytes, 0, iov->iov, iov->niov, vitastor_co_generic_cb, &task);
vitastor_schedule_uring_handler(client);
vitastor_schedule_send_all(client);
qemu_mutex_unlock(&client->mutex);
while (!task.complete)
@@ -740,6 +700,7 @@ static void vitastor_co_read_bitmap_cb(void *opaque, long retval, uint8_t *bitma
VitastorRPC *task = opaque;
VitastorClient *client = task->bs->opaque;
task->ret = retval;
task->complete = 1;
if (retval >= 0)
{
task->bitmap = bitmap;
@@ -751,17 +712,15 @@ static void vitastor_co_read_bitmap_cb(void *opaque, long retval, uint8_t *bitma
client->last_bitmap = bitmap;
}
}
#if QEMU_VERSION_MAJOR > 4 || QEMU_VERSION_MAJOR == 4 && QEMU_VERSION_MINOR >= 2
replay_bh_schedule_oneshot_event(bdrv_get_aio_context(task->bs), vitastor_co_generic_bh_cb, opaque);
#elif QEMU_VERSION_MAJOR >= 3 || QEMU_VERSION_MAJOR == 2 && QEMU_VERSION_MINOR >= 8
aio_bh_schedule_oneshot(bdrv_get_aio_context(task->bs), vitastor_co_generic_bh_cb, opaque);
#elif QEMU_VERSION_MAJOR >= 2
task->bh = aio_bh_new(bdrv_get_aio_context(task->bs), vitastor_co_generic_bh_cb, opaque);
qemu_bh_schedule(task->bh);
if (qemu_coroutine_self() != task->co)
{
#if QEMU_VERSION_MAJOR >= 3 || QEMU_VERSION_MAJOR == 2 && QEMU_VERSION_MINOR > 8
aio_co_wake(task->co);
#else
task->bh = qemu_bh_new(vitastor_co_generic_bh_cb, opaque);
qemu_bh_schedule(task->bh);
qemu_coroutine_enter(task->co, NULL);
qemu_aio_release(task);
#endif
}
}
static int coroutine_fn vitastor_co_block_status(
@@ -802,7 +761,7 @@ static int coroutine_fn vitastor_co_block_status(
task.bitmap = client->last_bitmap = NULL;
qemu_mutex_lock(&client->mutex);
vitastor_c_read_bitmap(client->proxy, task.inode, task.offset, task.len, !client->skip_parents, vitastor_co_read_bitmap_cb, &task);
vitastor_schedule_uring_handler(client);
vitastor_schedule_send_all(client);
qemu_mutex_unlock(&client->mutex);
while (!task.complete)
{
@@ -889,7 +848,7 @@ static int coroutine_fn vitastor_co_flush(BlockDriverState *bs)
qemu_mutex_lock(&client->mutex);
vitastor_c_sync(client->proxy, vitastor_co_generic_cb, &task);
vitastor_schedule_uring_handler(client);
vitastor_schedule_send_all(client);
qemu_mutex_unlock(&client->mutex);
while (!task.complete)

View File

@@ -2,12 +2,9 @@
// License: VNPL-1.1 or GNU GPL-2.0+ (see README.md for details)
#include <stdlib.h>
#include <unistd.h>
#include <stdexcept>
#include <sys/eventfd.h>
#include "ringloop.h"
ring_loop_t::ring_loop_t(int qd)
@@ -35,10 +32,6 @@ ring_loop_t::~ring_loop_t()
free(free_ring_data);
free(ring_datas);
io_uring_queue_exit(&ring);
if (ring_eventfd)
{
close(ring_eventfd);
}
}
void ring_loop_t::register_consumer(ring_consumer_t *consumer)
@@ -66,16 +59,6 @@ void ring_loop_t::unregister_consumer(ring_consumer_t *consumer)
void ring_loop_t::loop()
{
if (ring_eventfd >= 0)
{
// Reset eventfd counter
uint64_t ctr = 0;
int r = read(ring_eventfd, &ctr, 8);
if (r < 0 && errno != EAGAIN && errno != EINTR)
{
fprintf(stderr, "Error resetting eventfd: %s\n", strerror(errno));
}
}
struct io_uring_cqe *cqe;
while (!io_uring_peek_cqe(&ring, &cqe))
{
@@ -94,7 +77,7 @@ void ring_loop_t::loop()
}
else
{
fprintf(stderr, "Warning: empty callback in SQE\n");
printf("Warning: empty callback in SQE\n");
free_ring_data[free_ring_data_ptr++] = d - ring_datas;
}
io_uring_cqe_seen(&ring, cqe);
@@ -144,24 +127,3 @@ int ring_loop_t::sqes_left()
}
return left;
}
int ring_loop_t::register_eventfd()
{
if (ring_eventfd >= 0)
{
return ring_eventfd;
}
ring_eventfd = eventfd(0, EFD_CLOEXEC|EFD_NONBLOCK);
if (ring_eventfd < 0)
{
return -errno;
}
int r = io_uring_register_eventfd(&ring, ring_eventfd);
if (r < 0)
{
close(ring_eventfd);
ring_eventfd = -1;
return r;
}
return ring_eventfd;
}

View File

@@ -126,13 +126,11 @@ class ring_loop_t
unsigned free_ring_data_ptr;
bool loop_again;
struct io_uring ring;
int ring_eventfd = -1;
public:
ring_loop_t(int qd);
~ring_loop_t();
void register_consumer(ring_consumer_t *consumer);
void unregister_consumer(ring_consumer_t *consumer);
int register_eventfd();
inline struct io_uring_sqe* get_sqe()
{

View File

@@ -308,19 +308,3 @@ std::string str_repeat(const std::string & str, int times)
r += str;
return r;
}
size_t utf8_length(const std::string & s)
{
size_t len = 0;
for (size_t i = 0; i < s.size(); i++)
len += (s[i] & 0xC0) != 0x80;
return len;
}
size_t utf8_length(const char *s)
{
size_t len = 0;
for (; *s; s++)
len += (*s & 0xC0) != 0x80;
return len;
}

View File

@@ -18,5 +18,3 @@ void print_help(const char *help_text, std::string exe_name, std::string cmd, bo
uint64_t parse_time(std::string time_str, bool *ok = NULL);
std::string read_all_fd(int fd);
std::string str_repeat(const std::string & str, int times);
size_t utf8_length(const std::string & s);
size_t utf8_length(const char *s);

View File

@@ -6,7 +6,7 @@ includedir=${prefix}/@CMAKE_INSTALL_INCLUDEDIR@
Name: Vitastor
Description: Vitastor client library
Version: 0.9.6
Version: 0.9.2
Libs: -L${libdir} -lvitastor_client
Cflags: -I${includedir}

View File

@@ -5,7 +5,6 @@
// Also acts as a C-C++ proxy for the QEMU driver (QEMU headers don't compile with g++)
#include <sys/epoll.h>
#include <sys/eventfd.h>
#include "ringloop.h"
#include "epoll_manager.h"
@@ -26,7 +25,6 @@ struct vitastor_c
epoll_manager_t *epmgr = NULL;
timerfd_manager_t *tfd = NULL;
cluster_client_t *cli = NULL;
int uring_eventfd = -1;
QEMUSetFDHandler *aio_set_fd_handler = NULL;
void *aio_ctx = NULL;
@@ -72,8 +70,14 @@ static void vitastor_c_write_handler(void *opaque)
data->callback(data->fd, EPOLLOUT);
}
static vitastor_c *vitastor_c_create_qemu_common(QEMUSetFDHandler *aio_set_fd_handler, void *aio_context)
vitastor_c *vitastor_c_create_qemu(QEMUSetFDHandler *aio_set_fd_handler, void *aio_context,
const char *config_path, const char *etcd_host, const char *etcd_prefix,
int use_rdma, const char *rdma_device, int rdma_port_num, int rdma_gid_index, int rdma_mtu, int log_level)
{
json11::Json cfg_json = vitastor_c_common_config(
config_path, etcd_host, etcd_prefix, use_rdma,
rdma_device, rdma_port_num, rdma_gid_index, rdma_mtu, log_level
);
vitastor_c *self = new vitastor_c;
self->aio_set_fd_handler = aio_set_fd_handler;
self->aio_ctx = aio_context;
@@ -91,77 +95,24 @@ static vitastor_c *vitastor_c_create_qemu_common(QEMUSetFDHandler *aio_set_fd_ha
self->aio_set_fd_handler(self->aio_ctx, fd, false, NULL, NULL, NULL, NULL);
}
});
return self;
}
vitastor_c *vitastor_c_create_qemu(QEMUSetFDHandler *aio_set_fd_handler, void *aio_context,
const char *config_path, const char *etcd_host, const char *etcd_prefix,
int use_rdma, const char *rdma_device, int rdma_port_num, int rdma_gid_index, int rdma_mtu, int log_level)
{
json11::Json cfg_json = vitastor_c_common_config(
config_path, etcd_host, etcd_prefix, use_rdma,
rdma_device, rdma_port_num, rdma_gid_index, rdma_mtu, log_level
);
auto self = vitastor_c_create_qemu_common(aio_set_fd_handler, aio_context);
self->cli = new cluster_client_t(NULL, self->tfd, cfg_json);
return self;
}
vitastor_c *vitastor_c_create_qemu_uring(QEMUSetFDHandler *aio_set_fd_handler, void *aio_context,
const char *config_path, const char *etcd_host, const char *etcd_prefix,
int use_rdma, const char *rdma_device, int rdma_port_num, int rdma_gid_index, int rdma_mtu, int log_level)
{
ring_loop_t *ringloop = NULL;
try
{
ringloop = new ring_loop_t(512);
}
catch (std::exception & e)
{
return NULL;
}
json11::Json cfg_json = vitastor_c_common_config(
config_path, etcd_host, etcd_prefix, use_rdma,
rdma_device, rdma_port_num, rdma_gid_index, rdma_mtu, log_level
);
auto self = vitastor_c_create_qemu_common(aio_set_fd_handler, aio_context);
self->ringloop = ringloop;
self->cli = new cluster_client_t(self->ringloop, self->tfd, cfg_json);
return self;
}
vitastor_c *vitastor_c_create_uring(const char *config_path, const char *etcd_host, const char *etcd_prefix,
int use_rdma, const char *rdma_device, int rdma_port_num, int rdma_gid_index, int rdma_mtu, int log_level)
{
ring_loop_t *ringloop = NULL;
try
{
ringloop = new ring_loop_t(512);
}
catch (std::exception & e)
{
return NULL;
}
json11::Json cfg_json = vitastor_c_common_config(
config_path, etcd_host, etcd_prefix, use_rdma,
rdma_device, rdma_port_num, rdma_gid_index, rdma_mtu, log_level
);
vitastor_c *self = new vitastor_c;
self->ringloop = ringloop;
self->ringloop = new ring_loop_t(512);
self->epmgr = new epoll_manager_t(self->ringloop);
self->cli = new cluster_client_t(self->ringloop, self->epmgr->tfd, cfg_json);
return self;
}
int vitastor_c_uring_register_eventfd(vitastor_c *client)
{
if (!client->ringloop)
{
return -EINVAL;
}
return client->ringloop->register_eventfd();
}
vitastor_c *vitastor_c_create_uring_json(const char **options, int options_len)
{
json11::Json::object cfg;
@@ -215,9 +166,12 @@ void vitastor_c_uring_wait_events(vitastor_c *client)
client->ringloop->wait();
}
int vitastor_c_uring_has_work(vitastor_c *client)
void vitastor_c_qemu_send_all(vitastor_c *client)
{
return client->ringloop->has_work();
while (client->cli->msgr.can_send())
{
client->cli->msgr.send_replies();
}
}
void vitastor_c_read(vitastor_c *client, uint64_t inode, uint64_t offset, uint64_t len,

View File

@@ -34,19 +34,15 @@ typedef void QEMUSetFDHandler(void *ctx, int fd, int is_external, IOHandler *fd_
vitastor_c *vitastor_c_create_qemu(QEMUSetFDHandler *aio_set_fd_handler, void *aio_context,
const char *config_path, const char *etcd_host, const char *etcd_prefix,
int use_rdma, const char *rdma_device, int rdma_port_num, int rdma_gid_index, int rdma_mtu, int log_level);
vitastor_c *vitastor_c_create_qemu_uring(QEMUSetFDHandler *aio_set_fd_handler, void *aio_context,
const char *config_path, const char *etcd_host, const char *etcd_prefix,
int use_rdma, const char *rdma_device, int rdma_port_num, int rdma_gid_index, int rdma_mtu, int log_level);
vitastor_c *vitastor_c_create_uring(const char *config_path, const char *etcd_host, const char *etcd_prefix,
int use_rdma, const char *rdma_device, int rdma_port_num, int rdma_gid_index, int rdma_mtu, int log_level);
vitastor_c *vitastor_c_create_uring_json(const char **options, int options_len);
void vitastor_c_destroy(vitastor_c *client);
int vitastor_c_is_ready(vitastor_c *client);
int vitastor_c_uring_register_eventfd(vitastor_c *client);
void vitastor_c_uring_wait_ready(vitastor_c *client);
void vitastor_c_uring_handle_events(vitastor_c *client);
void vitastor_c_uring_wait_events(vitastor_c *client);
int vitastor_c_uring_has_work(vitastor_c *client);
void vitastor_c_qemu_send_all(vitastor_c *client);
void vitastor_c_read(vitastor_c *client, uint64_t inode, uint64_t offset, uint64_t len,
struct iovec *iov, int iovcnt, VitastorReadHandler cb, void *opaque);
void vitastor_c_write(vitastor_c *client, uint64_t inode, uint64_t offset, uint64_t len, uint64_t check_version,

View File

@@ -82,9 +82,7 @@ wait_up()
done
}
if [[ $OSD_COUNT -gt 0 ]]; then
wait_up 60
fi
wait_up 60
try_reweight()
{