Compare commits

..

4 Commits

Author SHA1 Message Date
1f6c4c79d6 vmsplice+splice experiment in stub_osd to test it too 2021-11-22 01:20:12 +03:00
4936c42132 Splice via io_uring - bad result too
40% CPU according to perf is lost inside do_splice() -> unix_stream_sendpage()
without io_uring and in various exc_page_fault() with io_uring
2021-11-22 00:13:27 +03:00
6c3248a36c Experiment: vmsplice+splice "zero-copy" read in NBD 2021-11-22 00:12:39 +03:00
a863013cb2 Add a patch for qemu 6.1 and replace _ with - in qemu options 2021-11-21 16:16:46 +03:00
133 changed files with 1802 additions and 5986 deletions

View File

@@ -2,6 +2,6 @@ cmake_minimum_required(VERSION 2.8)
project(vitastor) project(vitastor)
set(VERSION "0.6.12") set(VERSION "0.6.8")
add_subdirectory(src) add_subdirectory(src)

View File

@@ -51,14 +51,13 @@ Vitastor на данный момент находится в статусе п
- Базовая поддержка OpenStack: драйвер Cinder, патчи для Nova и libvirt - Базовая поддержка OpenStack: драйвер Cinder, патчи для Nova и libvirt
- Слияние снапшотов (vitastor-cli {snap-rm,flatten,merge}) - Слияние снапшотов (vitastor-cli {snap-rm,flatten,merge})
- Консольный интерфейс для управления образами (vitastor-cli {ls,create,modify}) - Консольный интерфейс для управления образами (vitastor-cli {ls,create,modify})
- Плагин для Proxmox
## Планы развития ## Планы развития
- Поддержка удаления снапшотов (слияния слоёв) - Поддержка удаления снапшотов (слияния слоёв)
- Более корректные скрипты разметки дисков и автоматического запуска OSD - Более корректные скрипты разметки дисков и автоматического запуска OSD
- Другие инструменты администрирования - Другие инструменты администрирования
- Плагины для OpenNebula и других облачных систем - Плагины для OpenNebula, Proxmox и других облачных систем
- iSCSI-прокси - iSCSI-прокси
- Более быстрое переключение при отказах - Более быстрое переключение при отказах
- Фоновая проверка целостности без контрольных сумм (сверка реплик) - Фоновая проверка целостности без контрольных сумм (сверка реплик)
@@ -404,21 +403,12 @@ Vitastor с однопоточной NBD прокси на том же стен
в этом случае пострадает. в этом случае пострадает.
- Быстрая сеть, минимум 10 гбит/с - Быстрая сеть, минимум 10 гбит/с
- Для наилучшей производительности нужно отключить энергосбережение CPU: `cpupower idle-set -D 0 && cpupower frequency-set -g performance`. - Для наилучшей производительности нужно отключить энергосбережение CPU: `cpupower idle-set -D 0 && cpupower frequency-set -g performance`.
- На хостах мониторов: - Пропишите нужные вам значения вверху файлов `/usr/lib/vitastor/mon/make-units.sh` и `/usr/lib/vitastor/mon/make-osd.sh`.
- Пропишите нужные вам значения в файле `/usr/lib/vitastor/mon/make-units.sh` - Создайте юниты systemd для etcd и мониторов: `/usr/lib/vitastor/mon/make-units.sh`
- Создайте юниты systemd для etcd и мониторов: `/usr/lib/vitastor/mon/make-units.sh` - Создайте юниты для OSD: `/usr/lib/vitastor/mon/make-osd.sh /dev/disk/by-partuuid/XXX [/dev/disk/by-partuuid/YYY ...]`
- Пропишите etcd_address и osd_network в `/etc/vitastor/vitastor.conf`. Например: - Вы можете поменять параметры OSD в юнитах systemd. Смысл некоторых параметров:
```
{
"etcd_address": ["10.200.1.10:2379","10.200.1.11:2379","10.200.1.12:2379"],
"osd_network": "10.200.1.0/24"
}
```
- Создайте юниты systemd для OSD: `/usr/lib/vitastor/make-osd.sh /dev/disk/by-partuuid/XXX [/dev/disk/by-partuuid/YYY ...]`
- Вы можете менять параметры OSD в юнитах systemd или в `vitastor.conf`. Смысл некоторых параметров:
- `disable_data_fsync 1` - отключает fsync, используется с SSD с конденсаторами. - `disable_data_fsync 1` - отключает fsync, используется с SSD с конденсаторами.
- `immediate_commit all` - используется с SSD с конденсаторами. - `immediate_commit all` - используется с SSD с конденсаторами.
Внимание: если установлено, также нужно установить его в то же значение в etcd в /vitastor/config/global
- `disable_device_lock 1` - отключает блокировку файла устройства, нужно, только если вы запускаете - `disable_device_lock 1` - отключает блокировку файла устройства, нужно, только если вы запускаете
несколько OSD на одном блочном устройстве. несколько OSD на одном блочном устройстве.
- `flusher_count 256` - "flusher" - микропоток, удаляющий старые данные из журнала. - `flusher_count 256` - "flusher" - микропоток, удаляющий старые данные из журнала.
@@ -538,75 +528,6 @@ for i in ./???-*.yaml; do kubectl apply -f $i; done
После этого вы сможете создавать PersistentVolume. Пример смотрите в файле [csi/deploy/example-pvc.yaml](csi/deploy/example-pvc.yaml). После этого вы сможете создавать PersistentVolume. Пример смотрите в файле [csi/deploy/example-pvc.yaml](csi/deploy/example-pvc.yaml).
### OpenStack
Чтобы подключить Vitastor к OpenStack:
- Установите пакеты vitastor-client, libvirt и QEMU из DEB или RPM репозитория Vitastor
- Примените патч `patches/nova-21.diff` или `patches/nova-23.diff` к вашей инсталляции Nova.
nova-21.diff подходит для Nova 21-22, nova-23.diff подходит для Nova 23-24.
- Скопируйте `patches/cinder-vitastor.py` в инсталляцию Cinder как `cinder/volume/drivers/vitastor.py`
- Создайте тип томов в cinder.conf (см. ниже)
- Обязательно заблокируйте доступ от виртуальных машин к сети Vitastor (OSD и etcd), т.к. Vitastor (пока) не поддерживает аутентификацию
- Перезапустите Cinder и Nova
Пример конфигурации Cinder:
```
[DEFAULT]
enabled_backends = lvmdriver-1, vitastor-testcluster
# ...
[vitastor-testcluster]
volume_driver = cinder.volume.drivers.vitastor.VitastorDriver
volume_backend_name = vitastor-testcluster
image_volume_cache_enabled = True
volume_clear = none
vitastor_etcd_address = 192.168.7.2:2379
vitastor_etcd_prefix =
vitastor_config_path = /etc/vitastor/vitastor.conf
vitastor_pool_id = 1
image_upload_use_cinder_backend = True
```
Чтобы помещать в Vitastor Glance-образы, нужно использовать
[https://docs.openstack.org/cinder/pike/admin/blockstorage-volume-backed-image.html](образы на основе томов Cinder),
однако, поддержка этой функции ещё не проверялась.
### Proxmox
Чтобы подключить Vitastor к Proxmox Virtual Environment (поддерживаются версии 6.4 и 7.1):
- Добавьте соответствующий Debian-репозиторий Vitastor в sources.list на хостах Proxmox
(buster для 6.4, bullseye для 7.1)
- Установите пакеты vitastor-client, pve-qemu-kvm, pve-storage-vitastor (* или см. сноску) из репозитория Vitastor
- Определите тип хранилища в `/etc/pve/storage.cfg` (см. ниже)
- Обязательно заблокируйте доступ от виртуальных машин к сети Vitastor (OSD и etcd), т.к. Vitastor (пока) не поддерживает аутентификацию
- Перезапустите демон Proxmox: `systemctl restart pvedaemon`
Пример `/etc/pve/storage.cfg` (единственная обязательная опция - vitastor_pool, все остальные
перечислены внизу для понимания значений по умолчанию):
```
vitastor: vitastor
# Пул, в который будут помещаться образы дисков
vitastor_pool testpool
# Путь к файлу конфигурации
vitastor_config_path /etc/vitastor/vitastor.conf
# Адрес(а) etcd, нужны, только если не указаны в vitastor.conf
vitastor_etcd_address 192.168.7.2:2379/v3
# Префикс ключей метаданных в etcd
vitastor_etcd_prefix /vitastor
# Префикс имён образов
vitastor_prefix pve/
# Монтировать образы через NBD прокси, через ядро (нужно только для контейнеров)
vitastor_nbd 0
```
\* Примечание: вместо установки пакета pve-storage-vitastor вы можете вручную скопировать файл
[patches/PVE_VitastorPlugin.pm](patches/PVE_VitastorPlugin.pm) на хосты Proxmox как
`/usr/share/perl5/PVE/Storage/Custom/VitastorPlugin.pm`.
## Известные проблемы ## Известные проблемы
- Запросы удаления объектов могут в данный момент приводить к "неполным" объектам в EC-пулах, - Запросы удаления объектов могут в данный момент приводить к "неполным" объектам в EC-пулах,

View File

@@ -45,7 +45,6 @@ breaking changes in the future. However, the following is implemented:
- Basic OpenStack support: Cinder driver, Nova and libvirt patches - Basic OpenStack support: Cinder driver, Nova and libvirt patches
- Snapshot merge tool (vitastor-cli {snap-rm,flatten,merge}) - Snapshot merge tool (vitastor-cli {snap-rm,flatten,merge})
- Image management CLI (vitastor-cli {ls,create,modify}) - Image management CLI (vitastor-cli {ls,create,modify})
- Proxmox storage plugin
## Roadmap ## Roadmap
@@ -357,21 +356,13 @@ and calculate disk offsets almost by hand. This will be fixed in near future.
with lazy fsync, but prepare for inferior single-thread latency. with lazy fsync, but prepare for inferior single-thread latency.
- Get a fast network (at least 10 Gbit/s). - Get a fast network (at least 10 Gbit/s).
- Disable CPU powersaving: `cpupower idle-set -D 0 && cpupower frequency-set -g performance`. - Disable CPU powersaving: `cpupower idle-set -D 0 && cpupower frequency-set -g performance`.
- On the monitor hosts: - Check `/usr/lib/vitastor/mon/make-units.sh` and `/usr/lib/vitastor/mon/make-osd.sh` and
- Edit variables at the top of `/usr/lib/vitastor/mon/make-units.sh` to desired values. put desired values into the variables at the top of these files.
- Create systemd units for the monitor and etcd: `/usr/lib/vitastor/mon/make-units.sh` - Create systemd units for the monitor and etcd: `/usr/lib/vitastor/mon/make-units.sh`
- Put etcd_address and osd_network into `/etc/vitastor/vitastor.conf`. Example:
```
{
"etcd_address": ["10.200.1.10:2379","10.200.1.11:2379","10.200.1.12:2379"],
"osd_network": "10.200.1.0/24"
}
```
- Create systemd units for your OSDs: `/usr/lib/vitastor/mon/make-osd.sh /dev/disk/by-partuuid/XXX [/dev/disk/by-partuuid/YYY ...]` - Create systemd units for your OSDs: `/usr/lib/vitastor/mon/make-osd.sh /dev/disk/by-partuuid/XXX [/dev/disk/by-partuuid/YYY ...]`
- You can change OSD configuration in units or in `vitastor.conf`. Notable configuration variables: - You can edit the units and change OSD configuration. Notable configuration variables:
- `disable_data_fsync 1` - only safe with server-grade drives with capacitors. - `disable_data_fsync 1` - only safe with server-grade drives with capacitors.
- `immediate_commit all` - use this if all your drives are server-grade. - `immediate_commit all` - use this if all your drives are server-grade.
If all OSDs have it set to all then you should also put the same value in etcd into /vitastor/config/global
- `disable_device_lock 1` - only required if you run multiple OSDs on one block device. - `disable_device_lock 1` - only required if you run multiple OSDs on one block device.
- `flusher_count 256` - flusher is a micro-thread that removes old data from the journal. - `flusher_count 256` - flusher is a micro-thread that removes old data from the journal.
You don't have to worry about this parameter anymore, 256 is enough. You don't have to worry about this parameter anymore, 256 is enough.
@@ -487,73 +478,6 @@ for i in ./???-*.yaml; do kubectl apply -f $i; done
After that you'll be able to create PersistentVolumes. See example in [csi/deploy/example-pvc.yaml](csi/deploy/example-pvc.yaml). After that you'll be able to create PersistentVolumes. See example in [csi/deploy/example-pvc.yaml](csi/deploy/example-pvc.yaml).
### OpenStack
To enable Vitastor support in an OpenStack installation:
- Install vitastor-client, patched QEMU and libvirt packages from Vitastor DEB or RPM repository
- Use `patches/nova-21.diff` or `patches/nova-23.diff` to patch your Nova installation.
Patch 21 fits Nova 21-22, patch 23 fits Nova 23-24.
- Install `patches/cinder-vitastor.py` as `..../cinder/volume/drivers/vitastor.py`
- Define a volume type in cinder.conf (see below)
- Block network access from VMs to Vitastor network (to OSDs and etcd), because Vitastor doesn't support authentication (yet)
- Restart Cinder and Nova
Cinder volume type configuration example:
```
[DEFAULT]
enabled_backends = lvmdriver-1, vitastor-testcluster
# ...
[vitastor-testcluster]
volume_driver = cinder.volume.drivers.vitastor.VitastorDriver
volume_backend_name = vitastor-testcluster
image_volume_cache_enabled = True
volume_clear = none
vitastor_etcd_address = 192.168.7.2:2379
vitastor_etcd_prefix =
vitastor_config_path = /etc/vitastor/vitastor.conf
vitastor_pool_id = 1
image_upload_use_cinder_backend = True
```
To put Glance images in Vitastor, use [https://docs.openstack.org/cinder/pike/admin/blockstorage-volume-backed-image.html](volume-backed images),
although the support has not been verified yet.
### Proxmox
To enable Vitastor support in Proxmox Virtual Environment (6.4 and 7.1 are supported):
- Add the corresponding Vitastor Debian repository into sources.list on Proxmox hosts
(buster for 6.4, bullseye for 7.1)
- Install vitastor-client, pve-qemu-kvm, pve-storage-vitastor (* or see note) packages from Vitastor repository
- Define storage in `/etc/pve/storage.cfg` (see below)
- Block network access from VMs to Vitastor network (to OSDs and etcd), because Vitastor doesn't support authentication (yet)
- Restart pvedaemon: `systemctl restart pvedaemon`
`/etc/pve/storage.cfg` example (the only required option is vitastor_pool, all others
are listed below with their default values):
```
vitastor: vitastor
# pool to put new images into
vitastor_pool testpool
# path to the configuration file
vitastor_config_path /etc/vitastor/vitastor.conf
# etcd address(es), required only if missing in the configuration file
vitastor_etcd_address 192.168.7.2:2379/v3
# prefix for keys in etcd
vitastor_etcd_prefix /vitastor
# prefix for images
vitastor_prefix pve/
# use NBD mounter (only required for containers)
vitastor_nbd 0
```
\* Note: you can also manually copy [patches/PVE_VitastorPlugin.pm](patches/PVE_VitastorPlugin.pm) to Proxmox hosts
as `/usr/share/perl5/PVE/Storage/Custom/VitastorPlugin.pm` instead of installing pve-storage-vitastor.
## Known Problems ## Known Problems
- Object deletion requests may currently lead to 'incomplete' objects in EC pools - Object deletion requests may currently lead to 'incomplete' objects in EC pools

View File

@@ -1,4 +1,4 @@
VERSION ?= v0.6.12 VERSION ?= v0.6.8
all: build push all: build push

View File

@@ -49,7 +49,7 @@ spec:
capabilities: capabilities:
add: ["SYS_ADMIN"] add: ["SYS_ADMIN"]
allowPrivilegeEscalation: true allowPrivilegeEscalation: true
image: vitalif/vitastor-csi:v0.6.12 image: vitalif/vitastor-csi:v0.6.8
args: args:
- "--node=$(NODE_ID)" - "--node=$(NODE_ID)"
- "--endpoint=$(CSI_ENDPOINT)" - "--endpoint=$(CSI_ENDPOINT)"

View File

@@ -116,7 +116,7 @@ spec:
privileged: true privileged: true
capabilities: capabilities:
add: ["SYS_ADMIN"] add: ["SYS_ADMIN"]
image: vitalif/vitastor-csi:v0.6.12 image: vitalif/vitastor-csi:v0.6.8
args: args:
- "--node=$(NODE_ID)" - "--node=$(NODE_ID)"
- "--endpoint=$(CSI_ENDPOINT)" - "--endpoint=$(CSI_ENDPOINT)"

View File

@@ -5,7 +5,7 @@ package vitastor
const ( const (
vitastorCSIDriverName = "csi.vitastor.io" vitastorCSIDriverName = "csi.vitastor.io"
vitastorCSIDriverVersion = "0.6.12" vitastorCSIDriverVersion = "0.6.8"
) )
// Config struct fills the parameters of request or user input // Config struct fills the parameters of request or user input

2
debian/changelog vendored
View File

@@ -1,4 +1,4 @@
vitastor (0.6.12-1) unstable; urgency=medium vitastor (0.6.8-1) unstable; urgency=medium
* RDMA support * RDMA support
* Bugfixes * Bugfixes

10
debian/control vendored
View File

@@ -9,7 +9,7 @@ Rules-Requires-Root: no
Package: vitastor Package: vitastor
Architecture: amd64 Architecture: amd64
Depends: vitastor-osd, vitastor-mon, vitastor-client, vitastor-client-dev, vitastor-fio Depends: vitastor-osd, vitastor-mon, vitastor-client, vitastor-client-dev, vitastor-fio, vitastor-qemu
Description: Vitastor, a fast software-defined clustered block storage Description: Vitastor, a fast software-defined clustered block storage
Vitastor is a small, simple and fast clustered block storage (storage for VM drives), Vitastor is a small, simple and fast clustered block storage (storage for VM drives),
architecturally similar to Ceph which means strong consistency, primary-replication, architecturally similar to Ceph which means strong consistency, primary-replication,
@@ -48,8 +48,8 @@ Depends: ${shlibs:Depends}, ${misc:Depends}, vitastor-client (= ${binary:Version
Description: Vitastor, a fast software-defined clustered block storage - fio drivers Description: Vitastor, a fast software-defined clustered block storage - fio drivers
Vitastor fio drivers for benchmarking. Vitastor fio drivers for benchmarking.
Package: pve-storage-vitastor Package: vitastor-qemu
Architecture: amd64 Architecture: amd64
Depends: ${shlibs:Depends}, ${misc:Depends}, vitastor-client (= ${binary:Version}) Depends: ${shlibs:Depends}, ${misc:Depends}, vitastor-client (= ${binary:Version}), qemu (= ${dep:qemu})
Description: Vitastor Proxmox Virtual Environment storage plugin Description: Vitastor, a fast software-defined clustered block storage - QEMU driver
Vitastor storage plugin for Proxmox Virtual Environment. Vitastor QEMU block device driver.

View File

@@ -1,40 +0,0 @@
# Build patched libvirt for Debian Buster or Bullseye/Sid inside a container
# cd ..; podman build --build-arg REL=bullseye -v `pwd`/packages:/root/packages -f debian/libvirt.Dockerfile .
ARG REL=
FROM debian:$REL
ARG REL=
WORKDIR /root
RUN if [ "$REL" = "buster" -o "$REL" = "bullseye" ]; then \
echo "deb http://deb.debian.org/debian $REL-backports main" >> /etc/apt/sources.list; \
echo >> /etc/apt/preferences; \
echo 'Package: *' >> /etc/apt/preferences; \
echo "Pin: release a=$REL-backports" >> /etc/apt/preferences; \
echo 'Pin-Priority: 500' >> /etc/apt/preferences; \
fi; \
grep '^deb ' /etc/apt/sources.list | perl -pe 's/^deb/deb-src/' >> /etc/apt/sources.list; \
echo 'APT::Install-Recommends false;' >> /etc/apt/apt.conf; \
echo 'APT::Install-Suggests false;' >> /etc/apt/apt.conf
RUN apt-get update; apt-get -y install devscripts
RUN apt-get -y build-dep libvirt0
RUN apt-get -y install libglusterfs-dev
RUN apt-get --download-only source libvirt
ADD patches/libvirt-5.0-vitastor.diff patches/libvirt-7.0-vitastor.diff patches/libvirt-7.5-vitastor.diff patches/libvirt-7.6-vitastor.diff /root
RUN set -e; \
mkdir -p /root/packages/libvirt-$REL; \
rm -rf /root/packages/libvirt-$REL/*; \
cd /root/packages/libvirt-$REL; \
dpkg-source -x /root/libvirt*.dsc; \
D=$(ls -d libvirt-*/); \
V=$(ls -d libvirt-*/ | perl -pe 's/libvirt-(\d+\.\d+).*/$1/'); \
cp /root/libvirt-$V-vitastor.diff $D/debian/patches; \
echo libvirt-$V-vitastor.diff >> $D/debian/patches/series; \
cd $D; \
V=$(head -n1 debian/changelog | perl -pe 's/^.*\((.*?)(~bpo[\d\+]*)?(\+deb[u\d]+)?\).*$/$1/')+vitastor2; \
DEBEMAIL="Vitaliy Filippov <vitalif@yourcmc.ru>" dch -D $REL -v $V 'Add Vitastor support'; \
DEB_BUILD_OPTIONS=nocheck dpkg-buildpackage --jobs=auto -sa; \
rm -rf /root/packages/libvirt-$REL/$D

View File

@@ -21,18 +21,14 @@ RUN if [ "$REL" = "buster" -o "$REL" = "bullseye" ]; then \
RUN apt-get update RUN apt-get update
RUN apt-get -y install qemu fio liburing1 liburing-dev libgoogle-perftools-dev devscripts RUN apt-get -y install qemu fio liburing1 liburing-dev libgoogle-perftools-dev devscripts
RUN apt-get -y build-dep qemu RUN apt-get -y build-dep qemu
RUN apt-get -y build-dep fio
# To build a custom version # To build a custom version
#RUN cp /root/packages/qemu-orig/* /root #RUN cp /root/packages/qemu-orig/* /root
RUN apt-get --download-only source qemu RUN apt-get --download-only source qemu
RUN apt-get --download-only source fio
ADD patches/qemu-5.0-vitastor.patch patches/qemu-5.1-vitastor.patch patches/qemu-6.1-vitastor.patch src/qemu_driver.c /root/vitastor/patches/ ADD patches/qemu-5.0-vitastor.patch patches/qemu-5.1-vitastor.patch patches/qemu-6.1-vitastor.patch /root/vitastor/patches/
RUN set -e; \ RUN set -e; \
apt-get install -y wget; \
wget -q -O /etc/apt/trusted.gpg.d/vitastor.gpg https://vitastor.io/debian/pubkey.gpg; \
(echo deb http://vitastor.io/debian $REL main > /etc/apt/sources.list.d/vitastor.list); \
(echo "APT::Install-Recommends false;" > /etc/apt/apt.conf) && \
apt-get update; \
apt-get install -y vitastor-client vitastor-client-dev quilt; \
mkdir -p /root/packages/qemu-$REL; \ mkdir -p /root/packages/qemu-$REL; \
rm -rf /root/packages/qemu-$REL/*; \ rm -rf /root/packages/qemu-$REL/*; \
cd /root/packages/qemu-$REL; \ cd /root/packages/qemu-$REL; \
@@ -51,11 +47,7 @@ RUN set -e; \
echo qemu-5.1-vitastor.patch >> $P/series; \ echo qemu-5.1-vitastor.patch >> $P/series; \
fi; \ fi; \
cd /root/packages/qemu-$REL/qemu-*/; \ cd /root/packages/qemu-$REL/qemu-*/; \
quilt push -a; \
quilt add block/vitastor.c; \
cp /root/vitastor/patches/qemu_driver.c block/vitastor.c; \
quilt refresh; \
V=$(head -n1 debian/changelog | perl -pe 's/^.*\((.*?)(~bpo[\d\+]*)?\).*$/$1/')+vitastor1; \ V=$(head -n1 debian/changelog | perl -pe 's/^.*\((.*?)(~bpo[\d\+]*)?\).*$/$1/')+vitastor1; \
DEBEMAIL="Vitaliy Filippov <vitalif@yourcmc.ru>" dch -D $REL -v $V 'Plug Vitastor block driver'; \ DEBFULLNAME="Vitaliy Filippov <vitalif@yourcmc.ru>" dch -D $REL -v $V 'Plug Vitastor block driver'; \
DEB_BUILD_OPTIONS=nocheck dpkg-buildpackage --jobs=auto -sa; \ DEB_BUILD_OPTIONS=nocheck dpkg-buildpackage --jobs=auto -sa; \
rm -rf /root/packages/qemu-$REL/qemu-*/ rm -rf /root/packages/qemu-$REL/qemu-*/

View File

@@ -1 +0,0 @@
patches/PVE_VitastorPlugin.pm usr/share/perl5/PVE/Storage/Custom/VitastorPlugin.pm

1
debian/qemu_version vendored Normal file
View File

@@ -0,0 +1 @@
dep:qemu=1:5.2+dfsg-10+vitastor1

19
debian/raw.h vendored
View File

@@ -1,19 +0,0 @@
/* Removed in Linux 5.14 */
/* SPDX-License-Identifier: GPL-2.0 WITH Linux-syscall-note */
#ifndef __LINUX_RAW_H
#define __LINUX_RAW_H
#include <linux/types.h>
#define RAW_SETBIND _IO( 0xac, 0 )
#define RAW_GETBIND _IO( 0xac, 1 )
struct raw_config_request
{
int raw_minor;
__u64 block_major;
__u64 block_minor;
};
#endif /* __LINUX_RAW_H */

2
debian/rules vendored
View File

@@ -6,5 +6,5 @@ export DH_VERBOSE = 1
override_dh_installdeb: override_dh_installdeb:
cat debian/fio_version >> debian/vitastor-fio.substvars cat debian/fio_version >> debian/vitastor-fio.substvars
[ -f debian/qemu_version ] && (cat debian/qemu_version >> debian/vitastor-qemu.substvars) || true cat debian/qemu_version >> debian/vitastor-qemu.substvars
dh_installdeb dh_installdeb

View File

@@ -3,4 +3,3 @@ usr/bin/vitastor-cli
usr/bin/vitastor-rm usr/bin/vitastor-rm
usr/bin/vitastor-nbd usr/bin/vitastor-nbd
usr/lib/*/libvitastor*.so* usr/lib/*/libvitastor*.so*
mon/make-osd.sh /usr/lib/vitastor

View File

@@ -1,2 +1,3 @@
usr/bin/vitastor-osd usr/bin/vitastor-osd
usr/bin/vitastor-dump-journal usr/bin/vitastor-dump-journal
mon/make-osd.sh /usr/lib/vitastor

1
debian/vitastor-qemu.install vendored Normal file
View File

@@ -0,0 +1 @@
usr/lib/*/qemu/*

View File

@@ -7,11 +7,11 @@ ARG REL=
WORKDIR /root WORKDIR /root
RUN if [ "$REL" = "buster" -o "$REL" = "bullseye" ]; then \ RUN if [ "$REL" = "buster" ]; then \
echo "deb http://deb.debian.org/debian $REL-backports main" >> /etc/apt/sources.list; \ echo 'deb http://deb.debian.org/debian buster-backports main' >> /etc/apt/sources.list; \
echo >> /etc/apt/preferences; \ echo >> /etc/apt/preferences; \
echo 'Package: *' >> /etc/apt/preferences; \ echo 'Package: *' >> /etc/apt/preferences; \
echo "Pin: release a=$REL-backports" >> /etc/apt/preferences; \ echo 'Pin: release a=buster-backports' >> /etc/apt/preferences; \
echo 'Pin-Priority: 500' >> /etc/apt/preferences; \ echo 'Pin-Priority: 500' >> /etc/apt/preferences; \
fi; \ fi; \
grep '^deb ' /etc/apt/sources.list | perl -pe 's/^deb/deb-src/' >> /etc/apt/sources.list; \ grep '^deb ' /etc/apt/sources.list | perl -pe 's/^deb/deb-src/' >> /etc/apt/sources.list; \
@@ -19,8 +19,10 @@ RUN if [ "$REL" = "buster" -o "$REL" = "bullseye" ]; then \
echo 'APT::Install-Suggests false;' >> /etc/apt/apt.conf echo 'APT::Install-Suggests false;' >> /etc/apt/apt.conf
RUN apt-get update RUN apt-get update
RUN apt-get -y install fio liburing1 liburing-dev libgoogle-perftools-dev devscripts RUN apt-get -y install qemu fio liburing1 liburing-dev libgoogle-perftools-dev devscripts
RUN apt-get -y build-dep qemu
RUN apt-get -y build-dep fio RUN apt-get -y build-dep fio
RUN apt-get --download-only source qemu
RUN apt-get --download-only source fio RUN apt-get --download-only source fio
RUN apt-get update && apt-get -y install libjerasure-dev cmake libibverbs-dev RUN apt-get update && apt-get -y install libjerasure-dev cmake libibverbs-dev
@@ -30,25 +32,37 @@ RUN set -e -x; \
cd /root/fio-build/; \ cd /root/fio-build/; \
rm -rf /root/fio-build/*; \ rm -rf /root/fio-build/*; \
dpkg-source -x /root/fio*.dsc; \ dpkg-source -x /root/fio*.dsc; \
cd /root/packages/qemu-$REL/; \
rm -rf qemu*/; \
dpkg-source -x qemu*.dsc; \
cd /root/packages/qemu-$REL/qemu*/; \
debian/rules b/configure-stamp; \
cd b/qemu; \
make -j8 qapi/qapi-builtin-types.h; \
mkdir -p /root/packages/vitastor-$REL; \ mkdir -p /root/packages/vitastor-$REL; \
rm -rf /root/packages/vitastor-$REL/*; \ rm -rf /root/packages/vitastor-$REL/*; \
cd /root/packages/vitastor-$REL; \ cd /root/packages/vitastor-$REL; \
cp -r /root/vitastor vitastor-0.6.12; \ cp -r /root/vitastor vitastor-0.6.8; \
cd vitastor-0.6.12; \ ln -s /root/packages/qemu-$REL/qemu-*/ vitastor-0.6.8/qemu; \
ln -s /root/fio-build/fio-*/ ./fio; \ ln -s /root/fio-build/fio-*/ vitastor-0.6.8/fio; \
cd vitastor-0.6.8; \
FIO=$(head -n1 fio/debian/changelog | perl -pe 's/^.*\((.*?)\).*$/$1/'); \ FIO=$(head -n1 fio/debian/changelog | perl -pe 's/^.*\((.*?)\).*$/$1/'); \
ls /usr/include/linux/raw.h || cp ./debian/raw.h /usr/include/linux/raw.h; \ QEMU=$(head -n1 qemu/debian/changelog | perl -pe 's/^.*\((.*?)\).*$/$1/'); \
sh copy-qemu-includes.sh; \
sh copy-fio-includes.sh; \ sh copy-fio-includes.sh; \
rm fio; \ rm qemu fio; \
mkdir -p a b debian/patches; \ mkdir -p a b debian/patches; \
mv qemu-copy b/qemu; \
mv fio-copy b/fio; \ mv fio-copy b/fio; \
diff -NaurpbB a b > debian/patches/fio-headers.patch || true; \ diff -NaurpbB a b > debian/patches/qemu-fio-headers.patch || true; \
echo fio-headers.patch >> debian/patches/series; \ echo qemu-fio-headers.patch >> debian/patches/series; \
rm -rf a b; \ rm -rf a b; \
rm -rf /root/packages/qemu-$REL/qemu*/; \
echo "dep:fio=$FIO" > debian/fio_version; \ echo "dep:fio=$FIO" > debian/fio_version; \
echo "dep:qemu=$QEMU" > debian/qemu_version; \
cd /root/packages/vitastor-$REL; \ cd /root/packages/vitastor-$REL; \
tar --sort=name --mtime='2020-01-01' --owner=0 --group=0 --exclude=debian -cJf vitastor_0.6.12.orig.tar.xz vitastor-0.6.12; \ tar --sort=name --mtime='2020-01-01' --owner=0 --group=0 --exclude=debian -cJf vitastor_0.6.8.orig.tar.xz vitastor-0.6.8; \
cd vitastor-0.6.12; \ cd vitastor-0.6.8; \
V=$(head -n1 debian/changelog | perl -pe 's/^.*\((.*?)\).*$/$1/'); \ V=$(head -n1 debian/changelog | perl -pe 's/^.*\((.*?)\).*$/$1/'); \
DEBFULLNAME="Vitaliy Filippov <vitalif@yourcmc.ru>" dch -D $REL -v "$V""$REL" "Rebuild for $REL"; \ DEBFULLNAME="Vitaliy Filippov <vitalif@yourcmc.ru>" dch -D $REL -v "$V""$REL" "Rebuild for $REL"; \
DEB_BUILD_OPTIONS=nocheck dpkg-buildpackage --jobs=auto -sa; \ DEB_BUILD_OPTIONS=nocheck dpkg-buildpackage --jobs=auto -sa; \

2
json11

Submodule json11 updated: 52a3af664f...55363fc265

View File

@@ -50,7 +50,7 @@ async function lp_solve(text)
return { score, vars }; return { score, vars };
} }
async function optimize_initial({ osd_tree, pg_count, pg_size = 3, pg_minsize = 2, max_combinations = 10000, parity_space = 1, ordered = false }) async function optimize_initial({ osd_tree, pg_count, pg_size = 3, pg_minsize = 2, max_combinations = 10000, parity_space = 1, round_robin = false })
{ {
if (!pg_count || !osd_tree) if (!pg_count || !osd_tree)
{ {
@@ -92,7 +92,7 @@ async function optimize_initial({ osd_tree, pg_count, pg_size = 3, pg_minsize =
console.log(lp); console.log(lp);
throw new Error('Problem is infeasible or unbounded - is it a bug?'); throw new Error('Problem is infeasible or unbounded - is it a bug?');
} }
const int_pgs = make_int_pgs(lp_result.vars, pg_count, ordered); const int_pgs = make_int_pgs(lp_result.vars, pg_count, round_robin);
const eff = pg_list_space_efficiency(int_pgs, all_weights, pg_minsize, parity_space); const eff = pg_list_space_efficiency(int_pgs, all_weights, pg_minsize, parity_space);
const res = { const res = {
score: lp_result.score, score: lp_result.score,
@@ -140,20 +140,20 @@ function make_int_pgs(weights, pg_count, round_robin)
return int_pgs; return int_pgs;
} }
function calc_intersect_weights(old_pg_size, pg_size, pg_count, prev_weights, all_pgs, ordered) function calc_intersect_weights(pg_size, pg_count, prev_weights, all_pgs)
{ {
const move_weights = {}; const move_weights = {};
if ((1 << old_pg_size) < pg_count) if ((1 << pg_size) < pg_count)
{ {
const intersect = {}; const intersect = {};
for (const pg_name in prev_weights) for (const pg_name in prev_weights)
{ {
const pg = pg_name.substr(3).split(/_/); const pg = pg_name.substr(3).split(/_/);
for (let omit = 1; omit < (1 << old_pg_size); omit++) for (let omit = 1; omit < (1 << pg_size); omit++)
{ {
let pg_omit = [ ...pg ]; let pg_omit = [ ...pg ];
let intersect_count = old_pg_size; let intersect_count = pg_size;
for (let i = 0; i < old_pg_size; i++) for (let i = 0; i < pg_size; i++)
{ {
if (omit & (1 << i)) if (omit & (1 << i))
{ {
@@ -161,8 +161,6 @@ function calc_intersect_weights(old_pg_size, pg_size, pg_count, prev_weights, al
intersect_count--; intersect_count--;
} }
} }
if (!ordered)
pg_omit = pg_omit.filter(n => n).sort();
pg_omit = pg_omit.join(':'); pg_omit = pg_omit.join(':');
intersect[pg_omit] = Math.max(intersect[pg_omit] || 0, intersect_count); intersect[pg_omit] = Math.max(intersect[pg_omit] || 0, intersect_count);
} }
@@ -176,10 +174,10 @@ function calc_intersect_weights(old_pg_size, pg_size, pg_count, prev_weights, al
for (let i = 0; i < pg_size; i++) for (let i = 0; i < pg_size; i++)
{ {
if (omit & (1 << i)) if (omit & (1 << i))
{
pg_omit[i] = ''; pg_omit[i] = '';
}
} }
if (!ordered)
pg_omit = pg_omit.filter(n => n).sort();
pg_omit = pg_omit.join(':'); pg_omit = pg_omit.join(':');
max_int = Math.max(max_int, intersect[pg_omit] || 0); max_int = Math.max(max_int, intersect[pg_omit] || 0);
} }
@@ -188,18 +186,15 @@ function calc_intersect_weights(old_pg_size, pg_size, pg_count, prev_weights, al
} }
else else
{ {
const prev_pg_hashed = Object.keys(prev_weights).map(pg_name => pg_name const prev_pg_hashed = Object.keys(prev_weights).map(pg_name => pg_name.substr(3).split(/_/).reduce((a, c) => { a[c] = 1; return a; }, {}));
.substr(3).split(/_/).reduce((a, c, i) => { a[c] = i+1; return a; }, {}));
for (const pg of all_pgs) for (const pg of all_pgs)
{ {
if (!prev_weights['pg_'+pg.join('_')]) if (!prev_weights['pg_'+pg.join('_')])
{ {
let max_int = 0; let max_int = 0;
for (const prev_hash of prev_pg_hashed) for (const prev_hash in prev_pg_hashed)
{ {
const intersect_count = ordered const intersect_count = pg.reduce((a, osd) => a + (prev_hash[osd] ? 1 : 0), 0);
? pg.reduce((a, osd, i) => a + (prev_hash[osd] == 1+i ? 1 : 0), 0)
: pg.reduce((a, osd, i) => a + (prev_hash[osd] ? 1 : 0), 0);
if (max_int < intersect_count) if (max_int < intersect_count)
{ {
max_int = intersect_count; max_int = intersect_count;
@@ -248,7 +243,7 @@ function add_valid_previous(osd_tree, prev_weights, all_pgs)
} }
// Try to minimize data movement // Try to minimize data movement
async function optimize_change({ prev_pgs: prev_int_pgs, osd_tree, pg_size = 3, pg_minsize = 2, max_combinations = 10000, parity_space = 1, ordered = false }) async function optimize_change({ prev_pgs: prev_int_pgs, osd_tree, pg_size = 3, pg_minsize = 2, max_combinations = 10000, parity_space = 1 })
{ {
if (!osd_tree) if (!osd_tree)
{ {
@@ -271,13 +266,9 @@ async function optimize_change({ prev_pgs: prev_int_pgs, osd_tree, pg_size = 3,
prev_pg_per_osd[osd].push([ pg_name, (i >= pg_minsize ? parity_space : 1) ]); prev_pg_per_osd[osd].push([ pg_name, (i >= pg_minsize ? parity_space : 1) ]);
} }
} }
const old_pg_size = prev_int_pgs[0].length;
// Get all combinations // Get all combinations
let all_pgs = random_combinations(osd_tree, pg_size, max_combinations, parity_space > 1); let all_pgs = random_combinations(osd_tree, pg_size, max_combinations, parity_space > 1);
if (old_pg_size == pg_size) add_valid_previous(osd_tree, prev_weights, all_pgs);
{
add_valid_previous(osd_tree, prev_weights, all_pgs);
}
all_pgs = Object.values(all_pgs); all_pgs = Object.values(all_pgs);
const pg_per_osd = {}; const pg_per_osd = {};
for (const pg of all_pgs) for (const pg of all_pgs)
@@ -291,7 +282,7 @@ async function optimize_change({ prev_pgs: prev_int_pgs, osd_tree, pg_size = 3,
} }
} }
// Penalize PGs based on their similarity to old PGs // Penalize PGs based on their similarity to old PGs
const move_weights = calc_intersect_weights(old_pg_size, pg_size, pg_count, prev_weights, all_pgs, ordered); const move_weights = calc_intersect_weights(pg_size, pg_count, prev_weights, all_pgs);
// Calculate total weight - old PG weights // Calculate total weight - old PG weights
const all_pg_names = all_pgs.map(pg => 'pg_'+pg.join('_')); const all_pg_names = all_pgs.map(pg => 'pg_'+pg.join('_'));
const all_pgs_hash = all_pg_names.reduce((a, c) => { a[c] = true; return a; }, {}); const all_pgs_hash = all_pg_names.reduce((a, c) => { a[c] = true; return a; }, {});
@@ -302,6 +293,11 @@ async function optimize_change({ prev_pgs: prev_int_pgs, osd_tree, pg_size = 3,
lp += 'max: '+all_pg_names.map(pg_name => ( lp += 'max: '+all_pg_names.map(pg_name => (
prev_weights[pg_name] ? `${pg_size+1}*add_${pg_name} - ${pg_size+1}*del_${pg_name}` : `${pg_size+1-move_weights[pg_name]}*${pg_name}` prev_weights[pg_name] ? `${pg_size+1}*add_${pg_name} - ${pg_size+1}*del_${pg_name}` : `${pg_size+1-move_weights[pg_name]}*${pg_name}`
)).join(' + ')+';\n'; )).join(' + ')+';\n';
lp += all_pg_names
.map(pg_name => (prev_weights[pg_name] ? `add_${pg_name} - del_${pg_name}` : `${pg_name}`))
.join(' + ')+' = '+(pg_count
- Object.keys(prev_weights).reduce((a, old_pg_name) => (a + (all_pgs_hash[old_pg_name] ? prev_weights[old_pg_name] : 0)), 0)
)+';\n';
for (const osd in pg_per_osd) for (const osd in pg_per_osd)
{ {
if (osd !== NO_OSD) if (osd !== NO_OSD)
@@ -382,35 +378,11 @@ async function optimize_change({ prev_pgs: prev_int_pgs, osd_tree, pg_size = 3,
{ {
differs++; differs++;
} }
} for (let j = 0; j < pg_size; j++)
if (ordered)
{
for (let i = 0; i < pg_count; i++)
{ {
for (let j = 0; j < pg_size; j++) if (new_pgs[i][j] != prev_int_pgs[i][j])
{ {
if (new_pgs[i][j] != prev_int_pgs[i][j]) osd_differs++;
{
osd_differs++;
}
}
}
}
else
{
for (let i = 0; i < pg_count; i++)
{
const old_map = prev_int_pgs[i].reduce((a, c) => { a[c] = (a[c]|0) + 1; return a; }, {});
for (let j = 0; j < pg_size; j++)
{
if ((0|old_map[new_pgs[i][j]]) > 0)
{
old_map[new_pgs[i][j]]--;
}
else
{
osd_differs++;
}
} }
} }
} }

View File

@@ -4,16 +4,18 @@
# Copyright (c) Vitaliy Filippov, 2019+ # Copyright (c) Vitaliy Filippov, 2019+
# License: MIT # License: MIT
# USAGE: # USAGE: ./make-osd.sh /dev/disk/by-partuuid/xxx [ /dev/disk/by-partuuid/yyy]...
# 1) Put etcd_address and osd_network into /etc/vitastor/vitastor.conf. Example:
# { IP_SUBSTR="10.200.1."
# "etcd_address":["http://10.200.1.10:2379/v3","http://10.200.1.11:2379/v3","http://10.200.1.12:2379/v3"], ETCD_HOSTS="etcd0=http://10.200.1.10:2380,etcd1=http://10.200.1.11:2380,etcd2=http://10.200.1.12:2380"
# "osd_network":"10.200.1.0/24"
# }
# 2) Run ./make-osd.sh /dev/disk/by-partuuid/xxx [ /dev/disk/by-partuuid/yyy]...
set -e -x set -e -x
IP=`ip -json a s | jq -r '.[].addr_info[] | select(.local | startswith("'$IP_SUBSTR'")) | .local'`
[ "$IP" != "" ] || exit 1
ETCD_MON=$(echo $ETCD_HOSTS | perl -pe 's/:2380/:2379/g; s/etcd\d*=//g;')
D=`dirname $0`
# Create OSDs on all passed devices # Create OSDs on all passed devices
for DEV in $*; do for DEV in $*; do
@@ -37,6 +39,8 @@ LimitNOFILE=1048576
LimitNPROC=1048576 LimitNPROC=1048576
LimitMEMLOCK=infinity LimitMEMLOCK=infinity
ExecStart=/usr/bin/vitastor-osd \\ ExecStart=/usr/bin/vitastor-osd \\
--etcd_address $IP:2379/v3 \\
--bind_address $IP \\
--osd_num $OSD_NUM \\ --osd_num $OSD_NUM \\
--disable_data_fsync 1 \\ --disable_data_fsync 1 \\
--immediate_commit all \\ --immediate_commit all \\

View File

@@ -9,18 +9,17 @@ const options = {};
for (let i = 2; i < process.argv.length; i++) for (let i = 2; i < process.argv.length; i++)
{ {
if (process.argv[i] === '-h' || process.argv[i] === '--help') if (process.argv[i].substr(0, 2) == '--')
{
console.error('USAGE: '+process.argv[0]+' '+process.argv[1]+' [--verbose 1]'+
' [--etcd_address "http://127.0.0.1:2379,..."] [--config_file /etc/vitastor/vitastor.conf]'+
' [--etcd_prefix "/vitastor"] [--etcd_start_timeout 5]');
process.exit();
}
else if (process.argv[i].substr(0, 2) == '--')
{ {
options[process.argv[i].substr(2)] = process.argv[i+1]; options[process.argv[i].substr(2)] = process.argv[i+1];
i++; i++;
} }
} }
new Mon(options).start().catch(e => { console.error(e); process.exit(1); }); if (!options.etcd_url)
{
console.error('USAGE: '+process.argv[0]+' '+process.argv[1]+' --etcd_url "http://127.0.0.1:2379,..." --etcd_prefix "/vitastor" --etcd_start_timeout 5 [--verbose 1]');
process.exit();
}
new Mon(options).start().catch(e => { console.error(e); process.exit(); });

View File

@@ -1,7 +1,6 @@
// Copyright (c) Vitaliy Filippov, 2019+ // Copyright (c) Vitaliy Filippov, 2019+
// License: VNPL-1.1 (see README.md for details) // License: VNPL-1.1 (see README.md for details)
const fs = require('fs');
const http = require('http'); const http = require('http');
const crypto = require('crypto'); const crypto = require('crypto');
const os = require('os'); const os = require('os');
@@ -84,14 +83,8 @@ const etcd_tree = {
osd_ping_timeout: 5, // seconds. min: 1 osd_ping_timeout: 5, // seconds. min: 1
up_wait_retry_interval: 500, // ms. min: 50 up_wait_retry_interval: 500, // ms. min: 50
// osd // osd
etcd_report_interval: 5, // seconds etcd_report_interval: 5,
max_etcd_attempts: 5,
etcd_quick_timeout: 1000, // ms
etcd_slow_timeout: 5000, // ms
etcd_keepalive_timeout: 30, // seconds, default is min(30, etcd_report_interval*2)
etcd_ws_keepalive_interval: 30, // seconds
run_primary: true, run_primary: true,
osd_network: null, // "192.168.7.0/24" or an array of masks
bind_address: "0.0.0.0", bind_address: "0.0.0.0",
bind_port: 0, bind_port: 0,
autosync_interval: 5, autosync_interval: 5,
@@ -329,56 +322,24 @@ class Mon
{ {
constructor(config) constructor(config)
{ {
this.die = (e) => this._die(e); // FIXME: Maybe prefer local etcd
if (fs.existsSync(config.config_path||'/etc/vitastor/vitastor.conf')) this.etcd_urls = [];
for (let url of config.etcd_url.split(/,/))
{ {
config = { let scheme = 'http';
...JSON.parse(fs.readFileSync(config.config_path||'/etc/vitastor/vitastor.conf', { encoding: 'utf-8' })), url = url.trim().replace(/^(https?):\/\//, (m, m1) => { scheme = m1; return ''; });
...config, if (!/\/[^\/]/.exec(url))
}; url += '/v3';
this.etcd_urls.push(scheme+'://'+url);
} }
this.parse_etcd_addresses(config.etcd_address||config.etcd_url);
this.verbose = config.verbose || 0; this.verbose = config.verbose || 0;
this.initConfig = config;
this.config = {}; this.config = {};
this.etcd_prefix = config.etcd_prefix || '/vitastor'; this.etcd_prefix = config.etcd_prefix || '/vitastor';
this.etcd_prefix = this.etcd_prefix.replace(/\/\/+/g, '/').replace(/^\/?(.*[^\/])\/?$/, '/$1'); this.etcd_prefix = this.etcd_prefix.replace(/\/\/+/g, '/').replace(/^\/?(.*[^\/])\/?$/, '/$1');
this.etcd_start_timeout = (config.etcd_start_timeout || 5) * 1000; this.etcd_start_timeout = (config.etcd_start_timeout || 5) * 1000;
this.state = JSON.parse(JSON.stringify(this.constructor.etcd_tree)); this.state = JSON.parse(JSON.stringify(this.constructor.etcd_tree));
this.signals_set = false; this.signals_set = false;
this.ws = null; this.on_stop_cb = () => this.on_stop().catch(console.error);
this.ws_alive = false;
this.ws_keepalive_timer = null;
this.on_stop_cb = () => this.on_stop(0).catch(console.error);
}
parse_etcd_addresses(addrs)
{
const is_local_ip = this.local_ips(true).reduce((a, c) => { a[c] = true; return a; }, {});
this.etcd_local = [];
this.etcd_urls = [];
this.selected_etcd_url = null;
this.etcd_urls_to_try = [];
if (!(addrs instanceof Array))
addrs = addrs ? (''+(addrs||'')).split(/,/) : [];
if (!addrs.length)
{
console.error('Vitastor etcd address(es) not specified. Please set on the command line or in the config file');
process.exit(1);
}
for (let url of addrs)
{
let scheme = 'http';
url = url.trim().replace(/^(https?):\/\//, (m, m1) => { scheme = m1; return ''; });
const slash = url.indexOf('/');
const colon = url.indexOf(':');
const is_local = is_local_ip[colon >= 0 ? url.substr(0, colon) : (slash >= 0 ? url.substr(0, slash) : url)];
url = scheme+'://'+(slash >= 0 ? url : url+'/v3');
if (is_local)
this.etcd_local.push(url);
else
this.etcd_urls.push(url);
}
} }
async start() async start()
@@ -391,7 +352,7 @@ class Mon
for (const pool_id in this.state.config.pools) for (const pool_id in this.state.config.pools)
{ {
if (!this.state.pool.stats[pool_id] || if (!this.state.pool.stats[pool_id] ||
!Number(this.state.pool.stats[pool_id].pg_real_size)) !this.state.pool.stats[pool_id].pg_real_size)
{ {
// Generate missing data in etcd // Generate missing data in etcd
this.state.config.pgs.hash = null; this.state.config.pgs.hash = null;
@@ -449,43 +410,6 @@ class Mon
} }
} }
pick_next_etcd()
{
if (this.selected_etcd_url)
return this.selected_etcd_url;
if (!this.etcd_urls_to_try || !this.etcd_urls_to_try.length)
{
this.etcd_urls_to_try = [ ...this.etcd_local ];
const others = [ ...this.etcd_urls ];
while (others.length)
{
const url = others.splice(0|(others.length*Math.random()), 1);
this.etcd_urls_to_try.push(url[0]);
}
}
this.selected_etcd_url = this.etcd_urls_to_try.shift();
return this.selected_etcd_url;
}
restart_watcher(cur_addr)
{
if (this.ws)
{
this.ws.close();
this.ws = null;
}
if (this.ws_keepalive_timer)
{
clearInterval(this.ws_keepalive_timer);
this.ws_keepalive_timer = null;
}
if (this.selected_etcd_url == cur_addr)
{
this.selected_etcd_url = null;
}
this.start_watcher(this.config.etcd_mon_retries).catch(this.die);
}
async start_watcher(retries) async start_watcher(retries)
{ {
let retry = 0; let retry = 0;
@@ -495,14 +419,12 @@ class Mon
} }
while (retries < 0 || retry < retries) while (retries < 0 || retry < retries)
{ {
const cur_addr = this.pick_next_etcd(); const base = 'ws'+this.etcd_urls[Math.floor(Math.random()*this.etcd_urls.length)].substr(4);
const base = 'ws'+cur_addr.substr(4);
const ok = await new Promise((ok, no) => const ok = await new Promise((ok, no) =>
{ {
const timer_id = setTimeout(() => const timer_id = setTimeout(() =>
{ {
this.ws.close(); this.ws.close();
this.ws = null;
ok(false); ok(false);
}, this.config.etcd_mon_timeout); }, this.config.etcd_mon_timeout);
this.ws = new WebSocket(base+'/watch'); this.ws = new WebSocket(base+'/watch');
@@ -520,9 +442,9 @@ class Mon
}); });
}); });
if (ok) if (ok)
{
break; break;
if (this.selected_etcd_url == cur_addr) }
this.selected_etcd_url = null;
this.ws = null; this.ws = null;
retry++; retry++;
} }
@@ -530,22 +452,6 @@ class Mon
{ {
this.die('Failed to open etcd watch websocket'); this.die('Failed to open etcd watch websocket');
} }
const cur_addr = this.selected_etcd_url;
this.ws_alive = true;
this.ws_keepalive_timer = setInterval(() =>
{
if (this.ws_alive)
{
this.ws_alive = false;
this.ws.send(JSON.stringify({ progress_request: {} }));
}
else
{
console.log('etcd websocket timed out, restarting it');
this.restart_watcher(cur_addr);
}
}, (Number(this.config.etcd_keepalive_interval) || 30)*1000);
this.ws.on('error', () => this.restart_watcher(cur_addr));
this.ws.send(JSON.stringify({ this.ws.send(JSON.stringify({
create_request: { create_request: {
key: b64(this.etcd_prefix+'/'), key: b64(this.etcd_prefix+'/'),
@@ -557,7 +463,6 @@ class Mon
})); }));
this.ws.on('message', (msg) => this.ws.on('message', (msg) =>
{ {
this.ws_alive = true;
let data; let data;
try try
{ {
@@ -566,25 +471,12 @@ class Mon
catch (e) catch (e)
{ {
} }
if (!data || !data.result) if (!data || !data.result || !data.result.events)
{ {
console.error('Unknown message received from watch websocket: '+msg); if (!data || !data.result || !data.result.watch_id)
}
else if (data.result.canceled)
{
// etcd watch canceled
if (data.result.compact_revision)
{ {
// we may miss events if we proceed console.error('Garbage received from watch websocket: '+msg);
console.error('Revisions before '+data.result.compact_revision+' were compacted by etcd, exiting');
this.on_stop(1);
} }
console.error('Watch canceled by etcd, reason: '+data.result.cancel_reason+', exiting');
this.on_stop(1);
}
else if (data.result.created)
{
// etcd watch created
} }
else else
{ {
@@ -594,7 +486,7 @@ class Mon
console.log('Revision '+data.result.header.revision+' events: '); console.log('Revision '+data.result.header.revision+' events: ');
} }
this.etcd_watch_revision = BigInt(data.result.header.revision)+BigInt(1); this.etcd_watch_revision = BigInt(data.result.header.revision)+BigInt(1);
for (const e of data.result.events||[]) for (const e of data.result.events)
{ {
this.parse_kv(e.kv); this.parse_kv(e.kv);
const key = e.kv.key.substr(this.etcd_prefix.length); const key = e.kv.key.substr(this.etcd_prefix.length);
@@ -617,7 +509,7 @@ class Mon
} }
if (pg_states_changed) if (pg_states_changed)
{ {
this.save_last_clean().catch(this.die); this.save_last_clean().catch(console.error);
} }
if (stats_changed) if (stats_changed)
{ {
@@ -688,11 +580,11 @@ class Mon
} }
} }
async on_stop(status) async on_stop()
{ {
clearInterval(this.lease_timer); clearInterval(this.lease_timer);
await this.etcd_call('/lease/revoke', { ID: this.etcd_lease_id }, this.config.etcd_mon_timeout, this.config.etcd_mon_retries); await this.etcd_call('/lease/revoke', { ID: this.etcd_lease_id }, this.config.etcd_mon_timeout, this.config.etcd_mon_retries);
process.exit(status); process.exit(0);
} }
async become_master() async become_master()
@@ -745,13 +637,10 @@ class Mon
for (const node_id in this.state.config.node_placement||{}) for (const node_id in this.state.config.node_placement||{})
{ {
const node_cfg = this.state.config.node_placement[node_id]; const node_cfg = this.state.config.node_placement[node_id];
if (/^\d+$/.exec(node_id)) if (!node_id || /^\d/.exec(node_id) ||
!node_cfg.level || !levels[node_cfg.level])
{ {
node_cfg.level = 'osd'; // All nodes must have non-empty non-numeric IDs and valid levels
}
if (!node_id || !node_cfg.level || !levels[node_cfg.level])
{
// All nodes must have non-empty IDs and valid levels
continue; continue;
} }
tree[node_id] = { id: node_id, level: node_cfg.level, parent: node_cfg.parent, children: [] }; tree[node_id] = { id: node_id, level: node_cfg.level, parent: node_cfg.parent, children: [] };
@@ -784,10 +673,10 @@ class Mon
.reduce((a, c) => { a[c] = true; return a; }, {}); .reduce((a, c) => { a[c] = true; return a; }, {});
} }
delete tree[osd_num].children; delete tree[osd_num].children;
if (!tree[stat.host]) if (!tree[tree[osd_num].parent])
{ {
tree[stat.host] = { tree[tree[osd_num].parent] = {
id: stat.host, id: tree[osd_num].parent,
level: 'host', level: 'host',
parent: null, parent: null,
children: [], children: [],
@@ -1133,7 +1022,7 @@ class Mon
pg_size: pool_cfg.pg_size, pg_size: pool_cfg.pg_size,
pg_minsize: pool_cfg.pg_minsize, pg_minsize: pool_cfg.pg_minsize,
max_combinations: pool_cfg.max_osd_combinations, max_combinations: pool_cfg.max_osd_combinations,
ordered: pool_cfg.scheme != 'replicated', round_robin: pool_cfg.scheme != 'replicated',
}; };
let optimize_result; let optimize_result;
if (old_pg_count > 0) if (old_pg_count > 0)
@@ -1156,6 +1045,10 @@ class Mon
{ {
pg.push(0); pg.push(0);
} }
while (pg.length > pool_cfg.pg_size)
{
pg.pop();
}
} }
if (!this.state.config.pgs.hash) if (!this.state.config.pgs.hash)
{ {
@@ -1191,8 +1084,8 @@ class Mon
this.state.pool.stats[pool_id] = { this.state.pool.stats[pool_id] = {
used_raw_tb: (this.state.pool.stats[pool_id]||{}).used_raw_tb || 0, used_raw_tb: (this.state.pool.stats[pool_id]||{}).used_raw_tb || 0,
total_raw_tb: optimize_result.space, total_raw_tb: optimize_result.space,
pg_real_size: pg_effsize || pool_cfg.pg_size, pg_real_size: pg_effsize,
raw_to_usable: (pg_effsize || pool_cfg.pg_size) / (pool_cfg.scheme === 'replicated' raw_to_usable: pg_effsize / (pool_cfg.scheme === 'replicated'
? 1 : (pool_cfg.pg_size - (pool_cfg.parity_chunks||0))), ? 1 : (pool_cfg.pg_size - (pool_cfg.parity_chunks||0))),
space_efficiency: optimize_result.space/(optimize_result.total_space||1), space_efficiency: optimize_result.space/(optimize_result.total_space||1),
}; };
@@ -1302,7 +1195,7 @@ class Mon
this.recheck_timer = setTimeout(() => this.recheck_timer = setTimeout(() =>
{ {
this.recheck_timer = null; this.recheck_timer = null;
this.recheck_pgs().catch(this.die); this.recheck_pgs().catch(console.error);
}, this.config.mon_change_timeout || 1000); }, this.config.mon_change_timeout || 1000);
} }
@@ -1449,7 +1342,6 @@ class Mon
{ {
for (const inode_num in inode_stats[pool_id]) for (const inode_num in inode_stats[pool_id])
{ {
let nonzero = inode_stats[pool_id][inode_num].raw_used > 0;
for (const op of [ 'read', 'write', 'delete' ]) for (const op of [ 'read', 'write', 'delete' ])
{ {
const op_st = inode_stats[pool_id][inode_num][op]; const op_st = inode_stats[pool_id][inode_num][op];
@@ -1457,13 +1349,6 @@ class Mon
op_st.bps = prev_st ? (op_st.bytes - prev_st.bytes) * 1000n / tm : 0; op_st.bps = prev_st ? (op_st.bytes - prev_st.bytes) * 1000n / tm : 0;
op_st.iops = prev_st ? (op_st.count - prev_st.count) * 1000n / tm : 0; op_st.iops = prev_st ? (op_st.count - prev_st.count) * 1000n / tm : 0;
op_st.lat = prev_st ? (op_st.usec - prev_st.usec) / ((op_st.count - prev_st.count) || 1n) : 0; op_st.lat = prev_st ? (op_st.usec - prev_st.usec) / ((op_st.count - prev_st.count) || 1n) : 0;
if (op_st.bps > 0 || op_st.iops > 0 || op_st.lat > 0)
nonzero = true;
}
if (!nonzero && (!this.state.config.inode[pool_id] || !this.state.config.inode[pool_id][inode_num]))
{
// Deleted inode (no data, no I/O, no config)
delete inode_stats[pool_id][inode_num];
} }
} }
} }
@@ -1512,18 +1397,6 @@ class Mon
} }); } });
} }
} }
for (const pool_id in this.state.inode.stats)
{
for (const inode_num in this.state.inode.stats[pool_id])
{
if (!inode_stats[pool_id] || !inode_stats[pool_id][inode_num])
{
txn.push({ requestDeleteRange: {
key: b64(this.etcd_prefix+'/inode/stats/'+pool_id+'/'+inode_num),
} });
}
}
}
for (const pool_id in this.state.pool.stats) for (const pool_id in this.state.pool.stats)
{ {
const pool_stats = { ...this.state.pool.stats[pool_id] }; const pool_stats = { ...this.state.pool.stats[pool_id] };
@@ -1589,7 +1462,7 @@ class Mon
cur[key_parts[key_parts.length-1]] = kv.value; cur[key_parts[key_parts.length-1]] = kv.value;
if (key === 'config/global') if (key === 'config/global')
{ {
this.config = { ...this.initConfig, ...this.state.config.global }; this.config = this.state.config.global;
this.check_config(); this.check_config();
for (const osd_num in this.state.osd.stats) for (const osd_num in this.state.osd.stats)
{ {
@@ -1626,15 +1499,12 @@ class Mon
} }
while (retries < 0 || retry < retries) while (retries < 0 || retry < retries)
{ {
retry++; const base = this.etcd_urls[Math.floor(Math.random()*this.etcd_urls.length)];
const base = this.pick_next_etcd();
const res = await POST(base+path, body, timeout); const res = await POST(base+path, body, timeout);
if (res.error) if (res.error)
{ {
if (this.selected_etcd_url == base) console.error('etcd returned error: '+res.error);
this.selected_etcd_url = null; break;
console.error('failed to query etcd: '+res.error);
continue;
} }
if (res.json) if (res.json)
{ {
@@ -1643,20 +1513,26 @@ class Mon
console.error('etcd returned error: '+res.json.error); console.error('etcd returned error: '+res.json.error);
break; break;
} }
if (this.etcd_urls.length > 1)
{
// Stick to the same etcd for the rest of calls
this.etcd_urls = [ base ];
}
return res.json; return res.json;
} }
retry++;
} }
this.die(); this.die();
} }
_die(err) die(err)
{ {
// In fact we can just try to rejoin // In fact we can just try to rejoin
console.error(new Error(err || 'Cluster connection failed')); console.error(new Error(err || 'Cluster connection failed'));
process.exit(1); process.exit(1);
} }
local_ips(all) local_ips()
{ {
const ips = []; const ips = [];
const ifaces = os.networkInterfaces(); const ifaces = os.networkInterfaces();
@@ -1664,7 +1540,7 @@ class Mon
{ {
for (const iface of ifaces[ifname]) for (const iface of ifaces[ifname])
{ {
if (iface.family == 'IPv4' && !iface.internal || all) if (iface.family == 'IPv4' && !iface.internal)
{ {
ips.push(iface.address); ips.push(iface.address);
} }

View File

@@ -5,45 +5,21 @@ const LPOptimizer = require('./lp-optimizer.js');
async function run() async function run()
{ {
const osd_tree = { const osd_tree = { a: { 1: 1 }, b: { 2: 1 }, c: { 3: 1 } };
100: { 1: 1 },
200: { 2: 1 },
300: { 3: 1 },
};
let res; let res;
console.log('16 PGs, size=3'); console.log('16 PGs, size=3');
res = await LPOptimizer.optimize_initial({ osd_tree, pg_size: 3, pg_count: 16, ordered: false }); res = await LPOptimizer.optimize_initial({ osd_tree, pg_size: 3, pg_count: 16 });
LPOptimizer.print_change_stats(res, false); LPOptimizer.print_change_stats(res, false);
assert(res.space == 3, 'Initial distribution');
console.log('\nChange size to 2'); console.log('\nReduce PG size to 2');
res = await LPOptimizer.optimize_change({ prev_pgs: res.int_pgs, osd_tree, pg_size: 2, ordered: false }); res = await LPOptimizer.optimize_change({ prev_pgs: res.int_pgs.map(pg => pg.slice(0, 2)), osd_tree, pg_size: 2 });
LPOptimizer.print_change_stats(res, false); LPOptimizer.print_change_stats(res, false);
assert(res.space >= 3*14/16 && res.osd_differs == 0, 'Redistribution');
console.log('\nRemove OSD 3'); console.log('\nRemove OSD 3');
const no3_tree = { ...osd_tree }; delete osd_tree['c'];
delete no3_tree['300']; res = await LPOptimizer.optimize_change({ prev_pgs: res.int_pgs, osd_tree, pg_size: 2 });
res = await LPOptimizer.optimize_change({ prev_pgs: res.int_pgs, osd_tree: no3_tree, pg_size: 2, ordered: false });
LPOptimizer.print_change_stats(res, false); LPOptimizer.print_change_stats(res, false);
assert(res.space == 2, 'Redistribution after OSD removal');
console.log('\n16 PGs, size=3, ordered');
res = await LPOptimizer.optimize_initial({ osd_tree, pg_size: 3, pg_count: 16, ordered: true });
LPOptimizer.print_change_stats(res, false);
assert(res.space == 3, 'Initial distribution');
console.log('\nChange size to 2, ordered');
res = await LPOptimizer.optimize_change({ prev_pgs: res.int_pgs, osd_tree, pg_size: 2, ordered: true });
LPOptimizer.print_change_stats(res, false);
assert(res.space >= 3*14/16 && res.osd_differs < 8, 'Redistribution');
}
function assert(cond, txt)
{
if (!cond)
{
throw new Error((txt||'test')+' failed');
}
} }
run().catch(console.error); run().catch(console.error);

View File

@@ -45,45 +45,30 @@ async function run()
console.log('Empty tree:'); console.log('Empty tree:');
let res = await LPOptimizer.optimize_initial({ osd_tree: cur_tree, pg_size: 3, pg_count: 256 }); let res = await LPOptimizer.optimize_initial({ osd_tree: cur_tree, pg_size: 3, pg_count: 256 });
LPOptimizer.print_change_stats(res, false); LPOptimizer.print_change_stats(res, false);
assert(res.space == 0);
console.log('\nAdding 1st failure domain:'); console.log('\nAdding 1st failure domain:');
cur_tree['dom1'] = osd_tree['dom1']; cur_tree['dom1'] = osd_tree['dom1'];
res = await LPOptimizer.optimize_change({ prev_pgs: res.int_pgs, osd_tree: cur_tree, pg_size: 3 }); res = await LPOptimizer.optimize_change({ prev_pgs: res.int_pgs, osd_tree: cur_tree, pg_size: 3 });
LPOptimizer.print_change_stats(res, false); LPOptimizer.print_change_stats(res, false);
assert(res.space == 12 && res.total_space == 12);
console.log('\nAdding 2nd failure domain:'); console.log('\nAdding 2nd failure domain:');
cur_tree['dom2'] = osd_tree['dom2']; cur_tree['dom2'] = osd_tree['dom2'];
res = await LPOptimizer.optimize_change({ prev_pgs: res.int_pgs, osd_tree: cur_tree, pg_size: 3 }); res = await LPOptimizer.optimize_change({ prev_pgs: res.int_pgs, osd_tree: cur_tree, pg_size: 3 });
LPOptimizer.print_change_stats(res, false); LPOptimizer.print_change_stats(res, false);
assert(res.space == 24 && res.total_space == 24);
console.log('\nAdding 3rd failure domain:'); console.log('\nAdding 3rd failure domain:');
cur_tree['dom3'] = osd_tree['dom3']; cur_tree['dom3'] = osd_tree['dom3'];
res = await LPOptimizer.optimize_change({ prev_pgs: res.int_pgs, osd_tree: cur_tree, pg_size: 3 }); res = await LPOptimizer.optimize_change({ prev_pgs: res.int_pgs, osd_tree: cur_tree, pg_size: 3 });
LPOptimizer.print_change_stats(res, false); LPOptimizer.print_change_stats(res, false);
assert(res.space == 36 && res.total_space == 36);
console.log('\nRemoving 3rd failure domain:'); console.log('\nRemoving 3rd failure domain:');
delete cur_tree['dom3']; delete cur_tree['dom3'];
res = await LPOptimizer.optimize_change({ prev_pgs: res.int_pgs, osd_tree: cur_tree, pg_size: 3 }); res = await LPOptimizer.optimize_change({ prev_pgs: res.int_pgs, osd_tree: cur_tree, pg_size: 3 });
LPOptimizer.print_change_stats(res, false); LPOptimizer.print_change_stats(res, false);
assert(res.space == 24 && res.total_space == 24);
console.log('\nRemoving 2nd failure domain:'); console.log('\nRemoving 2nd failure domain:');
delete cur_tree['dom2']; delete cur_tree['dom2'];
res = await LPOptimizer.optimize_change({ prev_pgs: res.int_pgs, osd_tree: cur_tree, pg_size: 3 }); res = await LPOptimizer.optimize_change({ prev_pgs: res.int_pgs, osd_tree: cur_tree, pg_size: 3 });
LPOptimizer.print_change_stats(res, false); LPOptimizer.print_change_stats(res, false);
assert(res.space == 12 && res.total_space == 12);
console.log('\nRemoving 1st failure domain:'); console.log('\nRemoving 1st failure domain:');
delete cur_tree['dom1']; delete cur_tree['dom1'];
res = await LPOptimizer.optimize_change({ prev_pgs: res.int_pgs, osd_tree: cur_tree, pg_size: 3 }); res = await LPOptimizer.optimize_change({ prev_pgs: res.int_pgs, osd_tree: cur_tree, pg_size: 3 });
LPOptimizer.print_change_stats(res, false); LPOptimizer.print_change_stats(res, false);
assert(res.space == 0);
}
function assert(cond, txt)
{
if (!cond)
{
throw new Error((txt||'test')+' failed');
}
} }
run().catch(console.error); run().catch(console.error);

View File

@@ -1,33 +0,0 @@
// Copyright (c) Vitaliy Filippov, 2019+
// License: VNPL-1.1 (see README.md for details)
const LPOptimizer = require('./lp-optimizer.js');
const osd_tree = {
100: {
1: 0.1,
2: 0.1,
3: 0.1,
},
200: {
4: 0.1,
5: 0.1,
6: 0.1,
},
};
async function run()
{
let res;
console.log('256 PGs, 3+3 OSDs, size=2');
res = await LPOptimizer.optimize_initial({ osd_tree, pg_size: 2, pg_count: 256 });
LPOptimizer.print_change_stats(res, false);
// Should NOT fail with the "unfeasible or unbounded" exception
console.log('\nRemoving osd.2');
delete osd_tree[100][2];
res = await LPOptimizer.optimize_change({ prev_pgs: res.int_pgs, osd_tree, pg_size: 2 });
LPOptimizer.print_change_stats(res, false);
}
run().catch(console.error);

View File

@@ -1,503 +0,0 @@
# Install as /usr/share/perl5/PVE/Storage/Custom/VitastorPlugin.pm
# Proxmox Vitastor Driver
# Copyright (c) Vitaliy Filippov, 2021+
# License: VNPL-1.1 or GNU AGPLv3.0
package PVE::Storage::Custom::VitastorPlugin;
use strict;
use warnings;
use JSON;
use PVE::Storage::Plugin;
use PVE::Tools qw(run_command);
use base qw(PVE::Storage::Plugin);
sub api
{
# Trick it :)
return PVE::Storage->APIVER;
}
sub run_cli
{
my ($scfg, $cmd, %args) = @_;
my $retval;
my $stderr = '';
my $errmsg = $args{errmsg} ? $args{errmsg}.": " : "vitastor-cli error: ";
my $json = delete $args{json};
$json = 1 if !defined $json;
my $binary = delete $args{binary};
$binary = '/usr/bin/vitastor-cli' if !defined $binary;
if (!exists($args{errfunc}))
{
$args{errfunc} = sub
{
my $line = shift;
print STDERR $line;
*STDERR->flush();
$stderr .= $line;
};
}
if (!exists($args{outfunc}))
{
$retval = '';
$args{outfunc} = sub { $retval .= shift };
if ($json)
{
unshift @$cmd, '--json';
}
}
if ($scfg->{vitastor_etcd_address})
{
unshift @$cmd, '--etcd_address', $scfg->{vitastor_etcd_address};
}
if ($scfg->{vitastor_config_path})
{
unshift @$cmd, '--config_path', $scfg->{vitastor_config_path};
}
unshift @$cmd, $binary;
eval { run_command($cmd, %args); };
if (my $err = $@)
{
die "Error invoking vitastor-cli: $err";
}
if (defined $retval)
{
# untaint
$retval =~ /^(.*)$/s;
if ($json)
{
eval { $retval = JSON::decode_json($1); };
if ($@)
{
die "vitastor-cli returned bad JSON: $@";
}
}
else
{
$retval = $1;
}
}
return $retval;
}
# Configuration
sub type
{
return 'vitastor';
}
sub plugindata
{
return {
content => [ { images => 1, rootdir => 1 }, { images => 1 } ],
};
}
sub properties
{
return {
vitastor_etcd_address => {
description => 'IP address(es) of etcd.',
type => 'string',
format => 'pve-storage-portal-dns-list',
},
vitastor_etcd_prefix => {
description => 'Prefix for Vitastor etcd metadata',
type => 'string',
},
vitastor_config_path => {
description => 'Path to Vitastor configuration file',
type => 'string',
},
vitastor_prefix => {
description => 'Image name prefix',
type => 'string',
},
vitastor_pool => {
description => 'Default pool to use for images',
type => 'string',
},
vitastor_nbd => {
description => 'Use kernel NBD devices (slower)',
type => 'boolean',
},
};
}
sub options
{
return {
nodes => { optional => 1 },
disable => { optional => 1 },
vitastor_etcd_address => { optional => 1},
vitastor_etcd_prefix => { optional => 1 },
vitastor_config_path => { optional => 1 },
vitastor_prefix => { optional => 1 },
vitastor_pool => {},
vitastor_nbd => { optional => 1 },
};
}
# Storage implementation
sub parse_volname
{
my ($class, $volname) = @_;
if ($volname =~ m/^((base-(\d+)-\S+)\/)?((?:(base)|(vm))-(\d+)-\S+)$/)
{
# ($vtype, $name, $vmid, $basename, $basevmid, $isBase, $format)
return ('images', $4, $7, $2, $3, $5, 'raw');
}
die "unable to parse vitastor volume name '$volname'\n";
}
sub _qemu_option
{
my ($k, $v) = @_;
if (defined $v && $v ne "")
{
$v =~ s/:/\\:/gso;
return ":$k=$v";
}
return "";
}
sub path
{
my ($class, $scfg, $volname, $storeid, $snapname) = @_;
my $prefix = defined $scfg->{vitastor_prefix} ? $scfg->{vitastor_prefix} : 'pve/';
my ($vtype, $name, $vmid) = $class->parse_volname($volname);
$name .= '@'.$snapname if $snapname;
if ($scfg->{vitastor_nbd})
{
my $mapped = run_cli($scfg, [ 'ls' ], binary => '/usr/bin/vitastor-nbd');
my ($kerneldev) = grep { $mapped->{$_}->{image} eq $prefix.$name } keys %$mapped;
die "Image not mapped via NBD" if !$kerneldev;
return ($kerneldev, $vmid, $vtype);
}
my $path = "vitastor";
$path .= _qemu_option('config_path', $scfg->{vitastor_config_path});
# FIXME This is the only exception: etcd_address -> etcd_host for qemu
$path .= _qemu_option('etcd_host', $scfg->{vitastor_etcd_address});
$path .= _qemu_option('etcd_prefix', $scfg->{vitastor_etcd_prefix});
$path .= _qemu_option('image', $prefix.$name);
return ($path, $vmid, $vtype);
}
sub _find_free_diskname
{
my ($class, $storeid, $scfg, $vmid, $fmt, $add_fmt_suffix) = @_;
my $list = _process_list($scfg, $storeid, run_cli($scfg, [ 'ls' ]));
$list = [ map { $_->{name} } @$list ];
return PVE::Storage::Plugin::get_next_vm_diskname($list, $storeid, $vmid, undef, $scfg);
}
# Used only in "Create Template" and, in fact, converts a VM into a template
# As a consequence, this is always invoked with the VM powered off
# So we just rename vm-xxx to base-xxx and make it a readonly base layer
sub create_base
{
my ($class, $storeid, $scfg, $volname) = @_;
my $prefix = defined $scfg->{vitastor_prefix} ? $scfg->{vitastor_prefix} : 'pve/';
my ($vtype, $name, $vmid, $basename, $basevmid, $isBase) = $class->parse_volname($volname);
die "create_base not possible with base image\n" if $isBase;
my $info = _process_list($scfg, $storeid, run_cli($scfg, [ 'ls', $prefix.$name ]))->[0];
die "image $name does not exist\n" if !$info;
die "volname '$volname' contains wrong information about parent {$info->{parent}} $basename\n"
if $basename && (!$info->{parent} || $info->{parent} ne $basename);
my $newname = $name;
$newname =~ s/^vm-/base-/;
my $newvolname = $basename ? "$basename/$newname" : "$newname";
run_cli($scfg, [ 'modify', '--rename', $prefix.$newname, '--readonly', $prefix.$name ], json => 0);
return $newvolname;
}
sub clone_image
{
my ($class, $scfg, $storeid, $volname, $vmid, $snapname) = @_;
my $prefix = defined $scfg->{vitastor_prefix} ? $scfg->{vitastor_prefix} : 'pve/';
my $snap = '';
$snap = '@'.$snapname if length $snapname;
my ($vtype, $basename, $basevmid, undef, undef, $isBase) = $class->parse_volname($volname);
die "$volname is not a base image and snapname is not provided\n" if !$isBase && !length($snapname);
my $name = $class->find_free_diskname($storeid, $scfg, $vmid);
warn "clone $volname: $basename snapname $snap to $name\n";
my $newvol = "$basename/$name";
$newvol = $name if length($snapname);
run_cli($scfg, [ 'create', '--parent', $prefix.$basename.$snap, $prefix.$name ], json => 0);
return $newvol;
}
sub alloc_image
{
# $size is in kb in this method
my ($class, $storeid, $scfg, $vmid, $fmt, $name, $size) = @_;
my $prefix = defined $scfg->{vitastor_prefix} ? $scfg->{vitastor_prefix} : 'pve/';
die "illegal name '$name' - should be 'vm-$vmid-*'\n" if $name && $name !~ m/^vm-$vmid-/;
$name = $class->find_free_diskname($storeid, $scfg, $vmid) if !$name;
run_cli($scfg, [ 'create', '--size', (int(($size+3)/4)*4).'k', '--pool', $scfg->{vitastor_pool}, $prefix.$name ], json => 0);
return $name;
}
sub free_image
{
my ($class, $storeid, $scfg, $volname, $isBase) = @_;
my $prefix = defined $scfg->{vitastor_prefix} ? $scfg->{vitastor_prefix} : 'pve/';
my ($vtype, $name, $vmid, undef, undef, undef) = $class->parse_volname($volname);
$class->deactivate_volume($storeid, $scfg, $volname);
my $full_list = run_cli($scfg, [ 'ls', '-l' ]);
my $list = _process_list($scfg, $storeid, $full_list);
# Remove image and all its snapshots
my $rm_names = {
map { ($prefix.$_->{name} => 1) }
grep { $_->{name} eq $name || substr($_->{name}, 0, length($name)+1) eq ($name.'@') }
@$list
};
my $children = [ grep { $_->{parent_name} && $rm_names->{$_->{parent_name}} } @$full_list ];
die "Image has children: ".join(', ', map {
substr($_->{name}, 0, length $prefix) eq $prefix
? substr($_->name, length $prefix)
: $_->{name}
} @$children)."\n" if @$children;
my $to_remove = [ grep { $rm_names->{$_->{name}} } @$full_list ];
for my $rmi (@$to_remove)
{
run_cli($scfg, [ 'rm-data', '--pool', $rmi->{pool_id}, '--inode', $rmi->{inode_num} ], json => 0);
}
for my $rmi (@$to_remove)
{
run_cli($scfg, [ 'rm', $rmi->{name} ], json => 0);
}
return undef;
}
sub _process_list
{
my ($scfg, $storeid, $result) = @_;
my $prefix = defined $scfg->{vitastor_prefix} ? $scfg->{vitastor_prefix} : 'pve/';
my $list = [];
foreach my $el (@$result)
{
next if !$el->{name} || length($prefix) && substr($el->{name}, 0, length $prefix) ne $prefix;
my $name = substr($el->{name}, length $prefix);
next if $name =~ /@/;
my ($owner) = $name =~ /^(?:vm|base)-(\d+)-/s;
next if !defined $owner;
my $parent = !defined $el->{parent_name}
? undef
: ($prefix eq '' || substr($el->{parent_name}, 0, length $prefix) eq $prefix
? substr($el->{parent_name}, length $prefix) : '');
my $volid = $parent && $parent =~ /^(base-\d+-\S+)$/s
? "$storeid:$1/$name" : "$storeid:$name";
push @$list, {
format => 'raw',
volid => $volid,
name => $name,
size => $el->{size},
parent => $parent,
vmid => $owner,
};
}
return $list;
}
sub list_images
{
my ($class, $storeid, $scfg, $vmid, $vollist, $cache) = @_;
my $list = _process_list($scfg, $storeid, run_cli($scfg, [ 'ls', '-l' ]));
if ($vollist)
{
my $h = { map { ($_ => 1) } @$vollist };
$list = [ grep { $h->{$_->{volid}} } @$list ]
}
elsif (defined $vmid)
{
$list = [ grep { $_->{vmid} eq $vmid } @$list ];
}
return $list;
}
sub status
{
my ($class, $storeid, $scfg, $cache) = @_;
my $stats = [ grep { $_->{name} eq $scfg->{vitastor_pool} } @{ run_cli($scfg, [ 'df' ]) } ]->[0];
my $free = $stats ? $stats->{max_available} : 0;
my $used = $stats ? $stats->{used_raw}/($stats->{raw_to_usable}||1) : 0;
my $total = $free+$used;
my $active = $stats ? 1 : 0;
return ($total, $free, $used, $active);
}
sub activate_storage
{
my ($class, $storeid, $scfg, $cache) = @_;
return 1;
}
sub deactivate_storage
{
my ($class, $storeid, $scfg, $cache) = @_;
return 1;
}
sub map_volume
{
my ($class, $storeid, $scfg, $volname, $snapname) = @_;
my $prefix = defined $scfg->{vitastor_prefix} ? $scfg->{vitastor_prefix} : 'pve/';
my ($vtype, $img_name, $vmid) = $class->parse_volname($volname);
my $name = $img_name;
$name .= '@'.$snapname if $snapname;
my $mapped = run_cli($scfg, [ 'ls' ], binary => '/usr/bin/vitastor-nbd');
my ($kerneldev) = grep { $mapped->{$_}->{image} eq $prefix.$name } keys %$mapped;
return $kerneldev if $kerneldev && -b $kerneldev; # already mapped
$kerneldev = run_cli($scfg, [ 'map', '--image', $prefix.$name ], binary => '/usr/bin/vitastor-nbd', json => 0);
return $kerneldev;
}
sub unmap_volume
{
my ($class, $storeid, $scfg, $volname, $snapname) = @_;
my $prefix = defined $scfg->{vitastor_prefix} ? $scfg->{vitastor_prefix} : 'pve/';
return 1 if !$scfg->{vitastor_nbd};
my ($vtype, $name, $vmid) = $class->parse_volname($volname);
$name .= '@'.$snapname if $snapname;
my $mapped = run_cli($scfg, [ 'ls' ], binary => '/usr/bin/vitastor-nbd');
my ($kerneldev) = grep { $mapped->{$_}->{image} eq $prefix.$name } keys %$mapped;
if ($kerneldev && -b $kerneldev)
{
run_cli($scfg, [ 'unmap', $kerneldev ], binary => '/usr/bin/vitastor-nbd', json => 0);
}
return 1;
}
sub activate_volume
{
my ($class, $storeid, $scfg, $volname, $snapname, $cache) = @_;
$class->map_volume($storeid, $scfg, $volname, $snapname) if $scfg->{vitastor_nbd};
return 1;
}
sub deactivate_volume
{
my ($class, $storeid, $scfg, $volname, $snapname, $cache) = @_;
$class->unmap_volume($storeid, $scfg, $volname, $snapname);
return 1;
}
sub volume_size_info
{
my ($class, $scfg, $storeid, $volname, $timeout) = @_;
my $prefix = defined $scfg->{vitastor_prefix} ? $scfg->{vitastor_prefix} : 'pve/';
my ($vtype, $name, $vmid) = $class->parse_volname($volname);
my $info = _process_list($scfg, $storeid, run_cli($scfg, [ 'ls', $prefix.$name ]))->[0];
#return wantarray ? ($size, $format, $used, $parent, $st->ctime) : $size;
return $info->{size};
}
sub volume_resize
{
my ($class, $scfg, $storeid, $volname, $size, $running) = @_;
my $prefix = defined $scfg->{vitastor_prefix} ? $scfg->{vitastor_prefix} : 'pve/';
my ($vtype, $name, $vmid) = $class->parse_volname($volname);
# $size is in bytes in this method
run_cli($scfg, [ 'modify', '--resize', (int(($size+4095)/4096)*4).'k', $prefix.$name ], json => 0);
return undef;
}
sub volume_snapshot
{
my ($class, $scfg, $storeid, $volname, $snap) = @_;
my $prefix = defined $scfg->{vitastor_prefix} ? $scfg->{vitastor_prefix} : 'pve/';
my ($vtype, $name, $vmid) = $class->parse_volname($volname);
run_cli($scfg, [ 'create', '--snapshot', $snap, $prefix.$name ], json => 0);
return undef;
}
sub volume_snapshot_rollback
{
my ($class, $scfg, $storeid, $volname, $snap) = @_;
my $prefix = defined $scfg->{vitastor_prefix} ? $scfg->{vitastor_prefix} : 'pve/';
my ($vtype, $name, $vmid) = $class->parse_volname($volname);
run_cli($scfg, [ 'rm', $prefix.$name ], json => 0);
run_cli($scfg, [ 'create', '--parent', $prefix.$name.'@'.$snap, $prefix.$name ], json => 0);
return undef;
}
sub volume_snapshot_delete
{
my ($class, $scfg, $storeid, $volname, $snap, $running) = @_;
my $prefix = defined $scfg->{vitastor_prefix} ? $scfg->{vitastor_prefix} : 'pve/';
my ($vtype, $name, $vmid) = $class->parse_volname($volname);
run_cli($scfg, [ 'rm', $prefix.$name.'@'.$snap ], json => 0);
return undef;
}
sub volume_snapshot_needs_fsfreeze
{
return 1;
}
sub volume_has_feature
{
my ($class, $scfg, $feature, $storeid, $volname, $snapname, $running) = @_;
my $features = {
snapshot => { current => 1, snap => 1 },
clone => { base => 1, snap => 1 },
template => { current => 1 },
copy => { base => 1, current => 1, snap => 1 },
sparseinit => { base => 1, current => 1 },
rename => { current => 1 },
};
my ($vtype, $name, $vmid, $basename, $basevmid, $isBase) = $class->parse_volname($volname);
my $key = undef;
if ($snapname)
{
$key = 'snap';
}
else
{
$key = $isBase ? 'base' : 'current';
}
return 1 if $features->{$feature}->{$key};
return undef;
}
sub rename_volume
{
my ($class, $scfg, $storeid, $source_volname, $target_vmid, $target_volname) = @_;
my $prefix = defined $scfg->{vitastor_prefix} ? $scfg->{vitastor_prefix} : 'pve/';
my (undef, $source_image, $source_vmid, $base_name, $base_vmid, undef, $format) =
$class->parse_volname($source_volname);
$target_volname = $class->find_free_diskname($storeid, $scfg, $target_vmid, $format) if !$target_volname;
run_cli($scfg, [ 'modify', '--rename', $prefix.$target_volname, $prefix.$source_image ], json => 0);
$base_name = $base_name ? "${base_name}/" : '';
return "${storeid}:${base_name}${target_volname}";
}
1;

View File

@@ -50,7 +50,7 @@ from cinder.volume import configuration
from cinder.volume import driver from cinder.volume import driver
from cinder.volume import volume_utils from cinder.volume import volume_utils
VERSION = '0.6.12' VERSION = '0.6.8'
LOG = logging.getLogger(__name__) LOG = logging.getLogger(__name__)
@@ -658,8 +658,8 @@ class VitastorDriver(driver.CloneableImageVD,
'etcd_address': self.configuration.vitastor_etcd_address, 'etcd_address': self.configuration.vitastor_etcd_address,
'etcd_prefix': self.configuration.vitastor_etcd_prefix, 'etcd_prefix': self.configuration.vitastor_etcd_prefix,
'name': volume.name, 'name': volume.name,
'logical_block_size': '512', 'logical_block_size': 512,
'physical_block_size': '4096', 'physical_block_size': 4096,
} }
} }
LOG.debug('connection data: %s', data) LOG.debug('connection data: %s', data)

View File

@@ -1,4 +1,4 @@
commit 7f01510ef207940b07fac4f5fc8b9f1580b443aa commit bd283191b3e7a4c6d1c100d3d96e348a1ebffe55
Author: Vitaliy Filippov <vitalif@yourcmc.ru> Author: Vitaliy Filippov <vitalif@yourcmc.ru>
Date: Sun Jun 27 12:52:40 2021 +0300 Date: Sun Jun 27 12:52:40 2021 +0300
@@ -65,38 +65,10 @@ index 4bf2b5f..dbc011b 100644
int virConnectListAllStoragePools(virConnectPtr conn, int virConnectListAllStoragePools(virConnectPtr conn,
diff --git a/src/conf/domain_conf.c b/src/conf/domain_conf.c diff --git a/src/conf/domain_conf.c b/src/conf/domain_conf.c
index 222bb8c..2c30c55 100644 index 222bb8c..685d255 100644
--- a/src/conf/domain_conf.c --- a/src/conf/domain_conf.c
+++ b/src/conf/domain_conf.c +++ b/src/conf/domain_conf.c
@@ -4667,8 +4667,7 @@ virDomainDeviceDefPostParseCommon(virDomainDeviceDefPtr dev, @@ -8653,6 +8653,10 @@ virDomainDiskSourceNetworkParse(xmlNodePtr node,
if (dev->type == VIR_DOMAIN_DEVICE_DISK) {
virDomainDiskDefPtr disk = dev->data.disk;
- /* internal snapshots and config files are currently supported
- * only with rbd: */
+ /* internal snapshots are currently supported only with rbd: */
if (virStorageSourceGetActualType(disk->src) != VIR_STORAGE_TYPE_NETWORK &&
disk->src->protocol != VIR_STORAGE_NET_PROTOCOL_RBD) {
if (disk->src->snapshot) {
@@ -4677,11 +4676,15 @@ virDomainDeviceDefPostParseCommon(virDomainDeviceDefPtr dev,
"only with 'rbd' disks"));
return -1;
}
-
+ }
+ /* config files are currently supported only with rbd and vitastor: */
+ if (virStorageSourceGetActualType(disk->src) != VIR_STORAGE_TYPE_NETWORK &&
+ disk->src->protocol != VIR_STORAGE_NET_PROTOCOL_RBD &&
+ disk->src->protocol != VIR_STORAGE_NET_PROTOCOL_VITASTOR) {
if (disk->src->configFile) {
virReportError(VIR_ERR_XML_ERROR, "%s",
_("<config> element is currently supported "
- "only with 'rbd' disks"));
+ "only with 'rbd' and 'vitastor' disks"));
return -1;
}
}
@@ -8653,6 +8656,10 @@ virDomainDiskSourceNetworkParse(xmlNodePtr node,
goto cleanup; goto cleanup;
} }
@@ -107,7 +79,7 @@ index 222bb8c..2c30c55 100644
if ((haveTLS = virXMLPropString(node, "tls")) && if ((haveTLS = virXMLPropString(node, "tls")) &&
(src->haveTLS = virTristateBoolTypeFromString(haveTLS)) <= 0) { (src->haveTLS = virTristateBoolTypeFromString(haveTLS)) <= 0) {
virReportError(VIR_ERR_XML_ERROR, virReportError(VIR_ERR_XML_ERROR,
@@ -23849,6 +23856,10 @@ virDomainDiskSourceFormatNetwork(virBufferPtr attrBuf, @@ -23849,6 +23853,10 @@ virDomainDiskSourceFormatNetwork(virBufferPtr attrBuf,
virBufferEscapeString(attrBuf, " name='%s'", path ? path : src->path); virBufferEscapeString(attrBuf, " name='%s'", path ? path : src->path);
@@ -118,7 +90,7 @@ index 222bb8c..2c30c55 100644
VIR_FREE(path); VIR_FREE(path);
if (src->haveTLS != VIR_TRISTATE_BOOL_ABSENT && if (src->haveTLS != VIR_TRISTATE_BOOL_ABSENT &&
@@ -30930,6 +30941,7 @@ virDomainDiskTranslateSourcePool(virDomainDiskDefPtr def) @@ -30930,6 +30938,7 @@ virDomainDiskTranslateSourcePool(virDomainDiskDefPtr def)
case VIR_STORAGE_POOL_MPATH: case VIR_STORAGE_POOL_MPATH:
case VIR_STORAGE_POOL_RBD: case VIR_STORAGE_POOL_RBD:
@@ -244,7 +216,7 @@ index 73e988a..ab7bb81 100644
case VIR_STORAGE_NET_PROTOCOL_NONE: case VIR_STORAGE_NET_PROTOCOL_NONE:
virReportError(VIR_ERR_NO_SUPPORT, virReportError(VIR_ERR_NO_SUPPORT,
diff --git a/src/qemu/qemu_block.c b/src/qemu/qemu_block.c diff --git a/src/qemu/qemu_block.c b/src/qemu/qemu_block.c
index cbf0aa4..f0ca9e7 100644 index cbf0aa4..096700d 100644
--- a/src/qemu/qemu_block.c --- a/src/qemu/qemu_block.c
+++ b/src/qemu/qemu_block.c +++ b/src/qemu/qemu_block.c
@@ -959,6 +959,42 @@ qemuBlockStorageSourceGetRBDProps(virStorageSourcePtr src) @@ -959,6 +959,42 @@ qemuBlockStorageSourceGetRBDProps(virStorageSourcePtr src)
@@ -303,7 +275,7 @@ index cbf0aa4..f0ca9e7 100644
if (!(fileprops = qemuBlockStorageSourceGetSheepdogProps(src))) if (!(fileprops = qemuBlockStorageSourceGetSheepdogProps(src)))
return NULL; return NULL;
diff --git a/src/qemu/qemu_command.c b/src/qemu/qemu_command.c diff --git a/src/qemu/qemu_command.c b/src/qemu/qemu_command.c
index 822d5f8..abec34e 100644 index 822d5f8..e375cef 100644
--- a/src/qemu/qemu_command.c --- a/src/qemu/qemu_command.c
+++ b/src/qemu/qemu_command.c +++ b/src/qemu/qemu_command.c
@@ -975,6 +975,43 @@ qemuBuildNetworkDriveStr(virStorageSourcePtr src, @@ -975,6 +975,43 @@ qemuBuildNetworkDriveStr(virStorageSourcePtr src,
@@ -415,7 +387,7 @@ index 4a13e90..33301c7 100644
ignore_value(VIR_STRDUP(stable_path, data->path)); ignore_value(VIR_STRDUP(stable_path, data->path));
break; break;
diff --git a/src/util/virstoragefile.c b/src/util/virstoragefile.c diff --git a/src/util/virstoragefile.c b/src/util/virstoragefile.c
index bd4b027..8454906 100644 index bd4b027..b323cd6 100644
--- a/src/util/virstoragefile.c --- a/src/util/virstoragefile.c
+++ b/src/util/virstoragefile.c +++ b/src/util/virstoragefile.c
@@ -84,7 +84,8 @@ VIR_ENUM_IMPL(virStorageNetProtocol, VIR_STORAGE_NET_PROTOCOL_LAST, @@ -84,7 +84,8 @@ VIR_ENUM_IMPL(virStorageNetProtocol, VIR_STORAGE_NET_PROTOCOL_LAST,

View File

@@ -1,4 +1,4 @@
commit 4e74c622884e2585b2cfcdf322fbd2bff6de41ca commit 41cdfe8317d98f70aadedfdbb381effed2641bdd
Author: Vitaliy Filippov <vitalif@yourcmc.ru> Author: Vitaliy Filippov <vitalif@yourcmc.ru>
Date: Fri Jul 9 01:31:57 2021 +0300 Date: Fri Jul 9 01:31:57 2021 +0300
@@ -65,38 +65,10 @@ index 089e1e0..d7e7ef4 100644
int virConnectListAllStoragePools(virConnectPtr conn, int virConnectListAllStoragePools(virConnectPtr conn,
diff --git a/src/conf/domain_conf.c b/src/conf/domain_conf.c diff --git a/src/conf/domain_conf.c b/src/conf/domain_conf.c
index 01b7187..645c758 100644 index 01b7187..c6e9702 100644
--- a/src/conf/domain_conf.c --- a/src/conf/domain_conf.c
+++ b/src/conf/domain_conf.c +++ b/src/conf/domain_conf.c
@@ -5230,8 +5230,7 @@ virDomainDiskDefPostParse(virDomainDiskDefPtr disk, @@ -8261,7 +8261,8 @@ virDomainDiskSourceNetworkParse(xmlNodePtr node,
const virDomainDef *def,
virDomainXMLOptionPtr xmlopt)
{
- /* internal snapshots and config files are currently supported
- * only with rbd: */
+ /* internal snapshots are currently supported only with rbd: */
if (virStorageSourceGetActualType(disk->src) != VIR_STORAGE_TYPE_NETWORK &&
disk->src->protocol != VIR_STORAGE_NET_PROTOCOL_RBD) {
if (disk->src->snapshot) {
@@ -5240,11 +5239,15 @@ virDomainDiskDefPostParse(virDomainDiskDefPtr disk,
"only with 'rbd' disks"));
return -1;
}
-
+ }
+ /* config files are currently supported only with rbd and vitastor: */
+ if (virStorageSourceGetActualType(disk->src) != VIR_STORAGE_TYPE_NETWORK &&
+ disk->src->protocol != VIR_STORAGE_NET_PROTOCOL_RBD &&
+ disk->src->protocol != VIR_STORAGE_NET_PROTOCOL_VITASTOR) {
if (disk->src->configFile) {
virReportError(VIR_ERR_XML_ERROR, "%s",
_("<config> element is currently supported "
- "only with 'rbd' disks"));
+ "only with 'rbd' and 'vitastor' disks"));
return -1;
}
}
@@ -8261,7 +8264,8 @@ virDomainDiskSourceNetworkParse(xmlNodePtr node,
src->configFile = virXPathString("string(./config/@file)", ctxt); src->configFile = virXPathString("string(./config/@file)", ctxt);
if (src->protocol == VIR_STORAGE_NET_PROTOCOL_HTTP || if (src->protocol == VIR_STORAGE_NET_PROTOCOL_HTTP ||
@@ -106,7 +78,7 @@ index 01b7187..645c758 100644
src->query = virXMLPropString(node, "query"); src->query = virXMLPropString(node, "query");
if (virDomainStorageNetworkParseHosts(node, ctxt, &src->hosts, &src->nhosts) < 0) if (virDomainStorageNetworkParseHosts(node, ctxt, &src->hosts, &src->nhosts) < 0)
@@ -31392,6 +31396,7 @@ virDomainStorageSourceTranslateSourcePool(virStorageSourcePtr src, @@ -31392,6 +31393,7 @@ virDomainStorageSourceTranslateSourcePool(virStorageSourcePtr src,
case VIR_STORAGE_POOL_MPATH: case VIR_STORAGE_POOL_MPATH:
case VIR_STORAGE_POOL_RBD: case VIR_STORAGE_POOL_RBD:
@@ -244,7 +216,7 @@ index 17b93d0..c5a0084 100644
case VIR_STORAGE_NET_PROTOCOL_NONE: case VIR_STORAGE_NET_PROTOCOL_NONE:
virReportError(VIR_ERR_NO_SUPPORT, virReportError(VIR_ERR_NO_SUPPORT,
diff --git a/src/qemu/qemu_block.c b/src/qemu/qemu_block.c diff --git a/src/qemu/qemu_block.c b/src/qemu/qemu_block.c
index f9c6da2..d837a05 100644 index f9c6da2..922dde5 100644
--- a/src/qemu/qemu_block.c --- a/src/qemu/qemu_block.c
+++ b/src/qemu/qemu_block.c +++ b/src/qemu/qemu_block.c
@@ -938,6 +938,38 @@ qemuBlockStorageSourceGetRBDProps(virStorageSourcePtr src, @@ -938,6 +938,38 @@ qemuBlockStorageSourceGetRBDProps(virStorageSourcePtr src,
@@ -321,7 +293,7 @@ index f9c6da2..d837a05 100644
driver = "sheepdog"; driver = "sheepdog";
if (!(location = qemuBlockStorageSourceGetSheepdogProps(src))) if (!(location = qemuBlockStorageSourceGetSheepdogProps(src)))
diff --git a/src/qemu/qemu_command.c b/src/qemu/qemu_command.c diff --git a/src/qemu/qemu_command.c b/src/qemu/qemu_command.c
index 6f970a3..4c03fb8 100644 index 6f970a3..10b39ca 100644
--- a/src/qemu/qemu_command.c --- a/src/qemu/qemu_command.c
+++ b/src/qemu/qemu_command.c +++ b/src/qemu/qemu_command.c
@@ -1034,6 +1034,43 @@ qemuBuildNetworkDriveStr(virStorageSourcePtr src, @@ -1034,6 +1034,43 @@ qemuBuildNetworkDriveStr(virStorageSourcePtr src,
@@ -443,7 +415,7 @@ index 29c4c86..a27ad94 100644
case VIR_STORAGE_POOL_LOGICAL: case VIR_STORAGE_POOL_LOGICAL:
case VIR_STORAGE_POOL_DISK: case VIR_STORAGE_POOL_DISK:
diff --git a/src/util/virstoragefile.c b/src/util/virstoragefile.c diff --git a/src/util/virstoragefile.c b/src/util/virstoragefile.c
index 0d3c2af..edb7f9e 100644 index 0d3c2af..36e3afc 100644
--- a/src/util/virstoragefile.c --- a/src/util/virstoragefile.c
+++ b/src/util/virstoragefile.c +++ b/src/util/virstoragefile.c
@@ -91,6 +91,7 @@ VIR_ENUM_IMPL(virStorageNetProtocol, @@ -91,6 +91,7 @@ VIR_ENUM_IMPL(virStorageNetProtocol,

View File

@@ -1,4 +1,4 @@
commit c97d7f2bfb7798f0d68bdba2646245dcfb940efa commit c6e1958a1b4974828e8e5852beb252ce6594e670
Author: Vitaliy Filippov <vitalif@yourcmc.ru> Author: Vitaliy Filippov <vitalif@yourcmc.ru>
Date: Mon Jun 28 01:20:19 2021 +0300 Date: Mon Jun 28 01:20:19 2021 +0300
@@ -86,37 +86,6 @@ index d78f846..f7222e3 100644
case VIR_STORAGE_POOL_SHEEPDOG: case VIR_STORAGE_POOL_SHEEPDOG:
case VIR_STORAGE_POOL_GLUSTER: case VIR_STORAGE_POOL_GLUSTER:
case VIR_STORAGE_POOL_LAST: case VIR_STORAGE_POOL_LAST:
diff --git a/src/conf/domain_validate.c b/src/conf/domain_validate.c
index 2124d25..6acc6fa 100644
--- a/src/conf/domain_validate.c
+++ b/src/conf/domain_validate.c
@@ -470,7 +470,7 @@ virDomainDiskDefValidateSourceChainOne(const virStorageSource *src)
}
}
- /* internal snapshots and config files are currently supported only with rbd: */
+ /* internal snapshots are currently supported only with rbd: */
if (virStorageSourceGetActualType(src) != VIR_STORAGE_TYPE_NETWORK &&
src->protocol != VIR_STORAGE_NET_PROTOCOL_RBD) {
if (src->snapshot) {
@@ -479,11 +479,15 @@ virDomainDiskDefValidateSourceChainOne(const virStorageSource *src)
"only with 'rbd' disks"));
return -1;
}
-
+ }
+ /* config files are currently supported only with rbd and vitastor: */
+ if (virStorageSourceGetActualType(src) != VIR_STORAGE_TYPE_NETWORK &&
+ src->protocol != VIR_STORAGE_NET_PROTOCOL_RBD &&
+ src->protocol != VIR_STORAGE_NET_PROTOCOL_VITASTOR) {
if (src->configFile) {
virReportError(VIR_ERR_XML_ERROR, "%s",
_("<config> element is currently supported "
- "only with 'rbd' disks"));
+ "only with 'rbd' and 'vitastor' disks"));
return -1;
}
}
diff --git a/src/conf/storage_conf.c b/src/conf/storage_conf.c diff --git a/src/conf/storage_conf.c b/src/conf/storage_conf.c
index 2aa9a3d..166ca1f 100644 index 2aa9a3d..166ca1f 100644
--- a/src/conf/storage_conf.c --- a/src/conf/storage_conf.c
@@ -279,7 +248,7 @@ index c0905b0..c172378 100644
case VIR_STORAGE_NET_PROTOCOL_NONE: case VIR_STORAGE_NET_PROTOCOL_NONE:
virReportError(VIR_ERR_NO_SUPPORT, virReportError(VIR_ERR_NO_SUPPORT,
diff --git a/src/qemu/qemu_block.c b/src/qemu/qemu_block.c diff --git a/src/qemu/qemu_block.c b/src/qemu/qemu_block.c
index 6627d04..f769d24 100644 index 6627d04..c33f428 100644
--- a/src/qemu/qemu_block.c --- a/src/qemu/qemu_block.c
+++ b/src/qemu/qemu_block.c +++ b/src/qemu/qemu_block.c
@@ -928,6 +928,38 @@ qemuBlockStorageSourceGetRBDProps(virStorageSource *src, @@ -928,6 +928,38 @@ qemuBlockStorageSourceGetRBDProps(virStorageSource *src,
@@ -356,7 +325,7 @@ index 6627d04..f769d24 100644
driver = "sheepdog"; driver = "sheepdog";
if (!(location = qemuBlockStorageSourceGetSheepdogProps(src))) if (!(location = qemuBlockStorageSourceGetSheepdogProps(src)))
diff --git a/src/qemu/qemu_command.c b/src/qemu/qemu_command.c diff --git a/src/qemu/qemu_command.c b/src/qemu/qemu_command.c
index ea51369..d714597 100644 index ea51369..8258632 100644
--- a/src/qemu/qemu_command.c --- a/src/qemu/qemu_command.c
+++ b/src/qemu/qemu_command.c +++ b/src/qemu/qemu_command.c
@@ -1074,6 +1074,43 @@ qemuBuildNetworkDriveStr(virStorageSource *src, @@ -1074,6 +1074,43 @@ qemuBuildNetworkDriveStr(virStorageSource *src,
@@ -466,7 +435,7 @@ index c2ff4b8..70d0689 100644
case VIR_STORAGE_POOL_ZFS: case VIR_STORAGE_POOL_ZFS:
case VIR_STORAGE_POOL_LAST: case VIR_STORAGE_POOL_LAST:
diff --git a/src/storage_file/storage_source_backingstore.c b/src/storage_file/storage_source_backingstore.c diff --git a/src/storage_file/storage_source_backingstore.c b/src/storage_file/storage_source_backingstore.c
index e48ae72..2017ccc 100644 index e48ae72..d7a9b72 100644
--- a/src/storage_file/storage_source_backingstore.c --- a/src/storage_file/storage_source_backingstore.c
+++ b/src/storage_file/storage_source_backingstore.c +++ b/src/storage_file/storage_source_backingstore.c
@@ -284,6 +284,75 @@ virStorageSourceParseRBDColonString(const char *rbdstr, @@ -284,6 +284,75 @@ virStorageSourceParseRBDColonString(const char *rbdstr,

View File

@@ -1,692 +0,0 @@
commit c97d7f2bfb7798f0d68bdba2646245dcfb940efa
Author: Vitaliy Filippov <vitalif@yourcmc.ru>
Date: Mon Jun 28 01:20:19 2021 +0300
Add Vitastor support
Index: libvirt-7.6.0/docs/schemas/domaincommon.rng
===================================================================
--- libvirt-7.6.0.orig/docs/schemas/domaincommon.rng
+++ libvirt-7.6.0/docs/schemas/domaincommon.rng
@@ -1877,6 +1877,35 @@
</element>
</define>
+ <define name="diskSourceNetworkProtocolVitastor">
+ <element name="source">
+ <interleave>
+ <attribute name="protocol">
+ <value>vitastor</value>
+ </attribute>
+ <ref name="diskSourceCommon"/>
+ <optional>
+ <attribute name="name"/>
+ </optional>
+ <optional>
+ <attribute name="query"/>
+ </optional>
+ <zeroOrMore>
+ <ref name="diskSourceNetworkHost"/>
+ </zeroOrMore>
+ <optional>
+ <element name="config">
+ <attribute name="file">
+ <ref name="absFilePath"/>
+ </attribute>
+ <empty/>
+ </element>
+ </optional>
+ <empty/>
+ </interleave>
+ </element>
+ </define>
+
<define name="diskSourceNetworkProtocolISCSI">
<element name="source">
<attribute name="protocol">
@@ -2133,6 +2162,7 @@
<ref name="diskSourceNetworkProtocolSimple"/>
<ref name="diskSourceNetworkProtocolVxHS"/>
<ref name="diskSourceNetworkProtocolNFS"/>
+ <ref name="diskSourceNetworkProtocolVitastor"/>
</choice>
</define>
Index: libvirt-7.6.0/include/libvirt/libvirt-storage.h
===================================================================
--- libvirt-7.6.0.orig/include/libvirt/libvirt-storage.h
+++ libvirt-7.6.0/include/libvirt/libvirt-storage.h
@@ -245,6 +245,7 @@ typedef enum {
VIR_CONNECT_LIST_STORAGE_POOLS_ZFS = 1 << 17,
VIR_CONNECT_LIST_STORAGE_POOLS_VSTORAGE = 1 << 18,
VIR_CONNECT_LIST_STORAGE_POOLS_ISCSI_DIRECT = 1 << 19,
+ VIR_CONNECT_LIST_STORAGE_POOLS_VITASTOR = 1 << 20,
} virConnectListAllStoragePoolsFlags;
int virConnectListAllStoragePools(virConnectPtr conn,
Index: libvirt-7.6.0/src/conf/domain_conf.c
===================================================================
--- libvirt-7.6.0.orig/src/conf/domain_conf.c
+++ libvirt-7.6.0/src/conf/domain_conf.c
@@ -8268,7 +8268,8 @@ virDomainDiskSourceNetworkParse(xmlNodeP
src->configFile = virXPathString("string(./config/@file)", ctxt);
if (src->protocol == VIR_STORAGE_NET_PROTOCOL_HTTP ||
- src->protocol == VIR_STORAGE_NET_PROTOCOL_HTTPS)
+ src->protocol == VIR_STORAGE_NET_PROTOCOL_HTTPS ||
+ src->protocol == VIR_STORAGE_NET_PROTOCOL_VITASTOR)
src->query = virXMLPropString(node, "query");
if (virDomainStorageNetworkParseHosts(node, ctxt, &src->hosts, &src->nhosts) < 0)
@@ -30831,6 +30832,7 @@ virDomainStorageSourceTranslateSourcePoo
case VIR_STORAGE_POOL_MPATH:
case VIR_STORAGE_POOL_RBD:
+ case VIR_STORAGE_POOL_VITASTOR:
case VIR_STORAGE_POOL_SHEEPDOG:
case VIR_STORAGE_POOL_GLUSTER:
case VIR_STORAGE_POOL_LAST:
Index: libvirt-7.6.0/src/conf/domain_validate.c
===================================================================
--- libvirt-7.6.0.orig/src/conf/domain_validate.c
+++ libvirt-7.6.0/src/conf/domain_validate.c
@@ -470,7 +470,7 @@ virDomainDiskDefValidateSourceChainOne(c
}
}
- /* internal snapshots and config files are currently supported only with rbd: */
+ /* internal snapshots are currently supported only with rbd: */
if (virStorageSourceGetActualType(src) != VIR_STORAGE_TYPE_NETWORK &&
src->protocol != VIR_STORAGE_NET_PROTOCOL_RBD) {
if (src->snapshot) {
@@ -479,11 +479,15 @@ virDomainDiskDefValidateSourceChainOne(c
"only with 'rbd' disks"));
return -1;
}
-
+ }
+ /* config files are currently supported only with rbd and vitastor: */
+ if (virStorageSourceGetActualType(src) != VIR_STORAGE_TYPE_NETWORK &&
+ src->protocol != VIR_STORAGE_NET_PROTOCOL_RBD &&
+ src->protocol != VIR_STORAGE_NET_PROTOCOL_VITASTOR) {
if (src->configFile) {
virReportError(VIR_ERR_XML_ERROR, "%s",
_("<config> element is currently supported "
- "only with 'rbd' disks"));
+ "only with 'rbd' and 'vitastor' disks"));
return -1;
}
}
Index: libvirt-7.6.0/src/conf/storage_conf.c
===================================================================
--- libvirt-7.6.0.orig/src/conf/storage_conf.c
+++ libvirt-7.6.0/src/conf/storage_conf.c
@@ -60,7 +60,7 @@ VIR_ENUM_IMPL(virStoragePool,
"logical", "disk", "iscsi",
"iscsi-direct", "scsi", "mpath",
"rbd", "sheepdog", "gluster",
- "zfs", "vstorage",
+ "zfs", "vstorage", "vitastor",
);
VIR_ENUM_IMPL(virStoragePoolFormatFileSystem,
@@ -246,6 +246,18 @@ static virStoragePoolTypeInfo poolTypeIn
.formatToString = virStorageFileFormatTypeToString,
}
},
+ {.poolType = VIR_STORAGE_POOL_VITASTOR,
+ .poolOptions = {
+ .flags = (VIR_STORAGE_POOL_SOURCE_HOST |
+ VIR_STORAGE_POOL_SOURCE_NETWORK |
+ VIR_STORAGE_POOL_SOURCE_NAME),
+ },
+ .volOptions = {
+ .defaultFormat = VIR_STORAGE_FILE_RAW,
+ .formatFromString = virStorageVolumeFormatFromString,
+ .formatToString = virStorageFileFormatTypeToString,
+ }
+ },
{.poolType = VIR_STORAGE_POOL_SHEEPDOG,
.poolOptions = {
.flags = (VIR_STORAGE_POOL_SOURCE_HOST |
@@ -546,6 +558,11 @@ virStoragePoolDefParseSource(xmlXPathCon
_("element 'name' is mandatory for RBD pool"));
return -1;
}
+ if (pool_type == VIR_STORAGE_POOL_VITASTOR && source->name == NULL) {
+ virReportError(VIR_ERR_XML_ERROR, "%s",
+ _("element 'name' is mandatory for Vitastor pool"));
+ return -1;
+ }
if (options->formatFromString) {
g_autofree char *format = NULL;
@@ -1182,6 +1199,7 @@ virStoragePoolDefFormatBuf(virBuffer *bu
/* RBD, Sheepdog, Gluster and Iscsi-direct devices are not local block devs nor
* files, so they don't have a target */
if (def->type != VIR_STORAGE_POOL_RBD &&
+ def->type != VIR_STORAGE_POOL_VITASTOR &&
def->type != VIR_STORAGE_POOL_SHEEPDOG &&
def->type != VIR_STORAGE_POOL_GLUSTER &&
def->type != VIR_STORAGE_POOL_ISCSI_DIRECT) {
Index: libvirt-7.6.0/src/conf/storage_conf.h
===================================================================
--- libvirt-7.6.0.orig/src/conf/storage_conf.h
+++ libvirt-7.6.0/src/conf/storage_conf.h
@@ -106,6 +106,7 @@ typedef enum {
VIR_STORAGE_POOL_GLUSTER, /* Gluster device */
VIR_STORAGE_POOL_ZFS, /* ZFS */
VIR_STORAGE_POOL_VSTORAGE, /* Virtuozzo Storage */
+ VIR_STORAGE_POOL_VITASTOR, /* Vitastor */
VIR_STORAGE_POOL_LAST,
} virStoragePoolType;
@@ -465,6 +466,7 @@ VIR_ENUM_DECL(virStoragePartedFs);
VIR_CONNECT_LIST_STORAGE_POOLS_SCSI | \
VIR_CONNECT_LIST_STORAGE_POOLS_MPATH | \
VIR_CONNECT_LIST_STORAGE_POOLS_RBD | \
+ VIR_CONNECT_LIST_STORAGE_POOLS_VITASTOR | \
VIR_CONNECT_LIST_STORAGE_POOLS_SHEEPDOG | \
VIR_CONNECT_LIST_STORAGE_POOLS_GLUSTER | \
VIR_CONNECT_LIST_STORAGE_POOLS_ZFS | \
Index: libvirt-7.6.0/src/conf/storage_source_conf.c
===================================================================
--- libvirt-7.6.0.orig/src/conf/storage_source_conf.c
+++ libvirt-7.6.0/src/conf/storage_source_conf.c
@@ -85,6 +85,7 @@ VIR_ENUM_IMPL(virStorageNetProtocol,
"ssh",
"vxhs",
"nfs",
+ "vitastor",
);
@@ -1262,6 +1263,7 @@ virStorageSourceNetworkDefaultPort(virSt
case VIR_STORAGE_NET_PROTOCOL_GLUSTER:
return 24007;
+ case VIR_STORAGE_NET_PROTOCOL_VITASTOR:
case VIR_STORAGE_NET_PROTOCOL_RBD:
/* we don't provide a default for RBD */
return 0;
Index: libvirt-7.6.0/src/conf/storage_source_conf.h
===================================================================
--- libvirt-7.6.0.orig/src/conf/storage_source_conf.h
+++ libvirt-7.6.0/src/conf/storage_source_conf.h
@@ -127,6 +127,7 @@ typedef enum {
VIR_STORAGE_NET_PROTOCOL_SSH,
VIR_STORAGE_NET_PROTOCOL_VXHS,
VIR_STORAGE_NET_PROTOCOL_NFS,
+ VIR_STORAGE_NET_PROTOCOL_VITASTOR,
VIR_STORAGE_NET_PROTOCOL_LAST
} virStorageNetProtocol;
Index: libvirt-7.6.0/src/conf/virstorageobj.c
===================================================================
--- libvirt-7.6.0.orig/src/conf/virstorageobj.c
+++ libvirt-7.6.0/src/conf/virstorageobj.c
@@ -1481,6 +1481,7 @@ virStoragePoolObjSourceFindDuplicateCb(c
return 1;
break;
+ case VIR_STORAGE_POOL_VITASTOR:
case VIR_STORAGE_POOL_RBD:
case VIR_STORAGE_POOL_LAST:
break;
@@ -1980,6 +1981,8 @@ virStoragePoolObjMatch(virStoragePoolObj
(obj->def->type == VIR_STORAGE_POOL_MPATH)) ||
(MATCH(VIR_CONNECT_LIST_STORAGE_POOLS_RBD) &&
(obj->def->type == VIR_STORAGE_POOL_RBD)) ||
+ (MATCH(VIR_CONNECT_LIST_STORAGE_POOLS_VITASTOR) &&
+ (obj->def->type == VIR_STORAGE_POOL_VITASTOR)) ||
(MATCH(VIR_CONNECT_LIST_STORAGE_POOLS_SHEEPDOG) &&
(obj->def->type == VIR_STORAGE_POOL_SHEEPDOG)) ||
(MATCH(VIR_CONNECT_LIST_STORAGE_POOLS_GLUSTER) &&
Index: libvirt-7.6.0/src/libvirt-storage.c
===================================================================
--- libvirt-7.6.0.orig/src/libvirt-storage.c
+++ libvirt-7.6.0/src/libvirt-storage.c
@@ -92,6 +92,7 @@ virStoragePoolGetConnect(virStoragePoolP
* VIR_CONNECT_LIST_STORAGE_POOLS_SCSI
* VIR_CONNECT_LIST_STORAGE_POOLS_MPATH
* VIR_CONNECT_LIST_STORAGE_POOLS_RBD
+ * VIR_CONNECT_LIST_STORAGE_POOLS_VITASTOR
* VIR_CONNECT_LIST_STORAGE_POOLS_SHEEPDOG
* VIR_CONNECT_LIST_STORAGE_POOLS_GLUSTER
* VIR_CONNECT_LIST_STORAGE_POOLS_ZFS
Index: libvirt-7.6.0/src/libxl/libxl_conf.c
===================================================================
--- libvirt-7.6.0.orig/src/libxl/libxl_conf.c
+++ libvirt-7.6.0/src/libxl/libxl_conf.c
@@ -972,6 +972,7 @@ libxlMakeNetworkDiskSrcStr(virStorageSou
case VIR_STORAGE_NET_PROTOCOL_SSH:
case VIR_STORAGE_NET_PROTOCOL_VXHS:
case VIR_STORAGE_NET_PROTOCOL_NFS:
+ case VIR_STORAGE_NET_PROTOCOL_VITASTOR:
case VIR_STORAGE_NET_PROTOCOL_LAST:
case VIR_STORAGE_NET_PROTOCOL_NONE:
virReportError(VIR_ERR_NO_SUPPORT,
Index: libvirt-7.6.0/src/libxl/xen_xl.c
===================================================================
--- libvirt-7.6.0.orig/src/libxl/xen_xl.c
+++ libvirt-7.6.0/src/libxl/xen_xl.c
@@ -1540,6 +1540,7 @@ xenFormatXLDiskSrcNet(virStorageSource *
case VIR_STORAGE_NET_PROTOCOL_SSH:
case VIR_STORAGE_NET_PROTOCOL_VXHS:
case VIR_STORAGE_NET_PROTOCOL_NFS:
+ case VIR_STORAGE_NET_PROTOCOL_VITASTOR:
case VIR_STORAGE_NET_PROTOCOL_LAST:
case VIR_STORAGE_NET_PROTOCOL_NONE:
virReportError(VIR_ERR_NO_SUPPORT,
Index: libvirt-7.6.0/src/qemu/qemu_block.c
===================================================================
--- libvirt-7.6.0.orig/src/qemu/qemu_block.c
+++ libvirt-7.6.0/src/qemu/qemu_block.c
@@ -916,6 +916,38 @@ qemuBlockStorageSourceGetRBDProps(virSto
static virJSONValue *
+qemuBlockStorageSourceGetVitastorProps(virStorageSource *src)
+{
+ virJSONValue *ret = NULL;
+ virStorageNetHostDef *host;
+ size_t i;
+ g_auto(virBuffer) buf = VIR_BUFFER_INITIALIZER;
+ g_autofree char *etcd = NULL;
+
+ for (i = 0; i < src->nhosts; i++) {
+ host = src->hosts + i;
+ if ((virStorageNetHostTransport)host->transport != VIR_STORAGE_NET_HOST_TRANS_TCP) {
+ return NULL;
+ }
+ virBufferAsprintf(&buf, i > 0 ? ",%s:%u" : "%s:%u", host->name, host->port);
+ }
+ if (src->nhosts > 0) {
+ etcd = virBufferContentAndReset(&buf);
+ }
+
+ if (virJSONValueObjectCreate(&ret,
+ "S:etcd-host", etcd,
+ "S:etcd-prefix", src->query,
+ "S:config-path", src->configFile,
+ "s:image", src->path,
+ NULL) < 0)
+ return NULL;
+
+ return ret;
+}
+
+
+static virJSONValue *
qemuBlockStorageSourceGetSheepdogProps(virStorageSource *src)
{
g_autoptr(virJSONValue) serverprops = NULL;
@@ -1205,6 +1237,12 @@ qemuBlockStorageSourceGetBackendProps(vi
return NULL;
break;
+ case VIR_STORAGE_NET_PROTOCOL_VITASTOR:
+ driver = "vitastor";
+ if (!(fileprops = qemuBlockStorageSourceGetVitastorProps(src)))
+ return NULL;
+ break;
+
case VIR_STORAGE_NET_PROTOCOL_SHEEPDOG:
driver = "sheepdog";
if (!(fileprops = qemuBlockStorageSourceGetSheepdogProps(src)))
@@ -2219,6 +2257,7 @@ qemuBlockGetBackingStoreString(virStorag
case VIR_STORAGE_NET_PROTOCOL_SHEEPDOG:
case VIR_STORAGE_NET_PROTOCOL_RBD:
+ case VIR_STORAGE_NET_PROTOCOL_VITASTOR:
case VIR_STORAGE_NET_PROTOCOL_VXHS:
case VIR_STORAGE_NET_PROTOCOL_NFS:
case VIR_STORAGE_NET_PROTOCOL_SSH:
@@ -2596,6 +2635,12 @@ qemuBlockStorageSourceCreateGetStoragePr
return -1;
break;
+ case VIR_STORAGE_NET_PROTOCOL_VITASTOR:
+ driver = "vitastor";
+ if (!(location = qemuBlockStorageSourceGetVitastorProps(src)))
+ return -1;
+ break;
+
case VIR_STORAGE_NET_PROTOCOL_SHEEPDOG:
driver = "sheepdog";
if (!(location = qemuBlockStorageSourceGetSheepdogProps(src)))
Index: libvirt-7.6.0/src/qemu/qemu_command.c
===================================================================
--- libvirt-7.6.0.orig/src/qemu/qemu_command.c
+++ libvirt-7.6.0/src/qemu/qemu_command.c
@@ -1074,6 +1074,43 @@ qemuBuildNetworkDriveStr(virStorageSourc
ret = virBufferContentAndReset(&buf);
break;
+ case VIR_STORAGE_NET_PROTOCOL_VITASTOR:
+ if (strchr(src->path, ':')) {
+ virReportError(VIR_ERR_CONFIG_UNSUPPORTED,
+ _("':' not allowed in Vitastor source volume name '%s'"),
+ src->path);
+ return NULL;
+ }
+
+ virBufferStrcat(&buf, "vitastor:image=", src->path, NULL);
+
+ if (src->nhosts > 0) {
+ virBufferAddLit(&buf, ":etcd-host=");
+ for (i = 0; i < src->nhosts; i++) {
+ if (i)
+ virBufferAddLit(&buf, ",");
+
+ /* assume host containing : is ipv6 */
+ if (strchr(src->hosts[i].name, ':'))
+ virBufferEscape(&buf, '\\', ":", "[%s]",
+ src->hosts[i].name);
+ else
+ virBufferAsprintf(&buf, "%s", src->hosts[i].name);
+
+ if (src->hosts[i].port)
+ virBufferAsprintf(&buf, "\\:%u", src->hosts[i].port);
+ }
+ }
+
+ if (src->configFile)
+ virBufferEscape(&buf, '\\', ":", ":config-path=%s", src->configFile);
+
+ if (src->query)
+ virBufferEscape(&buf, '\\', ":", ":etcd-prefix=%s", src->query);
+
+ ret = virBufferContentAndReset(&buf);
+ break;
+
case VIR_STORAGE_NET_PROTOCOL_VXHS:
virReportError(VIR_ERR_INTERNAL_ERROR, "%s",
_("VxHS protocol does not support URI syntax"));
Index: libvirt-7.6.0/src/qemu/qemu_domain.c
===================================================================
--- libvirt-7.6.0.orig/src/qemu/qemu_domain.c
+++ libvirt-7.6.0/src/qemu/qemu_domain.c
@@ -4900,7 +4900,8 @@ qemuDomainValidateStorageSource(virStora
if (src->query &&
(actualType != VIR_STORAGE_TYPE_NETWORK ||
(src->protocol != VIR_STORAGE_NET_PROTOCOL_HTTPS &&
- src->protocol != VIR_STORAGE_NET_PROTOCOL_HTTP))) {
+ src->protocol != VIR_STORAGE_NET_PROTOCOL_HTTP &&
+ src->protocol != VIR_STORAGE_NET_PROTOCOL_VITASTOR))) {
virReportError(VIR_ERR_CONFIG_UNSUPPORTED, "%s",
_("query is supported only with HTTP(S) protocols"));
return -1;
@@ -10102,6 +10103,7 @@ qemuDomainPrepareStorageSourceTLS(virSto
break;
case VIR_STORAGE_NET_PROTOCOL_RBD:
+ case VIR_STORAGE_NET_PROTOCOL_VITASTOR:
case VIR_STORAGE_NET_PROTOCOL_SHEEPDOG:
case VIR_STORAGE_NET_PROTOCOL_GLUSTER:
case VIR_STORAGE_NET_PROTOCOL_ISCSI:
Index: libvirt-7.6.0/src/qemu/qemu_snapshot.c
===================================================================
--- libvirt-7.6.0.orig/src/qemu/qemu_snapshot.c
+++ libvirt-7.6.0/src/qemu/qemu_snapshot.c
@@ -402,6 +402,7 @@ qemuSnapshotPrepareDiskExternalInactive(
case VIR_STORAGE_NET_PROTOCOL_NONE:
case VIR_STORAGE_NET_PROTOCOL_NBD:
case VIR_STORAGE_NET_PROTOCOL_RBD:
+ case VIR_STORAGE_NET_PROTOCOL_VITASTOR:
case VIR_STORAGE_NET_PROTOCOL_SHEEPDOG:
case VIR_STORAGE_NET_PROTOCOL_GLUSTER:
case VIR_STORAGE_NET_PROTOCOL_ISCSI:
@@ -494,6 +495,7 @@ qemuSnapshotPrepareDiskExternalActive(vi
case VIR_STORAGE_NET_PROTOCOL_NONE:
case VIR_STORAGE_NET_PROTOCOL_NBD:
case VIR_STORAGE_NET_PROTOCOL_RBD:
+ case VIR_STORAGE_NET_PROTOCOL_VITASTOR:
case VIR_STORAGE_NET_PROTOCOL_SHEEPDOG:
case VIR_STORAGE_NET_PROTOCOL_ISCSI:
case VIR_STORAGE_NET_PROTOCOL_HTTP:
@@ -647,6 +649,7 @@ qemuSnapshotPrepareDiskInternal(virDomai
case VIR_STORAGE_NET_PROTOCOL_NONE:
case VIR_STORAGE_NET_PROTOCOL_NBD:
case VIR_STORAGE_NET_PROTOCOL_RBD:
+ case VIR_STORAGE_NET_PROTOCOL_VITASTOR:
case VIR_STORAGE_NET_PROTOCOL_SHEEPDOG:
case VIR_STORAGE_NET_PROTOCOL_GLUSTER:
case VIR_STORAGE_NET_PROTOCOL_ISCSI:
Index: libvirt-7.6.0/src/storage/storage_driver.c
===================================================================
--- libvirt-7.6.0.orig/src/storage/storage_driver.c
+++ libvirt-7.6.0/src/storage/storage_driver.c
@@ -1644,6 +1644,7 @@ storageVolLookupByPathCallback(virStorag
case VIR_STORAGE_POOL_GLUSTER:
case VIR_STORAGE_POOL_RBD:
+ case VIR_STORAGE_POOL_VITASTOR:
case VIR_STORAGE_POOL_SHEEPDOG:
case VIR_STORAGE_POOL_ZFS:
case VIR_STORAGE_POOL_LAST:
Index: libvirt-7.6.0/src/storage_file/storage_source_backingstore.c
===================================================================
--- libvirt-7.6.0.orig/src/storage_file/storage_source_backingstore.c
+++ libvirt-7.6.0/src/storage_file/storage_source_backingstore.c
@@ -285,6 +285,75 @@ virStorageSourceParseRBDColonString(cons
static int
+virStorageSourceParseVitastorColonString(const char *colonstr,
+ virStorageSource *src)
+{
+ char *p, *e, *next;
+ g_autofree char *options = NULL;
+
+ /* optionally skip the "vitastor:" prefix if provided */
+ if (STRPREFIX(colonstr, "vitastor:"))
+ colonstr += strlen("vitastor:");
+
+ options = g_strdup(colonstr);
+
+ p = options;
+ while (*p) {
+ /* find : delimiter or end of string */
+ for (e = p; *e && *e != ':'; ++e) {
+ if (*e == '\\') {
+ e++;
+ if (*e == '\0')
+ break;
+ }
+ }
+ if (*e == '\0') {
+ next = e; /* last kv pair */
+ } else {
+ next = e + 1;
+ *e = '\0';
+ }
+
+ if (STRPREFIX(p, "image=")) {
+ src->path = g_strdup(p + strlen("image="));
+ } else if (STRPREFIX(p, "etcd-prefix=")) {
+ src->query = g_strdup(p + strlen("etcd-prefix="));
+ } else if (STRPREFIX(p, "config-path=")) {
+ src->configFile = g_strdup(p + strlen("config-path="));
+ } else if (STRPREFIX(p, "etcd-host=")) {
+ char *h, *sep;
+
+ h = p + strlen("etcd-host=");
+ while (h < e) {
+ for (sep = h; sep < e; ++sep) {
+ if (*sep == '\\' && (sep[1] == ',' ||
+ sep[1] == ';' ||
+ sep[1] == ' ')) {
+ *sep = '\0';
+ sep += 2;
+ break;
+ }
+ }
+
+ if (virStorageSourceRBDAddHost(src, h) < 0)
+ return -1;
+
+ h = sep;
+ }
+ }
+
+ p = next;
+ }
+
+ if (!src->path) {
+ return -1;
+ }
+
+ return 0;
+}
+
+
+static int
virStorageSourceParseNBDColonString(const char *nbdstr,
virStorageSource *src)
{
@@ -396,6 +465,11 @@ virStorageSourceParseBackingColon(virSto
return -1;
break;
+ case VIR_STORAGE_NET_PROTOCOL_VITASTOR:
+ if (virStorageSourceParseVitastorColonString(path, src) < 0)
+ return -1;
+ break;
+
case VIR_STORAGE_NET_PROTOCOL_SHEEPDOG:
case VIR_STORAGE_NET_PROTOCOL_LAST:
case VIR_STORAGE_NET_PROTOCOL_NONE:
@@ -985,6 +1059,54 @@ virStorageSourceParseBackingJSONRBD(virS
}
static int
+virStorageSourceParseBackingJSONVitastor(virStorageSource *src,
+ virJSONValue *json,
+ const char *jsonstr G_GNUC_UNUSED,
+ int opaque G_GNUC_UNUSED)
+{
+ const char *filename;
+ const char *image = virJSONValueObjectGetString(json, "image");
+ const char *conf = virJSONValueObjectGetString(json, "config-path");
+ const char *etcd_prefix = virJSONValueObjectGetString(json, "etcd-prefix");
+ virJSONValue *servers = virJSONValueObjectGetArray(json, "server");
+ size_t nservers;
+ size_t i;
+
+ src->type = VIR_STORAGE_TYPE_NETWORK;
+ src->protocol = VIR_STORAGE_NET_PROTOCOL_VITASTOR;
+
+ /* legacy syntax passed via 'filename' option */
+ if ((filename = virJSONValueObjectGetString(json, "filename")))
+ return virStorageSourceParseVitastorColonString(filename, src);
+
+ if (!image) {
+ virReportError(VIR_ERR_INVALID_ARG, "%s",
+ _("missing image name in Vitastor backing volume "
+ "JSON specification"));
+ return -1;
+ }
+
+ src->path = g_strdup(image);
+ src->configFile = g_strdup(conf);
+ src->query = g_strdup(etcd_prefix);
+
+ if (servers) {
+ nservers = virJSONValueArraySize(servers);
+
+ src->hosts = g_new0(virStorageNetHostDef, nservers);
+ src->nhosts = nservers;
+
+ for (i = 0; i < nservers; i++) {
+ if (virStorageSourceParseBackingJSONInetSocketAddress(src->hosts + i,
+ virJSONValueArrayGet(servers, i)) < 0)
+ return -1;
+ }
+ }
+
+ return 0;
+}
+
+static int
virStorageSourceParseBackingJSONRaw(virStorageSource *src,
virJSONValue *json,
const char *jsonstr,
@@ -1162,6 +1284,7 @@ static const struct virStorageSourceJSON
{"sheepdog", false, virStorageSourceParseBackingJSONSheepdog, 0},
{"ssh", false, virStorageSourceParseBackingJSONSSH, 0},
{"rbd", false, virStorageSourceParseBackingJSONRBD, 0},
+ {"vitastor", false, virStorageSourceParseBackingJSONVitastor, 0},
{"raw", true, virStorageSourceParseBackingJSONRaw, 0},
{"nfs", false, virStorageSourceParseBackingJSONNFS, 0},
{"vxhs", false, virStorageSourceParseBackingJSONVxHS, 0},
Index: libvirt-7.6.0/src/test/test_driver.c
===================================================================
--- libvirt-7.6.0.orig/src/test/test_driver.c
+++ libvirt-7.6.0/src/test/test_driver.c
@@ -7193,6 +7193,7 @@ testStorageVolumeTypeForPool(int pooltyp
case VIR_STORAGE_POOL_ISCSI_DIRECT:
case VIR_STORAGE_POOL_GLUSTER:
case VIR_STORAGE_POOL_RBD:
+ case VIR_STORAGE_POOL_VITASTOR:
return VIR_STORAGE_VOL_NETWORK;
case VIR_STORAGE_POOL_LOGICAL:
case VIR_STORAGE_POOL_DISK:
Index: libvirt-7.6.0/tests/storagepoolcapsschemadata/poolcaps-fs.xml
===================================================================
--- libvirt-7.6.0.orig/tests/storagepoolcapsschemadata/poolcaps-fs.xml
+++ libvirt-7.6.0/tests/storagepoolcapsschemadata/poolcaps-fs.xml
@@ -204,4 +204,11 @@
</enum>
</volOptions>
</pool>
+ <pool type='vitastor' supported='no'>
+ <volOptions>
+ <defaultFormat type='raw'/>
+ <enum name='targetFormatType'>
+ </enum>
+ </volOptions>
+ </pool>
</storagepoolCapabilities>
Index: libvirt-7.6.0/tests/storagepoolcapsschemadata/poolcaps-full.xml
===================================================================
--- libvirt-7.6.0.orig/tests/storagepoolcapsschemadata/poolcaps-full.xml
+++ libvirt-7.6.0/tests/storagepoolcapsschemadata/poolcaps-full.xml
@@ -204,4 +204,11 @@
</enum>
</volOptions>
</pool>
+ <pool type='vitastor' supported='yes'>
+ <volOptions>
+ <defaultFormat type='raw'/>
+ <enum name='targetFormatType'>
+ </enum>
+ </volOptions>
+ </pool>
</storagepoolCapabilities>
Index: libvirt-7.6.0/tests/storagepoolxml2argvtest.c
===================================================================
--- libvirt-7.6.0.orig/tests/storagepoolxml2argvtest.c
+++ libvirt-7.6.0/tests/storagepoolxml2argvtest.c
@@ -68,6 +68,7 @@ testCompareXMLToArgvFiles(bool shouldFai
case VIR_STORAGE_POOL_GLUSTER:
case VIR_STORAGE_POOL_ZFS:
case VIR_STORAGE_POOL_VSTORAGE:
+ case VIR_STORAGE_POOL_VITASTOR:
case VIR_STORAGE_POOL_LAST:
default:
VIR_TEST_DEBUG("pool type '%s' has no xml2argv test", defTypeStr);
Index: libvirt-7.6.0/tools/virsh-pool.c
===================================================================
--- libvirt-7.6.0.orig/tools/virsh-pool.c
+++ libvirt-7.6.0/tools/virsh-pool.c
@@ -1231,6 +1231,9 @@ cmdPoolList(vshControl *ctl, const vshCm
case VIR_STORAGE_POOL_VSTORAGE:
flags |= VIR_CONNECT_LIST_STORAGE_POOLS_VSTORAGE;
break;
+ case VIR_STORAGE_POOL_VITASTOR:
+ flags |= VIR_CONNECT_LIST_STORAGE_POOLS_VITASTOR;
+ break;
case VIR_STORAGE_POOL_LAST:
break;
}

View File

@@ -15,10 +15,9 @@
<target dev='vda' bus='virtio' /> <target dev='vda' bus='virtio' />
<driver name='qemu' type='raw' /> <driver name='qemu' type='raw' />
<!-- name is Vitastor image name --> <!-- name is Vitastor image name -->
<!-- config (optional) is the path to Vitastor's configuration file -->
<!-- query (optional) is Vitastor's etcd_prefix --> <!-- query (optional) is Vitastor's etcd_prefix -->
<source protocol='vitastor' name='debian9' query='/vitastor'> <source protocol='vitastor' name='debian9' query='/vitastor' config='/etc/vitastor/vitastor.conf'>
<!-- config (optional) is the path to Vitastor's configuration file -->
<config file='/etc/vitastor/vitastor.conf' />
<!-- hosts = etcd addresses --> <!-- hosts = etcd addresses -->
<host name='192.168.7.2' port='2379' /> <host name='192.168.7.2' port='2379' />
</source> </source>

View File

@@ -1,5 +1,5 @@
diff --git a/nova/virt/image/model.py b/nova/virt/image/model.py diff --git a/nova/virt/image/model.py b/nova/virt/image/model.py
index 971f7e9c07..ec3fca72cb 100644 index 971f7e9c07..70ed70d5e2 100644
--- a/nova/virt/image/model.py --- a/nova/virt/image/model.py
+++ b/nova/virt/image/model.py +++ b/nova/virt/image/model.py
@@ -129,3 +129,22 @@ class RBDImage(Image): @@ -129,3 +129,22 @@ class RBDImage(Image):
@@ -19,7 +19,7 @@ index 971f7e9c07..ec3fca72cb 100644
+ :param etcd_prefix: etcd prefix (optional) + :param etcd_prefix: etcd prefix (optional)
+ :param config_path: path to the configuration (optional) + :param config_path: path to the configuration (optional)
+ """ + """
+ super(VitastorImage, self).__init__(FORMAT_RAW) + super(RBDImage, self).__init__(FORMAT_RAW)
+ +
+ self.name = name + self.name = name
+ self.etcd_address = etcd_address + self.etcd_address = etcd_address
@@ -48,7 +48,7 @@ index 5358f3766a..ebe3d6effb 100644
info = nova.privsep.qemu.privileged_qemu_img_info(path, format=format) info = nova.privsep.qemu.privileged_qemu_img_info(path, format=format)
diff --git a/nova/virt/libvirt/config.py b/nova/virt/libvirt/config.py diff --git a/nova/virt/libvirt/config.py b/nova/virt/libvirt/config.py
index f9475776b3..a2e18aab67 100644 index f9475776b3..51573fe41d 100644
--- a/nova/virt/libvirt/config.py --- a/nova/virt/libvirt/config.py
+++ b/nova/virt/libvirt/config.py +++ b/nova/virt/libvirt/config.py
@@ -1060,6 +1060,8 @@ class LibvirtConfigGuestDisk(LibvirtConfigGuestDevice): @@ -1060,6 +1060,8 @@ class LibvirtConfigGuestDisk(LibvirtConfigGuestDevice):
@@ -60,19 +60,18 @@ index f9475776b3..a2e18aab67 100644
self.source_name = None self.source_name = None
self.source_hosts = [] self.source_hosts = []
self.source_ports = [] self.source_ports = []
@@ -1189,6 +1191,10 @@ class LibvirtConfigGuestDisk(LibvirtConfigGuestDevice): @@ -1186,7 +1188,8 @@ class LibvirtConfigGuestDisk(LibvirtConfigGuestDevice):
source = etree.Element("source", protocol=self.source_protocol) elif self.source_type == "mount":
dev.append(etree.Element("source", dir=self.source_path))
elif self.source_type == "network" and self.source_protocol:
- source = etree.Element("source", protocol=self.source_protocol)
+ source = etree.Element("source", protocol=self.source_protocol,
+ query=self.source_query, config=self.source_config)
if self.source_name is not None: if self.source_name is not None:
source.set('name', self.source_name) source.set('name', self.source_name)
+ if self.source_query is not None:
+ source.set('query', self.source_query)
+ if self.source_config is not None:
+ source.append(etree.Element('config', file=self.source_config))
hosts_info = zip(self.source_hosts, self.source_ports) hosts_info = zip(self.source_hosts, self.source_ports)
for name, port in hosts_info:
host = etree.Element('host', name=name)
diff --git a/nova/virt/libvirt/driver.py b/nova/virt/libvirt/driver.py diff --git a/nova/virt/libvirt/driver.py b/nova/virt/libvirt/driver.py
index 391231c527..f38faa1608 100644 index 391231c527..34dc60dcdd 100644
--- a/nova/virt/libvirt/driver.py --- a/nova/virt/libvirt/driver.py
+++ b/nova/virt/libvirt/driver.py +++ b/nova/virt/libvirt/driver.py
@@ -179,6 +179,7 @@ VOLUME_DRIVERS = { @@ -179,6 +179,7 @@ VOLUME_DRIVERS = {

View File

@@ -1,288 +0,0 @@
diff --git a/nova/virt/image/model.py b/nova/virt/image/model.py
index 971f7e9c07..ec3fca72cb 100644
--- a/nova/virt/image/model.py
+++ b/nova/virt/image/model.py
@@ -129,3 +129,22 @@ class RBDImage(Image):
self.user = user
self.password = password
self.servers = servers
+
+
+class VitastorImage(Image):
+ """Class for images in a remote Vitastor cluster"""
+
+ def __init__(self, name, etcd_address = None, etcd_prefix = None, config_path = None):
+ """Create a new Vitastor image object
+
+ :param name: name of the image
+ :param etcd_address: etcd URL(s) (optional)
+ :param etcd_prefix: etcd prefix (optional)
+ :param config_path: path to the configuration (optional)
+ """
+ super(VitastorImage, self).__init__(FORMAT_RAW)
+
+ self.name = name
+ self.etcd_address = etcd_address
+ self.etcd_prefix = etcd_prefix
+ self.config_path = config_path
diff --git a/nova/virt/images.py b/nova/virt/images.py
index 5358f3766a..ebe3d6effb 100644
--- a/nova/virt/images.py
+++ b/nova/virt/images.py
@@ -41,7 +41,7 @@ IMAGE_API = glance.API()
def qemu_img_info(path, format=None):
"""Return an object containing the parsed output from qemu-img info."""
- if not os.path.exists(path) and not path.startswith('rbd:'):
+ if not os.path.exists(path) and not path.startswith('rbd:') and not path.startswith('vitastor:'):
raise exception.DiskNotFound(location=path)
info = nova.privsep.qemu.unprivileged_qemu_img_info(path, format=format)
@@ -50,7 +50,7 @@ def qemu_img_info(path, format=None):
def privileged_qemu_img_info(path, format=None, output_format='json'):
"""Return an object containing the parsed output from qemu-img info."""
- if not os.path.exists(path) and not path.startswith('rbd:'):
+ if not os.path.exists(path) and not path.startswith('rbd:') and not path.startswith('vitastor:'):
raise exception.DiskNotFound(location=path)
info = nova.privsep.qemu.privileged_qemu_img_info(path, format=format)
diff --git a/nova/virt/libvirt/config.py b/nova/virt/libvirt/config.py
index ea525648b3..d7aa798954 100644
--- a/nova/virt/libvirt/config.py
+++ b/nova/virt/libvirt/config.py
@@ -1005,6 +1005,8 @@ class LibvirtConfigGuestDisk(LibvirtConfigGuestDevice):
self.driver_iommu = False
self.source_path = None
self.source_protocol = None
+ self.source_query = None
+ self.source_config = None
self.source_name = None
self.source_hosts = []
self.source_ports = []
@@ -1133,6 +1135,10 @@ class LibvirtConfigGuestDisk(LibvirtConfigGuestDevice):
source = etree.Element("source", protocol=self.source_protocol)
if self.source_name is not None:
source.set('name', self.source_name)
+ if self.source_query is not None:
+ source.set('query', self.source_query)
+ if self.source_config is not None:
+ source.append(etree.Element('config', file=self.source_config))
hosts_info = zip(self.source_hosts, self.source_ports)
for name, port in hosts_info:
host = etree.Element('host', name=name)
diff --git a/nova/virt/libvirt/driver.py b/nova/virt/libvirt/driver.py
index fbd033690a..74dc59ce87 100644
--- a/nova/virt/libvirt/driver.py
+++ b/nova/virt/libvirt/driver.py
@@ -180,6 +180,7 @@ libvirt_volume_drivers = [
'local=nova.virt.libvirt.volume.volume.LibvirtVolumeDriver',
'fake=nova.virt.libvirt.volume.volume.LibvirtFakeVolumeDriver',
'rbd=nova.virt.libvirt.volume.net.LibvirtNetVolumeDriver',
+ 'vitastor=nova.virt.libvirt.volume.vitastor.LibvirtVitastorVolumeDriver',
'nfs=nova.virt.libvirt.volume.nfs.LibvirtNFSVolumeDriver',
'smbfs=nova.virt.libvirt.volume.smbfs.LibvirtSMBFSVolumeDriver',
'fibre_channel='
@@ -287,10 +288,10 @@ class LibvirtDriver(driver.ComputeDriver):
# This prevents the risk of one test setting a capability
# which bleeds over into other tests.
- # LVM and RBD require raw images. If we are not configured to
+ # LVM, RBD, Vitastor require raw images. If we are not configured to
# force convert images into raw format, then we _require_ raw
# images only.
- raw_only = ('rbd', 'lvm')
+ raw_only = ('rbd', 'lvm', 'vitastor')
requires_raw_image = (CONF.libvirt.images_type in raw_only and
not CONF.force_raw_images)
requires_ploop_image = CONF.libvirt.virt_type == 'parallels'
@@ -703,12 +704,12 @@ class LibvirtDriver(driver.ComputeDriver):
# Some imagebackends are only able to import raw disk images,
# and will fail if given any other format. See the bug
# https://bugs.launchpad.net/nova/+bug/1816686 for more details.
- if CONF.libvirt.images_type in ('rbd',):
+ if CONF.libvirt.images_type in ('rbd', 'vitastor'):
if not CONF.force_raw_images:
msg = _("'[DEFAULT]/force_raw_images = False' is not "
- "allowed with '[libvirt]/images_type = rbd'. "
+ "allowed with '[libvirt]/images_type = rbd' or 'vitastor'. "
"Please check the two configs and if you really "
- "do want to use rbd as images_type, set "
+ "do want to use rbd or vitastor as images_type, set "
"force_raw_images to True.")
raise exception.InvalidConfiguration(msg)
@@ -2165,6 +2166,16 @@ class LibvirtDriver(driver.ComputeDriver):
if connection_info['data'].get('auth_enabled'):
username = connection_info['data']['auth_username']
path = f"rbd:{volume_name}:id={username}"
+ elif connection_info['driver_volume_type'] == 'vitastor':
+ volume_name = connection_info['data']['name']
+ path = 'vitastor:image='+volume_name.replace(':', '\\:')
+ for k in [ 'config_path', 'etcd_address', 'etcd_prefix' ]:
+ if k in connection_info['data']:
+ kk = k
+ if kk == 'etcd_address':
+ # FIXME use etcd_address in qemu driver
+ kk = 'etcd_host'
+ path += ":"+kk.replace('_', '-')+"="+connection_info['data'][k].replace(':', '\\:')
else:
path = 'unknown'
raise exception.DiskNotFound(location='unknown')
@@ -2440,8 +2451,8 @@ class LibvirtDriver(driver.ComputeDriver):
image_format = CONF.libvirt.snapshot_image_format or source_type
- # NOTE(bfilippov): save lvm and rbd as raw
- if image_format == 'lvm' or image_format == 'rbd':
+ # NOTE(bfilippov): save lvm and rbd and vitastor as raw
+ if image_format == 'lvm' or image_format == 'rbd' or image_format == 'vitastor':
image_format = 'raw'
metadata = self._create_snapshot_metadata(instance.image_meta,
@@ -2512,7 +2523,7 @@ class LibvirtDriver(driver.ComputeDriver):
expected_state=task_states.IMAGE_UPLOADING)
# TODO(nic): possibly abstract this out to the root_disk
- if source_type == 'rbd' and live_snapshot:
+ if (source_type == 'rbd' or source_type == 'vitastor') and live_snapshot:
# Standard snapshot uses qemu-img convert from RBD which is
# not safe to run with live_snapshot.
live_snapshot = False
@@ -3715,7 +3726,7 @@ class LibvirtDriver(driver.ComputeDriver):
# cleanup rescue volume
lvm.remove_volumes([lvmdisk for lvmdisk in self._lvm_disks(instance)
if lvmdisk.endswith('.rescue')])
- if CONF.libvirt.images_type == 'rbd':
+ if CONF.libvirt.images_type == 'rbd' or CONF.libvirt.images_type == 'vitastor':
filter_fn = lambda disk: (disk.startswith(instance.uuid) and
disk.endswith('.rescue'))
rbd_utils.RBDDriver().cleanup_volumes(filter_fn)
@@ -3972,6 +3983,8 @@ class LibvirtDriver(driver.ComputeDriver):
# TODO(mikal): there is a bug here if images_type has
# changed since creation of the instance, but I am pretty
# sure that this bug already exists.
+ if CONF.libvirt.images_type == 'vitastor':
+ return 'vitastor'
return 'rbd' if CONF.libvirt.images_type == 'rbd' else 'raw'
@staticmethod
@@ -4370,10 +4383,10 @@ class LibvirtDriver(driver.ComputeDriver):
finally:
# NOTE(mikal): if the config drive was imported into RBD,
# then we no longer need the local copy
- if CONF.libvirt.images_type == 'rbd':
+ if CONF.libvirt.images_type == 'rbd' or CONF.libvirt.images_type == 'vitastor':
LOG.info('Deleting local config drive %(path)s '
- 'because it was imported into RBD.',
- {'path': config_disk_local_path},
+ 'because it was imported into %(type).',
+ {'path': config_disk_local_path, 'type': CONF.libvirt.images_type},
instance=instance)
os.unlink(config_disk_local_path)
diff --git a/nova/virt/libvirt/utils.py b/nova/virt/libvirt/utils.py
index c1dc34daf4..263965912f 100644
--- a/nova/virt/libvirt/utils.py
+++ b/nova/virt/libvirt/utils.py
@@ -399,6 +399,10 @@ def find_disk(guest: libvirt_guest.Guest) -> ty.Tuple[str, ty.Optional[str]]:
disk_path = disk.source_name
if disk_path:
disk_path = 'rbd:' + disk_path
+ elif not disk_path and disk.source_protocol == 'vitastor':
+ disk_path = disk.source_name
+ if disk_path:
+ disk_path = 'vitastor:' + disk_path
if not disk_path:
raise RuntimeError(_("Can't retrieve root device path "
@@ -417,6 +421,8 @@ def get_disk_type_from_path(path: str) -> ty.Optional[str]:
return 'lvm'
elif path.startswith('rbd:'):
return 'rbd'
+ elif path.startswith('vitastor:'):
+ return 'vitastor'
elif (os.path.isdir(path) and
os.path.exists(os.path.join(path, "DiskDescriptor.xml"))):
return 'ploop'
diff --git a/nova/virt/libvirt/volume/vitastor.py b/nova/virt/libvirt/volume/vitastor.py
new file mode 100644
index 0000000000..0256df62c1
--- /dev/null
+++ b/nova/virt/libvirt/volume/vitastor.py
@@ -0,0 +1,75 @@
+# Copyright (c) 2021+, Vitaliy Filippov <vitalif@yourcmc.ru>
+#
+# Licensed under the Apache License, Version 2.0 (the "License"); you may
+# not use this file except in compliance with the License. You may obtain
+# a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
+# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
+# License for the specific language governing permissions and limitations
+# under the License.
+
+from os_brick import exception as os_brick_exception
+from os_brick import initiator
+from os_brick.initiator import connector
+from oslo_log import log as logging
+
+import nova.conf
+from nova import utils
+from nova.virt.libvirt.volume import volume as libvirt_volume
+
+
+CONF = nova.conf.CONF
+LOG = logging.getLogger(__name__)
+
+
+class LibvirtVitastorVolumeDriver(libvirt_volume.LibvirtBaseVolumeDriver):
+ """Driver to attach Vitastor volumes to libvirt."""
+ def __init__(self, host):
+ super(LibvirtVitastorVolumeDriver, self).__init__(host, is_block_dev=False)
+
+ def connect_volume(self, connection_info, instance):
+ pass
+
+ def disconnect_volume(self, connection_info, instance):
+ pass
+
+ def get_config(self, connection_info, disk_info):
+ """Returns xml for libvirt."""
+ conf = super(LibvirtVitastorVolumeDriver, self).get_config(connection_info, disk_info)
+ conf.source_type = 'network'
+ conf.source_protocol = 'vitastor'
+ conf.source_name = connection_info['data'].get('name')
+ conf.source_query = connection_info['data'].get('etcd_prefix') or None
+ conf.source_config = connection_info['data'].get('config_path') or None
+ conf.source_hosts = []
+ conf.source_ports = []
+ addresses = connection_info['data'].get('etcd_address', '')
+ if addresses:
+ if not isinstance(addresses, list):
+ addresses = addresses.split(',')
+ for addr in addresses:
+ if addr.startswith('https://'):
+ raise NotImplementedError('Vitastor block driver does not support SSL for etcd communication yet')
+ if addr.startswith('http://'):
+ addr = addr[7:]
+ addr = addr.rstrip('/')
+ if addr.endswith('/v3'):
+ addr = addr[0:-3]
+ p = addr.find('/')
+ if p > 0:
+ raise NotImplementedError('libvirt does not support custom URL paths for Vitastor etcd yet. Use /etc/vitastor/vitastor.conf')
+ p = addr.find(':')
+ port = '2379'
+ if p > 0:
+ port = addr[p+1:]
+ addr = addr[0:p]
+ conf.source_hosts.append(addr)
+ conf.source_ports.append(port)
+ return conf
+
+ def extend_volume(self, connection_info, instance, requested_size):
+ raise NotImplementedError

View File

@@ -1,175 +0,0 @@
Index: pve-qemu-kvm-5.1.0/qapi/block-core.json
===================================================================
--- pve-qemu-kvm-5.1.0.orig/qapi/block-core.json
+++ pve-qemu-kvm-5.1.0/qapi/block-core.json
@@ -3041,7 +3041,7 @@
'luks', 'nbd', 'nfs', 'null-aio', 'null-co', 'nvme', 'parallels',
'qcow', 'qcow2', 'qed', 'quorum', 'raw', 'rbd',
{ 'name': 'replication', 'if': 'defined(CONFIG_REPLICATION)' },
- 'sheepdog', 'pbs',
+ 'sheepdog', 'pbs', 'vitastor',
'ssh', 'throttle', 'vdi', 'vhdx', 'vmdk', 'vpc', 'vvfat' ] }
##
@@ -3889,6 +3889,28 @@
'*tag': 'str' } }
##
+# @BlockdevOptionsVitastor:
+#
+# Driver specific block device options for vitastor
+#
+# @image: Image name
+# @inode: Inode number
+# @pool: Pool ID
+# @size: Desired image size in bytes
+# @config-path: Path to Vitastor configuration
+# @etcd-host: etcd connection address(es)
+# @etcd-prefix: etcd key/value prefix
+##
+{ 'struct': 'BlockdevOptionsVitastor',
+ 'data': { '*inode': 'uint64',
+ '*pool': 'uint64',
+ '*size': 'uint64',
+ '*image': 'str',
+ '*config-path': 'str',
+ '*etcd-host': 'str',
+ '*etcd-prefix': 'str' } }
+
+##
# @ReplicationMode:
#
# An enumeration of replication modes.
@@ -4234,6 +4256,7 @@
'replication': { 'type': 'BlockdevOptionsReplication',
'if': 'defined(CONFIG_REPLICATION)' },
'sheepdog': 'BlockdevOptionsSheepdog',
+ 'vitastor': 'BlockdevOptionsVitastor',
'ssh': 'BlockdevOptionsSsh',
'throttle': 'BlockdevOptionsThrottle',
'vdi': 'BlockdevOptionsGenericFormat',
@@ -4623,6 +4646,17 @@
'*cluster-size' : 'size' } }
##
+# @BlockdevCreateOptionsVitastor:
+#
+# Driver specific image creation options for Vitastor.
+#
+# @size: Size of the virtual disk in bytes
+##
+{ 'struct': 'BlockdevCreateOptionsVitastor',
+ 'data': { 'location': 'BlockdevOptionsVitastor',
+ 'size': 'size' } }
+
+##
# @BlockdevVmdkSubformat:
#
# Subformat options for VMDK images
@@ -4884,6 +4918,7 @@
'qed': 'BlockdevCreateOptionsQed',
'rbd': 'BlockdevCreateOptionsRbd',
'sheepdog': 'BlockdevCreateOptionsSheepdog',
+ 'vitastor': 'BlockdevCreateOptionsVitastor',
'ssh': 'BlockdevCreateOptionsSsh',
'vdi': 'BlockdevCreateOptionsVdi',
'vhdx': 'BlockdevCreateOptionsVhdx',
Index: pve-qemu-kvm-5.1.0/configure
===================================================================
--- pve-qemu-kvm-5.1.0.orig/configure
+++ pve-qemu-kvm-5.1.0/configure
@@ -446,6 +446,7 @@ trace_backends="log"
trace_file="trace"
spice=""
rbd=""
+vitastor=""
smartcard=""
libusb=""
usb_redir=""
@@ -1383,6 +1384,10 @@ for opt do
;;
--enable-rbd) rbd="yes"
;;
+ --disable-vitastor) vitastor="no"
+ ;;
+ --enable-vitastor) vitastor="yes"
+ ;;
--disable-xfsctl) xfs="no"
;;
--enable-xfsctl) xfs="yes"
@@ -1901,6 +1906,7 @@ disabled with --disable-FEATURE, default
vhost-vdpa vhost-vdpa kernel backend support
spice spice
rbd rados block device (rbd)
+ vitastor vitastor block device
libiscsi iscsi support
libnfs nfs support
smartcard smartcard support (libcacard)
@@ -4234,6 +4240,27 @@ EOF
fi
##########################################
+# vitastor probe
+if test "$vitastor" != "no" ; then
+ cat > $TMPC <<EOF
+#include <vitastor_c.h>
+int main(void) {
+ vitastor_c_create_qemu(0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0);
+ return 0;
+}
+EOF
+ vitastor_libs="-lvitastor_client"
+ if compile_prog "" "$vitastor_libs" ; then
+ vitastor=yes
+ else
+ if test "$vitastor" = "yes" ; then
+ feature_not_found "vitastor block device" "Install vitastor-client-dev"
+ fi
+ vitastor=no
+ fi
+fi
+
+##########################################
# libssh probe
if test "$libssh" != "no" ; then
if $pkg_config --exists libssh; then
@@ -6969,6 +6996,7 @@ echo "Trace output file $trace_file-<pid
fi
echo "spice support $spice $(echo_version $spice $spice_protocol_version/$spice_server_version)"
echo "rbd support $rbd"
+echo "vitastor support $vitastor"
echo "xfsctl support $xfs"
echo "smartcard support $smartcard"
echo "libusb $libusb"
@@ -7644,6 +7672,10 @@ if test "$rbd" = "yes" ; then
echo "RBD_CFLAGS=$rbd_cflags" >> $config_host_mak
echo "RBD_LIBS=$rbd_libs" >> $config_host_mak
fi
+if test "$vitastor" = "yes" ; then
+ echo "CONFIG_VITASTOR=y" >> $config_host_mak
+ echo "VITASTOR_LIBS=$vitastor_libs" >> $config_host_mak
+fi
echo "CONFIG_COROUTINE_BACKEND=$coroutine" >> $config_host_mak
if test "$coroutine_pool" = "yes" ; then
Index: pve-qemu-kvm-5.1.0/block/Makefile.objs
===================================================================
--- pve-qemu-kvm-5.1.0.orig/block/Makefile.objs
+++ pve-qemu-kvm-5.1.0/block/Makefile.objs
@@ -32,6 +32,7 @@ block-obj-$(if $(CONFIG_LIBISCSI),y,n) +
block-obj-$(CONFIG_LIBNFS) += nfs.o
block-obj-$(CONFIG_CURL) += curl.o
block-obj-$(CONFIG_RBD) += rbd.o
+block-obj-$(CONFIG_VITASTOR) += vitastor.o
block-obj-$(CONFIG_GLUSTERFS) += gluster.o
block-obj-$(CONFIG_LIBSSH) += ssh.o
block-obj-y += backup-dump.o
@@ -61,6 +62,8 @@ curl.o-cflags := $(CURL_CFLAGS)
curl.o-libs := $(CURL_LIBS)
rbd.o-cflags := $(RBD_CFLAGS)
rbd.o-libs := $(RBD_LIBS)
+vitastor.o-cflags := $(VITASTOR_CFLAGS)
+vitastor.o-libs := $(VITASTOR_LIBS)
gluster.o-cflags := $(GLUSTERFS_CFLAGS)
gluster.o-libs := $(GLUSTERFS_LIBS)
ssh.o-cflags := $(LIBSSH_CFLAGS)

View File

@@ -1,181 +0,0 @@
Index: pve-qemu-kvm-5.2.0/qapi/block-core.json
===================================================================
--- pve-qemu-kvm-5.2.0.orig/qapi/block-core.json
+++ pve-qemu-kvm-5.2.0/qapi/block-core.json
@@ -3076,7 +3076,7 @@
'luks', 'nbd', 'nfs', 'null-aio', 'null-co', 'nvme', 'parallels',
'qcow', 'qcow2', 'qed', 'quorum', 'raw', 'rbd',
{ 'name': 'replication', 'if': 'defined(CONFIG_REPLICATION)' },
- 'sheepdog', 'pbs',
+ 'sheepdog', 'pbs', 'vitastor',
'ssh', 'throttle', 'vdi', 'vhdx', 'vmdk', 'vpc', 'vvfat' ] }
##
@@ -3924,6 +3924,28 @@
'*tag': 'str' } }
##
+# @BlockdevOptionsVitastor:
+#
+# Driver specific block device options for vitastor
+#
+# @image: Image name
+# @inode: Inode number
+# @pool: Pool ID
+# @size: Desired image size in bytes
+# @config-path: Path to Vitastor configuration
+# @etcd-host: etcd connection address(es)
+# @etcd-prefix: etcd key/value prefix
+##
+{ 'struct': 'BlockdevOptionsVitastor',
+ 'data': { '*inode': 'uint64',
+ '*pool': 'uint64',
+ '*size': 'uint64',
+ '*image': 'str',
+ '*config-path': 'str',
+ '*etcd-host': 'str',
+ '*etcd-prefix': 'str' } }
+
+##
# @ReplicationMode:
#
# An enumeration of replication modes.
@@ -4272,6 +4294,7 @@
'replication': { 'type': 'BlockdevOptionsReplication',
'if': 'defined(CONFIG_REPLICATION)' },
'sheepdog': 'BlockdevOptionsSheepdog',
+ 'vitastor': 'BlockdevOptionsVitastor',
'ssh': 'BlockdevOptionsSsh',
'throttle': 'BlockdevOptionsThrottle',
'vdi': 'BlockdevOptionsGenericFormat',
@@ -4662,6 +4685,17 @@
'*cluster-size' : 'size' } }
##
+# @BlockdevCreateOptionsVitastor:
+#
+# Driver specific image creation options for Vitastor.
+#
+# @size: Size of the virtual disk in bytes
+##
+{ 'struct': 'BlockdevCreateOptionsVitastor',
+ 'data': { 'location': 'BlockdevOptionsVitastor',
+ 'size': 'size' } }
+
+##
# @BlockdevVmdkSubformat:
#
# Subformat options for VMDK images
@@ -4923,6 +4957,7 @@
'qed': 'BlockdevCreateOptionsQed',
'rbd': 'BlockdevCreateOptionsRbd',
'sheepdog': 'BlockdevCreateOptionsSheepdog',
+ 'vitastor': 'BlockdevCreateOptionsVitastor',
'ssh': 'BlockdevCreateOptionsSsh',
'vdi': 'BlockdevCreateOptionsVdi',
'vhdx': 'BlockdevCreateOptionsVhdx',
Index: pve-qemu-kvm-5.2.0/block/meson.build
===================================================================
--- pve-qemu-kvm-5.2.0.orig/block/meson.build
+++ pve-qemu-kvm-5.2.0/block/meson.build
@@ -89,6 +89,7 @@ foreach m : [
['CONFIG_LIBNFS', 'nfs', libnfs, 'nfs.c'],
['CONFIG_LIBSSH', 'ssh', libssh, 'ssh.c'],
['CONFIG_RBD', 'rbd', rbd, 'rbd.c'],
+ ['CONFIG_VITASTOR', 'vitastor', vitastor, 'vitastor.c'],
]
if config_host.has_key(m[0])
if enable_modules
Index: pve-qemu-kvm-5.2.0/configure
===================================================================
--- pve-qemu-kvm-5.2.0.orig/configure
+++ pve-qemu-kvm-5.2.0/configure
@@ -372,6 +372,7 @@ trace_backends="log"
trace_file="trace"
spice=""
rbd=""
+vitastor=""
smartcard=""
u2f="auto"
libusb=""
@@ -1264,6 +1265,10 @@ for opt do
;;
--enable-rbd) rbd="yes"
;;
+ --disable-vitastor) vitastor="no"
+ ;;
+ --enable-vitastor) vitastor="yes"
+ ;;
--disable-xfsctl) xfs="no"
;;
--enable-xfsctl) xfs="yes"
@@ -1807,6 +1812,7 @@ disabled with --disable-FEATURE, default
vhost-vdpa vhost-vdpa kernel backend support
spice spice
rbd rados block device (rbd)
+ vitastor vitastor block device
libiscsi iscsi support
libnfs nfs support
smartcard smartcard support (libcacard)
@@ -3700,6 +3706,27 @@ EOF
fi
##########################################
+# vitastor probe
+if test "$vitastor" != "no" ; then
+ cat > $TMPC <<EOF
+#include <vitastor_c.h>
+int main(void) {
+ vitastor_c_create_qemu(0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0);
+ return 0;
+}
+EOF
+ vitastor_libs="-lvitastor_client"
+ if compile_prog "" "$vitastor_libs" ; then
+ vitastor=yes
+ else
+ if test "$vitastor" = "yes" ; then
+ feature_not_found "vitastor block device" "Install vitastor-client-dev"
+ fi
+ vitastor=no
+ fi
+fi
+
+##########################################
# libssh probe
if test "$libssh" != "no" ; then
if $pkg_config --exists libssh; then
@@ -6437,6 +6464,10 @@ if test "$rbd" = "yes" ; then
echo "CONFIG_RBD=y" >> $config_host_mak
echo "RBD_LIBS=$rbd_libs" >> $config_host_mak
fi
+if test "$vitastor" = "yes" ; then
+ echo "CONFIG_VITASTOR=y" >> $config_host_mak
+ echo "VITASTOR_LIBS=$vitastor_libs" >> $config_host_mak
+fi
echo "CONFIG_COROUTINE_BACKEND=$coroutine" >> $config_host_mak
if test "$coroutine_pool" = "yes" ; then
Index: pve-qemu-kvm-5.2.0/meson.build
===================================================================
--- pve-qemu-kvm-5.2.0.orig/meson.build
+++ pve-qemu-kvm-5.2.0/meson.build
@@ -596,6 +596,10 @@ rbd = not_found
if 'CONFIG_RBD' in config_host
rbd = declare_dependency(link_args: config_host['RBD_LIBS'].split())
endif
+vitastor = not_found
+if 'CONFIG_VITASTOR' in config_host
+ vitastor = declare_dependency(link_args: config_host['VITASTOR_LIBS'].split())
+endif
glusterfs = not_found
if 'CONFIG_GLUSTERFS' in config_host
glusterfs = declare_dependency(compile_args: config_host['GLUSTERFS_CFLAGS'].split(),
@@ -2151,6 +2155,7 @@ endif
# TODO: add back protocol and server version
summary_info += {'spice support': config_host.has_key('CONFIG_SPICE')}
summary_info += {'rbd support': config_host.has_key('CONFIG_RBD')}
+summary_info += {'vitastor support': config_host.has_key('CONFIG_VITASTOR')}
summary_info += {'xfsctl support': config_host.has_key('CONFIG_XFS')}
summary_info += {'smartcard support': config_host.has_key('CONFIG_SMARTCARD')}
summary_info += {'U2F support': u2f.found()}

View File

@@ -1,188 +0,0 @@
Index: pve-qemu-kvm-6.1.0/qapi/block-core.json
===================================================================
--- pve-qemu-kvm-6.1.0.orig/qapi/block-core.json
+++ pve-qemu-kvm-6.1.0/qapi/block-core.json
@@ -3084,7 +3084,7 @@
'preallocate', 'qcow', 'qcow2', 'qed', 'quorum', 'raw', 'rbd',
{ 'name': 'replication', 'if': 'defined(CONFIG_REPLICATION)' },
'pbs',
- 'ssh', 'throttle', 'vdi', 'vhdx', 'vmdk', 'vpc', 'vvfat' ] }
+ 'ssh', 'throttle', 'vdi', 'vhdx', 'vitastor', 'vmdk', 'vpc', 'vvfat' ] }
##
# @BlockdevOptionsFile:
@@ -4020,6 +4020,28 @@
'*server': ['InetSocketAddressBase'] } }
##
+# @BlockdevOptionsVitastor:
+#
+# Driver specific block device options for vitastor
+#
+# @image: Image name
+# @inode: Inode number
+# @pool: Pool ID
+# @size: Desired image size in bytes
+# @config-path: Path to Vitastor configuration
+# @etcd-host: etcd connection address(es)
+# @etcd-prefix: etcd key/value prefix
+##
+{ 'struct': 'BlockdevOptionsVitastor',
+ 'data': { '*inode': 'uint64',
+ '*pool': 'uint64',
+ '*size': 'uint64',
+ '*image': 'str',
+ '*config-path': 'str',
+ '*etcd-host': 'str',
+ '*etcd-prefix': 'str' } }
+
+##
# @ReplicationMode:
#
# An enumeration of replication modes.
@@ -4392,6 +4414,7 @@
'throttle': 'BlockdevOptionsThrottle',
'vdi': 'BlockdevOptionsGenericFormat',
'vhdx': 'BlockdevOptionsGenericFormat',
+ 'vitastor': 'BlockdevOptionsVitastor',
'vmdk': 'BlockdevOptionsGenericCOWFormat',
'vpc': 'BlockdevOptionsGenericFormat',
'vvfat': 'BlockdevOptionsVVFAT'
@@ -4782,6 +4805,17 @@
'*encrypt' : 'RbdEncryptionCreateOptions' } }
##
+# @BlockdevCreateOptionsVitastor:
+#
+# Driver specific image creation options for Vitastor.
+#
+# @size: Size of the virtual disk in bytes
+##
+{ 'struct': 'BlockdevCreateOptionsVitastor',
+ 'data': { 'location': 'BlockdevOptionsVitastor',
+ 'size': 'size' } }
+
+##
# @BlockdevVmdkSubformat:
#
# Subformat options for VMDK images
@@ -4977,6 +5011,7 @@
'ssh': 'BlockdevCreateOptionsSsh',
'vdi': 'BlockdevCreateOptionsVdi',
'vhdx': 'BlockdevCreateOptionsVhdx',
+ 'vitastor': 'BlockdevCreateOptionsVitastor',
'vmdk': 'BlockdevCreateOptionsVmdk',
'vpc': 'BlockdevCreateOptionsVpc'
} }
Index: pve-qemu-kvm-6.1.0/block/meson.build
===================================================================
--- pve-qemu-kvm-6.1.0.orig/block/meson.build
+++ pve-qemu-kvm-6.1.0/block/meson.build
@@ -91,6 +91,7 @@ foreach m : [
[libnfs, 'nfs', files('nfs.c')],
[libssh, 'ssh', files('ssh.c')],
[rbd, 'rbd', files('rbd.c')],
+ [vitastor, 'vitastor', files('vitastor.c')],
]
if m[0].found()
module_ss = ss.source_set()
Index: pve-qemu-kvm-6.1.0/configure
===================================================================
--- pve-qemu-kvm-6.1.0.orig/configure
+++ pve-qemu-kvm-6.1.0/configure
@@ -375,6 +375,7 @@ trace_file="trace"
spice="$default_feature"
spice_protocol="auto"
rbd="auto"
+vitastor="auto"
smartcard="auto"
u2f="auto"
libusb="auto"
@@ -1293,6 +1294,10 @@ for opt do
;;
--enable-rbd) rbd="enabled"
;;
+ --disable-vitastor) vitastor="disabled"
+ ;;
+ --enable-vitastor) vitastor="enabled"
+ ;;
--disable-xfsctl) xfs="no"
;;
--enable-xfsctl) xfs="yes"
@@ -1921,6 +1926,7 @@ disabled with --disable-FEATURE, default
spice spice
spice-protocol spice-protocol
rbd rados block device (rbd)
+ vitastor vitastor block device
libiscsi iscsi support
libnfs nfs support
smartcard smartcard support (libcacard)
@@ -5211,7 +5217,7 @@ if test "$skip_meson" = no; then
-Dcapstone=$capstone -Dslirp=$slirp -Dfdt=$fdt -Dbrlapi=$brlapi \
-Dcurl=$curl -Dglusterfs=$glusterfs -Dbzip2=$bzip2 -Dlibiscsi=$libiscsi \
-Dlibnfs=$libnfs -Diconv=$iconv -Dcurses=$curses -Dlibudev=$libudev\
- -Drbd=$rbd -Dlzo=$lzo -Dsnappy=$snappy -Dlzfse=$lzfse -Dlibxml2=$libxml2 \
+ -Drbd=$rbd -Dvitastor=$vitastor -Dlzo=$lzo -Dsnappy=$snappy -Dlzfse=$lzfse -Dlibxml2=$libxml2 \
-Dlibdaxctl=$libdaxctl -Dlibpmem=$libpmem -Dlinux_io_uring=$linux_io_uring \
-Dgnutls=$gnutls -Dnettle=$nettle -Dgcrypt=$gcrypt -Dauth_pam=$auth_pam \
-Dzstd=$zstd -Dseccomp=$seccomp -Dvirtfs=$virtfs -Dcap_ng=$cap_ng \
Index: pve-qemu-kvm-6.1.0/meson.build
===================================================================
--- pve-qemu-kvm-6.1.0.orig/meson.build
+++ pve-qemu-kvm-6.1.0/meson.build
@@ -729,6 +729,26 @@ if not get_option('rbd').auto() or have_
endif
endif
+vitastor = not_found
+if not get_option('vitastor').auto() or have_block
+ libvitastor_client = cc.find_library('vitastor_client', has_headers: ['vitastor_c.h'],
+ required: get_option('vitastor'), kwargs: static_kwargs)
+ if libvitastor_client.found()
+ if cc.links('''
+ #include <vitastor_c.h>
+ int main(void) {
+ vitastor_c_create_qemu(0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0);
+ return 0;
+ }''', dependencies: libvitastor_client)
+ vitastor = declare_dependency(dependencies: libvitastor_client)
+ elif get_option('vitastor').enabled()
+ error('could not link libvitastor_client')
+ else
+ warning('could not link libvitastor_client, disabling')
+ endif
+ endif
+endif
+
glusterfs = not_found
glusterfs_ftruncate_has_stat = false
glusterfs_iocb_has_stat = false
@@ -1268,6 +1288,7 @@ config_host_data.set('CONFIG_LIBNFS', li
config_host_data.set('CONFIG_LINUX_IO_URING', linux_io_uring.found())
config_host_data.set('CONFIG_LIBPMEM', libpmem.found())
config_host_data.set('CONFIG_RBD', rbd.found())
+config_host_data.set('CONFIG_VITASTOR', vitastor.found())
config_host_data.set('CONFIG_SDL', sdl.found())
config_host_data.set('CONFIG_SDL_IMAGE', sdl_image.found())
config_host_data.set('CONFIG_SECCOMP', seccomp.found())
@@ -3087,6 +3108,7 @@ summary_info += {'bpf support': libbpf.f
# TODO: add back protocol and server version
summary_info += {'spice support': config_host.has_key('CONFIG_SPICE')}
summary_info += {'rbd support': rbd.found()}
+summary_info += {'vitastor support': vitastor.found()}
summary_info += {'xfsctl support': config_host.has_key('CONFIG_XFS')}
summary_info += {'smartcard support': cacard.found()}
summary_info += {'U2F support': u2f.found()}
Index: pve-qemu-kvm-6.1.0/meson_options.txt
===================================================================
--- pve-qemu-kvm-6.1.0.orig/meson_options.txt
+++ pve-qemu-kvm-6.1.0/meson_options.txt
@@ -102,6 +102,8 @@ option('lzo', type : 'feature', value :
description: 'lzo compression support')
option('rbd', type : 'feature', value : 'auto',
description: 'Ceph block device driver')
+option('vitastor', type : 'feature', value : 'auto',
+ description: 'Vitastor block device driver')
option('gtk', type : 'feature', value : 'auto',
description: 'GTK+ user interface')
option('sdl', type : 'feature', value : 'auto',

View File

@@ -1,107 +1,7 @@
diff --git a/block/Makefile.objs b/block/Makefile.objs Index: qemu-3.1+dfsg/qapi/block-core.json
index 46d585cfd0..62222f25fe 100644 ===================================================================
--- a/block/Makefile.objs --- qemu-3.1+dfsg.orig/qapi/block-core.json
+++ b/block/Makefile.objs +++ qemu-3.1+dfsg/qapi/block-core.json
@@ -29,6 +29,7 @@ block-obj-$(if $(CONFIG_LIBISCSI),y,n) += iscsi-opts.o
block-obj-$(CONFIG_LIBNFS) += nfs.o
block-obj-$(CONFIG_CURL) += curl.o
block-obj-$(CONFIG_RBD) += rbd.o
+block-obj-$(CONFIG_VITASTOR) += vitastor.o
block-obj-$(CONFIG_GLUSTERFS) += gluster.o
block-obj-$(CONFIG_VXHS) += vxhs.o
block-obj-$(CONFIG_LIBSSH2) += ssh.o
@@ -49,6 +50,8 @@ curl.o-cflags := $(CURL_CFLAGS)
curl.o-libs := $(CURL_LIBS)
rbd.o-cflags := $(RBD_CFLAGS)
rbd.o-libs := $(RBD_LIBS)
+vitastor.o-cflags := $(VITASTOR_CFLAGS)
+vitastor.o-libs := $(VITASTOR_LIBS)
gluster.o-cflags := $(GLUSTERFS_CFLAGS)
gluster.o-libs := $(GLUSTERFS_LIBS)
vxhs.o-libs := $(VXHS_LIBS)
diff --git a/configure b/configure
index 1c9f6090e8..25ef89c33a 100755
--- a/configure
+++ b/configure
@@ -422,6 +422,7 @@ trace_backends="log"
trace_file="trace"
spice=""
rbd=""
+vitastor=""
smartcard=""
libusb=""
usb_redir=""
@@ -1282,6 +1283,10 @@ for opt do
;;
--enable-rbd) rbd="yes"
;;
+ --disable-vitastor) vitastor="no"
+ ;;
+ --enable-vitastor) vitastor="yes"
+ ;;
--disable-xfsctl) xfs="no"
;;
--enable-xfsctl) xfs="yes"
@@ -1737,6 +1742,7 @@ disabled with --disable-FEATURE, default is enabled if available:
vhost-crypto vhost-crypto acceleration support
spice spice
rbd rados block device (rbd)
+ vitastor vitastor block device
libiscsi iscsi support
libnfs nfs support
smartcard smartcard support (libcacard)
@@ -3722,6 +3728,27 @@ EOF
fi
fi
+##########################################
+# vitastor probe
+if test "$vitastor" != "no" ; then
+ cat > $TMPC <<EOF
+#include <vitastor_c.h>
+int main(void) {
+ vitastor_c_create_qemu(0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0);
+ return 0;
+}
+EOF
+ vitastor_libs="-lvitastor_client"
+ if compile_prog "" "$vitastor_libs" ; then
+ vitastor=yes
+ else
+ if test "$vitastor" = "yes" ; then
+ feature_not_found "vitastor block device" "Install vitastor-client-dev"
+ fi
+ vitastor=no
+ fi
+fi
+
##########################################
# libssh2 probe
min_libssh2_version=1.2.8
@@ -6109,6 +6136,7 @@ echo "Trace output file $trace_file-<pid>"
fi
echo "spice support $spice $(echo_version $spice $spice_protocol_version/$spice_server_version)"
echo "rbd support $rbd"
+echo "vitastor support $vitastor"
echo "xfsctl support $xfs"
echo "smartcard support $smartcard"
echo "libusb $libusb"
@@ -6694,6 +6722,11 @@ if test "$rbd" = "yes" ; then
echo "RBD_CFLAGS=$rbd_cflags" >> $config_host_mak
echo "RBD_LIBS=$rbd_libs" >> $config_host_mak
fi
+if test "$vitastor" = "yes" ; then
+ echo "CONFIG_VITASTOR=m" >> $config_host_mak
+ echo "VITASTOR_CFLAGS=$vitastor_cflags" >> $config_host_mak
+ echo "VITASTOR_LIBS=$vitastor_libs" >> $config_host_mak
+fi
echo "CONFIG_COROUTINE_BACKEND=$coroutine" >> $config_host_mak
if test "$coroutine_pool" = "yes" ; then
diff --git a/qapi/block-core.json b/qapi/block-core.json
index d4fe710836..dbad3327b3 100644
--- a/qapi/block-core.json
+++ b/qapi/block-core.json
@@ -2617,7 +2617,7 @@ @@ -2617,7 +2617,7 @@
## ##
{ 'enum': 'BlockdevDriver', { 'enum': 'BlockdevDriver',
@@ -111,11 +11,10 @@ index d4fe710836..dbad3327b3 100644
'host_cdrom', 'host_device', 'http', 'https', 'iscsi', 'luks', 'host_cdrom', 'host_device', 'http', 'https', 'iscsi', 'luks',
'nbd', 'nfs', 'null-aio', 'null-co', 'nvme', 'parallels', 'qcow', 'nbd', 'nfs', 'null-aio', 'null-co', 'nvme', 'parallels', 'qcow',
'qcow2', 'qed', 'quorum', 'raw', 'rbd', 'replication', 'sheepdog', 'qcow2', 'qed', 'quorum', 'raw', 'rbd', 'replication', 'sheepdog',
@@ -3366,6 +3366,28 @@ @@ -3367,6 +3367,28 @@
'*snap-id': 'uint32',
'*tag': 'str' } } '*tag': 'str' } }
+## ##
+# @BlockdevOptionsVitastor: +# @BlockdevOptionsVitastor:
+# +#
+# Driver specific block device options for vitastor +# Driver specific block device options for vitastor
@@ -137,10 +36,11 @@ index d4fe710836..dbad3327b3 100644
+ '*etcd-host': 'str', + '*etcd-host': 'str',
+ '*etcd-prefix': 'str' } } + '*etcd-prefix': 'str' } }
+ +
## +##
# @ReplicationMode: # @ReplicationMode:
# #
@@ -3713,6 +3735,7 @@ # An enumeration of replication modes.
@@ -3713,6 +3731,7 @@
'rbd': 'BlockdevOptionsRbd', 'rbd': 'BlockdevOptionsRbd',
'replication':'BlockdevOptionsReplication', 'replication':'BlockdevOptionsReplication',
'sheepdog': 'BlockdevOptionsSheepdog', 'sheepdog': 'BlockdevOptionsSheepdog',
@@ -148,11 +48,10 @@ index d4fe710836..dbad3327b3 100644
'ssh': 'BlockdevOptionsSsh', 'ssh': 'BlockdevOptionsSsh',
'throttle': 'BlockdevOptionsThrottle', 'throttle': 'BlockdevOptionsThrottle',
'vdi': 'BlockdevOptionsGenericFormat', 'vdi': 'BlockdevOptionsGenericFormat',
@@ -4157,6 +4180,17 @@ @@ -4158,6 +4177,17 @@
'*subformat': 'BlockdevVhdxSubformat',
'*block-state-zero': 'bool' } } '*block-state-zero': 'bool' } }
+## ##
+# @BlockdevCreateOptionsVitastor: +# @BlockdevCreateOptionsVitastor:
+# +#
+# Driver specific image creation options for Vitastor. +# Driver specific image creation options for Vitastor.
@@ -163,10 +62,11 @@ index d4fe710836..dbad3327b3 100644
+ 'data': { 'location': 'BlockdevOptionsVitastor', + 'data': { 'location': 'BlockdevOptionsVitastor',
+ 'size': 'size' } } + 'size': 'size' } }
+ +
## +##
# @BlockdevVpcSubformat: # @BlockdevVpcSubformat:
# #
@@ -4212,6 +4246,7 @@ # @dynamic: Growing image file
@@ -4212,6 +4242,7 @@
'qed': 'BlockdevCreateOptionsQed', 'qed': 'BlockdevCreateOptionsQed',
'rbd': 'BlockdevCreateOptionsRbd', 'rbd': 'BlockdevCreateOptionsRbd',
'sheepdog': 'BlockdevCreateOptionsSheepdog', 'sheepdog': 'BlockdevCreateOptionsSheepdog',
@@ -174,3 +74,15 @@ index d4fe710836..dbad3327b3 100644
'ssh': 'BlockdevCreateOptionsSsh', 'ssh': 'BlockdevCreateOptionsSsh',
'vdi': 'BlockdevCreateOptionsVdi', 'vdi': 'BlockdevCreateOptionsVdi',
'vhdx': 'BlockdevCreateOptionsVhdx', 'vhdx': 'BlockdevCreateOptionsVhdx',
Index: qemu-3.1+dfsg/scripts/modules/module_block.py
===================================================================
--- qemu-3.1+dfsg.orig/scripts/modules/module_block.py
+++ qemu-3.1+dfsg/scripts/modules/module_block.py
@@ -88,6 +88,7 @@ def print_bottom(fheader):
output_file = sys.argv[1]
with open(output_file, 'w') as fheader:
print_top(fheader)
+ add_module(fheader, "vitastor", "vitastor", "vitastor")
for filename in sys.argv[2:]:
if os.path.isfile(filename):

View File

@@ -1,105 +1,8 @@
diff -NaurpbB qemu-4.2.0/block/Makefile.objs qemu-4.2.0-vitastor/block/Makefile.objs Index: qemu/qapi/block-core.json
--- qemu-4.2.0/block/Makefile.objs 2019-12-12 18:20:47.000000000 +0000 ===================================================================
+++ qemu-4.2.0-vitastor/block/Makefile.objs 2021-12-01 21:28:47.342341760 +0000 --- qemu.orig/qapi/block-core.json 2020-11-07 22:57:38.932613674 +0000
@@ -29,6 +29,7 @@ block-obj-$(if $(CONFIG_LIBISCSI),y,n) + +++ qemu.orig/qapi/block-core.json 2020-11-07 22:59:49.890722862 +0000
block-obj-$(CONFIG_LIBNFS) += nfs.o @@ -2907,7 +2907,7 @@
block-obj-$(CONFIG_CURL) += curl.o
block-obj-$(CONFIG_RBD) += rbd.o
+block-obj-$(CONFIG_VITASTOR) += vitastor.o
block-obj-$(CONFIG_GLUSTERFS) += gluster.o
block-obj-$(CONFIG_VXHS) += vxhs.o
block-obj-$(CONFIG_LIBSSH) += ssh.o
@@ -53,6 +54,8 @@ curl.o-cflags := $(CURL_CFLAGS)
curl.o-libs := $(CURL_LIBS)
rbd.o-cflags := $(RBD_CFLAGS)
rbd.o-libs := $(RBD_LIBS)
+vitastor.o-cflags := $(VITASTOR_CFLAGS)
+vitastor.o-libs := $(VITASTOR_LIBS)
gluster.o-cflags := $(GLUSTERFS_CFLAGS)
gluster.o-libs := $(GLUSTERFS_LIBS)
vxhs.o-libs := $(VXHS_LIBS)
diff -NaurpbB qemu-4.2.0/configure qemu-4.2.0-vitastor/configure
--- qemu-4.2.0/configure 2019-12-12 18:20:47.000000000 +0000
+++ qemu-4.2.0-vitastor/configure 2021-12-01 21:30:24.605237562 +0000
@@ -435,6 +435,7 @@ trace_backends="log"
trace_file="trace"
spice=""
rbd=""
+vitastor=""
smartcard=""
libusb=""
usb_redir=""
@@ -1312,6 +1313,10 @@ for opt do
;;
--enable-rbd) rbd="yes"
;;
+ --disable-vitastor) vitastor="no"
+ ;;
+ --enable-vitastor) vitastor="yes"
+ ;;
--disable-xfsctl) xfs="no"
;;
--enable-xfsctl) xfs="yes"
@@ -1782,6 +1787,7 @@ disabled with --disable-FEATURE, default
vhost-user vhost-user backend support
spice spice
rbd rados block device (rbd)
+ vitastor vitastor block device
libiscsi iscsi support
libnfs nfs support
smartcard smartcard support (libcacard)
@@ -3980,6 +3986,27 @@ EOF
fi
##########################################
+# vitastor probe
+if test "$vitastor" != "no" ; then
+ cat > $TMPC <<EOF
+#include <vitastor_c.h>
+int main(void) {
+ vitastor_c_create_qemu(0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0);
+ return 0;
+}
+EOF
+ vitastor_libs="-lvitastor_client"
+ if compile_prog "" "$vitastor_libs" ; then
+ vitastor=yes
+ else
+ if test "$vitastor" = "yes" ; then
+ feature_not_found "vitastor block device" "Install vitastor-client-dev"
+ fi
+ vitastor=no
+ fi
+fi
+
+##########################################
# libssh probe
if test "$libssh" != "no" ; then
if $pkg_config --exists libssh; then
@@ -6549,6 +6576,7 @@ echo "Trace output file $trace_file-<pid
fi
echo "spice support $spice $(echo_version $spice $spice_protocol_version/$spice_server_version)"
echo "rbd support $rbd"
+echo "vitastor support $vitastor"
echo "xfsctl support $xfs"
echo "smartcard support $smartcard"
echo "libusb $libusb"
@@ -7182,6 +7210,11 @@ if test "$rbd" = "yes" ; then
echo "RBD_CFLAGS=$rbd_cflags" >> $config_host_mak
echo "RBD_LIBS=$rbd_libs" >> $config_host_mak
fi
+if test "$vitastor" = "yes" ; then
+ echo "CONFIG_VITASTOR=m" >> $config_host_mak
+ echo "VITASTOR_CFLAGS=$vitastor_cflags" >> $config_host_mak
+ echo "VITASTOR_LIBS=$vitastor_libs" >> $config_host_mak
+fi
echo "CONFIG_COROUTINE_BACKEND=$coroutine" >> $config_host_mak
if test "$coroutine_pool" = "yes" ; then
diff -NaurpbB qemu-4.2.0/qapi/block-core.json qemu-4.2.0-vitastor/qapi/block-core.json
--- qemu-4.2.0/qapi/block-core.json 2019-12-12 18:20:48.000000000 +0000
+++ qemu-4.2.0-vitastor/qapi/block-core.json 2021-12-01 21:27:49.213574396 +0000
@@ -2894,7 +2894,7 @@
'nbd', 'nfs', 'null-aio', 'null-co', 'nvme', 'parallels', 'qcow', 'nbd', 'nfs', 'null-aio', 'null-co', 'nvme', 'parallels', 'qcow',
'qcow2', 'qed', 'quorum', 'raw', 'rbd', 'qcow2', 'qed', 'quorum', 'raw', 'rbd',
{ 'name': 'replication', 'if': 'defined(CONFIG_REPLICATION)' }, { 'name': 'replication', 'if': 'defined(CONFIG_REPLICATION)' },
@@ -108,7 +11,7 @@ diff -NaurpbB qemu-4.2.0/qapi/block-core.json qemu-4.2.0-vitastor/qapi/block-cor
'ssh', 'throttle', 'vdi', 'vhdx', 'vmdk', 'vpc', 'vvfat', 'vxhs' ] } 'ssh', 'throttle', 'vdi', 'vhdx', 'vmdk', 'vpc', 'vvfat', 'vxhs' ] }
## ##
@@ -3712,6 +3712,28 @@ @@ -3725,6 +3725,28 @@
'*tag': 'str' } } '*tag': 'str' } }
## ##
@@ -137,7 +40,7 @@ diff -NaurpbB qemu-4.2.0/qapi/block-core.json qemu-4.2.0-vitastor/qapi/block-cor
# @ReplicationMode: # @ReplicationMode:
# #
# An enumeration of replication modes. # An enumeration of replication modes.
@@ -4071,6 +4093,7 @@ @@ -4084,6 +4102,7 @@
'replication': { 'type': 'BlockdevOptionsReplication', 'replication': { 'type': 'BlockdevOptionsReplication',
'if': 'defined(CONFIG_REPLICATION)' }, 'if': 'defined(CONFIG_REPLICATION)' },
'sheepdog': 'BlockdevOptionsSheepdog', 'sheepdog': 'BlockdevOptionsSheepdog',
@@ -145,7 +48,7 @@ diff -NaurpbB qemu-4.2.0/qapi/block-core.json qemu-4.2.0-vitastor/qapi/block-cor
'ssh': 'BlockdevOptionsSsh', 'ssh': 'BlockdevOptionsSsh',
'throttle': 'BlockdevOptionsThrottle', 'throttle': 'BlockdevOptionsThrottle',
'vdi': 'BlockdevOptionsGenericFormat', 'vdi': 'BlockdevOptionsGenericFormat',
@@ -4441,6 +4464,17 @@ @@ -4461,6 +4480,17 @@
'*cluster-size' : 'size' } } '*cluster-size' : 'size' } }
## ##
@@ -163,7 +66,7 @@ diff -NaurpbB qemu-4.2.0/qapi/block-core.json qemu-4.2.0-vitastor/qapi/block-cor
# @BlockdevVmdkSubformat: # @BlockdevVmdkSubformat:
# #
# Subformat options for VMDK images # Subformat options for VMDK images
@@ -4702,6 +4736,7 @@ @@ -4722,6 +4752,7 @@
'qed': 'BlockdevCreateOptionsQed', 'qed': 'BlockdevCreateOptionsQed',
'rbd': 'BlockdevCreateOptionsRbd', 'rbd': 'BlockdevCreateOptionsRbd',
'sheepdog': 'BlockdevCreateOptionsSheepdog', 'sheepdog': 'BlockdevCreateOptionsSheepdog',
@@ -171,3 +74,15 @@ diff -NaurpbB qemu-4.2.0/qapi/block-core.json qemu-4.2.0-vitastor/qapi/block-cor
'ssh': 'BlockdevCreateOptionsSsh', 'ssh': 'BlockdevCreateOptionsSsh',
'vdi': 'BlockdevCreateOptionsVdi', 'vdi': 'BlockdevCreateOptionsVdi',
'vhdx': 'BlockdevCreateOptionsVhdx', 'vhdx': 'BlockdevCreateOptionsVhdx',
Index: qemu/scripts/modules/module_block.py
===================================================================
--- qemu.orig/scripts/modules/module_block.py 2020-11-07 22:57:38.936613739 +0000
+++ qemu/scripts/modules/module_block.py 2020-11-07 22:59:49.890722862 +0000
@@ -86,6 +86,7 @@ def print_bottom(fheader):
output_file = sys.argv[1]
with open(output_file, 'w') as fheader:
print_top(fheader)
+ add_module(fheader, "vitastor", "vitastor", "vitastor")
for filename in sys.argv[2:]:
if os.path.isfile(filename):

View File

@@ -1,107 +1,7 @@
diff --git a/block/Makefile.objs b/block/Makefile.objs Index: qemu/qapi/block-core.json
index 3635b6b4c1..6cdf6df6ff 100644 ===================================================================
--- a/block/Makefile.objs --- qemu.orig/qapi/block-core.json
+++ b/block/Makefile.objs +++ qemu/qapi/block-core.json
@@ -30,6 +30,7 @@ block-obj-$(if $(CONFIG_LIBISCSI),y,n) += iscsi-opts.o
block-obj-$(CONFIG_LIBNFS) += nfs.o
block-obj-$(CONFIG_CURL) += curl.o
block-obj-$(CONFIG_RBD) += rbd.o
+block-obj-$(CONFIG_VITASTOR) += vitastor.o
block-obj-$(CONFIG_GLUSTERFS) += gluster.o
block-obj-$(CONFIG_VXHS) += vxhs.o
block-obj-$(CONFIG_LIBSSH) += ssh.o
@@ -58,6 +59,8 @@ curl.o-cflags := $(CURL_CFLAGS)
curl.o-libs := $(CURL_LIBS)
rbd.o-cflags := $(RBD_CFLAGS)
rbd.o-libs := $(RBD_LIBS)
+vitastor.o-cflags := $(VITASTOR_CFLAGS)
+vitastor.o-libs := $(VITASTOR_LIBS)
gluster.o-cflags := $(GLUSTERFS_CFLAGS)
gluster.o-libs := $(GLUSTERFS_LIBS)
vxhs.o-libs := $(VXHS_LIBS)
diff --git a/configure b/configure
index 23b5e93752..7400cb9546 100755
--- a/configure
+++ b/configure
@@ -438,6 +438,7 @@ trace_backends="log"
trace_file="trace"
spice=""
rbd=""
+vitastor=""
smartcard=""
libusb=""
usb_redir=""
@@ -1355,6 +1356,10 @@ for opt do
;;
--enable-rbd) rbd="yes"
;;
+ --disable-vitastor) vitastor="no"
+ ;;
+ --enable-vitastor) vitastor="yes"
+ ;;
--disable-xfsctl) xfs="no"
;;
--enable-xfsctl) xfs="yes"
@@ -1848,6 +1853,7 @@ disabled with --disable-FEATURE, default is enabled if available:
vhost-user vhost-user backend support
spice spice
rbd rados block device (rbd)
+ vitastor vitastor block device
libiscsi iscsi support
libnfs nfs support
smartcard smartcard support (libcacard)
@@ -4088,6 +4094,27 @@ EOF
fi
fi
+##########################################
+# vitastor probe
+if test "$vitastor" != "no" ; then
+ cat > $TMPC <<EOF
+#include <vitastor_c.h>
+int main(void) {
+ vitastor_c_create_qemu(0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0);
+ return 0;
+}
+EOF
+ vitastor_libs="-lvitastor_client"
+ if compile_prog "" "$vitastor_libs" ; then
+ vitastor=yes
+ else
+ if test "$vitastor" = "yes" ; then
+ feature_not_found "vitastor block device" "Install vitastor-client-dev"
+ fi
+ vitastor=no
+ fi
+fi
+
##########################################
# libssh probe
if test "$libssh" != "no" ; then
@@ -6679,6 +6706,7 @@ echo "Trace output file $trace_file-<pid>"
fi
echo "spice support $spice $(echo_version $spice $spice_protocol_version/$spice_server_version)"
echo "rbd support $rbd"
+echo "vitastor support $vitastor"
echo "xfsctl support $xfs"
echo "smartcard support $smartcard"
echo "libusb $libusb"
@@ -7329,6 +7357,11 @@ if test "$rbd" = "yes" ; then
echo "RBD_CFLAGS=$rbd_cflags" >> $config_host_mak
echo "RBD_LIBS=$rbd_libs" >> $config_host_mak
fi
+if test "$vitastor" = "yes" ; then
+ echo "CONFIG_VITASTOR=m" >> $config_host_mak
+ echo "VITASTOR_CFLAGS=$vitastor_cflags" >> $config_host_mak
+ echo "VITASTOR_LIBS=$vitastor_libs" >> $config_host_mak
+fi
echo "CONFIG_COROUTINE_BACKEND=$coroutine" >> $config_host_mak
if test "$coroutine_pool" = "yes" ; then
diff --git a/qapi/block-core.json b/qapi/block-core.json
index 943df1926a..c4f23230a3 100644
--- a/qapi/block-core.json
+++ b/qapi/block-core.json
@@ -2798,7 +2798,7 @@ @@ -2798,7 +2798,7 @@
'luks', 'nbd', 'nfs', 'null-aio', 'null-co', 'nvme', 'parallels', 'luks', 'nbd', 'nfs', 'null-aio', 'null-co', 'nvme', 'parallels',
'qcow', 'qcow2', 'qed', 'quorum', 'raw', 'rbd', 'qcow', 'qcow2', 'qed', 'quorum', 'raw', 'rbd',
@@ -111,11 +11,10 @@ index 943df1926a..c4f23230a3 100644
'ssh', 'throttle', 'vdi', 'vhdx', 'vmdk', 'vpc', 'vvfat', 'vxhs' ] } 'ssh', 'throttle', 'vdi', 'vhdx', 'vmdk', 'vpc', 'vvfat', 'vxhs' ] }
## ##
@@ -3634,6 +3634,28 @@ @@ -3635,6 +3635,28 @@
'*snap-id': 'uint32',
'*tag': 'str' } } '*tag': 'str' } }
+## ##
+# @BlockdevOptionsVitastor: +# @BlockdevOptionsVitastor:
+# +#
+# Driver specific block device options for vitastor +# Driver specific block device options for vitastor
@@ -137,10 +36,11 @@ index 943df1926a..c4f23230a3 100644
+ '*etcd-host': 'str', + '*etcd-host': 'str',
+ '*etcd-prefix': 'str' } } + '*etcd-prefix': 'str' } }
+ +
## +##
# @ReplicationMode: # @ReplicationMode:
# #
@@ -3995,6 +4017,7 @@ # An enumeration of replication modes.
@@ -3995,6 +4013,7 @@
'replication': { 'type': 'BlockdevOptionsReplication', 'replication': { 'type': 'BlockdevOptionsReplication',
'if': 'defined(CONFIG_REPLICATION)' }, 'if': 'defined(CONFIG_REPLICATION)' },
'sheepdog': 'BlockdevOptionsSheepdog', 'sheepdog': 'BlockdevOptionsSheepdog',
@@ -148,11 +48,10 @@ index 943df1926a..c4f23230a3 100644
'ssh': 'BlockdevOptionsSsh', 'ssh': 'BlockdevOptionsSsh',
'throttle': 'BlockdevOptionsThrottle', 'throttle': 'BlockdevOptionsThrottle',
'vdi': 'BlockdevOptionsGenericFormat', 'vdi': 'BlockdevOptionsGenericFormat',
@@ -4364,6 +4387,17 @@ @@ -4365,6 +4384,17 @@
'size': 'size',
'*cluster-size' : 'size' } } '*cluster-size' : 'size' } }
+## ##
+# @BlockdevCreateOptionsVitastor: +# @BlockdevCreateOptionsVitastor:
+# +#
+# Driver specific image creation options for Vitastor. +# Driver specific image creation options for Vitastor.
@@ -163,10 +62,11 @@ index 943df1926a..c4f23230a3 100644
+ 'data': { 'location': 'BlockdevOptionsVitastor', + 'data': { 'location': 'BlockdevOptionsVitastor',
+ 'size': 'size' } } + 'size': 'size' } }
+ +
## +##
# @BlockdevVmdkSubformat: # @BlockdevVmdkSubformat:
# #
@@ -4626,6 +4660,7 @@ # Subformat options for VMDK images
@@ -4626,6 +4656,7 @@
'qed': 'BlockdevCreateOptionsQed', 'qed': 'BlockdevCreateOptionsQed',
'rbd': 'BlockdevCreateOptionsRbd', 'rbd': 'BlockdevCreateOptionsRbd',
'sheepdog': 'BlockdevCreateOptionsSheepdog', 'sheepdog': 'BlockdevCreateOptionsSheepdog',
@@ -174,3 +74,15 @@ index 943df1926a..c4f23230a3 100644
'ssh': 'BlockdevCreateOptionsSsh', 'ssh': 'BlockdevCreateOptionsSsh',
'vdi': 'BlockdevCreateOptionsVdi', 'vdi': 'BlockdevCreateOptionsVdi',
'vhdx': 'BlockdevCreateOptionsVhdx', 'vhdx': 'BlockdevCreateOptionsVhdx',
Index: qemu/scripts/modules/module_block.py
===================================================================
--- qemu.orig/scripts/modules/module_block.py
+++ qemu/scripts/modules/module_block.py
@@ -85,6 +85,7 @@ def print_bottom(fheader):
output_file = sys.argv[1]
with open(output_file, 'w') as fheader:
print_top(fheader)
+ add_module(fheader, "vitastor", "vitastor", "vitastor")
for filename in sys.argv[2:]:
if os.path.isfile(filename):

View File

@@ -1,8 +1,8 @@
Index: qemu-5.2+dfsg/qapi/block-core.json Index: qemu-5.1+dfsg/qapi/block-core.json
=================================================================== ===================================================================
--- qemu-5.2+dfsg.orig/qapi/block-core.json --- qemu-5.1+dfsg.orig/qapi/block-core.json
+++ qemu-5.2+dfsg/qapi/block-core.json +++ qemu-5.1+dfsg/qapi/block-core.json
@@ -2831,7 +2831,7 @@ @@ -2807,7 +2807,7 @@
'luks', 'nbd', 'nfs', 'null-aio', 'null-co', 'nvme', 'parallels', 'luks', 'nbd', 'nfs', 'null-aio', 'null-co', 'nvme', 'parallels',
'qcow', 'qcow2', 'qed', 'quorum', 'raw', 'rbd', 'qcow', 'qcow2', 'qed', 'quorum', 'raw', 'rbd',
{ 'name': 'replication', 'if': 'defined(CONFIG_REPLICATION)' }, { 'name': 'replication', 'if': 'defined(CONFIG_REPLICATION)' },
@@ -11,7 +11,7 @@ Index: qemu-5.2+dfsg/qapi/block-core.json
'ssh', 'throttle', 'vdi', 'vhdx', 'vmdk', 'vpc', 'vvfat' ] } 'ssh', 'throttle', 'vdi', 'vhdx', 'vmdk', 'vpc', 'vvfat' ] }
## ##
@@ -3668,6 +3668,28 @@ @@ -3644,6 +3644,28 @@
'*tag': 'str' } } '*tag': 'str' } }
## ##
@@ -40,7 +40,7 @@ Index: qemu-5.2+dfsg/qapi/block-core.json
# @ReplicationMode: # @ReplicationMode:
# #
# An enumeration of replication modes. # An enumeration of replication modes.
@@ -4015,6 +4037,7 @@ @@ -3988,6 +4006,7 @@
'replication': { 'type': 'BlockdevOptionsReplication', 'replication': { 'type': 'BlockdevOptionsReplication',
'if': 'defined(CONFIG_REPLICATION)' }, 'if': 'defined(CONFIG_REPLICATION)' },
'sheepdog': 'BlockdevOptionsSheepdog', 'sheepdog': 'BlockdevOptionsSheepdog',
@@ -48,7 +48,7 @@ Index: qemu-5.2+dfsg/qapi/block-core.json
'ssh': 'BlockdevOptionsSsh', 'ssh': 'BlockdevOptionsSsh',
'throttle': 'BlockdevOptionsThrottle', 'throttle': 'BlockdevOptionsThrottle',
'vdi': 'BlockdevOptionsGenericFormat', 'vdi': 'BlockdevOptionsGenericFormat',
@@ -4404,6 +4427,17 @@ @@ -4376,6 +4395,17 @@
'*cluster-size' : 'size' } } '*cluster-size' : 'size' } }
## ##
@@ -66,7 +66,7 @@ Index: qemu-5.2+dfsg/qapi/block-core.json
# @BlockdevVmdkSubformat: # @BlockdevVmdkSubformat:
# #
# Subformat options for VMDK images # Subformat options for VMDK images
@@ -4665,6 +4699,7 @@ @@ -4637,6 +4667,7 @@
'qed': 'BlockdevCreateOptionsQed', 'qed': 'BlockdevCreateOptionsQed',
'rbd': 'BlockdevCreateOptionsRbd', 'rbd': 'BlockdevCreateOptionsRbd',
'sheepdog': 'BlockdevCreateOptionsSheepdog', 'sheepdog': 'BlockdevCreateOptionsSheepdog',
@@ -74,108 +74,15 @@ Index: qemu-5.2+dfsg/qapi/block-core.json
'ssh': 'BlockdevCreateOptionsSsh', 'ssh': 'BlockdevCreateOptionsSsh',
'vdi': 'BlockdevCreateOptionsVdi', 'vdi': 'BlockdevCreateOptionsVdi',
'vhdx': 'BlockdevCreateOptionsVhdx', 'vhdx': 'BlockdevCreateOptionsVhdx',
Index: qemu-5.2+dfsg/block/meson.build Index: qemu-5.1+dfsg/scripts/modules/module_block.py
=================================================================== ===================================================================
--- qemu-5.2+dfsg.orig/block/meson.build --- qemu-5.1+dfsg.orig/scripts/modules/module_block.py
+++ qemu-5.2+dfsg/block/meson.build +++ qemu-5.1+dfsg/scripts/modules/module_block.py
@@ -76,6 +76,7 @@ foreach m : [ @@ -86,6 +86,7 @@ if __name__ == '__main__':
['CONFIG_LIBNFS', 'nfs', libnfs, 'nfs.c'], output_file = sys.argv[1]
['CONFIG_LIBSSH', 'ssh', libssh, 'ssh.c'], with open(output_file, 'w') as fheader:
['CONFIG_RBD', 'rbd', rbd, 'rbd.c'], print_top(fheader)
+ ['CONFIG_VITASTOR', 'vitastor', vitastor, 'vitastor.c'], + add_module(fheader, "vitastor", "vitastor", "vitastor")
]
if config_host.has_key(m[0])
if enable_modules
Index: qemu-5.2+dfsg/configure
===================================================================
--- qemu-5.2+dfsg.orig/configure
+++ qemu-5.2+dfsg/configure
@@ -372,6 +372,7 @@ trace_backends="log"
trace_file="trace"
spice=""
rbd=""
+vitastor=""
smartcard=""
u2f="auto"
libusb=""
@@ -1263,6 +1264,10 @@ for opt do
;;
--enable-rbd) rbd="yes"
;;
+ --disable-vitastor) vitastor="no"
+ ;;
+ --enable-vitastor) vitastor="yes"
+ ;;
--disable-xfsctl) xfs="no"
;;
--enable-xfsctl) xfs="yes"
@@ -1827,6 +1832,7 @@ disabled with --disable-FEATURE, default
vhost-vdpa vhost-vdpa kernel backend support
spice spice
rbd rados block device (rbd)
+ vitastor vitastor block device
libiscsi iscsi support
libnfs nfs support
smartcard smartcard support (libcacard)
@@ -3719,6 +3725,27 @@ EOF
fi
########################################## for filename in sys.argv[2:]:
+# vitastor probe if os.path.isfile(filename):
+if test "$vitastor" != "no" ; then
+ cat > $TMPC <<EOF
+#include <vitastor_c.h>
+int main(void) {
+ vitastor_c_create_qemu(0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0);
+ return 0;
+}
+EOF
+ vitastor_libs="-lvitastor_client"
+ if compile_prog "" "$vitastor_libs" ; then
+ vitastor=yes
+ else
+ if test "$vitastor" = "yes" ; then
+ feature_not_found "vitastor block device" "Install vitastor-client-dev"
+ fi
+ vitastor=no
+ fi
+fi
+
+##########################################
# libssh probe
if test "$libssh" != "no" ; then
if $pkg_config --exists libssh; then
@@ -6456,6 +6483,10 @@ if test "$rbd" = "yes" ; then
echo "CONFIG_RBD=y" >> $config_host_mak
echo "RBD_LIBS=$rbd_libs" >> $config_host_mak
fi
+if test "$vitastor" = "yes" ; then
+ echo "CONFIG_VITASTOR=y" >> $config_host_mak
+ echo "VITASTOR_LIBS=$vitastor_libs" >> $config_host_mak
+fi
echo "CONFIG_COROUTINE_BACKEND=$coroutine" >> $config_host_mak
if test "$coroutine_pool" = "yes" ; then
Index: qemu-5.2+dfsg/meson.build
===================================================================
--- qemu-5.2+dfsg.orig/meson.build
+++ qemu-5.2+dfsg/meson.build
@@ -596,6 +596,10 @@ rbd = not_found
if 'CONFIG_RBD' in config_host
rbd = declare_dependency(link_args: config_host['RBD_LIBS'].split())
endif
+vitastor = not_found
+if 'CONFIG_VITASTOR' in config_host
+ vitastor = declare_dependency(link_args: config_host['VITASTOR_LIBS'].split())
+endif
glusterfs = not_found
if 'CONFIG_GLUSTERFS' in config_host
glusterfs = declare_dependency(compile_args: config_host['GLUSTERFS_CFLAGS'].split(),
@@ -2145,6 +2149,7 @@ endif
# TODO: add back protocol and server version
summary_info += {'spice support': config_host.has_key('CONFIG_SPICE')}
summary_info += {'rbd support': config_host.has_key('CONFIG_RBD')}
+summary_info += {'vitastor support': config_host.has_key('CONFIG_VITASTOR')}
summary_info += {'xfsctl support': config_host.has_key('CONFIG_XFS')}
summary_info += {'smartcard support': config_host.has_key('CONFIG_SMARTCARD')}
summary_info += {'U2F support': u2f.found()}

View File

@@ -1,188 +0,0 @@
diff --git a/block/meson.build b/block/meson.build
index d21990ec95..385ac37732 100644
--- a/block/meson.build
+++ b/block/meson.build
@@ -78,6 +78,7 @@ foreach m : [
[libnfs, 'nfs', libnfs, 'nfs.c'],
[libssh, 'ssh', libssh, 'ssh.c'],
[rbd, 'rbd', rbd, 'rbd.c'],
+ [vitastor, 'vitastor', vitastor, 'vitastor.c'],
]
if m[0].found()
if enable_modules
diff --git a/configure b/configure
index c77f7b1020..5f534e8484 100755
--- a/configure
+++ b/configure
@@ -389,6 +389,7 @@ trace_backends="log"
trace_file="trace"
spice="$default_feature"
rbd="auto"
+vitastor="auto"
smartcard="$default_feature"
u2f="auto"
libusb="$default_feature"
@@ -1280,6 +1281,10 @@ for opt do
;;
--enable-rbd) rbd="enabled"
;;
+ --disable-vitastor) vitastor="disabled"
+ ;;
+ --enable-vitastor) vitastor="enabled"
+ ;;
--disable-xfsctl) xfs="no"
;;
--enable-xfsctl) xfs="yes"
@@ -1867,6 +1872,7 @@ disabled with --disable-FEATURE, default is enabled if available
vhost-vdpa vhost-vdpa kernel backend support
spice spice
rbd rados block device (rbd)
+ vitastor vitastor block device
libiscsi iscsi support
libnfs nfs support
smartcard smartcard support (libcacard)
@@ -6423,7 +6429,7 @@ NINJA=$ninja $meson setup \
-Dcapstone=$capstone -Dslirp=$slirp -Dfdt=$fdt -Dbrlapi=$brlapi \
-Dcurl=$curl -Dglusterfs=$glusterfs -Dbzip2=$bzip2 -Dlibiscsi=$libiscsi \
-Dlibnfs=$libnfs -Diconv=$iconv -Dcurses=$curses -Dlibudev=$libudev\
- -Drbd=$rbd -Dlzo=$lzo -Dsnappy=$snappy -Dlzfse=$lzfse \
+ -Drbd=$rbd -Dvitastor=$vitastor -Dlzo=$lzo -Dsnappy=$snappy -Dlzfse=$lzfse \
-Dzstd=$zstd -Dseccomp=$seccomp -Dvirtfs=$virtfs -Dcap_ng=$cap_ng \
-Dattr=$attr -Ddefault_devices=$default_devices \
-Ddocs=$docs -Dsphinx_build=$sphinx_build -Dinstall_blobs=$blobs \
diff --git a/meson.build b/meson.build
index c6f4b0cf5e..3dc7f7b463 100644
--- a/meson.build
+++ b/meson.build
@@ -720,6 +720,26 @@ if not get_option('rbd').auto() or have_block
endif
endif
+vitastor = not_found
+if not get_option('vitastor').auto() or have_block
+ libvitastor_client = cc.find_library('vitastor_client', has_headers: ['vitastor_c.h'],
+ required: get_option('vitastor'), kwargs: static_kwargs)
+ if libvitastor_client.found()
+ if cc.links('''
+ #include <vitastor_c.h>
+ int main(void) {
+ vitastor_c_create_qemu(0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0);
+ return 0;
+ }''', dependencies: libvitastor_client)
+ vitastor = declare_dependency(dependencies: libvitastor_client)
+ elif get_option('vitastor').enabled()
+ error('could not link libvitastor_client')
+ else
+ warning('could not link libvitastor_client, disabling')
+ endif
+ endif
+endif
+
glusterfs = not_found
glusterfs_ftruncate_has_stat = false
glusterfs_iocb_has_stat = false
@@ -1118,6 +1138,7 @@ config_host_data.set('CONFIG_LIBCAP_NG', libcap_ng.found())
config_host_data.set('CONFIG_LIBISCSI', libiscsi.found())
config_host_data.set('CONFIG_LIBNFS', libnfs.found())
config_host_data.set('CONFIG_RBD', rbd.found())
+config_host_data.set('CONFIG_VITASTOR', vitastor.found())
config_host_data.set('CONFIG_SDL', sdl.found())
config_host_data.set('CONFIG_SDL_IMAGE', sdl_image.found())
config_host_data.set('CONFIG_SECCOMP', seccomp.found())
@@ -2683,6 +2704,7 @@ summary_info += {'libcap-ng support': libcap_ng.found()}
# TODO: add back protocol and server version
summary_info += {'spice support': config_host.has_key('CONFIG_SPICE')}
summary_info += {'rbd support': rbd.found()}
+summary_info += {'vitastor support': vitastor.found()}
summary_info += {'xfsctl support': config_host.has_key('CONFIG_XFS')}
summary_info += {'smartcard support': config_host.has_key('CONFIG_SMARTCARD')}
summary_info += {'U2F support': u2f.found()}
diff --git a/meson_options.txt b/meson_options.txt
index 9734019995..bc93963b27 100644
--- a/meson_options.txt
+++ b/meson_options.txt
@@ -80,6 +80,8 @@ option('lzo', type : 'feature', value : 'auto',
description: 'lzo compression support')
option('rbd', type : 'feature', value : 'auto',
description: 'Ceph block device driver')
+option('vitastor', type : 'feature', value : 'auto',
+ description: 'Vitastor block device driver')
option('gtk', type : 'feature', value : 'auto',
description: 'GTK+ user interface')
option('sdl', type : 'feature', value : 'auto',
diff --git a/qapi/block-core.json b/qapi/block-core.json
index 6d227924d0..d14b29aa43 100644
--- a/qapi/block-core.json
+++ b/qapi/block-core.json
@@ -2819,7 +2819,7 @@
'preallocate', 'qcow', 'qcow2', 'qed', 'quorum', 'raw', 'rbd',
{ 'name': 'replication', 'if': 'defined(CONFIG_REPLICATION)' },
'sheepdog',
- 'ssh', 'throttle', 'vdi', 'vhdx', 'vmdk', 'vpc', 'vvfat' ] }
+ 'ssh', 'throttle', 'vdi', 'vhdx', 'vitastor', 'vmdk', 'vpc', 'vvfat' ] }
##
# @BlockdevOptionsFile:
@@ -3671,6 +3671,28 @@
'*snap-id': 'uint32',
'*tag': 'str' } }
+##
+# @BlockdevOptionsVitastor:
+#
+# Driver specific block device options for vitastor
+#
+# @image: Image name
+# @inode: Inode number
+# @pool: Pool ID
+# @size: Desired image size in bytes
+# @config-path: Path to Vitastor configuration
+# @etcd-host: etcd connection address(es)
+# @etcd-prefix: etcd key/value prefix
+##
+{ 'struct': 'BlockdevOptionsVitastor',
+ 'data': { '*inode': 'uint64',
+ '*pool': 'uint64',
+ '*size': 'uint64',
+ '*image': 'str',
+ '*config-path': 'str',
+ '*etcd-host': 'str',
+ '*etcd-prefix': 'str' } }
+
##
# @ReplicationMode:
#
@@ -4042,6 +4064,7 @@
'throttle': 'BlockdevOptionsThrottle',
'vdi': 'BlockdevOptionsGenericFormat',
'vhdx': 'BlockdevOptionsGenericFormat',
+ 'vitastor': 'BlockdevOptionsVitastor',
'vmdk': 'BlockdevOptionsGenericCOWFormat',
'vpc': 'BlockdevOptionsGenericFormat',
'vvfat': 'BlockdevOptionsVVFAT'
@@ -4426,6 +4449,17 @@
'size': 'size',
'*cluster-size' : 'size' } }
+##
+# @BlockdevCreateOptionsVitastor:
+#
+# Driver specific image creation options for Vitastor.
+#
+# @size: Size of the virtual disk in bytes
+##
+{ 'struct': 'BlockdevCreateOptionsVitastor',
+ 'data': { 'location': 'BlockdevOptionsVitastor',
+ 'size': 'size' } }
+
##
# @BlockdevVmdkSubformat:
#
@@ -4691,6 +4725,7 @@
'ssh': 'BlockdevCreateOptionsSsh',
'vdi': 'BlockdevCreateOptionsVdi',
'vhdx': 'BlockdevCreateOptionsVhdx',
+ 'vitastor': 'BlockdevCreateOptionsVitastor',
'vmdk': 'BlockdevCreateOptionsVmdk',
'vpc': 'BlockdevCreateOptionsVpc'
} }

View File

@@ -74,115 +74,15 @@ Index: qemu-6.1+dfsg/qapi/block-core.json
'vmdk': 'BlockdevCreateOptionsVmdk', 'vmdk': 'BlockdevCreateOptionsVmdk',
'vpc': 'BlockdevCreateOptionsVpc' 'vpc': 'BlockdevCreateOptionsVpc'
} } } }
Index: qemu-6.1+dfsg/block/meson.build Index: qemu-6.1+dfsg/scripts/modules/module_block.py
=================================================================== ===================================================================
--- qemu-6.1+dfsg.orig/block/meson.build --- qemu-6.1+dfsg.orig/scripts/modules/module_block.py
+++ qemu-6.1+dfsg/block/meson.build +++ qemu-6.1+dfsg/scripts/modules/module_block.py
@@ -78,6 +78,7 @@ foreach m : [ @@ -86,6 +86,7 @@ if __name__ == '__main__':
[libnfs, 'nfs', files('nfs.c')], output_file = sys.argv[1]
[libssh, 'ssh', files('ssh.c')], with open(output_file, 'w') as fheader:
[rbd, 'rbd', files('rbd.c')], print_top(fheader)
+ [vitastor, 'vitastor', files('vitastor.c')], + add_module(fheader, "vitastor", "vitastor", "vitastor")
]
if m[0].found()
module_ss = ss.source_set()
Index: qemu-6.1+dfsg/configure
===================================================================
--- qemu-6.1+dfsg.orig/configure
+++ qemu-6.1+dfsg/configure
@@ -375,6 +375,7 @@ trace_file="trace"
spice="$default_feature"
spice_protocol="auto"
rbd="auto"
+vitastor="auto"
smartcard="auto"
u2f="auto"
libusb="auto"
@@ -1292,6 +1293,10 @@ for opt do
;;
--enable-rbd) rbd="enabled"
;;
+ --disable-vitastor) vitastor="disabled"
+ ;;
+ --enable-vitastor) vitastor="enabled"
+ ;;
--disable-xfsctl) xfs="no"
;;
--enable-xfsctl) xfs="yes"
@@ -1916,6 +1921,7 @@ disabled with --disable-FEATURE, default
spice spice
spice-protocol spice-protocol
rbd rados block device (rbd)
+ vitastor vitastor block device
libiscsi iscsi support
libnfs nfs support
smartcard smartcard support (libcacard)
@@ -5202,7 +5208,7 @@ if test "$skip_meson" = no; then
-Dcapstone=$capstone -Dslirp=$slirp -Dfdt=$fdt -Dbrlapi=$brlapi \
-Dcurl=$curl -Dglusterfs=$glusterfs -Dbzip2=$bzip2 -Dlibiscsi=$libiscsi \
-Dlibnfs=$libnfs -Diconv=$iconv -Dcurses=$curses -Dlibudev=$libudev\
- -Drbd=$rbd -Dlzo=$lzo -Dsnappy=$snappy -Dlzfse=$lzfse -Dlibxml2=$libxml2 \
+ -Drbd=$rbd -Dvitastor=$vitastor -Dlzo=$lzo -Dsnappy=$snappy -Dlzfse=$lzfse -Dlibxml2=$libxml2 \
-Dlibdaxctl=$libdaxctl -Dlibpmem=$libpmem -Dlinux_io_uring=$linux_io_uring \
-Dgnutls=$gnutls -Dnettle=$nettle -Dgcrypt=$gcrypt -Dauth_pam=$auth_pam \
-Dzstd=$zstd -Dseccomp=$seccomp -Dvirtfs=$virtfs -Dcap_ng=$cap_ng \
Index: qemu-6.1+dfsg/meson.build
===================================================================
--- qemu-6.1+dfsg.orig/meson.build
+++ qemu-6.1+dfsg/meson.build
@@ -729,6 +729,26 @@ if not get_option('rbd').auto() or have_
endif
endif
+vitastor = not_found for filename in sys.argv[2:]:
+if not get_option('vitastor').auto() or have_block if os.path.isfile(filename):
+ libvitastor_client = cc.find_library('vitastor_client', has_headers: ['vitastor_c.h'],
+ required: get_option('vitastor'), kwargs: static_kwargs)
+ if libvitastor_client.found()
+ if cc.links('''
+ #include <vitastor_c.h>
+ int main(void) {
+ vitastor_c_create_qemu(0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0);
+ return 0;
+ }''', dependencies: libvitastor_client)
+ vitastor = declare_dependency(dependencies: libvitastor_client)
+ elif get_option('vitastor').enabled()
+ error('could not link libvitastor_client')
+ else
+ warning('could not link libvitastor_client, disabling')
+ endif
+ endif
+endif
+
glusterfs = not_found
glusterfs_ftruncate_has_stat = false
glusterfs_iocb_has_stat = false
@@ -1264,6 +1284,7 @@ config_host_data.set('CONFIG_LIBNFS', li
config_host_data.set('CONFIG_LINUX_IO_URING', linux_io_uring.found())
config_host_data.set('CONFIG_LIBPMEM', libpmem.found())
config_host_data.set('CONFIG_RBD', rbd.found())
+config_host_data.set('CONFIG_VITASTOR', vitastor.found())
config_host_data.set('CONFIG_SDL', sdl.found())
config_host_data.set('CONFIG_SDL_IMAGE', sdl_image.found())
config_host_data.set('CONFIG_SECCOMP', seccomp.found())
@@ -3075,6 +3096,7 @@ summary_info += {'bpf support': libbpf.f
# TODO: add back protocol and server version
summary_info += {'spice support': config_host.has_key('CONFIG_SPICE')}
summary_info += {'rbd support': rbd.found()}
+summary_info += {'vitastor support': vitastor.found()}
summary_info += {'xfsctl support': config_host.has_key('CONFIG_XFS')}
summary_info += {'smartcard support': cacard.found()}
summary_info += {'U2F support': u2f.found()}
Index: qemu-6.1+dfsg/meson_options.txt
===================================================================
--- qemu-6.1+dfsg.orig/meson_options.txt
+++ qemu-6.1+dfsg/meson_options.txt
@@ -102,6 +102,8 @@ option('lzo', type : 'feature', value :
description: 'lzo compression support')
option('rbd', type : 'feature', value : 'auto',
description: 'Ceph block device driver')
+option('vitastor', type : 'feature', value : 'auto',
+ description: 'Vitastor block device driver')
option('gtk', type : 'feature', value : 'auto',
description: 'GTK+ user interface')
option('sdl', type : 'feature', value : 'auto',

View File

@@ -1,15 +0,0 @@
#!/bin/bash
# QEMU patches don't include the `block/vitastor.c` file to not duplicate it in sources
# Run this script to append its creation to all QEMU patches
DIR=$(dirname $0)
for i in "$DIR"/qemu-*-vitastor.patch "$DIR"/pve-qemu-*-vitastor.patch; do
if ! grep -qP '^\+\+\+ .*block/vitastor\.c' $i; then
echo 'Index: a/block/vitastor.c' >> $i
echo '===================================================================' >> $i
echo '--- /dev/null' >> $i
echo '+++ a/block/vitastor.c' >> $i
echo '@@ -0,0 +1,'$(wc -l "$DIR"/../src/qemu_driver.c)' @@' >> $i
cat "$DIR"/../src/qemu_driver.c | sed 's/^/+/' >> $i
fi
done

View File

@@ -1,5 +1,5 @@
#!/bin/bash #!/bin/bash
# Vitastor depends on QEMU and/or FIO headers, but QEMU and FIO don't have -devel packages # Vitastor depends on QEMU and FIO headers, but QEMU and FIO don't have -devel packages
# So we have to copy their headers into the source tarball # So we have to copy their headers into the source tarball
set -e set -e
@@ -18,11 +18,34 @@ else
fi fi
cd ~/rpmbuild/SPECS cd ~/rpmbuild/SPECS
rpmbuild -bp fio.spec rpmbuild -bp fio.spec
perl -i -pe 's/^make V=1/exit 0; make V=1/' qemu*.spec
rpmbuild -bc qemu*.spec
perl -i -pe 's/^exit 0; make V=1/make V=1/' qemu*.spec
cd ~/rpmbuild/BUILD/qemu*/
rm -rf $VITASTOR/qemu $VITASTOR/fio
mkdir -p $VITASTOR/qemu/b/qemu
make -j8 config-host.h
cp config-host.h $VITASTOR/qemu/b/qemu
cp -r include $VITASTOR/qemu
if [ -f qapi-schema.json ]; then
# QEMU 2.0
make qapi-types.h
cp qapi-types.h $VITASTOR/qemu/b/qemu
else
# QEMU 3.0+
make qapi
cp -r qapi $VITASTOR/qemu/b/qemu
fi
cd $VITASTOR cd $VITASTOR
sh copy-qemu-includes.sh
rm -rf qemu
mv qemu-copy qemu
ln -s ~/rpmbuild/BUILD/fio*/ fio ln -s ~/rpmbuild/BUILD/fio*/ fio
sh copy-fio-includes.sh sh copy-fio-includes.sh
rm fio rm fio
mv fio-copy fio mv fio-copy fio
FIO=`rpm -qi fio | perl -e 'while(<>) { /^Epoch[\s:]+(\S+)/ && print "$1:"; /^Version[\s:]+(\S+)/ && print $1; /^Release[\s:]+(\S+)/ && print "-$1"; }'` FIO=`rpm -qi fio | perl -e 'while(<>) { /^Epoch[\s:]+(\S+)/ && print "$1:"; /^Version[\s:]+(\S+)/ && print $1; /^Release[\s:]+(\S+)/ && print "-$1"; }'`
QEMU=`rpm -qi qemu qemu-kvm | perl -e 'while(<>) { /^Epoch[\s:]+(\S+)/ && print "$1:"; /^Version[\s:]+(\S+)/ && print $1; /^Release[\s:]+(\S+)/ && print "-$1"; }'`
perl -i -pe 's/(Requires:\s*fio)([^\n]+)?/$1 = '$FIO'/' $VITASTOR/rpm/vitastor-el$EL.spec perl -i -pe 's/(Requires:\s*fio)([^\n]+)?/$1 = '$FIO'/' $VITASTOR/rpm/vitastor-el$EL.spec
tar --transform 's#^#vitastor-0.6.12/#' --exclude 'rpm/*.rpm' -czf $VITASTOR/../vitastor-0.6.12$(rpm --eval '%dist').tar.gz * perl -i -pe 's/(Requires:\s*qemu(?:-kvm)?)([^\n]+)?/$1 = '$QEMU'/' $VITASTOR/rpm/vitastor-el$EL.spec
tar --transform 's#^#vitastor-0.6.8/#' --exclude 'rpm/*.rpm' -czf $VITASTOR/../vitastor-0.6.8$(rpm --eval '%dist').tar.gz *

View File

@@ -1,6 +1,3 @@
# This is an attempt to automatically build patched RPM specs
# More or less broken, better use *.spec.patch for now (and copy src/qemu_driver.c to SOURCES/qemu-vitastor.c)
# Build packages for CentOS 8 inside a container # Build packages for CentOS 8 inside a container
# cd ..; podman build -t qemu-el8 -v `pwd`/packages:/root/packages -f rpm/qemu-el8.Dockerfile . # cd ..; podman build -t qemu-el8 -v `pwd`/packages:/root/packages -f rpm/qemu-el8.Dockerfile .

View File

@@ -1,112 +0,0 @@
--- qemu-kvm.spec.orig 2021-12-01 22:10:58.967935539 +0000
+++ qemu-kvm.spec 2021-12-01 22:14:38.530117175 +0000
@@ -57,6 +57,7 @@ Requires: %{name}-block-gluster = %{epoc
%endif \
Requires: %{name}-block-iscsi = %{epoch}:%{version}-%{release} \
Requires: %{name}-block-rbd = %{epoch}:%{version}-%{release} \
+Requires: %{name}-block-vitastor = %{epoch}:%{version}-%{release}\
Requires: %{name}-block-ssh = %{epoch}:%{version}-%{release}
# Macro to properly setup RHEL/RHEV conflict handling
@@ -67,7 +68,7 @@ Obsoletes: %1-rhev
Summary: QEMU is a machine emulator and virtualizer
Name: qemu-kvm
Version: 4.2.0
-Release: 29%{?dist}.6
+Release: 32.vitastor%{?dist}.6
# Epoch because we pushed a qemu-1.0 package. AIUI this can't ever be dropped
Epoch: 15
License: GPLv2 and GPLv2+ and CC-BY
@@ -102,6 +103,7 @@ Source33: qemu-pr-helper.socket
Source34: 81-kvm-rhel.rules
Source35: udev-kvm-check.c
Source36: README.tests
+Source37: qemu-vitastor.c
Patch0005: 0005-Initial-redhat-build.patch
@@ -825,6 +827,7 @@ Patch331: kvm-Drop-bogus-IPv6-messages.p
Patch333: kvm-virtiofsd-Whitelist-fchmod.patch
# For bz#1883869 - virtiofsd core dump in KATA Container [rhel-8.2.1.z]
Patch334: kvm-virtiofsd-avoid-proc-self-fd-tempdir.patch
+Patch335: qemu-4.2-vitastor.patch
BuildRequires: wget
BuildRequires: rpm-build
@@ -861,6 +864,7 @@ BuildRequires: libcurl-devel
BuildRequires: libssh-devel
BuildRequires: librados-devel
BuildRequires: librbd-devel
+BuildRequires: vitastor-client-devel
%if %{have_gluster}
# For gluster block driver
BuildRequires: glusterfs-api-devel
@@ -1095,6 +1099,14 @@ Install this package if you want to acce
using the rbd protocol.
+%package block-vitastor
+Summary: QEMU Vitastor block driver
+Requires: %{name}-common%{?_isa} = %{epoch}:%{version}-%{release}
+
+%description block-vitastor
+This package provides the additional Vitastor block driver for QEMU.
+
+
%package block-ssh
Summary: QEMU SSH block driver
Requires: %{name}-common%{?_isa} = %{epoch}:%{version}-%{release}
@@ -1109,6 +1121,7 @@ the Secure Shell (SSH) protocol.
%prep
%setup -n qemu-%{version}
%autopatch -p1
+cp %{SOURCE37} ./block/vitastor.c
%build
%global buildarch %{kvm_target}-softmmu
@@ -1116,7 +1129,7 @@ the Secure Shell (SSH) protocol.
# --build-id option is used for giving info to the debug packages.
buildldflags="VL_LDFLAGS=-Wl,--build-id"
-%global block_drivers_list qcow2,raw,file,host_device,nbd,iscsi,rbd,blkdebug,luks,null-co,nvme,copy-on-read,throttle
+%global block_drivers_list qcow2,raw,file,host_device,nbd,iscsi,rbd,vitastor,blkdebug,luks,null-co,nvme,copy-on-read,throttle
%if 0%{have_gluster}
%global block_drivers_list %{block_drivers_list},gluster
@@ -1131,7 +1144,7 @@ buildldflags="VL_LDFLAGS=-Wl,--build-id"
--docdir="%{qemudocdir}" \
--libexecdir="%{_libexecdir}" \
--extra-ldflags="-Wl,--build-id -Wl,-z,relro -Wl,-z,now" \
- --extra-cflags="%{optflags}" \
+ --extra-cflags="%{optflags} -DRHEL_BDRV_CO_TRUNCATE_FLAGS" \
--with-pkgversion="%{name}-%{version}-%{release}" \
--with-confsuffix=/"%{name}" \
--firmwarepath=%{_prefix}/share/qemu-firmware \
@@ -1152,6 +1165,7 @@ buildldflags="VL_LDFLAGS=-Wl,--build-id"
--disable-numa \
%endif
--enable-rbd \
+ --enable-vitastor \
%if 0%{have_librdma}
--enable-rdma \
%else
@@ -1192,9 +1206,7 @@ buildldflags="VL_LDFLAGS=-Wl,--build-id"
%endif
--python=%{__python3} \
--target-list="%{buildarch}" \
- --block-drv-rw-whitelist=%{block_drivers_list} \
--audio-drv-list= \
- --block-drv-ro-whitelist=vmdk,vhdx,vpc,https,ssh \
--with-coroutine=ucontext \
--tls-priority=NORMAL \
--disable-bluez \
@@ -1750,6 +1762,9 @@ useradd -r -u 107 -g qemu -G kvm -d / -s
%files block-rbd
%{_libdir}/qemu-kvm/block-rbd.so
+%files block-vitastor
+%{_libdir}/qemu-kvm/block-vitastor.so
+
%files block-ssh
%{_libdir}/qemu-kvm/block-ssh.so

View File

@@ -1,103 +0,0 @@
--- qemu-kvm_6.0.spec.orig 2021-10-22 13:22:07.000000000 +0000
+++ qemu-kvm_6.0.spec 2021-12-01 22:43:26.095508618 +0000
@@ -67,6 +67,7 @@ Requires: %{name}-hw-usbredir = %{epoch}
%endif \
Requires: %{name}-block-iscsi = %{epoch}:%{version}-%{release} \
Requires: %{name}-block-rbd = %{epoch}:%{version}-%{release} \
+Requires: %{name}-block-vitastor = %{epoch}:%{version}-%{release}\
Requires: %{name}-block-ssh = %{epoch}:%{version}-%{release}
# Macro to properly setup RHEL/RHEV conflict handling
@@ -77,7 +78,7 @@ Obsoletes: %1-rhev <= %{epoch}:%{version
Summary: QEMU is a machine emulator and virtualizer
Name: qemu-kvm
Version: 6.0.0
-Release: 33%{?dist}
+Release: 33.vitastor%{?dist}
# Epoch because we pushed a qemu-1.0 package. AIUI this can't ever be dropped
Epoch: 15
License: GPLv2 and GPLv2+ and CC-BY
@@ -112,6 +113,7 @@ Source33: qemu-pr-helper.socket
Source34: 81-kvm-rhel.rules
Source35: udev-kvm-check.c
Source36: README.tests
+Source37: qemu-vitastor.c
Patch0001: 0001-redhat-Adding-slirp-to-the-exploded-tree.patch
@@ -342,6 +344,7 @@ Patch109: kvm-virtio-balloon-Fix-page-po
Patch110: kvm-virtio-net-fix-use-after-unmap-free-for-sg.patch
# For bz#1999141 - migration fails with: "qemu-kvm: get_pci_config_device: Bad config data: i=0x9a read: 3 device: 2 cmask: ff wmask: 0 w1cmask:0"
Patch111: kvm-Fix-virtio-net-pci-vectors-compat.patch
+Patch112: qemu-6.0-vitastor.patch
BuildRequires: wget
BuildRequires: rpm-build
@@ -379,6 +382,7 @@ BuildRequires: libcurl-devel
BuildRequires: libssh-devel
BuildRequires: librados-devel
BuildRequires: librbd-devel
+BuildRequires: vitastor-client-devel
%if %{have_gluster}
# For gluster block driver
BuildRequires: glusterfs-api-devel
@@ -625,6 +629,14 @@ Install this package if you want to acce
using the rbd protocol.
+%package block-vitastor
+Summary: QEMU Vitastor block driver
+Requires: %{name}-common%{?_isa} = %{epoch}:%{version}-%{release}
+
+%description block-vitastor
+This package provides the additional Vitastor block driver for QEMU.
+
+
%package block-ssh
Summary: QEMU SSH block driver
Requires: %{name}-common%{?_isa} = %{epoch}:%{version}-%{release}
@@ -678,6 +690,7 @@ This package provides usbredir support.
rm -fr slirp
mkdir slirp
%autopatch -p1
+cp %{SOURCE37} ./block/vitastor.c
%global qemu_kvm_build qemu_kvm_build
%global qemu_kiwi_build qemu_kiwi_src/build
@@ -701,7 +714,7 @@ mkdir -p %{qemu_kvm_build}
# --build-id option is used for giving info to the debug packages.
buildldflags="VL_LDFLAGS=-Wl,--build-id"
-%global block_drivers_list qcow2,raw,file,host_device,nbd,iscsi,rbd,blkdebug,luks,null-co,nvme,copy-on-read,throttle
+%global block_drivers_list qcow2,raw,file,host_device,nbd,iscsi,rbd,vitastor,blkdebug,luks,null-co,nvme,copy-on-read,throttle
%if 0%{have_gluster}
%global block_drivers_list %{block_drivers_list},gluster
@@ -894,6 +907,7 @@ pushd %{qemu_kvm_build}
%endif
--enable-pie \
--enable-rbd \
+ --enable-vitastor \
%if 0%{have_librdma}
--enable-rdma \
%endif
@@ -977,9 +991,7 @@ find ../default-configs -name "*-rh-devi
--firmwarepath=%{_prefix}/share/qemu-firmware \
--meson="%{__meson}" \
--target-list="%{buildarch}" \
- --block-drv-rw-whitelist=%{block_drivers_list} \
--audio-drv-list= \
- --block-drv-ro-whitelist=vmdk,vhdx,vpc,https,ssh \
--with-coroutine=ucontext \
--with-git=git \
--tls-priority=@QEMU,SYSTEM \
@@ -1584,6 +1596,9 @@ sh %{_sysconfdir}/sysconfig/modules/kvm.
%files block-rbd
%{_libdir}/qemu-kvm/block-rbd.so
+%files block-vitastor
+%{_libdir}/qemu-kvm/block-vitastor.so
+
%files block-ssh
%{_libdir}/qemu-kvm/block-ssh.so

View File

@@ -1,5 +1,5 @@
--- qemu-kvm.spec.orig 2020-11-09 23:41:03.000000000 +0000 --- qemu-kvm.spec.orig 2020-11-09 23:41:03.000000000 +0000
+++ qemu-kvm.spec 2021-12-01 21:53:30.895747529 +0000 +++ qemu-kvm.spec 2020-12-06 10:44:24.207640963 +0000
@@ -2,7 +2,7 @@ @@ -2,7 +2,7 @@
%global SLOF_gittagcommit 899d9883 %global SLOF_gittagcommit 899d9883
@@ -9,24 +9,25 @@
%global have_opengl 1 %global have_opengl 1
%global have_fdt 0 %global have_fdt 0
%global have_gluster 1 %global have_gluster 1
@@ -57,6 +57,7 @@ Requires: %{name}-block-gluster = %{epoc @@ -56,7 +56,7 @@ Requires: %{name}-block-curl = %{epoch}:
Requires: %{name}-block-gluster = %{epoch}:%{version}-%{release} \
%endif \ %endif \
Requires: %{name}-block-iscsi = %{epoch}:%{version}-%{release} \ Requires: %{name}-block-iscsi = %{epoch}:%{version}-%{release} \
Requires: %{name}-block-rbd = %{epoch}:%{version}-%{release} \ -Requires: %{name}-block-rbd = %{epoch}:%{version}-%{release} \
+Requires: %{name}-block-vitastor = %{epoch}:%{version}-%{release}\ +#Requires: %{name}-block-rbd = %{epoch}:%{version}-%{release} \
Requires: %{name}-block-ssh = %{epoch}:%{version}-%{release} Requires: %{name}-block-ssh = %{epoch}:%{version}-%{release}
# Macro to properly setup RHEL/RHEV conflict handling # Macro to properly setup RHEL/RHEV conflict handling
@@ -67,7 +68,7 @@ Obsoletes: %1-rhev @@ -67,7 +67,7 @@ Obsoletes: %1-rhev
Summary: QEMU is a machine emulator and virtualizer Summary: QEMU is a machine emulator and virtualizer
Name: qemu-kvm Name: qemu-kvm
Version: 4.2.0 Version: 4.2.0
-Release: 29.vitastor%{?dist}.6 -Release: 29.vitastor%{?dist}.6
+Release: 32.vitastor%{?dist}.6 +Release: 30.vitastor%{?dist}.6
# Epoch because we pushed a qemu-1.0 package. AIUI this can't ever be dropped # Epoch because we pushed a qemu-1.0 package. AIUI this can't ever be dropped
Epoch: 15 Epoch: 15
License: GPLv2 and GPLv2+ and CC-BY License: GPLv2 and GPLv2+ and CC-BY
@@ -99,9 +100,10 @@ Source30: kvm-s390x.conf @@ -99,8 +99,8 @@ Source30: kvm-s390x.conf
Source31: kvm-x86.conf Source31: kvm-x86.conf
Source32: qemu-pr-helper.service Source32: qemu-pr-helper.service
Source33: qemu-pr-helper.socket Source33: qemu-pr-helper.socket
@@ -35,11 +36,9 @@
+#Source34: 81-kvm-rhel.rules +#Source34: 81-kvm-rhel.rules
+#Source35: udev-kvm-check.c +#Source35: udev-kvm-check.c
Source36: README.tests Source36: README.tests
+Source37: qemu-vitastor.c
Patch0005: 0005-Initial-redhat-build.patch @@ -825,7 +825,9 @@ Patch331: kvm-Drop-bogus-IPv6-messages.p
@@ -825,7 +827,9 @@ Patch331: kvm-Drop-bogus-IPv6-messages.p
Patch333: kvm-virtiofsd-Whitelist-fchmod.patch Patch333: kvm-virtiofsd-Whitelist-fchmod.patch
# For bz#1883869 - virtiofsd core dump in KATA Container [rhel-8.2.1.z] # For bz#1883869 - virtiofsd core dump in KATA Container [rhel-8.2.1.z]
Patch334: kvm-virtiofsd-avoid-proc-self-fd-tempdir.patch Patch334: kvm-virtiofsd-avoid-proc-self-fd-tempdir.patch
@@ -50,7 +49,7 @@
BuildRequires: wget BuildRequires: wget
BuildRequires: rpm-build BuildRequires: rpm-build
@@ -842,7 +846,8 @@ BuildRequires: pciutils-devel @@ -842,7 +844,8 @@ BuildRequires: pciutils-devel
BuildRequires: libiscsi-devel BuildRequires: libiscsi-devel
BuildRequires: ncurses-devel BuildRequires: ncurses-devel
BuildRequires: libattr-devel BuildRequires: libattr-devel
@@ -60,7 +59,7 @@
%if %{have_usbredir} %if %{have_usbredir}
BuildRequires: usbredir-devel >= 0.7.1 BuildRequires: usbredir-devel >= 0.7.1
%endif %endif
@@ -856,12 +861,13 @@ BuildRequires: virglrenderer-devel @@ -856,12 +859,12 @@ BuildRequires: virglrenderer-devel
# For smartcard NSS support # For smartcard NSS support
BuildRequires: nss-devel BuildRequires: nss-devel
%endif %endif
@@ -71,13 +70,12 @@
BuildRequires: libssh-devel BuildRequires: libssh-devel
-BuildRequires: librados-devel -BuildRequires: librados-devel
-BuildRequires: librbd-devel -BuildRequires: librbd-devel
+BuildRequires: librados2-devel +#BuildRequires: librados-devel
+BuildRequires: librbd1-devel +#BuildRequires: librbd-devel
+BuildRequires: vitastor-client-devel
%if %{have_gluster} %if %{have_gluster}
# For gluster block driver # For gluster block driver
BuildRequires: glusterfs-api-devel BuildRequires: glusterfs-api-devel
@@ -955,25 +961,25 @@ hardware for a full system such as a PC @@ -955,25 +958,25 @@ hardware for a full system such as a PC
%package -n qemu-kvm-core %package -n qemu-kvm-core
Summary: qemu-kvm core components Summary: qemu-kvm core components
@@ -107,35 +105,38 @@
# For compressed guest memory dumps # For compressed guest memory dumps
Requires: lzo snappy Requires: lzo snappy
%if %{have_kvm_setup} %if %{have_kvm_setup}
@@ -1096,6 +1102,14 @@ Install this package if you want to acce @@ -1085,15 +1088,15 @@ This package provides the additional iSC
using the rbd protocol. Install this package if you want to access iSCSI volumes.
-%package block-rbd
-Summary: QEMU Ceph/RBD block driver
-Requires: %{name}-common%{?_isa} = %{epoch}:%{version}-%{release}
-
-%description block-rbd
-This package provides the additional Ceph/RBD block driver for QEMU.
-
-Install this package if you want to access remote Ceph volumes
-using the rbd protocol.
+#%package block-rbd
+#Summary: QEMU Ceph/RBD block driver
+#Requires: %{name}-common%{?_isa} = %{epoch}:%{version}-%{release}
+#
+#%description block-rbd
+#This package provides the additional Ceph/RBD block driver for QEMU.
+#
+#Install this package if you want to access remote Ceph volumes
+#using the rbd protocol.
+%package block-vitastor
+Summary: QEMU Vitastor block driver
+Requires: %{name}-common%{?_isa} = %{epoch}:%{version}-%{release}
+
+%description block-vitastor
+This package provides the additional Vitastor block driver for QEMU.
+
+
%package block-ssh %package block-ssh
Summary: QEMU SSH block driver @@ -1117,12 +1120,14 @@ the Secure Shell (SSH) protocol.
Requires: %{name}-common%{?_isa} = %{epoch}:%{version}-%{release}
@@ -1110,6 +1124,7 @@ the Secure Shell (SSH) protocol.
%prep
%setup -n qemu-%{version}
%autopatch -p1
+cp %{SOURCE37} ./block/vitastor.c
%build
%global buildarch %{kvm_target}-softmmu
@@ -1117,12 +1132,13 @@ the Secure Shell (SSH) protocol.
# --build-id option is used for giving info to the debug packages. # --build-id option is used for giving info to the debug packages.
buildldflags="VL_LDFLAGS=-Wl,--build-id" buildldflags="VL_LDFLAGS=-Wl,--build-id"
-%global block_drivers_list qcow2,raw,file,host_device,nbd,iscsi,rbd,blkdebug,luks,null-co,nvme,copy-on-read,throttle -%global block_drivers_list qcow2,raw,file,host_device,nbd,iscsi,rbd,blkdebug,luks,null-co,nvme,copy-on-read,throttle
+%global block_drivers_list qcow2,raw,file,host_device,nbd,iscsi,rbd,vitastor,blkdebug,luks,null-co,nvme,copy-on-read,throttle +#%global block_drivers_list qcow2,raw,file,host_device,nbd,iscsi,rbd,blkdebug,luks,null-co,nvme,copy-on-read,throttle
+%global block_drivers_list qcow2,raw,file,host_device,nbd,iscsi,blkdebug,luks,null-co,nvme,copy-on-read,throttle
%if 0%{have_gluster} %if 0%{have_gluster}
%global block_drivers_list %{block_drivers_list},gluster %global block_drivers_list %{block_drivers_list},gluster
@@ -145,20 +146,12 @@
./configure \ ./configure \
--prefix="%{_prefix}" \ --prefix="%{_prefix}" \
--libdir="%{_libdir}" \ --libdir="%{_libdir}" \
@@ -1132,7 +1148,7 @@ buildldflags="VL_LDFLAGS=-Wl,--build-id" @@ -1152,15 +1157,15 @@ buildldflags="VL_LDFLAGS=-Wl,--build-id"
--docdir="%{qemudocdir}" \ %else
--libexecdir="%{_libexecdir}" \
--extra-ldflags="-Wl,--build-id -Wl,-z,relro -Wl,-z,now" \
- --extra-cflags="%{optflags}" \
+ --extra-cflags="%{optflags} -DRHEL_BDRV_CO_TRUNCATE_FLAGS" \
--with-pkgversion="%{name}-%{version}-%{release}" \
--with-confsuffix=/"%{name}" \
--firmwarepath=%{_prefix}/share/qemu-firmware \
@@ -1153,14 +1169,15 @@ buildldflags="VL_LDFLAGS=-Wl,--build-id"
--disable-numa \ --disable-numa \
%endif %endif
--enable-rbd \ - --enable-rbd \
+ --enable-vitastor \ + --disable-rbd \
%if 0%{have_librdma} %if 0%{have_librdma}
--enable-rdma \ --enable-rdma \
%else %else
@@ -172,7 +165,7 @@
--enable-spice \ --enable-spice \
--enable-smartcard \ --enable-smartcard \
--enable-virglrenderer \ --enable-virglrenderer \
@@ -1179,7 +1196,7 @@ buildldflags="VL_LDFLAGS=-Wl,--build-id" @@ -1179,7 +1184,7 @@ buildldflags="VL_LDFLAGS=-Wl,--build-id"
%else %else
--disable-usb-redir \ --disable-usb-redir \
%endif %endif
@@ -181,7 +174,7 @@
%ifarch x86_64 %ifarch x86_64
--enable-libpmem \ --enable-libpmem \
%else %else
@@ -1193,9 +1210,7 @@ buildldflags="VL_LDFLAGS=-Wl,--build-id" @@ -1193,9 +1198,7 @@ buildldflags="VL_LDFLAGS=-Wl,--build-id"
%endif %endif
--python=%{__python3} \ --python=%{__python3} \
--target-list="%{buildarch}" \ --target-list="%{buildarch}" \
@@ -191,7 +184,7 @@
--with-coroutine=ucontext \ --with-coroutine=ucontext \
--tls-priority=NORMAL \ --tls-priority=NORMAL \
--disable-bluez \ --disable-bluez \
@@ -1262,7 +1277,7 @@ buildldflags="VL_LDFLAGS=-Wl,--build-id" @@ -1262,7 +1265,7 @@ buildldflags="VL_LDFLAGS=-Wl,--build-id"
--disable-sanitizers \ --disable-sanitizers \
--disable-hvf \ --disable-hvf \
--disable-whpx \ --disable-whpx \
@@ -200,7 +193,7 @@
--disable-membarrier \ --disable-membarrier \
--disable-vhost-crypto \ --disable-vhost-crypto \
--disable-libxml2 \ --disable-libxml2 \
@@ -1308,7 +1323,7 @@ make V=1 %{?_smp_mflags} $buildldflags @@ -1308,7 +1311,7 @@ make V=1 %{?_smp_mflags} $buildldflags
cp -a %{kvm_target}-softmmu/qemu-system-%{kvm_target} qemu-kvm cp -a %{kvm_target}-softmmu/qemu-system-%{kvm_target} qemu-kvm
gcc %{SOURCE6} $RPM_OPT_FLAGS $RPM_LD_FLAGS -o ksmctl gcc %{SOURCE6} $RPM_OPT_FLAGS $RPM_LD_FLAGS -o ksmctl
@@ -209,7 +202,7 @@
%install %install
%define _udevdir %(pkg-config --variable=udevdir udev) %define _udevdir %(pkg-config --variable=udevdir udev)
@@ -1343,8 +1358,8 @@ mkdir -p $RPM_BUILD_ROOT%{testsdir}/test @@ -1343,8 +1346,8 @@ mkdir -p $RPM_BUILD_ROOT%{testsdir}/test
mkdir -p $RPM_BUILD_ROOT%{testsdir}/tests/qemu-iotests mkdir -p $RPM_BUILD_ROOT%{testsdir}/tests/qemu-iotests
mkdir -p $RPM_BUILD_ROOT%{testsdir}/scripts/qmp mkdir -p $RPM_BUILD_ROOT%{testsdir}/scripts/qmp
@@ -220,7 +213,7 @@
install -m 0644 scripts/dump-guest-memory.py \ install -m 0644 scripts/dump-guest-memory.py \
$RPM_BUILD_ROOT%{_datadir}/%{name} $RPM_BUILD_ROOT%{_datadir}/%{name}
@@ -1562,6 +1577,8 @@ rm -rf $RPM_BUILD_ROOT%{qemudocdir}/inte @@ -1562,6 +1565,8 @@ rm -rf $RPM_BUILD_ROOT%{qemudocdir}/inte
# Remove spec # Remove spec
rm -rf $RPM_BUILD_ROOT%{qemudocdir}/specs rm -rf $RPM_BUILD_ROOT%{qemudocdir}/specs
@@ -229,7 +222,7 @@
%check %check
export DIFF=diff; make check V=1 export DIFF=diff; make check V=1
@@ -1645,8 +1662,8 @@ useradd -r -u 107 -g qemu -G kvm -d / -s @@ -1645,8 +1650,8 @@ useradd -r -u 107 -g qemu -G kvm -d / -s
%config(noreplace) %{_sysconfdir}/sysconfig/ksm %config(noreplace) %{_sysconfdir}/sysconfig/ksm
%{_unitdir}/ksmtuned.service %{_unitdir}/ksmtuned.service
%{_sbindir}/ksmtuned %{_sbindir}/ksmtuned
@@ -240,7 +233,7 @@
%ghost %{_sysconfdir}/kvm %ghost %{_sysconfdir}/kvm
%config(noreplace) %{_sysconfdir}/ksmtuned.conf %config(noreplace) %{_sysconfdir}/ksmtuned.conf
%dir %{_sysconfdir}/%{name} %dir %{_sysconfdir}/%{name}
@@ -1711,8 +1728,8 @@ useradd -r -u 107 -g qemu -G kvm -d / -s @@ -1711,8 +1716,8 @@ useradd -r -u 107 -g qemu -G kvm -d / -s
%{_libexecdir}/vhost-user-gpu %{_libexecdir}/vhost-user-gpu
%{_datadir}/%{name}/vhost-user/50-qemu-gpu.json %{_datadir}/%{name}/vhost-user/50-qemu-gpu.json
%endif %endif
@@ -251,13 +244,14 @@
%files -n qemu-img %files -n qemu-img
%defattr(-,root,root) %defattr(-,root,root)
@@ -1751,6 +1768,9 @@ useradd -r -u 107 -g qemu -G kvm -d / -s @@ -1748,8 +1753,8 @@ useradd -r -u 107 -g qemu -G kvm -d / -s
%files block-rbd %files block-iscsi
%{_libdir}/qemu-kvm/block-rbd.so %{_libdir}/qemu-kvm/block-iscsi.so
-%files block-rbd
-%{_libdir}/qemu-kvm/block-rbd.so
+#%files block-rbd
+#%{_libdir}/qemu-kvm/block-rbd.so
+%files block-vitastor
+%{_libdir}/qemu-kvm/block-vitastor.so
+
%files block-ssh %files block-ssh
%{_libdir}/qemu-kvm/block-ssh.so %{_libdir}/qemu-kvm/block-ssh.so

29
rpm/qemu-kvm.spec.patch Normal file
View File

@@ -0,0 +1,29 @@
--- qemu-kvm.spec 2020-12-05 13:13:54.388623517 +0000
+++ qemu-kvm.spec 2020-12-05 13:13:58.728696598 +0000
@@ -67,7 +67,7 @@ Obsoletes: %1-rhev
Summary: QEMU is a machine emulator and virtualizer
Name: qemu-kvm
Version: 4.2.0
-Release: 29%{?dist}.6
+Release: 29.vitastor%{?dist}.6
# Epoch because we pushed a qemu-1.0 package. AIUI this can't ever be dropped
Epoch: 15
License: GPLv2 and GPLv2+ and CC-BY
@@ -825,6 +825,7 @@ Patch331: kvm-Drop-bogus-IPv6-messages.p
Patch333: kvm-virtiofsd-Whitelist-fchmod.patch
# For bz#1883869 - virtiofsd core dump in KATA Container [rhel-8.2.1.z]
Patch334: kvm-virtiofsd-avoid-proc-self-fd-tempdir.patch
+Patch335: qemu-4.2-vitastor.patch
BuildRequires: wget
BuildRequires: rpm-build
@@ -1192,9 +1193,7 @@ buildldflags="VL_LDFLAGS=-Wl,--build-id"
%endif
--python=%{__python3} \
--target-list="%{buildarch}" \
- --block-drv-rw-whitelist=%{block_drivers_list} \
--audio-drv-list= \
- --block-drv-ro-whitelist=vmdk,vhdx,vpc,https,ssh \
--with-coroutine=ucontext \
--tls-priority=NORMAL \
--disable-bluez \

View File

@@ -9,11 +9,15 @@ WORKDIR /root
RUN rm -f /etc/yum.repos.d/CentOS-Media.repo RUN rm -f /etc/yum.repos.d/CentOS-Media.repo
RUN yum -y --enablerepo=extras install centos-release-scl epel-release yum-utils rpm-build RUN yum -y --enablerepo=extras install centos-release-scl epel-release yum-utils rpm-build
RUN yum -y install https://vitastor.io/rpms/centos/7/vitastor-release-1.0-1.el7.noarch.rpm RUN yum -y install https://vitastor.io/rpms/centos/7/vitastor-release-1.0-1.el7.noarch.rpm
RUN yum -y install devtoolset-9-gcc-c++ devtoolset-9-libatomic-devel gcc make cmake gperftools-devel fio rh-nodejs12 jerasure-devel gf-complete-devel rdma-core-devel RUN yum -y install devtoolset-9-gcc-c++ devtoolset-9-libatomic-devel gperftools-devel qemu-kvm fio rh-nodejs12 jerasure-devel gf-complete-devel
RUN yumdownloader --disablerepo=centos-sclo-rh --source qemu-kvm
RUN yumdownloader --disablerepo=centos-sclo-rh --source fio RUN yumdownloader --disablerepo=centos-sclo-rh --source fio
RUN rpm --nomd5 -i qemu*.src.rpm
RUN rpm --nomd5 -i fio*.src.rpm RUN rpm --nomd5 -i fio*.src.rpm
RUN rm -f /etc/yum.repos.d/CentOS-Media.repo RUN rm -f /etc/yum.repos.d/CentOS-Media.repo
RUN cd ~/rpmbuild/SPECS && yum-builddep -y qemu-kvm.spec
RUN cd ~/rpmbuild/SPECS && yum-builddep -y fio.spec RUN cd ~/rpmbuild/SPECS && yum-builddep -y fio.spec
RUN yum -y install rdma-core-devel
ADD https://vitastor.io/rpms/liburing-el7/liburing-0.7-2.el7.src.rpm /root ADD https://vitastor.io/rpms/liburing-el7/liburing-0.7-2.el7.src.rpm /root
@@ -34,7 +38,7 @@ ADD . /root/vitastor
RUN set -e; \ RUN set -e; \
cd /root/vitastor/rpm; \ cd /root/vitastor/rpm; \
sh build-tarball.sh; \ sh build-tarball.sh; \
cp /root/vitastor-0.6.12.el7.tar.gz ~/rpmbuild/SOURCES; \ cp /root/vitastor-0.6.8.el7.tar.gz ~/rpmbuild/SOURCES; \
cp vitastor-el7.spec ~/rpmbuild/SPECS/vitastor.spec; \ cp vitastor-el7.spec ~/rpmbuild/SPECS/vitastor.spec; \
cd ~/rpmbuild/SPECS/; \ cd ~/rpmbuild/SPECS/; \
rpmbuild -ba vitastor.spec; \ rpmbuild -ba vitastor.spec; \

View File

@@ -1,11 +1,11 @@
Name: vitastor Name: vitastor
Version: 0.6.12 Version: 0.6.8
Release: 1%{?dist} Release: 1%{?dist}
Summary: Vitastor, a fast software-defined clustered block storage Summary: Vitastor, a fast software-defined clustered block storage
License: Vitastor Network Public License 1.1 License: Vitastor Network Public License 1.1
URL: https://vitastor.io/ URL: https://vitastor.io/
Source0: vitastor-0.6.12.el7.tar.gz Source0: vitastor-0.6.8.el7.tar.gz
BuildRequires: liburing-devel >= 0.6 BuildRequires: liburing-devel >= 0.6
BuildRequires: gperftools-devel BuildRequires: gperftools-devel
@@ -21,6 +21,7 @@ Requires: vitastor-mon = %{version}-%{release}
Requires: vitastor-client = %{version}-%{release} Requires: vitastor-client = %{version}-%{release}
Requires: vitastor-client-devel = %{version}-%{release} Requires: vitastor-client-devel = %{version}-%{release}
Requires: vitastor-fio = %{version}-%{release} Requires: vitastor-fio = %{version}-%{release}
Requires: vitastor-qemu = %{version}-%{release}
%description %description
Vitastor is a small, simple and fast clustered block storage (storage for VM drives), Vitastor is a small, simple and fast clustered block storage (storage for VM drives),
@@ -82,13 +83,24 @@ Requires: fio = 3.7-1.el7
Vitastor fio drivers for benchmarking. Vitastor fio drivers for benchmarking.
%package -n vitastor-qemu
Summary: Vitastor - QEMU driver
Group: Development/Libraries
Requires: vitastor-client = %{version}-%{release}
Requires: qemu-kvm = 2.0.0-1.el7.6
%description -n vitastor-qemu
Vitastor QEMU block device driver.
%prep %prep
%setup -q %setup -q
%build %build
. /opt/rh/devtoolset-9/enable . /opt/rh/devtoolset-9/enable
%cmake . %cmake . -DQEMU_PLUGINDIR=qemu-kvm
%make_build %make_build
@@ -111,6 +123,7 @@ cp -r mon %buildroot/usr/lib/vitastor
%files -n vitastor-osd %files -n vitastor-osd
%_bindir/vitastor-osd %_bindir/vitastor-osd
%_bindir/vitastor-dump-journal %_bindir/vitastor-dump-journal
/usr/lib/vitastor/make-osd.sh
%files -n vitastor-mon %files -n vitastor-mon
@@ -124,7 +137,6 @@ cp -r mon %buildroot/usr/lib/vitastor
%_bindir/vita %_bindir/vita
%_libdir/libvitastor_blk.so* %_libdir/libvitastor_blk.so*
%_libdir/libvitastor_client.so* %_libdir/libvitastor_client.so*
/usr/lib/vitastor/make-osd.sh
%files -n vitastor-client-devel %files -n vitastor-client-devel
@@ -138,4 +150,8 @@ cp -r mon %buildroot/usr/lib/vitastor
%_libdir/libfio_vitastor_sec.so %_libdir/libfio_vitastor_sec.so
%files -n vitastor-qemu
%_libdir/qemu-kvm/block-vitastor.so
%changelog %changelog

View File

@@ -8,11 +8,14 @@ WORKDIR /root
RUN rm -f /etc/yum.repos.d/CentOS-Media.repo RUN rm -f /etc/yum.repos.d/CentOS-Media.repo
RUN dnf -y install centos-release-advanced-virtualization epel-release dnf-plugins-core RUN dnf -y install centos-release-advanced-virtualization epel-release dnf-plugins-core
RUN yum -y install https://vitastor.io/rpms/centos/8/vitastor-release-1.0-1.el8.noarch.rpm RUN yum -y install https://vitastor.io/rpms/centos/8/vitastor-release-1.0-1.el8.noarch.rpm
RUN dnf -y install gcc-toolset-9 gcc-toolset-9-gcc-c++ gperftools-devel \ RUN dnf --enablerepo='centos-advanced-virtualization' -y install gcc-toolset-9 gcc-toolset-9-gcc-c++ gperftools-devel qemu-kvm fio nodejs rpm-build jerasure-devel gf-complete-devel
fio nodejs rpm-build jerasure-devel gf-complete-devel libibverbs-devel libarchive cmake RUN rm -rf /var/lib/dnf/*; dnf download --disablerepo='*' --enablerepo='vitastor' --source qemu-kvm
RUN dnf download --source fio RUN dnf download --source fio
RUN rpm --nomd5 -i qemu*.src.rpm
RUN rpm --nomd5 -i fio*.src.rpm RUN rpm --nomd5 -i fio*.src.rpm
RUN cd ~/rpmbuild/SPECS && dnf builddep -y --enablerepo=powertools --spec fio.spec RUN cd ~/rpmbuild/SPECS && dnf builddep -y --enablerepo=powertools --spec qemu-kvm.spec
RUN cd ~/rpmbuild/SPECS && dnf builddep -y --enablerepo=powertools --spec fio.spec && dnf install -y cmake
RUN yum -y install libibverbs-devel libarchive
ADD https://vitastor.io/rpms/liburing-el7/liburing-0.7-2.el7.src.rpm /root ADD https://vitastor.io/rpms/liburing-el7/liburing-0.7-2.el7.src.rpm /root
@@ -26,14 +29,14 @@ RUN set -e; \
cp ~/rpmbuild/RPMS/*/liburing* /root/packages/liburing-el8/; \ cp ~/rpmbuild/RPMS/*/liburing* /root/packages/liburing-el8/; \
cp ~/rpmbuild/SRPMS/liburing* /root/packages/liburing-el8/ cp ~/rpmbuild/SRPMS/liburing* /root/packages/liburing-el8/
RUN rpm -i `ls /root/packages/liburing-el8/liburing-*.x86_64.rpm | grep -v debug` RUN rpm -i `ls /root/packages/liburing-el7/liburing-*.x86_64.rpm | grep -v debug`
ADD . /root/vitastor ADD . /root/vitastor
RUN set -e; \ RUN set -e; \
cd /root/vitastor/rpm; \ cd /root/vitastor/rpm; \
sh build-tarball.sh; \ sh build-tarball.sh; \
cp /root/vitastor-0.6.12.el8.tar.gz ~/rpmbuild/SOURCES; \ cp /root/vitastor-0.6.8.el8.tar.gz ~/rpmbuild/SOURCES; \
cp vitastor-el8.spec ~/rpmbuild/SPECS/vitastor.spec; \ cp vitastor-el8.spec ~/rpmbuild/SPECS/vitastor.spec; \
cd ~/rpmbuild/SPECS/; \ cd ~/rpmbuild/SPECS/; \
rpmbuild -ba vitastor.spec; \ rpmbuild -ba vitastor.spec; \

View File

@@ -1,11 +1,11 @@
Name: vitastor Name: vitastor
Version: 0.6.12 Version: 0.6.8
Release: 1%{?dist} Release: 1%{?dist}
Summary: Vitastor, a fast software-defined clustered block storage Summary: Vitastor, a fast software-defined clustered block storage
License: Vitastor Network Public License 1.1 License: Vitastor Network Public License 1.1
URL: https://vitastor.io/ URL: https://vitastor.io/
Source0: vitastor-0.6.12.el8.tar.gz Source0: vitastor-0.6.8.el8.tar.gz
BuildRequires: liburing-devel >= 0.6 BuildRequires: liburing-devel >= 0.6
BuildRequires: gperftools-devel BuildRequires: gperftools-devel
@@ -20,6 +20,7 @@ Requires: vitastor-mon = %{version}-%{release}
Requires: vitastor-client = %{version}-%{release} Requires: vitastor-client = %{version}-%{release}
Requires: vitastor-client-devel = %{version}-%{release} Requires: vitastor-client-devel = %{version}-%{release}
Requires: vitastor-fio = %{version}-%{release} Requires: vitastor-fio = %{version}-%{release}
Requires: vitastor-qemu = %{version}-%{release}
%description %description
Vitastor is a small, simple and fast clustered block storage (storage for VM drives), Vitastor is a small, simple and fast clustered block storage (storage for VM drives),
@@ -80,13 +81,24 @@ Requires: fio = 3.7-3.el8
Vitastor fio drivers for benchmarking. Vitastor fio drivers for benchmarking.
%package -n vitastor-qemu
Summary: Vitastor - QEMU driver
Group: Development/Libraries
Requires: vitastor-client = %{version}-%{release}
Requires: qemu-kvm = 4.2.0-29.el8.6
%description -n vitastor-qemu
Vitastor QEMU block device driver.
%prep %prep
%setup -q %setup -q
%build %build
. /opt/rh/gcc-toolset-9/enable . /opt/rh/gcc-toolset-9/enable
%cmake . %cmake . -DQEMU_PLUGINDIR=qemu-kvm
%make_build %make_build
@@ -108,6 +120,7 @@ cp -r mon %buildroot/usr/lib/vitastor
%files -n vitastor-osd %files -n vitastor-osd
%_bindir/vitastor-osd %_bindir/vitastor-osd
%_bindir/vitastor-dump-journal %_bindir/vitastor-dump-journal
/usr/lib/vitastor/make-osd.sh
%files -n vitastor-mon %files -n vitastor-mon
@@ -121,7 +134,6 @@ cp -r mon %buildroot/usr/lib/vitastor
%_bindir/vita %_bindir/vita
%_libdir/libvitastor_blk.so* %_libdir/libvitastor_blk.so*
%_libdir/libvitastor_client.so* %_libdir/libvitastor_client.so*
/usr/lib/vitastor/make-osd.sh
%files -n vitastor-client-devel %files -n vitastor-client-devel
@@ -135,4 +147,8 @@ cp -r mon %buildroot/usr/lib/vitastor
%_libdir/libfio_vitastor_sec.so %_libdir/libfio_vitastor_sec.so
%files -n vitastor-qemu
%_libdir/qemu-kvm/block-vitastor.so
%changelog %changelog

View File

@@ -4,7 +4,7 @@ project(vitastor)
include(GNUInstallDirs) include(GNUInstallDirs)
set(WITH_QEMU false CACHE BOOL "Build QEMU driver inside Vitastor source tree") set(WITH_QEMU true CACHE BOOL "Build QEMU driver")
set(WITH_FIO true CACHE BOOL "Build FIO driver") set(WITH_FIO true CACHE BOOL "Build FIO driver")
set(QEMU_PLUGINDIR qemu CACHE STRING "QEMU plugin directory suffix (qemu-kvm on RHEL)") set(QEMU_PLUGINDIR qemu CACHE STRING "QEMU plugin directory suffix (qemu-kvm on RHEL)")
set(WITH_ASAN false CACHE BOOL "Build with AddressSanitizer") set(WITH_ASAN false CACHE BOOL "Build with AddressSanitizer")
@@ -15,7 +15,7 @@ if("${CMAKE_INSTALL_PREFIX}" MATCHES "^/usr/local/?$")
set(CMAKE_INSTALL_RPATH "${CMAKE_INSTALL_PREFIX}/${CMAKE_INSTALL_LIBDIR}") set(CMAKE_INSTALL_RPATH "${CMAKE_INSTALL_PREFIX}/${CMAKE_INSTALL_LIBDIR}")
endif() endif()
add_definitions(-DVERSION="0.6.12") add_definitions(-DVERSION="0.6.8")
add_definitions(-Wall -Wno-sign-compare -Wno-comment -Wno-parentheses -Wno-pointer-arith -fdiagnostics-color=always -I ${CMAKE_SOURCE_DIR}/src) add_definitions(-Wall -Wno-sign-compare -Wno-comment -Wno-parentheses -Wno-pointer-arith -fdiagnostics-color=always -I ${CMAKE_SOURCE_DIR}/src)
if (${WITH_ASAN}) if (${WITH_ASAN})
add_definitions(-fsanitize=address -fno-omit-frame-pointer) add_definitions(-fsanitize=address -fno-omit-frame-pointer)
@@ -88,8 +88,8 @@ if (IBVERBS_LIBRARIES)
set(MSGR_RDMA "msgr_rdma.cpp") set(MSGR_RDMA "msgr_rdma.cpp")
endif (IBVERBS_LIBRARIES) endif (IBVERBS_LIBRARIES)
add_library(vitastor_common STATIC add_library(vitastor_common STATIC
epoll_manager.cpp etcd_state_client.cpp messenger.cpp addr_util.cpp epoll_manager.cpp etcd_state_client.cpp
msgr_stop.cpp msgr_op.cpp msgr_send.cpp msgr_receive.cpp ringloop.cpp ../json11/json11.cpp messenger.cpp msgr_stop.cpp msgr_op.cpp msgr_send.cpp msgr_receive.cpp ringloop.cpp ../json11/json11.cpp
http_client.cpp osd_ops.cpp pg_states.cpp timerfd_manager.cpp base64.cpp ${MSGR_RDMA} http_client.cpp osd_ops.cpp pg_states.cpp timerfd_manager.cpp base64.cpp ${MSGR_RDMA}
) )
target_compile_options(vitastor_common PUBLIC -fPIC) target_compile_options(vitastor_common PUBLIC -fPIC)
@@ -112,7 +112,6 @@ if (${WITH_FIO})
add_library(fio_vitastor_sec SHARED add_library(fio_vitastor_sec SHARED
fio_sec_osd.cpp fio_sec_osd.cpp
rw_blocking.cpp rw_blocking.cpp
addr_util.cpp
) )
target_link_libraries(fio_vitastor_sec target_link_libraries(fio_vitastor_sec
tcmalloc_minimal tcmalloc_minimal
@@ -146,7 +145,7 @@ endif (${WITH_FIO})
# vitastor-nbd # vitastor-nbd
add_executable(vitastor-nbd add_executable(vitastor-nbd
nbd_proxy.cpp nbd_proxy.cpp mmap_manager.cpp
) )
target_link_libraries(vitastor-nbd target_link_libraries(vitastor-nbd
vitastor_client vitastor_client
@@ -154,7 +153,7 @@ target_link_libraries(vitastor-nbd
# vitastor-cli # vitastor-cli
add_executable(vitastor-cli add_executable(vitastor-cli
cli.cpp cli_alloc_osd.cpp cli_simple_offsets.cpp cli_df.cpp cli.cpp cli_alloc_osd.cpp cli_simple_offsets.cpp
cli_ls.cpp cli_create.cpp cli_modify.cpp cli_flatten.cpp cli_merge.cpp cli_rm.cpp cli_snap_rm.cpp cli_ls.cpp cli_create.cpp cli_modify.cpp cli_flatten.cpp cli_merge.cpp cli_rm.cpp cli_snap_rm.cpp
) )
target_link_libraries(vitastor-cli target_link_libraries(vitastor-cli
@@ -172,7 +171,6 @@ if (${WITH_QEMU})
add_library(qemu_vitastor SHARED add_library(qemu_vitastor SHARED
qemu_driver.c qemu_driver.c
) )
target_compile_options(qemu_vitastor PUBLIC -DVITASTOR_SOURCE_TREE)
target_include_directories(qemu_vitastor PUBLIC target_include_directories(qemu_vitastor PUBLIC
../qemu/b/qemu ../qemu/b/qemu
../qemu/include ../qemu/include
@@ -190,11 +188,11 @@ endif (${WITH_QEMU})
### Test stubs ### Test stubs
# stub_osd, stub_bench, osd_test # stub_osd, stub_bench, osd_test
add_executable(stub_osd stub_osd.cpp rw_blocking.cpp addr_util.cpp) add_executable(stub_osd stub_osd.cpp rw_blocking.cpp mmap_manager.cpp)
target_link_libraries(stub_osd tcmalloc_minimal) target_link_libraries(stub_osd tcmalloc_minimal)
add_executable(stub_bench stub_bench.cpp rw_blocking.cpp addr_util.cpp) add_executable(stub_bench stub_bench.cpp rw_blocking.cpp)
target_link_libraries(stub_bench tcmalloc_minimal) target_link_libraries(stub_bench tcmalloc_minimal)
add_executable(osd_test osd_test.cpp rw_blocking.cpp addr_util.cpp) add_executable(osd_test osd_test.cpp rw_blocking.cpp)
target_link_libraries(osd_test tcmalloc_minimal) target_link_libraries(osd_test tcmalloc_minimal)
# osd_rmw_test # osd_rmw_test

View File

@@ -1,188 +0,0 @@
#include <arpa/inet.h>
#include <net/if.h>
#include <sys/types.h>
#include <ifaddrs.h>
#include <string.h>
#include <stdio.h>
#include <stdexcept>
#include "addr_util.h"
bool string_to_addr(std::string str, bool parse_port, int default_port, struct sockaddr *addr)
{
if (parse_port)
{
int p = str.rfind(':');
if (p != std::string::npos && !(str.length() > 0 && str[p-1] == ']')) // "[ipv6]" which contains ':'
{
char null_byte = 0;
int n = sscanf(str.c_str()+p+1, "%d%c", &default_port, &null_byte);
if (n != 1 || default_port >= 0x10000)
return false;
str = str.substr(0, p);
}
}
if (inet_pton(AF_INET, str.c_str(), &((struct sockaddr_in*)addr)->sin_addr) == 1)
{
addr->sa_family = AF_INET;
((struct sockaddr_in*)addr)->sin_port = htons(default_port);
return true;
}
if (str.length() >= 2 && str[0] == '[' && str[str.length()-1] == ']')
str = str.substr(1, str.length()-2);
if (inet_pton(AF_INET6, str.c_str(), &((struct sockaddr_in6*)addr)->sin6_addr) == 1)
{
addr->sa_family = AF_INET6;
((struct sockaddr_in6*)addr)->sin6_port = htons(default_port);
return true;
}
return false;
}
std::string addr_to_string(const sockaddr &addr)
{
char peer_str[256];
bool ok = false;
int port;
if (addr.sa_family == AF_INET)
{
ok = !!inet_ntop(AF_INET, &((sockaddr_in*)&addr)->sin_addr, peer_str, 256);
port = ntohs(((sockaddr_in*)&addr)->sin_port);
}
else if (addr.sa_family == AF_INET6)
{
ok = !!inet_ntop(AF_INET6, &((sockaddr_in6*)&addr)->sin6_addr, peer_str, 256);
port = ntohs(((sockaddr_in6*)&addr)->sin6_port);
}
else
throw std::runtime_error("Unknown address family "+std::to_string(addr.sa_family));
if (!ok)
throw std::runtime_error(std::string("inet_ntop: ") + strerror(errno));
return std::string(peer_str)+":"+std::to_string(port);
}
static bool cidr_match(const in_addr &addr, const in_addr &net, uint8_t bits)
{
if (bits == 0)
{
// C99 6.5.7 (3): u32 << 32 is undefined behaviour
return true;
}
return !((addr.s_addr ^ net.s_addr) & htonl(0xFFFFFFFFu << (32 - bits)));
}
static bool cidr6_match(const in6_addr &address, const in6_addr &network, uint8_t bits)
{
const uint32_t *a = address.s6_addr32;
const uint32_t *n = network.s6_addr32;
int bits_whole, bits_incomplete;
bits_whole = bits >> 5; // number of whole u32
bits_incomplete = bits & 0x1F; // number of bits in incomplete u32
if (bits_whole && memcmp(a, n, bits_whole << 2))
return false;
if (bits_incomplete)
{
uint32_t mask = htonl((0xFFFFFFFFu) << (32 - bits_incomplete));
if ((a[bits_whole] ^ n[bits_whole]) & mask)
return false;
}
return true;
}
struct addr_mask_t
{
sa_family_t family;
in_addr ipv4;
in6_addr ipv6;
uint8_t bits;
};
std::vector<std::string> getifaddr_list(std::vector<std::string> mask_cfg, bool include_v6)
{
std::vector<addr_mask_t> masks;
for (auto mask: mask_cfg)
{
unsigned bits = 0;
int p = mask.find('/');
if (p != std::string::npos)
{
char null_byte = 0;
if (sscanf(mask.c_str()+p+1, "%u%c", &bits, &null_byte) != 1 || bits > 128)
{
throw std::runtime_error((include_v6 ? "Invalid IPv4 address mask: " : "Invalid IP address mask: ") + mask);
}
mask = mask.substr(0, p);
}
in_addr ipv4;
in6_addr ipv6;
if (inet_pton(AF_INET, mask.c_str(), &ipv4) == 1)
{
if (bits > 32)
{
throw std::runtime_error((include_v6 ? "Invalid IPv4 address mask: " : "Invalid IP address mask: ") + mask);
}
masks.push_back((addr_mask_t){ .family = AF_INET, .ipv4 = ipv4, .bits = (uint8_t)bits });
}
else if (include_v6 && inet_pton(AF_INET6, mask.c_str(), &ipv6) == 1)
{
masks.push_back((addr_mask_t){ .family = AF_INET6, .ipv6 = ipv6, .bits = (uint8_t)bits });
}
else
{
throw std::runtime_error((include_v6 ? "Invalid IPv4 address mask: " : "Invalid IP address mask: ") + mask);
}
}
std::vector<std::string> addresses;
ifaddrs *list, *ifa;
if (getifaddrs(&list) == -1)
{
throw std::runtime_error(std::string("getifaddrs: ") + strerror(errno));
}
for (ifa = list; ifa != NULL; ifa = ifa->ifa_next)
{
if (!ifa->ifa_addr)
{
continue;
}
int family = ifa->ifa_addr->sa_family;
if ((family == AF_INET || family == AF_INET6 && include_v6) &&
(ifa->ifa_flags & (IFF_UP | IFF_RUNNING | IFF_LOOPBACK)) == (IFF_UP | IFF_RUNNING))
{
void *addr_ptr;
if (family == AF_INET)
{
addr_ptr = &((sockaddr_in *)ifa->ifa_addr)->sin_addr;
}
else
{
addr_ptr = &((sockaddr_in6 *)ifa->ifa_addr)->sin6_addr;
}
if (masks.size() > 0)
{
int i;
for (i = 0; i < masks.size(); i++)
{
if (masks[i].family == family && (family == AF_INET
? cidr_match(*(in_addr*)addr_ptr, masks[i].ipv4, masks[i].bits)
: cidr6_match(*(in6_addr*)addr_ptr, masks[i].ipv6, masks[i].bits)))
{
break;
}
}
if (i >= masks.size())
{
continue;
}
}
char addr[INET6_ADDRSTRLEN];
if (!inet_ntop(family, addr_ptr, addr, INET6_ADDRSTRLEN))
{
throw std::runtime_error(std::string("inet_ntop: ") + strerror(errno));
}
addresses.push_back(std::string(addr));
}
}
freeifaddrs(list);
return addresses;
}

View File

@@ -1,9 +0,0 @@
#pragma once
#include <sys/socket.h>
#include <string>
#include <vector>
bool string_to_addr(std::string str, bool parse_port, int default_port, struct sockaddr *addr);
std::string addr_to_string(const sockaddr &addr);
std::vector<std::string> getifaddr_list(std::vector<std::string> mask_cfg = std::vector<std::string>(), bool include_v6 = false);

View File

@@ -185,7 +185,7 @@ void journal_flusher_t::release_trim()
void journal_flusher_t::dump_diagnostics() void journal_flusher_t::dump_diagnostics()
{ {
const char *unflushable_type = ""; const char *unflushable_type = "";
obj_ver_id unflushable = {}; obj_ver_id unflushable = { 0 };
// Try to find out if there is a flushable object for information // Try to find out if there is a flushable object for information
for (object_id cur_oid: flush_queue) for (object_id cur_oid: flush_queue)
{ {
@@ -486,8 +486,8 @@ resume_1:
if (bs->clean_entry_bitmap_size) if (bs->clean_entry_bitmap_size)
{ {
new_clean_bitmap = (bs->inmemory_meta new_clean_bitmap = (bs->inmemory_meta
? (uint8_t*)meta_new.buf + meta_new.pos*bs->clean_entry_size + sizeof(clean_disk_entry) ? meta_new.buf + meta_new.pos*bs->clean_entry_size + sizeof(clean_disk_entry)
: (uint8_t*)bs->clean_bitmap + (clean_loc >> bs->block_order)*(2*bs->clean_entry_bitmap_size)); : bs->clean_bitmap + (clean_loc >> bs->block_order)*(2*bs->clean_entry_bitmap_size));
if (clean_init_bitmap) if (clean_init_bitmap)
{ {
memset(new_clean_bitmap, 0, bs->clean_entry_bitmap_size); memset(new_clean_bitmap, 0, bs->clean_entry_bitmap_size);
@@ -533,7 +533,7 @@ resume_1:
return false; return false;
} }
// zero out old metadata entry // zero out old metadata entry
memset((uint8_t*)meta_old.buf + meta_old.pos*bs->clean_entry_size, 0, bs->clean_entry_size); memset(meta_old.buf + meta_old.pos*bs->clean_entry_size, 0, bs->clean_entry_size);
await_sqe(15); await_sqe(15);
data->iov = (struct iovec){ meta_old.buf, bs->meta_block_size }; data->iov = (struct iovec){ meta_old.buf, bs->meta_block_size };
data->callback = simple_callback_w; data->callback = simple_callback_w;
@@ -544,25 +544,23 @@ resume_1:
} }
if (has_delete) if (has_delete)
{ {
clean_disk_entry *new_entry = (clean_disk_entry*)((uint8_t*)meta_new.buf + meta_new.pos*bs->clean_entry_size); clean_disk_entry *new_entry = (clean_disk_entry*)(meta_new.buf + meta_new.pos*bs->clean_entry_size);
if (new_entry->oid.inode != 0 && new_entry->oid != cur.oid) if (new_entry->oid.inode != 0 && new_entry->oid != cur.oid)
{ {
printf("Fatal error (metadata corruption or bug): tried to delete metadata entry %lu (%lx:%lx v%lu) while deleting %lx:%lx\n", printf("Fatal error (metadata corruption or bug): tried to delete metadata entry %lu (%lx:%lx) while deleting %lx:%lx\n",
clean_loc >> bs->block_order, new_entry->oid.inode, new_entry->oid.stripe, clean_loc >> bs->block_order, new_entry->oid.inode, new_entry->oid.stripe, cur.oid.inode, cur.oid.stripe);
new_entry->version, cur.oid.inode, cur.oid.stripe);
exit(1); exit(1);
} }
// zero out new metadata entry // zero out new metadata entry
memset((uint8_t*)meta_new.buf + meta_new.pos*bs->clean_entry_size, 0, bs->clean_entry_size); memset(meta_new.buf + meta_new.pos*bs->clean_entry_size, 0, bs->clean_entry_size);
} }
else else
{ {
clean_disk_entry *new_entry = (clean_disk_entry*)((uint8_t*)meta_new.buf + meta_new.pos*bs->clean_entry_size); clean_disk_entry *new_entry = (clean_disk_entry*)(meta_new.buf + meta_new.pos*bs->clean_entry_size);
if (new_entry->oid.inode != 0 && new_entry->oid != cur.oid) if (new_entry->oid.inode != 0 && new_entry->oid != cur.oid)
{ {
printf("Fatal error (metadata corruption or bug): tried to overwrite non-zero metadata entry %lu (%lx:%lx v%lu) with %lx:%lx v%lu\n", printf("Fatal error (metadata corruption or bug): tried to overwrite non-zero metadata entry %lu (%lx:%lx) with %lx:%lx\n",
clean_loc >> bs->block_order, new_entry->oid.inode, new_entry->oid.stripe, new_entry->version, clean_loc >> bs->block_order, new_entry->oid.inode, new_entry->oid.stripe, cur.oid.inode, cur.oid.stripe);
cur.oid.inode, cur.oid.stripe, cur.version);
exit(1); exit(1);
} }
new_entry->oid = cur.oid; new_entry->oid = cur.oid;
@@ -575,7 +573,7 @@ resume_1:
if (bs->clean_entry_bitmap_size) if (bs->clean_entry_bitmap_size)
{ {
void *bmp_ptr = bs->clean_entry_bitmap_size > sizeof(void*) ? dirty_end->second.bitmap : &dirty_end->second.bitmap; void *bmp_ptr = bs->clean_entry_bitmap_size > sizeof(void*) ? dirty_end->second.bitmap : &dirty_end->second.bitmap;
memcpy((uint8_t*)(new_entry+1) + bs->clean_entry_bitmap_size, bmp_ptr, bs->clean_entry_bitmap_size); memcpy((void*)(new_entry+1) + bs->clean_entry_bitmap_size, bmp_ptr, bs->clean_entry_bitmap_size);
} }
} }
await_sqe(6); await_sqe(6);
@@ -745,15 +743,12 @@ bool journal_flusher_co::scan_dirty(int wait_base)
offset = dirty_it->second.offset; offset = dirty_it->second.offset;
end_offset = dirty_it->second.offset + dirty_it->second.len; end_offset = dirty_it->second.offset + dirty_it->second.len;
it = v.begin(); it = v.begin();
while (end_offset > offset) while (1)
{ {
for (; it != v.end(); it++) for (; it != v.end(); it++)
if (it->offset+it->len > offset) if (it->offset >= offset)
break; break;
// If all items end before offset or if the found item starts after end_offset, just insert the buffer if (it == v.end() || it->offset > offset && it->len > 0)
// If (offset < it->offset < end_offset) insert (offset..it->offset) part
// If (it->offset <= offset <= it->offset+it->len) then just skip to it->offset+it->len
if (it == v.end() || it->offset > offset)
{ {
submit_offset = dirty_it->second.location + offset - dirty_it->second.offset; submit_offset = dirty_it->second.location + offset - dirty_it->second.offset;
submit_len = it == v.end() || it->offset >= end_offset ? end_offset-offset : it->offset-offset; submit_len = it == v.end() || it->offset >= end_offset ? end_offset-offset : it->offset-offset;
@@ -762,7 +757,7 @@ bool journal_flusher_co::scan_dirty(int wait_base)
if (bs->journal.inmemory) if (bs->journal.inmemory)
{ {
// Take it from memory // Take it from memory
memcpy(it->buf, (uint8_t*)bs->journal.buffer + submit_offset, submit_len); memcpy(it->buf, bs->journal.buffer + submit_offset, submit_len);
} }
else else
{ {
@@ -777,7 +772,7 @@ bool journal_flusher_co::scan_dirty(int wait_base)
} }
} }
offset = it->offset+it->len; offset = it->offset+it->len;
if (it == v.end()) if (it == v.end() || offset >= end_offset)
break; break;
} }
} }
@@ -826,7 +821,7 @@ bool journal_flusher_co::modify_meta_read(uint64_t meta_loc, flusher_meta_write_
wr.pos = ((meta_loc >> bs->block_order) % (bs->meta_block_size / bs->clean_entry_size)); wr.pos = ((meta_loc >> bs->block_order) % (bs->meta_block_size / bs->clean_entry_size));
if (bs->inmemory_meta) if (bs->inmemory_meta)
{ {
wr.buf = (uint8_t*)bs->metadata_buffer + wr.sector; wr.buf = bs->metadata_buffer + wr.sector;
return true; return true;
} }
wr.it = flusher->meta_sectors.find(wr.sector); wr.it = flusher->meta_sectors.find(wr.sector);

View File

@@ -235,12 +235,6 @@ void blockstore_impl_t::loop()
{ {
throw std::runtime_error(std::string("io_uring_submit: ") + strerror(-ret)); throw std::runtime_error(std::string("io_uring_submit: ") + strerror(-ret));
} }
for (auto s: journal.submitting_sectors)
{
// Mark journal sector writes as submitted
journal.sector_info[s].submit_id = 0;
}
journal.submitting_sectors.clear();
if ((initial_ring_space - ringloop->space_left()) > 0) if ((initial_ring_space - ringloop->space_left()) > 0)
{ {
live = true; live = true;
@@ -372,7 +366,7 @@ void blockstore_impl_t::enqueue_op(blockstore_op_t *op)
}; };
} }
unstable_writes.clear(); unstable_writes.clear();
op->callback = [old_callback](blockstore_op_t *op) op->callback = [this, old_callback](blockstore_op_t *op)
{ {
obj_ver_id *vers = (obj_ver_id*)op->buf; obj_ver_id *vers = (obj_ver_id*)op->buf;
delete[] vers; delete[] vers;

View File

@@ -54,14 +54,6 @@
#define IS_BIG_WRITE(st) (((st) & 0x0F) == BS_ST_BIG_WRITE) #define IS_BIG_WRITE(st) (((st) & 0x0F) == BS_ST_BIG_WRITE)
#define IS_DELETE(st) (((st) & 0x0F) == BS_ST_DELETE) #define IS_DELETE(st) (((st) & 0x0F) == BS_ST_DELETE)
#define BS_SUBMIT_CHECK_SQES(n) \
if (ringloop->space_left() < (n))\
{\
/* Pause until there are more requests available */\
PRIV(op)->wait_for = WAIT_SQE;\
return 0;\
}
#define BS_SUBMIT_GET_SQE(sqe, data) \ #define BS_SUBMIT_GET_SQE(sqe, data) \
BS_SUBMIT_GET_ONLY_SQE(sqe); \ BS_SUBMIT_GET_ONLY_SQE(sqe); \
struct ring_data_t *data = ((ring_data_t*)sqe->user_data) struct ring_data_t *data = ((ring_data_t*)sqe->user_data)
@@ -178,7 +170,7 @@ struct blockstore_op_private_t
std::vector<fulfill_read_t> read_vec; std::vector<fulfill_read_t> read_vec;
// Sync, write // Sync, write
int min_flushed_journal_sector, max_flushed_journal_sector; uint64_t min_flushed_journal_sector, max_flushed_journal_sector;
// Write // Write
struct iovec iov_zerofill[3]; struct iovec iov_zerofill[3];
@@ -259,7 +251,6 @@ class blockstore_impl_t
int data_fd; int data_fd;
uint64_t meta_size, meta_area, meta_len; uint64_t meta_size, meta_area, meta_len;
uint64_t data_size, data_len; uint64_t data_size, data_len;
uint64_t data_device_sect, meta_device_sect, journal_device_sect;
void *metadata_buffer = NULL; void *metadata_buffer = NULL;
@@ -280,7 +271,7 @@ class blockstore_impl_t
friend class blockstore_init_meta; friend class blockstore_init_meta;
friend class blockstore_init_journal; friend class blockstore_init_journal;
friend struct blockstore_journal_check_t; friend class blockstore_journal_check_t;
friend class journal_flusher_t; friend class journal_flusher_t;
friend class journal_flusher_co; friend class journal_flusher_co;
@@ -291,10 +282,6 @@ class blockstore_impl_t
void open_journal(); void open_journal();
uint8_t* get_clean_entry_bitmap(uint64_t block_loc, int offset); uint8_t* get_clean_entry_bitmap(uint64_t block_loc, int offset);
// Journaling
void prepare_journal_sector_write(int sector, blockstore_op_t *op);
void handle_journal_write(ring_data_t *data, uint64_t flush_id);
// Asynchronous init // Asynchronous init
int initialized; int initialized;
int metadata_buf_size; int metadata_buf_size;
@@ -322,18 +309,21 @@ class blockstore_impl_t
// Sync // Sync
int continue_sync(blockstore_op_t *op, bool queue_has_in_progress_sync); int continue_sync(blockstore_op_t *op, bool queue_has_in_progress_sync);
void handle_sync_event(ring_data_t *data, blockstore_op_t *op);
void ack_sync(blockstore_op_t *op); void ack_sync(blockstore_op_t *op);
// Stabilize // Stabilize
int dequeue_stable(blockstore_op_t *op); int dequeue_stable(blockstore_op_t *op);
int continue_stable(blockstore_op_t *op); int continue_stable(blockstore_op_t *op);
void mark_stable(const obj_ver_id & ov, bool forget_dirty = false); void mark_stable(const obj_ver_id & ov, bool forget_dirty = false);
void handle_stable_event(ring_data_t *data, blockstore_op_t *op);
void stabilize_object(object_id oid, uint64_t max_ver); void stabilize_object(object_id oid, uint64_t max_ver);
// Rollback // Rollback
int dequeue_rollback(blockstore_op_t *op); int dequeue_rollback(blockstore_op_t *op);
int continue_rollback(blockstore_op_t *op); int continue_rollback(blockstore_op_t *op);
void mark_rolled_back(const obj_ver_id & ov); void mark_rolled_back(const obj_ver_id & ov);
void handle_rollback_event(ring_data_t *data, blockstore_op_t *op);
void erase_dirty(blockstore_dirty_db_t::iterator dirty_start, blockstore_dirty_db_t::iterator dirty_end, uint64_t clean_loc); void erase_dirty(blockstore_dirty_db_t::iterator dirty_start, blockstore_dirty_db_t::iterator dirty_end, uint64_t clean_loc);
// List // List

View File

@@ -148,7 +148,7 @@ resume_1:
{ {
GET_SQE(); GET_SQE();
data->iov = { data->iov = {
(uint8_t*)metadata_buffer + (bs->inmemory_meta metadata_buffer + (bs->inmemory_meta
? metadata_read ? metadata_read
: (prev == 1 ? bs->metadata_buf_size : 0)), : (prev == 1 ? bs->metadata_buf_size : 0)),
bs->meta_len - metadata_read > bs->metadata_buf_size ? bs->metadata_buf_size : bs->meta_len - metadata_read, bs->meta_len - metadata_read > bs->metadata_buf_size ? bs->metadata_buf_size : bs->meta_len - metadata_read,
@@ -169,13 +169,13 @@ resume_1:
if (prev_done) if (prev_done)
{ {
void *done_buf = bs->inmemory_meta void *done_buf = bs->inmemory_meta
? ((uint8_t*)metadata_buffer + done_pos) ? (metadata_buffer + done_pos)
: ((uint8_t*)metadata_buffer + (prev_done == 2 ? bs->metadata_buf_size : 0)); : (metadata_buffer + (prev_done == 2 ? bs->metadata_buf_size : 0));
unsigned count = bs->meta_block_size / bs->clean_entry_size; unsigned count = bs->meta_block_size / bs->clean_entry_size;
for (int sector = 0; sector < done_len; sector += bs->meta_block_size) for (int sector = 0; sector < done_len; sector += bs->meta_block_size)
{ {
// handle <count> entries // handle <count> entries
handle_entries((uint8_t*)done_buf + sector, count, bs->block_order); handle_entries(done_buf + sector, count, bs->block_order);
done_cnt += count; done_cnt += count;
} }
prev_done = 0; prev_done = 0;
@@ -215,7 +215,7 @@ void blockstore_init_meta::handle_entries(void* entries, unsigned count, int blo
{ {
for (unsigned i = 0; i < count; i++) for (unsigned i = 0; i < count; i++)
{ {
clean_disk_entry *entry = (clean_disk_entry*)((uint8_t*)entries + i*bs->clean_entry_size); clean_disk_entry *entry = (clean_disk_entry*)(entries + i*bs->clean_entry_size);
if (!bs->inmemory_meta && bs->clean_entry_bitmap_size) if (!bs->inmemory_meta && bs->clean_entry_bitmap_size)
{ {
memcpy(bs->clean_bitmap + (done_cnt+i)*2*bs->clean_entry_bitmap_size, &entry->bitmap, 2*bs->clean_entry_bitmap_size); memcpy(bs->clean_bitmap + (done_cnt+i)*2*bs->clean_entry_bitmap_size, &entry->bitmap, 2*bs->clean_entry_bitmap_size);
@@ -440,7 +440,7 @@ resume_1:
if (!bs->journal.inmemory) if (!bs->journal.inmemory)
submitted_buf = memalign_or_die(MEM_ALIGNMENT, JOURNAL_BUFFER_SIZE); submitted_buf = memalign_or_die(MEM_ALIGNMENT, JOURNAL_BUFFER_SIZE);
else else
submitted_buf = (uint8_t*)bs->journal.buffer + journal_pos; submitted_buf = bs->journal.buffer + journal_pos;
data->iov = { data->iov = {
submitted_buf, submitted_buf,
end - journal_pos < JOURNAL_BUFFER_SIZE ? end - journal_pos : JOURNAL_BUFFER_SIZE, end - journal_pos < JOURNAL_BUFFER_SIZE ? end - journal_pos : JOURNAL_BUFFER_SIZE,
@@ -570,7 +570,7 @@ int blockstore_init_journal::handle_journal_part(void *buf, uint64_t done_pos, u
resume: resume:
while (pos < bs->journal.block_size) while (pos < bs->journal.block_size)
{ {
journal_entry *je = (journal_entry*)((uint8_t*)buf + proc_pos - done_pos + pos); journal_entry *je = (journal_entry*)(buf + proc_pos - done_pos + pos);
if (je->magic != JOURNAL_MAGIC || je_crc32(je) != je->crc32 || if (je->magic != JOURNAL_MAGIC || je_crc32(je) != je->crc32 ||
je->type < JE_MIN || je->type > JE_MAX || started && je->crc32_prev != crc32_last) je->type < JE_MIN || je->type > JE_MAX || started && je->crc32_prev != crc32_last)
{ {
@@ -619,7 +619,7 @@ int blockstore_init_journal::handle_journal_part(void *buf, uint64_t done_pos, u
if (location >= done_pos && location+je->small_write.len <= done_pos+len) if (location >= done_pos && location+je->small_write.len <= done_pos+len)
{ {
// data is within this buffer // data is within this buffer
data_crc32 = crc32c(0, (uint8_t*)buf + location - done_pos, je->small_write.len); data_crc32 = crc32c(0, buf + location - done_pos, je->small_write.len);
} }
else else
{ {
@@ -634,7 +634,7 @@ int blockstore_init_journal::handle_journal_part(void *buf, uint64_t done_pos, u
? location+je->small_write.len : done[i].pos+done[i].len); ? location+je->small_write.len : done[i].pos+done[i].len);
uint64_t part_begin = (location < done[i].pos ? done[i].pos : location); uint64_t part_begin = (location < done[i].pos ? done[i].pos : location);
covered += part_end - part_begin; covered += part_end - part_begin;
data_crc32 = crc32c(data_crc32, (uint8_t*)done[i].buf + part_begin - done[i].pos, part_end - part_begin); data_crc32 = crc32c(data_crc32, done[i].buf + part_begin - done[i].pos, part_end - part_begin);
} }
} }
if (covered < je->small_write.len) if (covered < je->small_write.len)
@@ -650,9 +650,9 @@ int blockstore_init_journal::handle_journal_part(void *buf, uint64_t done_pos, u
// interesting thing is that we must clear the corrupt entry if we're not readonly, // interesting thing is that we must clear the corrupt entry if we're not readonly,
// because we don't write next entries in the same journal block // because we don't write next entries in the same journal block
printf("Journal entry data is corrupt (data crc32 %x != %x)\n", data_crc32, je->small_write.crc32_data); printf("Journal entry data is corrupt (data crc32 %x != %x)\n", data_crc32, je->small_write.crc32_data);
memset((uint8_t*)buf + proc_pos - done_pos + pos, 0, bs->journal.block_size - pos); memset(buf + proc_pos - done_pos + pos, 0, bs->journal.block_size - pos);
bs->journal.next_free = prev_free; bs->journal.next_free = prev_free;
init_write_buf = (uint8_t*)buf + proc_pos - done_pos; init_write_buf = buf + proc_pos - done_pos;
init_write_sector = proc_pos; init_write_sector = proc_pos;
return 0; return 0;
} }
@@ -665,7 +665,7 @@ int blockstore_init_journal::handle_journal_part(void *buf, uint64_t done_pos, u
.version = je->small_write.version, .version = je->small_write.version,
}; };
void *bmp = NULL; void *bmp = NULL;
void *bmp_from = (uint8_t*)je + sizeof(journal_entry_small_write); void *bmp_from = (void*)je + sizeof(journal_entry_small_write);
if (bs->clean_entry_bitmap_size <= sizeof(void*)) if (bs->clean_entry_bitmap_size <= sizeof(void*))
{ {
memcpy(&bmp, bmp_from, bs->clean_entry_bitmap_size); memcpy(&bmp, bmp_from, bs->clean_entry_bitmap_size);
@@ -745,7 +745,7 @@ int blockstore_init_journal::handle_journal_part(void *buf, uint64_t done_pos, u
.version = je->big_write.version, .version = je->big_write.version,
}; };
void *bmp = NULL; void *bmp = NULL;
void *bmp_from = (uint8_t*)je + sizeof(journal_entry_big_write); void *bmp_from = (void*)je + sizeof(journal_entry_big_write);
if (bs->clean_entry_bitmap_size <= sizeof(void*)) if (bs->clean_entry_bitmap_size <= sizeof(void*))
{ {
memcpy(&bmp, bmp_from, bs->clean_entry_bitmap_size); memcpy(&bmp, bmp_from, bs->clean_entry_bitmap_size);

View File

@@ -6,7 +6,7 @@
class blockstore_init_meta class blockstore_init_meta
{ {
blockstore_impl_t *bs; blockstore_impl_t *bs;
int wait_state = 0; int wait_state = 0, wait_count = 0;
bool zero_on_init = false; bool zero_on_init = false;
void *metadata_buffer = NULL; void *metadata_buffer = NULL;
uint64_t metadata_read = 0; uint64_t metadata_read = 0;

View File

@@ -96,8 +96,7 @@ int blockstore_journal_check_t::check_available(blockstore_op_t *op, int entries
next_pos = next_pos + data_after; next_pos = next_pos + data_after;
if (next_pos > bs->journal.len) if (next_pos > bs->journal.len)
{ {
if (right_dir) next_pos = bs->journal.block_size + data_after;
next_pos = bs->journal.block_size + data_after;
right_dir = false; right_dir = false;
} }
} }
@@ -137,13 +136,13 @@ journal_entry* prefill_single_journal_entry(journal_t & journal, uint16_t type,
journal.in_sector_pos = 0; journal.in_sector_pos = 0;
journal.next_free = (journal.next_free+journal.block_size) < journal.len ? journal.next_free + journal.block_size : journal.block_size; journal.next_free = (journal.next_free+journal.block_size) < journal.len ? journal.next_free + journal.block_size : journal.block_size;
memset(journal.inmemory memset(journal.inmemory
? (uint8_t*)journal.buffer + journal.sector_info[journal.cur_sector].offset ? journal.buffer + journal.sector_info[journal.cur_sector].offset
: (uint8_t*)journal.sector_buf + journal.block_size*journal.cur_sector, 0, journal.block_size); : journal.sector_buf + journal.block_size*journal.cur_sector, 0, journal.block_size);
} }
journal_entry *je = (struct journal_entry*)( journal_entry *je = (struct journal_entry*)(
(journal.inmemory (journal.inmemory
? (uint8_t*)journal.buffer + journal.sector_info[journal.cur_sector].offset ? journal.buffer + journal.sector_info[journal.cur_sector].offset
: (uint8_t*)journal.sector_buf + journal.block_size*journal.cur_sector) + journal.in_sector_pos : journal.sector_buf + journal.block_size*journal.cur_sector) + journal.in_sector_pos
); );
journal.in_sector_pos += size; journal.in_sector_pos += size;
je->magic = JOURNAL_MAGIC; je->magic = JOURNAL_MAGIC;
@@ -154,73 +153,22 @@ journal_entry* prefill_single_journal_entry(journal_t & journal, uint16_t type,
return je; return je;
} }
void blockstore_impl_t::prepare_journal_sector_write(int cur_sector, blockstore_op_t *op) void prepare_journal_sector_write(journal_t & journal, int cur_sector, io_uring_sqe *sqe, std::function<void(ring_data_t*)> cb)
{ {
// Don't submit the same sector twice in the same batch
if (!journal.sector_info[cur_sector].submit_id)
{
io_uring_sqe *sqe = get_sqe();
// Caller must ensure availability of an SQE
assert(sqe != NULL);
ring_data_t *data = ((ring_data_t*)sqe->user_data);
journal.sector_info[cur_sector].written = true;
journal.sector_info[cur_sector].submit_id = ++journal.submit_id;
journal.submitting_sectors.push_back(cur_sector);
journal.sector_info[cur_sector].flush_count++;
data->iov = (struct iovec){
(journal.inmemory
? (uint8_t*)journal.buffer + journal.sector_info[cur_sector].offset
: (uint8_t*)journal.sector_buf + journal.block_size*cur_sector),
journal.block_size
};
data->callback = [this, flush_id = journal.submit_id](ring_data_t *data) { handle_journal_write(data, flush_id); };
my_uring_prep_writev(
sqe, journal.fd, &data->iov, 1, journal.offset + journal.sector_info[cur_sector].offset
);
}
journal.sector_info[cur_sector].dirty = false; journal.sector_info[cur_sector].dirty = false;
// But always remember that this operation has to wait until this exact journal write is finished journal.sector_info[cur_sector].written = true;
journal.flushing_ops.insert((pending_journaling_t){ journal.sector_info[cur_sector].flush_count++;
.flush_id = journal.sector_info[cur_sector].submit_id, ring_data_t *data = ((ring_data_t*)sqe->user_data);
.sector = cur_sector, data->iov = (struct iovec){
.op = op, (journal.inmemory
}); ? journal.buffer + journal.sector_info[cur_sector].offset
auto priv = PRIV(op); : journal.sector_buf + journal.block_size*cur_sector),
priv->pending_ops++; journal.block_size
if (!priv->min_flushed_journal_sector) };
priv->min_flushed_journal_sector = 1+cur_sector; data->callback = cb;
priv->max_flushed_journal_sector = 1+cur_sector; my_uring_prep_writev(
} sqe, journal.fd, &data->iov, 1, journal.offset + journal.sector_info[cur_sector].offset
);
void blockstore_impl_t::handle_journal_write(ring_data_t *data, uint64_t flush_id)
{
live = true;
if (data->res != data->iov.iov_len)
{
// FIXME: our state becomes corrupted after a write error. maybe do something better than just die
throw std::runtime_error(
"journal write failed ("+std::to_string(data->res)+" != "+std::to_string(data->iov.iov_len)+
"). in-memory state is corrupted. AAAAAAAaaaaaaaaa!!!111"
);
}
auto fl_it = journal.flushing_ops.upper_bound((pending_journaling_t){ .flush_id = flush_id });
if (fl_it != journal.flushing_ops.end() && fl_it->flush_id == flush_id)
{
journal.sector_info[fl_it->sector].flush_count--;
}
while (fl_it != journal.flushing_ops.end() && fl_it->flush_id == flush_id)
{
auto priv = PRIV(fl_it->op);
priv->pending_ops--;
assert(priv->pending_ops >= 0);
if (priv->pending_ops == 0)
{
release_journal_sectors(fl_it->op);
priv->op_state++;
ringloop->wakeup();
}
journal.flushing_ops.erase(fl_it++);
}
} }
journal_t::~journal_t() journal_t::~journal_t()

View File

@@ -4,7 +4,6 @@
#pragma once #pragma once
#include "crc32c.h" #include "crc32c.h"
#include <set>
#define MIN_JOURNAL_SIZE 4*1024*1024 #define MIN_JOURNAL_SIZE 4*1024*1024
#define JOURNAL_MAGIC 0x4A33 #define JOURNAL_MAGIC 0x4A33
@@ -146,21 +145,8 @@ struct journal_sector_info_t
uint64_t flush_count; uint64_t flush_count;
bool written; bool written;
bool dirty; bool dirty;
uint64_t submit_id;
}; };
struct pending_journaling_t
{
uint64_t flush_id;
int sector;
blockstore_op_t *op;
};
inline bool operator < (const pending_journaling_t & a, const pending_journaling_t & b)
{
return a.flush_id < b.flush_id || a.flush_id == b.flush_id && a.op < b.op;
}
struct journal_t struct journal_t
{ {
int fd; int fd;
@@ -186,9 +172,6 @@ struct journal_t
bool no_same_sector_overwrites = false; bool no_same_sector_overwrites = false;
int cur_sector = 0; int cur_sector = 0;
int in_sector_pos = 0; int in_sector_pos = 0;
std::vector<int> submitting_sectors;
std::set<pending_journaling_t> flushing_ops;
uint64_t submit_id = 0;
// Used sector map // Used sector map
// May use ~ 80 MB per 1 GB of used journal space in the worst case // May use ~ 80 MB per 1 GB of used journal space in the worst case
@@ -217,3 +200,5 @@ struct blockstore_journal_check_t
}; };
journal_entry* prefill_single_journal_entry(journal_t & journal, uint16_t type, uint32_t size); journal_entry* prefill_single_journal_entry(journal_t & journal, uint16_t type, uint32_t size);
void prepare_journal_sector_write(journal_t & journal, int sector, io_uring_sqe *sqe, std::function<void(ring_data_t*)> cb);

View File

@@ -295,9 +295,9 @@ void blockstore_impl_t::calc_lengths()
} }
} }
static void check_size(int fd, uint64_t *size, uint64_t *sectsize, std::string name) void check_size(int fd, uint64_t *size, std::string name)
{ {
int sect; int sectsize;
struct stat st; struct stat st;
if (fstat(fd, &st) < 0) if (fstat(fd, &st) < 0)
{ {
@@ -306,21 +306,14 @@ static void check_size(int fd, uint64_t *size, uint64_t *sectsize, std::string n
if (S_ISREG(st.st_mode)) if (S_ISREG(st.st_mode))
{ {
*size = st.st_size; *size = st.st_size;
if (sectsize)
{
*sectsize = st.st_blksize;
}
} }
else if (S_ISBLK(st.st_mode)) else if (S_ISBLK(st.st_mode))
{ {
if (ioctl(fd, BLKGETSIZE64, size) < 0 || if (ioctl(fd, BLKSSZGET, &sectsize) < 0 ||
ioctl(fd, BLKSSZGET, &sect) < 0) ioctl(fd, BLKGETSIZE64, size) < 0 ||
sectsize != 512)
{ {
throw std::runtime_error("failed to get "+name+" size or block size: "+strerror(errno)); throw std::runtime_error(name+" sector is not equal to 512 bytes");
}
if (sectsize)
{
*sectsize = sect;
} }
} }
else else
@@ -336,14 +329,7 @@ void blockstore_impl_t::open_data()
{ {
throw std::runtime_error("Failed to open data device"); throw std::runtime_error("Failed to open data device");
} }
check_size(data_fd, &data_size, &data_device_sect, "data device"); check_size(data_fd, &data_size, "data device");
if (disk_alignment % data_device_sect)
{
throw std::runtime_error(
"disk_alignment ("+std::to_string(disk_alignment)+
") is not a multiple of data device sector size ("+std::to_string(data_device_sect)+")"
);
}
if (data_offset >= data_size) if (data_offset >= data_size)
{ {
throw std::runtime_error("data_offset exceeds device size = "+std::to_string(data_size)); throw std::runtime_error("data_offset exceeds device size = "+std::to_string(data_size));
@@ -364,7 +350,7 @@ void blockstore_impl_t::open_meta()
{ {
throw std::runtime_error("Failed to open metadata device"); throw std::runtime_error("Failed to open metadata device");
} }
check_size(meta_fd, &meta_size, &meta_device_sect, "metadata device"); check_size(meta_fd, &meta_size, "metadata device");
if (meta_offset >= meta_size) if (meta_offset >= meta_size)
{ {
throw std::runtime_error("meta_offset exceeds device size = "+std::to_string(meta_size)); throw std::runtime_error("meta_offset exceeds device size = "+std::to_string(meta_size));
@@ -377,20 +363,12 @@ void blockstore_impl_t::open_meta()
else else
{ {
meta_fd = data_fd; meta_fd = data_fd;
meta_device_sect = data_device_sect;
meta_size = 0; meta_size = 0;
if (meta_offset >= data_size) if (meta_offset >= data_size)
{ {
throw std::runtime_error("meta_offset exceeds device size = "+std::to_string(data_size)); throw std::runtime_error("meta_offset exceeds device size = "+std::to_string(data_size));
} }
} }
if (meta_block_size % meta_device_sect)
{
throw std::runtime_error(
"meta_block_size ("+std::to_string(meta_block_size)+
") is not a multiple of data device sector size ("+std::to_string(meta_device_sect)+")"
);
}
} }
void blockstore_impl_t::open_journal() void blockstore_impl_t::open_journal()
@@ -402,7 +380,7 @@ void blockstore_impl_t::open_journal()
{ {
throw std::runtime_error("Failed to open journal device"); throw std::runtime_error("Failed to open journal device");
} }
check_size(journal.fd, &journal.device_size, &journal_device_sect, "journal device"); check_size(journal.fd, &journal.device_size, "journal device");
if (!disable_flock && flock(journal.fd, LOCK_EX|LOCK_NB) != 0) if (!disable_flock && flock(journal.fd, LOCK_EX|LOCK_NB) != 0)
{ {
throw std::runtime_error(std::string("Failed to lock journal device: ") + strerror(errno)); throw std::runtime_error(std::string("Failed to lock journal device: ") + strerror(errno));
@@ -411,7 +389,6 @@ void blockstore_impl_t::open_journal()
else else
{ {
journal.fd = meta_fd; journal.fd = meta_fd;
journal_device_sect = meta_device_sect;
journal.device_size = 0; journal.device_size = 0;
if (journal.offset >= data_size) if (journal.offset >= data_size)
{ {
@@ -429,11 +406,4 @@ void blockstore_impl_t::open_journal()
if (!journal.sector_buf) if (!journal.sector_buf)
throw std::bad_alloc(); throw std::bad_alloc();
} }
if (journal_block_size % journal_device_sect)
{
throw std::runtime_error(
"journal_block_size ("+std::to_string(journal_block_size)+
") is not a multiple of journal device sector size ("+std::to_string(journal_device_sect)+")"
);
}
} }

View File

@@ -24,7 +24,7 @@ int blockstore_impl_t::fulfill_read_push(blockstore_op_t *op, void *buf, uint64_
} }
if (journal.inmemory && IS_JOURNAL(item_state)) if (journal.inmemory && IS_JOURNAL(item_state))
{ {
memcpy(buf, (uint8_t*)journal.buffer + offset, len); memcpy(buf, journal.buffer + offset, len);
return 1; return 1;
} }
BS_SUBMIT_GET_SQE(sqe, data); BS_SUBMIT_GET_SQE(sqe, data);
@@ -75,7 +75,7 @@ int blockstore_impl_t::fulfill_read(blockstore_op_t *read_op, uint64_t &fulfille
}; };
it = PRIV(read_op)->read_vec.insert(it, el); it = PRIV(read_op)->read_vec.insert(it, el);
if (!fulfill_read_push(read_op, if (!fulfill_read_push(read_op,
(uint8_t*)read_op->buf + el.offset - read_op->offset, read_op->buf + el.offset - read_op->offset,
item_location + el.offset - item_start, item_location + el.offset - item_start,
el.len, item_state, item_version)) el.len, item_state, item_version))
{ {
@@ -102,7 +102,7 @@ uint8_t* blockstore_impl_t::get_clean_entry_bitmap(uint64_t block_loc, int offse
{ {
uint64_t sector = (meta_loc / (meta_block_size / clean_entry_size)) * meta_block_size; uint64_t sector = (meta_loc / (meta_block_size / clean_entry_size)) * meta_block_size;
uint64_t pos = (meta_loc % (meta_block_size / clean_entry_size)); uint64_t pos = (meta_loc % (meta_block_size / clean_entry_size));
clean_entry_bitmap = ((uint8_t*)metadata_buffer + sector + pos*clean_entry_size + sizeof(clean_disk_entry) + offset); clean_entry_bitmap = (uint8_t*)(metadata_buffer + sector + pos*clean_entry_size + sizeof(clean_disk_entry) + offset);
} }
else else
clean_entry_bitmap = (uint8_t*)(clean_bitmap + meta_loc*2*clean_entry_bitmap_size + offset); clean_entry_bitmap = (uint8_t*)(clean_bitmap + meta_loc*2*clean_entry_bitmap_size + offset);

View File

@@ -74,17 +74,24 @@ skip_ov:
{ {
return 0; return 0;
} }
// There is sufficient space. Check SQEs // There is sufficient space. Get SQEs
BS_SUBMIT_CHECK_SQES(space_check.sectors_to_write); struct io_uring_sqe *sqe[space_check.sectors_to_write];
for (i = 0; i < space_check.sectors_to_write; i++)
{
BS_SUBMIT_GET_SQE_DECL(sqe[i]);
}
// Prepare and submit journal entries // Prepare and submit journal entries
int s = 0; auto cb = [this, op](ring_data_t *data) { handle_rollback_event(data, op); };
int s = 0, cur_sector = -1;
for (i = 0, v = (obj_ver_id*)op->buf; i < op->len; i++, v++) for (i = 0, v = (obj_ver_id*)op->buf; i < op->len; i++, v++)
{ {
if (!journal.entry_fits(sizeof(journal_entry_rollback)) && if (!journal.entry_fits(sizeof(journal_entry_rollback)) &&
journal.sector_info[journal.cur_sector].dirty) journal.sector_info[journal.cur_sector].dirty)
{ {
prepare_journal_sector_write(journal.cur_sector, op); if (cur_sector == -1)
s++; PRIV(op)->min_flushed_journal_sector = 1 + journal.cur_sector;
prepare_journal_sector_write(journal, journal.cur_sector, sqe[s++], cb);
cur_sector = journal.cur_sector;
} }
journal_entry_rollback *je = (journal_entry_rollback*) journal_entry_rollback *je = (journal_entry_rollback*)
prefill_single_journal_entry(journal, JE_ROLLBACK, sizeof(journal_entry_rollback)); prefill_single_journal_entry(journal, JE_ROLLBACK, sizeof(journal_entry_rollback));
@@ -93,9 +100,12 @@ skip_ov:
je->crc32 = je_crc32((journal_entry*)je); je->crc32 = je_crc32((journal_entry*)je);
journal.crc32_last = je->crc32; journal.crc32_last = je->crc32;
} }
prepare_journal_sector_write(journal.cur_sector, op); prepare_journal_sector_write(journal, journal.cur_sector, sqe[s++], cb);
s++;
assert(s == space_check.sectors_to_write); assert(s == space_check.sectors_to_write);
if (cur_sector == -1)
PRIV(op)->min_flushed_journal_sector = 1 + journal.cur_sector;
PRIV(op)->max_flushed_journal_sector = 1 + journal.cur_sector;
PRIV(op)->pending_ops = s;
PRIV(op)->op_state = 1; PRIV(op)->op_state = 1;
return 1; return 1;
} }
@@ -104,23 +114,30 @@ int blockstore_impl_t::continue_rollback(blockstore_op_t *op)
{ {
if (PRIV(op)->op_state == 2) if (PRIV(op)->op_state == 2)
goto resume_2; goto resume_2;
else if (PRIV(op)->op_state == 4) else if (PRIV(op)->op_state == 3)
goto resume_4; goto resume_3;
else if (PRIV(op)->op_state == 5)
goto resume_5;
else else
return 1; return 1;
resume_2: resume_2:
// Release used journal sectors
release_journal_sectors(op);
resume_3:
if (!disable_journal_fsync) if (!disable_journal_fsync)
{ {
BS_SUBMIT_GET_SQE(sqe, data); io_uring_sqe *sqe;
BS_SUBMIT_GET_SQE_DECL(sqe);
ring_data_t *data = ((ring_data_t*)sqe->user_data);
my_uring_prep_fsync(sqe, journal.fd, IORING_FSYNC_DATASYNC); my_uring_prep_fsync(sqe, journal.fd, IORING_FSYNC_DATASYNC);
data->iov = { 0 }; data->iov = { 0 };
data->callback = [this, op](ring_data_t *data) { handle_write_event(data, op); }; data->callback = [this, op](ring_data_t *data) { handle_rollback_event(data, op); };
PRIV(op)->min_flushed_journal_sector = PRIV(op)->max_flushed_journal_sector = 0; PRIV(op)->min_flushed_journal_sector = PRIV(op)->max_flushed_journal_sector = 0;
PRIV(op)->pending_ops = 1; PRIV(op)->pending_ops = 1;
PRIV(op)->op_state = 3; PRIV(op)->op_state = 4;
return 1; return 1;
} }
resume_4: resume_5:
obj_ver_id* v; obj_ver_id* v;
int i; int i;
for (i = 0, v = (obj_ver_id*)op->buf; i < op->len; i++, v++) for (i = 0, v = (obj_ver_id*)op->buf; i < op->len; i++, v++)
@@ -179,6 +196,24 @@ void blockstore_impl_t::mark_rolled_back(const obj_ver_id & ov)
} }
} }
void blockstore_impl_t::handle_rollback_event(ring_data_t *data, blockstore_op_t *op)
{
live = true;
if (data->res != data->iov.iov_len)
{
throw std::runtime_error(
"write operation failed ("+std::to_string(data->res)+" != "+std::to_string(data->iov.iov_len)+
"). in-memory state is corrupted. AAAAAAAaaaaaaaaa!!!111"
);
}
PRIV(op)->pending_ops--;
if (PRIV(op)->pending_ops == 0)
{
PRIV(op)->op_state++;
ringloop->wakeup();
}
}
void blockstore_impl_t::erase_dirty(blockstore_dirty_db_t::iterator dirty_start, blockstore_dirty_db_t::iterator dirty_end, uint64_t clean_loc) void blockstore_impl_t::erase_dirty(blockstore_dirty_db_t::iterator dirty_start, blockstore_dirty_db_t::iterator dirty_end, uint64_t clean_loc)
{ {
if (dirty_end == dirty_start) if (dirty_end == dirty_start)

View File

@@ -97,18 +97,25 @@ int blockstore_impl_t::dequeue_stable(blockstore_op_t *op)
{ {
return 0; return 0;
} }
// There is sufficient space. Check SQEs // There is sufficient space. Get SQEs
BS_SUBMIT_CHECK_SQES(space_check.sectors_to_write); struct io_uring_sqe *sqe[space_check.sectors_to_write];
for (i = 0; i < space_check.sectors_to_write; i++)
{
BS_SUBMIT_GET_SQE_DECL(sqe[i]);
}
// Prepare and submit journal entries // Prepare and submit journal entries
int s = 0; auto cb = [this, op](ring_data_t *data) { handle_stable_event(data, op); };
int s = 0, cur_sector = -1;
for (i = 0, v = (obj_ver_id*)op->buf; i < op->len; i++, v++) for (i = 0, v = (obj_ver_id*)op->buf; i < op->len; i++, v++)
{ {
// FIXME: Only stabilize versions that aren't stable yet // FIXME: Only stabilize versions that aren't stable yet
if (!journal.entry_fits(sizeof(journal_entry_stable)) && if (!journal.entry_fits(sizeof(journal_entry_stable)) &&
journal.sector_info[journal.cur_sector].dirty) journal.sector_info[journal.cur_sector].dirty)
{ {
prepare_journal_sector_write(journal.cur_sector, op); if (cur_sector == -1)
s++; PRIV(op)->min_flushed_journal_sector = 1 + journal.cur_sector;
prepare_journal_sector_write(journal, journal.cur_sector, sqe[s++], cb);
cur_sector = journal.cur_sector;
} }
journal_entry_stable *je = (journal_entry_stable*) journal_entry_stable *je = (journal_entry_stable*)
prefill_single_journal_entry(journal, JE_STABLE, sizeof(journal_entry_stable)); prefill_single_journal_entry(journal, JE_STABLE, sizeof(journal_entry_stable));
@@ -117,9 +124,12 @@ int blockstore_impl_t::dequeue_stable(blockstore_op_t *op)
je->crc32 = je_crc32((journal_entry*)je); je->crc32 = je_crc32((journal_entry*)je);
journal.crc32_last = je->crc32; journal.crc32_last = je->crc32;
} }
prepare_journal_sector_write(journal.cur_sector, op); prepare_journal_sector_write(journal, journal.cur_sector, sqe[s++], cb);
s++;
assert(s == space_check.sectors_to_write); assert(s == space_check.sectors_to_write);
if (cur_sector == -1)
PRIV(op)->min_flushed_journal_sector = 1 + journal.cur_sector;
PRIV(op)->max_flushed_journal_sector = 1 + journal.cur_sector;
PRIV(op)->pending_ops = s;
PRIV(op)->op_state = 1; PRIV(op)->op_state = 1;
return 1; return 1;
} }
@@ -128,23 +138,30 @@ int blockstore_impl_t::continue_stable(blockstore_op_t *op)
{ {
if (PRIV(op)->op_state == 2) if (PRIV(op)->op_state == 2)
goto resume_2; goto resume_2;
else if (PRIV(op)->op_state == 4) else if (PRIV(op)->op_state == 3)
goto resume_4; goto resume_3;
else if (PRIV(op)->op_state == 5)
goto resume_5;
else else
return 1; return 1;
resume_2: resume_2:
// Release used journal sectors
release_journal_sectors(op);
resume_3:
if (!disable_journal_fsync) if (!disable_journal_fsync)
{ {
BS_SUBMIT_GET_SQE(sqe, data); io_uring_sqe *sqe;
BS_SUBMIT_GET_SQE_DECL(sqe);
ring_data_t *data = ((ring_data_t*)sqe->user_data);
my_uring_prep_fsync(sqe, journal.fd, IORING_FSYNC_DATASYNC); my_uring_prep_fsync(sqe, journal.fd, IORING_FSYNC_DATASYNC);
data->iov = { 0 }; data->iov = { 0 };
data->callback = [this, op](ring_data_t *data) { handle_write_event(data, op); }; data->callback = [this, op](ring_data_t *data) { handle_stable_event(data, op); };
PRIV(op)->min_flushed_journal_sector = PRIV(op)->max_flushed_journal_sector = 0; PRIV(op)->min_flushed_journal_sector = PRIV(op)->max_flushed_journal_sector = 0;
PRIV(op)->pending_ops = 1; PRIV(op)->pending_ops = 1;
PRIV(op)->op_state = 3; PRIV(op)->op_state = 4;
return 1; return 1;
} }
resume_4: resume_5:
// Mark dirty_db entries as stable, acknowledge op completion // Mark dirty_db entries as stable, acknowledge op completion
obj_ver_id* v; obj_ver_id* v;
int i; int i;
@@ -240,3 +257,21 @@ void blockstore_impl_t::mark_stable(const obj_ver_id & v, bool forget_dirty)
unstable_writes.erase(unstab_it); unstable_writes.erase(unstab_it);
} }
} }
void blockstore_impl_t::handle_stable_event(ring_data_t *data, blockstore_op_t *op)
{
live = true;
if (data->res != data->iov.iov_len)
{
throw std::runtime_error(
"write operation failed ("+std::to_string(data->res)+" != "+std::to_string(data->iov.iov_len)+
"). in-memory state is corrupted. AAAAAAAaaaaaaaaa!!!111"
);
}
PRIV(op)->pending_ops--;
if (PRIV(op)->pending_ops == 0)
{
PRIV(op)->op_state++;
ringloop->wakeup();
}
}

View File

@@ -44,8 +44,10 @@ int blockstore_impl_t::continue_sync(blockstore_op_t *op, bool queue_has_in_prog
if (journal.sector_info[journal.cur_sector].dirty) if (journal.sector_info[journal.cur_sector].dirty)
{ {
// Write out the last journal sector if it happens to be dirty // Write out the last journal sector if it happens to be dirty
BS_SUBMIT_CHECK_SQES(1); BS_SUBMIT_GET_ONLY_SQE(sqe);
prepare_journal_sector_write(journal.cur_sector, op); prepare_journal_sector_write(journal, journal.cur_sector, sqe, [this, op](ring_data_t *data) { handle_sync_event(data, op); });
PRIV(op)->min_flushed_journal_sector = PRIV(op)->max_flushed_journal_sector = 1 + journal.cur_sector;
PRIV(op)->pending_ops = 1;
PRIV(op)->op_state = SYNC_JOURNAL_WRITE_SENT; PRIV(op)->op_state = SYNC_JOURNAL_WRITE_SENT;
return 1; return 1;
} }
@@ -62,7 +64,7 @@ int blockstore_impl_t::continue_sync(blockstore_op_t *op, bool queue_has_in_prog
BS_SUBMIT_GET_SQE(sqe, data); BS_SUBMIT_GET_SQE(sqe, data);
my_uring_prep_fsync(sqe, data_fd, IORING_FSYNC_DATASYNC); my_uring_prep_fsync(sqe, data_fd, IORING_FSYNC_DATASYNC);
data->iov = { 0 }; data->iov = { 0 };
data->callback = [this, op](ring_data_t *data) { handle_write_event(data, op); }; data->callback = [this, op](ring_data_t *data) { handle_sync_event(data, op); };
PRIV(op)->min_flushed_journal_sector = PRIV(op)->max_flushed_journal_sector = 0; PRIV(op)->min_flushed_journal_sector = PRIV(op)->max_flushed_journal_sector = 0;
PRIV(op)->pending_ops = 1; PRIV(op)->pending_ops = 1;
PRIV(op)->op_state = SYNC_DATA_SYNC_SENT; PRIV(op)->op_state = SYNC_DATA_SYNC_SENT;
@@ -83,18 +85,24 @@ int blockstore_impl_t::continue_sync(blockstore_op_t *op, bool queue_has_in_prog
{ {
return 0; return 0;
} }
// Check SQEs. Don't bother about merging, submit each journal sector as a separate request // Get SQEs. Don't bother about merging, submit each journal sector as a separate request
BS_SUBMIT_CHECK_SQES(space_check.sectors_to_write); struct io_uring_sqe *sqe[space_check.sectors_to_write];
for (int i = 0; i < space_check.sectors_to_write; i++)
{
BS_SUBMIT_GET_SQE_DECL(sqe[i]);
}
// Prepare and submit journal entries // Prepare and submit journal entries
auto it = PRIV(op)->sync_big_writes.begin(); auto it = PRIV(op)->sync_big_writes.begin();
int s = 0; int s = 0, cur_sector = -1;
while (it != PRIV(op)->sync_big_writes.end()) while (it != PRIV(op)->sync_big_writes.end())
{ {
if (!journal.entry_fits(sizeof(journal_entry_big_write) + clean_entry_bitmap_size) && if (!journal.entry_fits(sizeof(journal_entry_big_write) + clean_entry_bitmap_size) &&
journal.sector_info[journal.cur_sector].dirty) journal.sector_info[journal.cur_sector].dirty)
{ {
prepare_journal_sector_write(journal.cur_sector, op); if (cur_sector == -1)
s++; PRIV(op)->min_flushed_journal_sector = 1 + journal.cur_sector;
prepare_journal_sector_write(journal, journal.cur_sector, sqe[s++], [this, op](ring_data_t *data) { handle_sync_event(data, op); });
cur_sector = journal.cur_sector;
} }
auto & dirty_entry = dirty_db.at(*it); auto & dirty_entry = dirty_db.at(*it);
journal_entry_big_write *je = (journal_entry_big_write*)prefill_single_journal_entry( journal_entry_big_write *je = (journal_entry_big_write*)prefill_single_journal_entry(
@@ -121,9 +129,12 @@ int blockstore_impl_t::continue_sync(blockstore_op_t *op, bool queue_has_in_prog
journal.crc32_last = je->crc32; journal.crc32_last = je->crc32;
it++; it++;
} }
prepare_journal_sector_write(journal.cur_sector, op); prepare_journal_sector_write(journal, journal.cur_sector, sqe[s++], [this, op](ring_data_t *data) { handle_sync_event(data, op); });
s++;
assert(s == space_check.sectors_to_write); assert(s == space_check.sectors_to_write);
if (cur_sector == -1)
PRIV(op)->min_flushed_journal_sector = 1 + journal.cur_sector;
PRIV(op)->max_flushed_journal_sector = 1 + journal.cur_sector;
PRIV(op)->pending_ops = s;
PRIV(op)->op_state = SYNC_JOURNAL_WRITE_SENT; PRIV(op)->op_state = SYNC_JOURNAL_WRITE_SENT;
return 1; return 1;
} }
@@ -134,7 +145,7 @@ int blockstore_impl_t::continue_sync(blockstore_op_t *op, bool queue_has_in_prog
BS_SUBMIT_GET_SQE(sqe, data); BS_SUBMIT_GET_SQE(sqe, data);
my_uring_prep_fsync(sqe, journal.fd, IORING_FSYNC_DATASYNC); my_uring_prep_fsync(sqe, journal.fd, IORING_FSYNC_DATASYNC);
data->iov = { 0 }; data->iov = { 0 };
data->callback = [this, op](ring_data_t *data) { handle_write_event(data, op); }; data->callback = [this, op](ring_data_t *data) { handle_sync_event(data, op); };
PRIV(op)->min_flushed_journal_sector = PRIV(op)->max_flushed_journal_sector = 0; PRIV(op)->min_flushed_journal_sector = PRIV(op)->max_flushed_journal_sector = 0;
PRIV(op)->pending_ops = 1; PRIV(op)->pending_ops = 1;
PRIV(op)->op_state = SYNC_JOURNAL_SYNC_SENT; PRIV(op)->op_state = SYNC_JOURNAL_SYNC_SENT;
@@ -153,6 +164,42 @@ int blockstore_impl_t::continue_sync(blockstore_op_t *op, bool queue_has_in_prog
return 1; return 1;
} }
void blockstore_impl_t::handle_sync_event(ring_data_t *data, blockstore_op_t *op)
{
live = true;
if (data->res != data->iov.iov_len)
{
throw std::runtime_error(
"write operation failed ("+std::to_string(data->res)+" != "+std::to_string(data->iov.iov_len)+
"). in-memory state is corrupted. AAAAAAAaaaaaaaaa!!!111"
);
}
PRIV(op)->pending_ops--;
if (PRIV(op)->pending_ops == 0)
{
// Release used journal sectors
release_journal_sectors(op);
// Handle states
if (PRIV(op)->op_state == SYNC_DATA_SYNC_SENT)
{
PRIV(op)->op_state = SYNC_DATA_SYNC_DONE;
}
else if (PRIV(op)->op_state == SYNC_JOURNAL_WRITE_SENT)
{
PRIV(op)->op_state = SYNC_JOURNAL_WRITE_DONE;
}
else if (PRIV(op)->op_state == SYNC_JOURNAL_SYNC_SENT)
{
PRIV(op)->op_state = SYNC_DONE;
}
else
{
throw std::runtime_error("BUG: unexpected sync op state");
}
ringloop->wakeup();
}
}
void blockstore_impl_t::ack_sync(blockstore_op_t *op) void blockstore_impl_t::ack_sync(blockstore_op_t *op)
{ {
// Handle states // Handle states

View File

@@ -102,7 +102,7 @@ bool blockstore_impl_t::enqueue_write(blockstore_op_t *op)
// Issue an additional sync so that the previous big write can reach the journal // Issue an additional sync so that the previous big write can reach the journal
blockstore_op_t *sync_op = new blockstore_op_t; blockstore_op_t *sync_op = new blockstore_op_t;
sync_op->opcode = BS_OP_SYNC; sync_op->opcode = BS_OP_SYNC;
sync_op->callback = [](blockstore_op_t *sync_op) sync_op->callback = [this, op](blockstore_op_t *sync_op)
{ {
delete sync_op; delete sync_op;
}; };
@@ -268,8 +268,8 @@ int blockstore_impl_t::dequeue_write(blockstore_op_t *op)
cancel_all_writes(op, dirty_it, -ENOSPC); cancel_all_writes(op, dirty_it, -ENOSPC);
return 2; return 2;
} }
BS_SUBMIT_GET_SQE(sqe, data);
write_iodepth++; write_iodepth++;
BS_SUBMIT_GET_SQE(sqe, data);
dirty_it->second.location = loc << block_order; dirty_it->second.location = loc << block_order;
dirty_it->second.state = (dirty_it->second.state & ~BS_ST_WORKFLOW_MASK) | BS_ST_SUBMITTED; dirty_it->second.state = (dirty_it->second.state & ~BS_ST_WORKFLOW_MASK) | BS_ST_SUBMITTED;
#ifdef BLOCKSTORE_DEBUG #ifdef BLOCKSTORE_DEBUG
@@ -324,21 +324,29 @@ int blockstore_impl_t::dequeue_write(blockstore_op_t *op)
{ {
return 0; return 0;
} }
// There is sufficient space. Check SQE(s)
BS_SUBMIT_CHECK_SQES(
// Write current journal sector only if it's dirty and full, or in the immediate_commit mode
(immediate_commit != IMMEDIATE_NONE ||
!journal.entry_fits(sizeof(journal_entry_small_write) + clean_entry_bitmap_size) ? 1 : 0) +
(op->len > 0 ? 1 : 0)
);
write_iodepth++; write_iodepth++;
// There is sufficient space. Get SQE(s)
struct io_uring_sqe *sqe1 = NULL;
if (immediate_commit != IMMEDIATE_NONE ||
!journal.entry_fits(sizeof(journal_entry_small_write) + clean_entry_bitmap_size))
{
// Write current journal sector only if it's dirty and full, or in the immediate_commit mode
BS_SUBMIT_GET_SQE_DECL(sqe1);
}
struct io_uring_sqe *sqe2 = NULL;
if (op->len > 0)
{
BS_SUBMIT_GET_SQE_DECL(sqe2);
}
// Got SQEs. Prepare previous journal sector write if required // Got SQEs. Prepare previous journal sector write if required
auto cb = [this, op](ring_data_t *data) { handle_write_event(data, op); }; auto cb = [this, op](ring_data_t *data) { handle_write_event(data, op); };
if (immediate_commit == IMMEDIATE_NONE) if (immediate_commit == IMMEDIATE_NONE)
{ {
if (!journal.entry_fits(sizeof(journal_entry_small_write) + clean_entry_bitmap_size)) if (sqe1)
{ {
prepare_journal_sector_write(journal.cur_sector, op); prepare_journal_sector_write(journal, journal.cur_sector, sqe1, cb);
PRIV(op)->min_flushed_journal_sector = PRIV(op)->max_flushed_journal_sector = 1 + journal.cur_sector;
PRIV(op)->pending_ops++;
} }
else else
{ {
@@ -372,7 +380,9 @@ int blockstore_impl_t::dequeue_write(blockstore_op_t *op)
journal.crc32_last = je->crc32; journal.crc32_last = je->crc32;
if (immediate_commit != IMMEDIATE_NONE) if (immediate_commit != IMMEDIATE_NONE)
{ {
prepare_journal_sector_write(journal.cur_sector, op); prepare_journal_sector_write(journal, journal.cur_sector, sqe1, cb);
PRIV(op)->min_flushed_journal_sector = PRIV(op)->max_flushed_journal_sector = 1 + journal.cur_sector;
PRIV(op)->pending_ops++;
} }
if (op->len > 0) if (op->len > 0)
{ {
@@ -380,9 +390,9 @@ int blockstore_impl_t::dequeue_write(blockstore_op_t *op)
if (journal.inmemory) if (journal.inmemory)
{ {
// Copy data // Copy data
memcpy((uint8_t*)journal.buffer + journal.next_free, op->buf, op->len); memcpy(journal.buffer + journal.next_free, op->buf, op->len);
} }
BS_SUBMIT_GET_SQE(sqe2, data2); ring_data_t *data2 = ((ring_data_t*)sqe2->user_data);
data2->iov = (struct iovec){ op->buf, op->len }; data2->iov = (struct iovec){ op->buf, op->len };
data2->callback = cb; data2->callback = cb;
my_uring_prep_writev( my_uring_prep_writev(
@@ -431,12 +441,13 @@ int blockstore_impl_t::continue_write(blockstore_op_t *op)
resume_2: resume_2:
// Only for the immediate_commit mode: prepare and submit big_write journal entry // Only for the immediate_commit mode: prepare and submit big_write journal entry
{ {
BS_SUBMIT_CHECK_SQES(1);
auto dirty_it = dirty_db.find((obj_ver_id){ auto dirty_it = dirty_db.find((obj_ver_id){
.oid = op->oid, .oid = op->oid,
.version = op->version, .version = op->version,
}); });
assert(dirty_it != dirty_db.end()); assert(dirty_it != dirty_db.end());
io_uring_sqe *sqe = NULL;
BS_SUBMIT_GET_SQE_DECL(sqe);
journal_entry_big_write *je = (journal_entry_big_write*)prefill_single_journal_entry( journal_entry_big_write *je = (journal_entry_big_write*)prefill_single_journal_entry(
journal, op->opcode == BS_OP_WRITE_STABLE ? JE_BIG_WRITE_INSTANT : JE_BIG_WRITE, journal, op->opcode == BS_OP_WRITE_STABLE ? JE_BIG_WRITE_INSTANT : JE_BIG_WRITE,
sizeof(journal_entry_big_write) + clean_entry_bitmap_size sizeof(journal_entry_big_write) + clean_entry_bitmap_size
@@ -458,7 +469,10 @@ resume_2:
memcpy((void*)(je+1), (clean_entry_bitmap_size > sizeof(void*) ? dirty_it->second.bitmap : &dirty_it->second.bitmap), clean_entry_bitmap_size); memcpy((void*)(je+1), (clean_entry_bitmap_size > sizeof(void*) ? dirty_it->second.bitmap : &dirty_it->second.bitmap), clean_entry_bitmap_size);
je->crc32 = je_crc32((journal_entry*)je); je->crc32 = je_crc32((journal_entry*)je);
journal.crc32_last = je->crc32; journal.crc32_last = je->crc32;
prepare_journal_sector_write(journal.cur_sector, op); prepare_journal_sector_write(journal, journal.cur_sector, sqe,
[this, op](ring_data_t *data) { handle_write_event(data, op); });
PRIV(op)->min_flushed_journal_sector = PRIV(op)->max_flushed_journal_sector = 1 + journal.cur_sector;
PRIV(op)->pending_ops = 1;
PRIV(op)->op_state = 3; PRIV(op)->op_state = 3;
return 1; return 1;
} }
@@ -573,7 +587,6 @@ void blockstore_impl_t::handle_write_event(ring_data_t *data, blockstore_op_t *o
); );
} }
PRIV(op)->pending_ops--; PRIV(op)->pending_ops--;
assert(PRIV(op)->pending_ops >= 0);
if (PRIV(op)->pending_ops == 0) if (PRIV(op)->pending_ops == 0)
{ {
release_journal_sectors(op); release_journal_sectors(op);
@@ -591,6 +604,7 @@ void blockstore_impl_t::release_journal_sectors(blockstore_op_t *op)
uint64_t s = PRIV(op)->min_flushed_journal_sector; uint64_t s = PRIV(op)->min_flushed_journal_sector;
while (1) while (1)
{ {
journal.sector_info[s-1].flush_count--;
if (s != (1+journal.cur_sector) && journal.sector_info[s-1].flush_count == 0) if (s != (1+journal.cur_sector) && journal.sector_info[s-1].flush_count == 0)
{ {
// We know for sure that we won't write into this sector anymore // We know for sure that we won't write into this sector anymore
@@ -629,24 +643,24 @@ int blockstore_impl_t::dequeue_del(blockstore_op_t *op)
{ {
return 0; return 0;
} }
// Write current journal sector only if it's dirty and full, or in the immediate_commit mode
BS_SUBMIT_CHECK_SQES(
(immediate_commit != IMMEDIATE_NONE ||
(journal_block_size - journal.in_sector_pos) < sizeof(journal_entry_del) &&
journal.sector_info[journal.cur_sector].dirty) ? 1 : 0
);
if (write_iodepth >= max_write_iodepth)
{
return 0;
}
write_iodepth++; write_iodepth++;
io_uring_sqe *sqe = NULL;
if (immediate_commit != IMMEDIATE_NONE ||
(journal_block_size - journal.in_sector_pos) < sizeof(journal_entry_del) &&
journal.sector_info[journal.cur_sector].dirty)
{
// Write current journal sector only if it's dirty and full, or in the immediate_commit mode
BS_SUBMIT_GET_SQE_DECL(sqe);
}
auto cb = [this, op](ring_data_t *data) { handle_write_event(data, op); };
// Prepare journal sector write // Prepare journal sector write
if (immediate_commit == IMMEDIATE_NONE) if (immediate_commit == IMMEDIATE_NONE)
{ {
if ((journal_block_size - journal.in_sector_pos) < sizeof(journal_entry_del) && if (sqe)
journal.sector_info[journal.cur_sector].dirty)
{ {
prepare_journal_sector_write(journal.cur_sector, op); prepare_journal_sector_write(journal, journal.cur_sector, sqe, cb);
PRIV(op)->min_flushed_journal_sector = PRIV(op)->max_flushed_journal_sector = 1 + journal.cur_sector;
PRIV(op)->pending_ops++;
} }
else else
{ {
@@ -673,7 +687,9 @@ int blockstore_impl_t::dequeue_del(blockstore_op_t *op)
dirty_it->second.state = BS_ST_DELETE | BS_ST_SUBMITTED; dirty_it->second.state = BS_ST_DELETE | BS_ST_SUBMITTED;
if (immediate_commit != IMMEDIATE_NONE) if (immediate_commit != IMMEDIATE_NONE)
{ {
prepare_journal_sector_write(journal.cur_sector, op); prepare_journal_sector_write(journal, journal.cur_sector, sqe, cb);
PRIV(op)->min_flushed_journal_sector = PRIV(op)->max_flushed_journal_sector = 1 + journal.cur_sector;
PRIV(op)->pending_ops++;
} }
if (!PRIV(op)->pending_ops) if (!PRIV(op)->pending_ops)
{ {

View File

@@ -57,7 +57,6 @@ json11::Json::object cli_tool_t::parse_args(int narg, const char *args[])
const char *opt = args[i]+2; const char *opt = args[i]+2;
cfg[opt] = i == narg-1 || !strcmp(opt, "json") || !strcmp(opt, "wait-list") || cfg[opt] = i == narg-1 || !strcmp(opt, "json") || !strcmp(opt, "wait-list") ||
!strcmp(opt, "long") || !strcmp(opt, "del") || !strcmp(opt, "no-color") || !strcmp(opt, "long") || !strcmp(opt, "del") || !strcmp(opt, "no-color") ||
!strcmp(opt, "readonly") || !strcmp(opt, "readwrite") ||
!strcmp(opt, "force") || !strcmp(opt, "reverse") || !strcmp(opt, "force") || !strcmp(opt, "reverse") ||
!strcmp(opt, "writers-stopped") && strcmp("1", args[i+1]) != 0 !strcmp(opt, "writers-stopped") && strcmp("1", args[i+1]) != 0
? "1" : args[++i]; ? "1" : args[++i];
@@ -70,7 +69,7 @@ json11::Json::object cli_tool_t::parse_args(int narg, const char *args[])
if (!cmd.size()) if (!cmd.size())
{ {
std::string exe(exe_name); std::string exe(exe_name);
if (exe.size() >= 11 && exe.substr(exe.size()-11) == "vitastor-rm") if (exe.substr(exe.size()-11) == "vitastor-rm")
{ {
cmd.push_back("rm-data"); cmd.push_back("rm-data");
} }
@@ -86,11 +85,8 @@ void cli_tool_t::help()
"(c) Vitaliy Filippov, 2019+ (VNPL-1.1)\n" "(c) Vitaliy Filippov, 2019+ (VNPL-1.1)\n"
"\n" "\n"
"USAGE:\n" "USAGE:\n"
"%s df\n" "%s ls [-l] [-p POOL] [--sort FIELD] [-r] [-n N] [<name> ...]\n"
" Show pool space statistics\n" " List images (only specified if <name> passed).\n"
"\n"
"%s ls [-l] [-p POOL] [--sort FIELD] [-r] [-n N] [<glob> ...]\n"
" List images (only matching <glob> patterns if passed).\n"
" -p|--pool POOL Filter images by pool ID or name\n" " -p|--pool POOL Filter images by pool ID or name\n"
" -l|--long Also report allocated size and I/O statistics\n" " -l|--long Also report allocated size and I/O statistics\n"
" --del Also include delete operation statistics\n" " --del Also include delete operation statistics\n"
@@ -107,8 +103,8 @@ void cli_tool_t::help()
"%s snap-create [-p|--pool <id|name>] <image>@<snapshot>\n" "%s snap-create [-p|--pool <id|name>] <image>@<snapshot>\n"
" Create a snapshot of image <name>. May be used live if only a single writer is active.\n" " Create a snapshot of image <name>. May be used live if only a single writer is active.\n"
"\n" "\n"
"%s modify <name> [--rename <new-name>] [--resize <size>] [--readonly | --readwrite] [-f|--force]\n" "%s modify <name> [-s|--size <size>] [--readonly | --readwrite] [-f|--force]\n"
" Rename, resize image or change its readonly status. Images with children can't be made read-write.\n" " Resize image or change its readonly status. Images with children can't be made read-write.\n"
" If the new size is smaller than the old size, extra data will be purged.\n" " If the new size is smaller than the old size, extra data will be purged.\n"
" You should resize file system in the image, if present, before shrinking it.\n" " You should resize file system in the image, if present, before shrinking it.\n"
" -f|--force Proceed with shrinking or setting readwrite flag even if the image has children.\n" " -f|--force Proceed with shrinking or setting readwrite flag even if the image has children.\n"
@@ -155,8 +151,7 @@ void cli_tool_t::help()
" --no-color Disable colored output\n" " --no-color Disable colored output\n"
" --json JSON output\n" " --json JSON output\n"
, ,
exe_name, exe_name, exe_name, exe_name, exe_name, exe_name, exe_name, exe_name, exe_name, exe_name, exe_name, exe_name, exe_name, exe_name, exe_name, exe_name, exe_name
exe_name, exe_name, exe_name, exe_name, exe_name, exe_name
); );
exit(0); exit(0);
} }
@@ -177,7 +172,7 @@ void cli_tool_t::change_parent(inode_t cur, inode_t new_parent)
new_cfg.parent_id = new_parent; new_cfg.parent_id = new_parent;
json11::Json::object cur_cfg_json = cli->st_cli.serialize_inode_cfg(&new_cfg); json11::Json::object cur_cfg_json = cli->st_cli.serialize_inode_cfg(&new_cfg);
waiting++; waiting++;
cli->st_cli.etcd_txn_slow(json11::Json::object { cli->st_cli.etcd_txn(json11::Json::object {
{ "compare", json11::Json::array { { "compare", json11::Json::array {
json11::Json::object { json11::Json::object {
{ "target", "MOD" }, { "target", "MOD" },
@@ -194,7 +189,7 @@ void cli_tool_t::change_parent(inode_t cur, inode_t new_parent)
} } } }
}, },
} }, } },
}, [this, new_parent, cur, cur_name](std::string err, json11::Json res) }, ETCD_SLOW_TIMEOUT, [this, new_parent, cur, cur_name](std::string err, json11::Json res)
{ {
if (err != "") if (err != "")
{ {
@@ -229,22 +224,6 @@ void cli_tool_t::change_parent(inode_t cur, inode_t new_parent)
}); });
} }
void cli_tool_t::etcd_txn(json11::Json txn)
{
waiting++;
cli->st_cli.etcd_txn_slow(txn, [this](std::string err, json11::Json res)
{
waiting--;
if (err != "")
{
fprintf(stderr, "Error reading from etcd: %s\n", err.c_str());
exit(1);
}
etcd_result = res;
ringloop->wakeup();
});
}
inode_config_t* cli_tool_t::get_inode_cfg(const std::string & name) inode_config_t* cli_tool_t::get_inode_cfg(const std::string & name)
{ {
for (auto & ic: cli->st_cli.inode_config) for (auto & ic: cli->st_cli.inode_config)
@@ -266,11 +245,6 @@ void cli_tool_t::run(json11::Json cfg)
fprintf(stderr, "command is missing\n"); fprintf(stderr, "command is missing\n");
exit(1); exit(1);
} }
else if (cmd[0] == "df")
{
// Show pool space stats
action_cb = start_df(cfg);
}
else if (cmd[0] == "ls") else if (cmd[0] == "ls")
{ {
// List images // List images
@@ -321,10 +295,6 @@ void cli_tool_t::run(json11::Json cfg)
fprintf(stderr, "unknown command: %s\n", cmd[0].string_value().c_str()); fprintf(stderr, "unknown command: %s\n", cmd[0].string_value().c_str());
exit(1); exit(1);
} }
if (action_cb == NULL)
{
return;
}
color = !cfg["no-color"].bool_value(); color = !cfg["no-color"].bool_value();
json_output = cfg["json"].bool_value(); json_output = cfg["json"].bool_value();
iodepth = cfg["iodepth"].uint64_value(); iodepth = cfg["iodepth"].uint64_value();

View File

@@ -34,7 +34,6 @@ public:
cluster_client_t *cli = NULL; cluster_client_t *cli = NULL;
int waiting = 0; int waiting = 0;
json11::Json etcd_result;
ring_consumer_t consumer; ring_consumer_t consumer;
std::function<bool(void)> action_cb; std::function<bool(void)> action_cb;
@@ -51,7 +50,6 @@ public:
friend struct snap_flattener_t; friend struct snap_flattener_t;
friend struct snap_remover_t; friend struct snap_remover_t;
std::function<bool(void)> start_df(json11::Json);
std::function<bool(void)> start_ls(json11::Json); std::function<bool(void)> start_ls(json11::Json);
std::function<bool(void)> start_create(json11::Json); std::function<bool(void)> start_create(json11::Json);
std::function<bool(void)> start_modify(json11::Json); std::function<bool(void)> start_modify(json11::Json);
@@ -61,18 +59,7 @@ public:
std::function<bool(void)> start_snap_rm(json11::Json); std::function<bool(void)> start_snap_rm(json11::Json);
std::function<bool(void)> start_alloc_osd(json11::Json cfg, uint64_t *out = NULL); std::function<bool(void)> start_alloc_osd(json11::Json cfg, uint64_t *out = NULL);
std::function<bool(void)> simple_offsets(json11::Json cfg); std::function<bool(void)> simple_offsets(json11::Json cfg);
void etcd_txn(json11::Json txn);
}; };
uint64_t parse_size(std::string size_str);
std::string print_table(json11::Json items, json11::Json header, bool use_esc);
std::string format_size(uint64_t size); std::string format_size(uint64_t size);
uint64_t parse_size(std::string size_str);
std::string format_lat(uint64_t lat);
std::string format_q(double depth);
bool stupid_glob(const std::string str, const std::string glob);

View File

@@ -13,6 +13,7 @@ struct alloc_osd_t
{ {
cli_tool_t *parent; cli_tool_t *parent;
json11::Json result;
uint64_t new_id = 1; uint64_t new_id = 1;
int state = 0; int state = 0;
@@ -28,7 +29,7 @@ struct alloc_osd_t
goto resume_1; goto resume_1;
do do
{ {
parent->etcd_txn(json11::Json::object { etcd_txn(json11::Json::object {
{ "compare", json11::Json::array { { "compare", json11::Json::array {
json11::Json::object { json11::Json::object {
{ "target", "VERSION" }, { "target", "VERSION" },
@@ -62,10 +63,10 @@ struct alloc_osd_t
state = 1; state = 1;
if (parent->waiting > 0) if (parent->waiting > 0)
return; return;
if (!parent->etcd_result["succeeded"].bool_value()) if (!result["succeeded"].bool_value())
{ {
std::vector<osd_num_t> used; std::vector<osd_num_t> used;
for (auto kv: parent->etcd_result["responses"][0]["response_range"]["kvs"].array_items()) for (auto kv: result["responses"][0]["response_range"]["kvs"].array_items())
{ {
std::string key = base64_decode(kv["key"].string_value()); std::string key = base64_decode(kv["key"].string_value());
osd_num_t cur_osd; osd_num_t cur_osd;
@@ -97,9 +98,25 @@ struct alloc_osd_t
new_id = used[e-1]+1; new_id = used[e-1]+1;
} }
} }
} while (!parent->etcd_result["succeeded"].bool_value()); } while (!result["succeeded"].bool_value());
state = 100; state = 100;
} }
void etcd_txn(json11::Json txn)
{
parent->waiting++;
parent->cli->st_cli.etcd_txn(txn, ETCD_SLOW_TIMEOUT, [this](std::string err, json11::Json res)
{
parent->waiting--;
if (err != "")
{
fprintf(stderr, "Error reading from etcd: %s\n", err.c_str());
exit(1);
}
this->result = res;
parent->ringloop->wakeup();
});
}
}; };
std::function<bool(void)> cli_tool_t::start_alloc_osd(json11::Json cfg, uint64_t *out) std::function<bool(void)> cli_tool_t::start_alloc_osd(json11::Json cfg, uint64_t *out)
@@ -107,7 +124,7 @@ std::function<bool(void)> cli_tool_t::start_alloc_osd(json11::Json cfg, uint64_t
json11::Json::array cmd = cfg["command"].array_items(); json11::Json::array cmd = cfg["command"].array_items();
auto alloc_osd = new alloc_osd_t(); auto alloc_osd = new alloc_osd_t();
alloc_osd->parent = this; alloc_osd->parent = this;
return [alloc_osd, out]() return [alloc_osd, &out]()
{ {
alloc_osd->loop(); alloc_osd->loop();
if (alloc_osd->is_done()) if (alloc_osd->is_done())

View File

@@ -31,6 +31,7 @@ struct image_creator_t
inode_t new_parent_id = 0; inode_t new_parent_id = 0;
inode_t new_id = 0, old_id = 0; inode_t new_id = 0, old_id = 0;
uint64_t max_id_mod_rev = 0, cfg_mod_rev = 0, idx_mod_rev = 0; uint64_t max_id_mod_rev = 0, cfg_mod_rev = 0, idx_mod_rev = 0;
json11::Json result;
int state = 0; int state = 0;
@@ -48,8 +49,7 @@ struct image_creator_t
auto & pools = parent->cli->st_cli.pool_config; auto & pools = parent->cli->st_cli.pool_config;
if (pools.find(new_pool_id) == pools.end()) if (pools.find(new_pool_id) == pools.end())
{ {
fprintf(stderr, "Pool %u does not exist\n", new_pool_id); new_pool_id = 0;
exit(1);
} }
} }
else if (new_pool_name != "") else if (new_pool_name != "")
@@ -62,17 +62,24 @@ struct image_creator_t
break; break;
} }
} }
if (!new_pool_id)
{
fprintf(stderr, "Pool %s does not exist\n", new_pool_name.c_str());
exit(1);
}
} }
else if (parent->cli->st_cli.pool_config.size() == 1) else if (parent->cli->st_cli.pool_config.size() == 1)
{ {
auto it = parent->cli->st_cli.pool_config.begin(); auto it = parent->cli->st_cli.pool_config.begin();
new_pool_id = it->first; new_pool_id = it->first;
} }
if (!new_pool_id)
{
if (new_pool_name == "")
{
fprintf(stderr, "Pool name or ID is missing\n");
}
else
{
fprintf(stderr, "Pool %s does not exist\n", new_pool_name.c_str());
}
exit(1);
}
state = 1; state = 1;
resume_1: resume_1:
if (new_snap == "") if (new_snap == "")
@@ -87,6 +94,11 @@ struct image_creator_t
goto resume_2; goto resume_2;
else if (state == 3) else if (state == 3)
goto resume_3; goto resume_3;
if (!size)
{
fprintf(stderr, "Image size is missing\n");
exit(1);
}
for (auto & ic: parent->cli->st_cli.inode_config) for (auto & ic: parent->cli->st_cli.inode_config)
{ {
if (ic.second.name == image_name) if (ic.second.name == image_name)
@@ -94,56 +106,29 @@ struct image_creator_t
fprintf(stderr, "Image %s already exists\n", image_name.c_str()); fprintf(stderr, "Image %s already exists\n", image_name.c_str());
exit(1); exit(1);
} }
if (ic.second.name == new_parent)
{
new_parent_id = ic.second.num;
if (!new_pool_id)
{
new_pool_id = INODE_POOL(ic.second.num);
}
if (!size)
{
size = ic.second.size;
}
}
}
if (new_parent != "" && !new_parent_id)
{
fprintf(stderr, "Parent image not found\n");
exit(1);
}
if (!new_pool_id)
{
fprintf(stderr, "Pool name or ID is missing\n");
exit(1);
}
if (!size)
{
fprintf(stderr, "Image size is missing\n");
exit(1);
} }
do do
{ {
parent->etcd_txn(json11::Json::object { etcd_txn(json11::Json::object {
{ "success", json11::Json::array { get_next_id() } } { "success", json11::Json::array { get_next_id() } }
}); });
state = 2; state = 2;
resume_2: resume_2:
if (parent->waiting > 0) if (parent->waiting > 0)
return; return;
extract_next_id(parent->etcd_result["responses"][0]); extract_next_id(result["responses"][0]);
attempt_create(); attempt_create();
state = 3; state = 3;
resume_3: resume_3:
if (parent->waiting > 0) if (parent->waiting > 0)
return; return;
if (!parent->etcd_result["succeeded"].bool_value() && if (!result["succeeded"].bool_value() &&
parent->etcd_result["responses"][0]["response_range"]["kvs"].array_items().size() > 0) result["responses"][0]["response_range"]["kvs"].array_items().size() > 0)
{ {
fprintf(stderr, "Image %s already exists\n", image_name.c_str()); fprintf(stderr, "Image %s already exists\n", image_name.c_str());
exit(1); exit(1);
} }
} while (!parent->etcd_result["succeeded"].bool_value()); } while (!result["succeeded"].bool_value());
if (parent->progress) if (parent->progress)
{ {
printf("Image %s created\n", image_name.c_str()); printf("Image %s created\n", image_name.c_str());
@@ -167,11 +152,6 @@ resume_3:
exit(1); exit(1);
} }
} }
if (new_parent != "")
{
fprintf(stderr, "--parent can't be used with snapshots\n");
exit(1);
}
do do
{ {
// In addition to next_id, get: size, old_id, old_pool_id, new_parent, cfg_mod_rev, idx_mod_rev // In addition to next_id, get: size, old_id, old_pool_id, new_parent, cfg_mod_rev, idx_mod_rev
@@ -185,23 +165,18 @@ resume_3:
fprintf(stderr, "Image %s does not exist\n", image_name.c_str()); fprintf(stderr, "Image %s does not exist\n", image_name.c_str());
exit(1); exit(1);
} }
if (!new_pool_id)
{
// Create snapshot in the same pool by default
new_pool_id = old_pool_id;
}
attempt_create(); attempt_create();
state = 4; state = 4;
resume_4: resume_4:
if (parent->waiting > 0) if (parent->waiting > 0)
return; return;
if (!parent->etcd_result["succeeded"].bool_value() && if (!result["succeeded"].bool_value() &&
parent->etcd_result["responses"][0]["response_range"]["kvs"].array_items().size() > 0) result["responses"][0]["response_range"]["kvs"].array_items().size() > 0)
{ {
fprintf(stderr, "Snapshot %s@%s already exists\n", image_name.c_str(), new_snap.c_str()); fprintf(stderr, "Snapshot %s@%s already exists\n", image_name.c_str(), new_snap.c_str());
exit(1); exit(1);
} }
} while (!parent->etcd_result["succeeded"].bool_value()); } while (!result["succeeded"].bool_value());
if (parent->progress) if (parent->progress)
{ {
printf("Snapshot %s@%s created\n", image_name.c_str(), new_snap.c_str()); printf("Snapshot %s@%s created\n", image_name.c_str(), new_snap.c_str());
@@ -245,7 +220,7 @@ resume_4:
goto resume_2; goto resume_2;
else if (state == 3) else if (state == 3)
goto resume_3; goto resume_3;
parent->etcd_txn(json11::Json::object { { "success", json11::Json::array { etcd_txn(json11::Json::object { { "success", json11::Json::array {
get_next_id(), get_next_id(),
json11::Json::object { json11::Json::object {
{ "request_range", json11::Json::object { { "request_range", json11::Json::object {
@@ -259,11 +234,11 @@ resume_4:
resume_2: resume_2:
if (parent->waiting > 0) if (parent->waiting > 0)
return; return;
extract_next_id(parent->etcd_result["responses"][0]); extract_next_id(result["responses"][0]);
old_id = 0; old_id = 0;
old_pool_id = 0; old_pool_id = 0;
cfg_mod_rev = idx_mod_rev = 0; cfg_mod_rev = idx_mod_rev = 0;
if (parent->etcd_result["responses"][1]["response_range"]["kvs"].array_items().size() == 0) if (result["responses"][1]["response_range"]["kvs"].array_items().size() == 0)
{ {
for (auto & ic: parent->cli->st_cli.inode_config) for (auto & ic: parent->cli->st_cli.inode_config)
{ {
@@ -282,7 +257,7 @@ resume_2:
{ {
// FIXME: Parse kvs in etcd_state_client automatically // FIXME: Parse kvs in etcd_state_client automatically
{ {
auto kv = parent->cli->st_cli.parse_etcd_kv(parent->etcd_result["responses"][1]["response_range"]["kvs"][0]); auto kv = parent->cli->st_cli.parse_etcd_kv(result["responses"][1]["response_range"]["kvs"][0]);
old_id = INODE_NO_POOL(kv.value["id"].uint64_value()); old_id = INODE_NO_POOL(kv.value["id"].uint64_value());
old_pool_id = (pool_id_t)kv.value["pool_id"].uint64_value(); old_pool_id = (pool_id_t)kv.value["pool_id"].uint64_value();
idx_mod_rev = kv.mod_revision; idx_mod_rev = kv.mod_revision;
@@ -292,7 +267,7 @@ resume_2:
exit(1); exit(1);
} }
} }
parent->etcd_txn(json11::Json::object { etcd_txn(json11::Json::object {
{ "success", json11::Json::array { { "success", json11::Json::array {
json11::Json::object { json11::Json::object {
{ "request_range", json11::Json::object { { "request_range", json11::Json::object {
@@ -309,7 +284,7 @@ resume_3:
if (parent->waiting > 0) if (parent->waiting > 0)
return; return;
{ {
auto kv = parent->cli->st_cli.parse_etcd_kv(parent->etcd_result["responses"][0]["response_range"]["kvs"][0]); auto kv = parent->cli->st_cli.parse_etcd_kv(result["responses"][0]["response_range"]["kvs"][0]);
size = kv.value["size"].uint64_value(); size = kv.value["size"].uint64_value();
new_parent_id = kv.value["parent_id"].uint64_value(); new_parent_id = kv.value["parent_id"].uint64_value();
uint64_t parent_pool_id = kv.value["parent_pool_id"].uint64_value(); uint64_t parent_pool_id = kv.value["parent_pool_id"].uint64_value();
@@ -438,20 +413,32 @@ resume_3:
} }, } },
}); });
}; };
parent->etcd_txn(json11::Json::object { etcd_txn(json11::Json::object {
{ "compare", checks }, { "compare", checks },
{ "success", success }, { "success", success },
{ "failure", failure }, { "failure", failure },
}); });
} }
void etcd_txn(json11::Json txn)
{
parent->waiting++;
parent->cli->st_cli.etcd_txn(txn, ETCD_SLOW_TIMEOUT, [this](std::string err, json11::Json res)
{
parent->waiting--;
if (err != "")
{
fprintf(stderr, "Error reading from etcd: %s\n", err.c_str());
exit(1);
}
this->result = res;
parent->ringloop->wakeup();
});
}
}; };
uint64_t parse_size(std::string size_str) uint64_t parse_size(std::string size_str)
{ {
if (!size_str.length())
{
return 0;
}
uint64_t mul = 1; uint64_t mul = 1;
char type_char = tolower(size_str[size_str.length()-1]); char type_char = tolower(size_str[size_str.length()-1]);
if (type_char == 'k' || type_char == 'm' || type_char == 'g' || type_char == 't') if (type_char == 'k' || type_char == 'm' || type_char == 'g' || type_char == 't')

View File

@@ -1,219 +0,0 @@
// Copyright (c) Vitaliy Filippov, 2019+
// License: VNPL-1.1 (see README.md for details)
#include "cli.h"
#include "cluster_client.h"
#include "base64.h"
// List pools with space statistics
struct pool_lister_t
{
cli_tool_t *parent;
int state = 0;
json11::Json space_info;
std::map<pool_id_t, json11::Json::object> pool_stats;
bool is_done()
{
return state == 100;
}
void get_stats()
{
if (state == 1)
goto resume_1;
// Space statistics - pool/stats/<pool>
parent->etcd_txn(json11::Json::object {
{ "success", json11::Json::array {
json11::Json::object {
{ "request_range", json11::Json::object {
{ "key", base64_encode(
parent->cli->st_cli.etcd_prefix+"/pool/stats/"
) },
{ "range_end", base64_encode(
parent->cli->st_cli.etcd_prefix+"/pool/stats0"
) },
} },
},
json11::Json::object {
{ "request_range", json11::Json::object {
{ "key", base64_encode(
parent->cli->st_cli.etcd_prefix+"/osd/stats/"
) },
{ "range_end", base64_encode(
parent->cli->st_cli.etcd_prefix+"/osd/stats0"
) },
} },
},
} },
});
state = 1;
resume_1:
if (parent->waiting > 0)
return;
space_info = parent->etcd_result;
std::map<pool_id_t, uint64_t> osd_free;
for (auto & kv_item: space_info["responses"][0]["response_range"]["kvs"].array_items())
{
auto kv = parent->cli->st_cli.parse_etcd_kv(kv_item);
// pool ID
pool_id_t pool_id;
char null_byte = 0;
sscanf(kv.key.substr(parent->cli->st_cli.etcd_prefix.length()).c_str(), "/pool/stats/%u%c", &pool_id, &null_byte);
if (!pool_id || pool_id >= POOL_ID_MAX || null_byte != 0)
{
fprintf(stderr, "Invalid key in etcd: %s\n", kv.key.c_str());
continue;
}
// pool/stats/<N>
pool_stats[pool_id] = kv.value.object_items();
}
for (auto & kv_item: space_info["responses"][1]["response_range"]["kvs"].array_items())
{
auto kv = parent->cli->st_cli.parse_etcd_kv(kv_item);
// osd ID
osd_num_t osd_num;
char null_byte = 0;
sscanf(kv.key.substr(parent->cli->st_cli.etcd_prefix.length()).c_str(), "/osd/stats/%lu%c", &osd_num, &null_byte);
if (!osd_num || osd_num >= POOL_ID_MAX || null_byte != 0)
{
fprintf(stderr, "Invalid key in etcd: %s\n", kv.key.c_str());
continue;
}
// osd/stats/<N>::free
osd_free[osd_num] = kv.value["free"].uint64_value();
}
// Calculate max_avail for each pool
for (auto & pp: parent->cli->st_cli.pool_config)
{
auto & pool_cfg = pp.second;
uint64_t pool_avail = UINT64_MAX;
std::map<osd_num_t, uint64_t> pg_per_osd;
for (auto & pgp: pool_cfg.pg_config)
{
for (auto pg_osd: pgp.second.target_set)
{
if (pg_osd != 0)
{
pg_per_osd[pg_osd]++;
}
}
}
for (auto pg_per_pair: pg_per_osd)
{
uint64_t pg_free = osd_free[pg_per_pair.first] * pool_cfg.pg_count / pg_per_pair.second;
if (pool_avail > pg_free)
{
pool_avail = pg_free;
}
}
if (pool_cfg.scheme != POOL_SCHEME_REPLICATED)
{
pool_avail = pool_avail * (pool_cfg.pg_size - pool_cfg.parity_chunks) / pool_stats[pool_cfg.id]["pg_real_size"].uint64_value();
}
pool_stats[pool_cfg.id] = json11::Json::object {
{ "name", pool_cfg.name },
{ "pg_count", pool_cfg.pg_count },
{ "scheme", pool_cfg.scheme == POOL_SCHEME_REPLICATED ? "replicated" : "jerasure" },
{ "scheme_name", pool_cfg.scheme == POOL_SCHEME_REPLICATED
? std::to_string(pool_cfg.pg_size)+"/"+std::to_string(pool_cfg.pg_minsize)
: "EC "+std::to_string(pool_cfg.pg_size-pool_cfg.parity_chunks)+"+"+std::to_string(pool_cfg.parity_chunks) },
{ "used_raw", (uint64_t)(pool_stats[pool_cfg.id]["used_raw_tb"].number_value() * (1l<<40)) },
{ "total_raw", (uint64_t)(pool_stats[pool_cfg.id]["total_raw_tb"].number_value() * (1l<<40)) },
{ "max_available", pool_avail },
{ "raw_to_usable", pool_stats[pool_cfg.id]["raw_to_usable"].number_value() },
{ "space_efficiency", pool_stats[pool_cfg.id]["space_efficiency"].number_value() },
{ "pg_real_size", pool_stats[pool_cfg.id]["pg_real_size"].uint64_value() },
{ "failure_domain", pool_cfg.failure_domain },
};
}
}
json11::Json::array to_list()
{
json11::Json::array list;
for (auto & kv: pool_stats)
{
list.push_back(kv.second);
}
return list;
}
void loop()
{
get_stats();
if (parent->waiting > 0)
return;
if (parent->json_output)
{
// JSON output
printf("%s\n", json11::Json(to_list()).dump().c_str());
state = 100;
return;
}
// Table output: name, scheme_name, pg_count, total, used, max_avail, used%, efficiency
json11::Json::array cols;
cols.push_back(json11::Json::object{
{ "key", "name" },
{ "title", "NAME" },
});
cols.push_back(json11::Json::object{
{ "key", "scheme_name" },
{ "title", "SCHEME" },
});
cols.push_back(json11::Json::object{
{ "key", "pg_count" },
{ "title", "PGS" },
});
cols.push_back(json11::Json::object{
{ "key", "total_fmt" },
{ "title", "TOTAL" },
});
cols.push_back(json11::Json::object{
{ "key", "used_fmt" },
{ "title", "USED" },
});
cols.push_back(json11::Json::object{
{ "key", "max_avail_fmt" },
{ "title", "AVAILABLE" },
});
cols.push_back(json11::Json::object{
{ "key", "used_pct" },
{ "title", "USED%" },
});
cols.push_back(json11::Json::object{
{ "key", "eff_fmt" },
{ "title", "EFFICIENCY" },
});
json11::Json::array list;
for (auto & kv: pool_stats)
{
kv.second["total_fmt"] = format_size(kv.second["total_raw"].uint64_value() / kv.second["raw_to_usable"].number_value());
kv.second["used_fmt"] = format_size(kv.second["used_raw"].uint64_value() / kv.second["raw_to_usable"].number_value());
kv.second["max_avail_fmt"] = format_size(kv.second["max_available"].uint64_value());
kv.second["used_pct"] = format_q(100 - 100*kv.second["max_available"].uint64_value() *
kv.second["raw_to_usable"].number_value() / kv.second["total_raw"].uint64_value())+"%";
kv.second["eff_fmt"] = format_q(kv.second["space_efficiency"].number_value()*100)+"%";
}
printf("%s", print_table(to_list(), cols, parent->color).c_str());
state = 100;
}
};
std::function<bool(void)> cli_tool_t::start_df(json11::Json cfg)
{
json11::Json::array cmd = cfg["command"].array_items();
auto lister = new pool_lister_t();
lister->parent = this;
return [lister]()
{
lister->loop();
if (lister->is_done())
{
delete lister;
return true;
}
return false;
};
}

View File

@@ -3,7 +3,6 @@
#include "cli.h" #include "cli.h"
#include "cluster_client.h" #include "cluster_client.h"
#include <sys/stat.h>
// Flatten a layer: merge all parents into a layer and break the connection completely // Flatten a layer: merge all parents into a layer and break the connection completely
struct snap_flattener_t struct snap_flattener_t

View File

@@ -6,6 +6,16 @@
#include "cluster_client.h" #include "cluster_client.h"
#include "base64.h" #include "base64.h"
#define MIN(a, b) ((a) < (b) ? (b) : (a))
std::string print_table(json11::Json items, json11::Json header, bool use_esc);
std::string format_size(uint64_t size);
std::string format_lat(uint64_t lat);
std::string format_q(double depth);
// List existing images // List existing images
// //
// Again, you can just look into etcd, but this console tool incapsulates it // Again, you can just look into etcd, but this console tool incapsulates it
@@ -84,7 +94,8 @@ struct image_lister_t
// Space statistics // Space statistics
// inode/stats/<pool>/<inode>::raw_used divided by pool/stats/<pool>::pg_real_size // inode/stats/<pool>/<inode>::raw_used divided by pool/stats/<pool>::pg_real_size
// multiplied by 1 or number of data drives // multiplied by 1 or number of data drives
parent->etcd_txn(json11::Json::object { parent->waiting++;
parent->cli->st_cli.etcd_txn(json11::Json::object {
{ "success", json11::Json::array { { "success", json11::Json::array {
json11::Json::object { json11::Json::object {
{ "request_range", json11::Json::object { { "request_range", json11::Json::object {
@@ -111,12 +122,21 @@ struct image_lister_t
} }, } },
}, },
} }, } },
}, ETCD_SLOW_TIMEOUT, [this](std::string err, json11::Json res)
{
parent->waiting--;
if (err != "")
{
fprintf(stderr, "Error reading from etcd: %s\n", err.c_str());
exit(1);
}
space_info = res;
parent->ringloop->wakeup();
}); });
state = 1; state = 1;
resume_1: resume_1:
if (parent->waiting > 0) if (parent->waiting > 0)
return; return;
space_info = parent->etcd_result;
std::map<pool_id_t, uint64_t> pool_pg_real_size; std::map<pool_id_t, uint64_t> pool_pg_real_size;
for (auto & kv_item: space_info["responses"][0]["response_range"]["kvs"].array_items()) for (auto & kv_item: space_info["responses"][0]["response_range"]["kvs"].array_items())
{ {
@@ -193,21 +213,10 @@ resume_1:
json11::Json::array list; json11::Json::array list;
for (auto & kv: stats) for (auto & kv: stats)
{ {
if (!only_names.size()) if (!only_names.size() || only_names.find(kv.second["name"].string_value()) != only_names.end())
{ {
list.push_back(kv.second); list.push_back(kv.second);
} }
else
{
for (auto glob: only_names)
{
if (stupid_glob(kv.second["name"].string_value(), glob))
{
list.push_back(kv.second);
break;
}
}
}
} }
if (sort_field == "name" || sort_field == "pool_name") if (sort_field == "name" || sort_field == "pool_name")
{ {
@@ -346,9 +355,6 @@ resume_1:
kv.second["read_bw"] = format_size(kv.second["read_bps"].uint64_value())+"/s"; kv.second["read_bw"] = format_size(kv.second["read_bps"].uint64_value())+"/s";
kv.second["write_bw"] = format_size(kv.second["write_bps"].uint64_value())+"/s"; kv.second["write_bw"] = format_size(kv.second["write_bps"].uint64_value())+"/s";
kv.second["delete_bw"] = format_size(kv.second["delete_bps"].uint64_value())+"/s"; kv.second["delete_bw"] = format_size(kv.second["delete_bps"].uint64_value())+"/s";
kv.second["read_iops"] = format_q(kv.second["read_iops"].number_value());
kv.second["write_iops"] = format_q(kv.second["write_iops"].number_value());
kv.second["delete_iops"] = format_q(kv.second["delete_iops"].number_value());
kv.second["read_lat_f"] = format_lat(kv.second["read_lat"].uint64_value()); kv.second["read_lat_f"] = format_lat(kv.second["read_lat"].uint64_value());
kv.second["write_lat_f"] = format_lat(kv.second["write_lat"].uint64_value()); kv.second["write_lat_f"] = format_lat(kv.second["write_lat"].uint64_value());
kv.second["delete_lat_f"] = format_lat(kv.second["delete_lat"].uint64_value()); kv.second["delete_lat_f"] = format_lat(kv.second["delete_lat"].uint64_value());
@@ -487,62 +493,6 @@ std::string format_q(double depth)
return std::string(buf); return std::string(buf);
} }
struct glob_stack_t
{
int glob_pos;
int str_pos;
};
// Yes I know I could do it by translating the pattern to std::regex O:-)
bool stupid_glob(const std::string str, const std::string glob)
{
std::vector<glob_stack_t> wildcards;
int pos = 0, gp = 0;
bool m;
back:
while (true)
{
if (gp >= glob.length())
{
if (pos >= str.length())
return true;
m = false;
}
else if (glob[gp] == '*')
{
wildcards.push_back((glob_stack_t){ .glob_pos = ++gp, .str_pos = pos });
continue;
}
else if (glob[gp] == '?')
m = pos < str.size();
else
{
if (glob[gp] == '\\' && gp < glob.length()-1)
gp++;
m = pos < str.size() && str[pos] == glob[gp];
}
if (!m)
{
while (wildcards.size() > 0)
{
// Backtrack
pos = (++wildcards[wildcards.size()-1].str_pos);
if (pos > str.size())
wildcards.pop_back();
else
{
gp = wildcards[wildcards.size()-1].glob_pos;
goto back;
}
}
return false;
}
pos++;
gp++;
}
return true;
}
std::function<bool(void)> cli_tool_t::start_ls(json11::Json cfg) std::function<bool(void)> cli_tool_t::start_ls(json11::Json cfg)
{ {
json11::Json::array cmd = cfg["command"].array_items(); json11::Json::array cmd = cfg["command"].array_items();

View File

@@ -412,7 +412,7 @@ struct snap_merger_t
uint64_t bitmap_size = target_block_size / gran; uint64_t bitmap_size = target_block_size / gran;
while (rwo->end < bitmap_size) while (rwo->end < bitmap_size)
{ {
auto bit = ((*((uint8_t*)rwo->op.bitmap_buf + (rwo->end >> 3))) & (1 << (rwo->end & 0x7))); auto bit = ((*(uint8_t*)(rwo->op.bitmap_buf + (rwo->end >> 3))) & (1 << (rwo->end & 0x7)));
if (!bit) if (!bit)
{ {
if (rwo->end > rwo->start) if (rwo->end > rwo->start)
@@ -459,7 +459,7 @@ struct snap_merger_t
subop->len = end-start; subop->len = end-start;
subop->version = version; subop->version = version;
subop->flags = OSD_OP_IGNORE_READONLY; subop->flags = OSD_OP_IGNORE_READONLY;
subop->iov.push_back((uint8_t*)rwo->buf+start, end-start); subop->iov.push_back(rwo->buf+start, end-start);
subop->callback = [this, rwo](cluster_op_t *subop) subop->callback = [this, rwo](cluster_op_t *subop)
{ {
rwo->todo--; rwo->todo--;
@@ -495,7 +495,7 @@ struct snap_merger_t
subop->offset = offset; subop->offset = offset;
subop->len = 0; subop->len = 0;
subop->flags = OSD_OP_IGNORE_READONLY; subop->flags = OSD_OP_IGNORE_READONLY;
subop->callback = [](cluster_op_t *subop) subop->callback = [this](cluster_op_t *subop)
{ {
if (subop->retval != 0) if (subop->retval != 0)
{ {
@@ -519,10 +519,10 @@ struct snap_merger_t
deleted_unsynced++; deleted_unsynced++;
if (deleted_unsynced >= fsync_interval) if (deleted_unsynced >= fsync_interval)
{ {
uint64_t to = last_written_offset; uint64_t from = last_fsync_offset, to = last_written_offset;
cluster_op_t *subop = new cluster_op_t; cluster_op_t *subop = new cluster_op_t;
subop->opcode = OSD_OP_SYNC; subop->opcode = OSD_OP_SYNC;
subop->callback = [this, to](cluster_op_t *subop) subop->callback = [this, from, to](cluster_op_t *subop)
{ {
delete subop; delete subop;
// We can now delete source data between <from> and <to> // We can now delete source data between <from> and <to>

View File

@@ -5,13 +5,12 @@
#include "cluster_client.h" #include "cluster_client.h"
#include "base64.h" #include "base64.h"
// Rename, resize image (and purge extra data on shrink) or change its readonly status // Resize image (purging extra data on shrink) or change its readonly status
struct image_changer_t struct image_changer_t
{ {
cli_tool_t *parent; cli_tool_t *parent;
std::string image_name; std::string image_name;
std::string new_name;
uint64_t new_size = 0; uint64_t new_size = 0;
bool set_readonly = false, set_readwrite = false, force = false; bool set_readonly = false, set_readwrite = false, force = false;
// interval between fsyncs // interval between fsyncs
@@ -19,7 +18,7 @@ struct image_changer_t
uint64_t inode_num = 0; uint64_t inode_num = 0;
inode_config_t cfg; inode_config_t cfg;
json11::Json::array checks, success; std::string cur_cfg_key;
bool has_children = false; bool has_children = false;
int state = 0; int state = 0;
@@ -44,11 +43,6 @@ struct image_changer_t
cfg = ic.second; cfg = ic.second;
break; break;
} }
if (new_name != "" && ic.second.name == new_name)
{
fprintf(stderr, "Image %s already exists\n", new_name.c_str());
exit(1);
}
} }
if (!inode_num) if (!inode_num)
{ {
@@ -63,23 +57,14 @@ struct image_changer_t
break; break;
} }
} }
if ((!set_readwrite || !cfg.readonly) &&
(!set_readonly || cfg.readonly) &&
(!new_size || cfg.size == new_size) &&
(new_name == "" || new_name == image_name))
{
printf("No change\n");
state = 100;
return;
}
if (new_size != 0) if (new_size != 0)
{ {
if (cfg.size >= new_size) if (cfg.size >= new_size)
{ {
// Check confirmation when trimming an image with children // Check confirmation if trimming an image with children
if (has_children && !force) if (has_children && !force)
{ {
fprintf(stderr, "Image %s has children. Refusing to shrink it without --force\n", image_name.c_str()); fprintf(stderr, "Image %s has children. Refusing to shrink it without --force", image_name.c_str());
exit(1); exit(1);
} }
// Shrink the image first // Shrink the image first
@@ -106,84 +91,56 @@ resume_1:
if (set_readwrite) if (set_readwrite)
{ {
cfg.readonly = false; cfg.readonly = false;
// Check confirmation when making an image with children read-write // Check confirmation if trimming an image with children
if (has_children && !force) if (!force)
{ {
fprintf(stderr, "Image %s has children. Refusing to make it read-write without --force\n", image_name.c_str()); fprintf(stderr, "Image %s has children. Refusing to make it read-write without --force", image_name.c_str());
exit(1); exit(1);
} }
} }
if (new_name != "") cur_cfg_key = base64_encode(parent->cli->st_cli.etcd_prefix+
{ "/config/inode/"+std::to_string(INODE_POOL(inode_num))+
cfg.name = new_name; "/"+std::to_string(INODE_NO_POOL(inode_num)));
} parent->waiting++;
{ parent->cli->st_cli.etcd_txn(json11::Json::object {
std::string cur_cfg_key = base64_encode(parent->cli->st_cli.etcd_prefix+ { "compare", json11::Json::array {
"/config/inode/"+std::to_string(INODE_POOL(inode_num))+ json11::Json::object {
"/"+std::to_string(INODE_NO_POOL(inode_num))); { "target", "MOD" },
checks.push_back(json11::Json::object {
{ "target", "MOD" },
{ "key", cur_cfg_key },
{ "result", "LESS" },
{ "mod_revision", cfg.mod_revision+1 },
});
success.push_back(json11::Json::object {
{ "request_put", json11::Json::object {
{ "key", cur_cfg_key }, { "key", cur_cfg_key },
{ "value", base64_encode(json11::Json( { "result", "LESS" },
parent->cli->st_cli.serialize_inode_cfg(&cfg) { "mod_revision", cfg.mod_revision+1 },
).dump()) }, },
} } } },
}); { "success", json11::Json::array {
} json11::Json::object {
if (new_name != "") { "request_put", json11::Json::object {
{ "key", cur_cfg_key },
{ "value", base64_encode(json11::Json(
parent->cli->st_cli.serialize_inode_cfg(&cfg)
).dump()) },
} }
},
} },
}, ETCD_SLOW_TIMEOUT, [this](std::string err, json11::Json res)
{ {
std::string old_idx_key = base64_encode( if (err != "")
parent->cli->st_cli.etcd_prefix+"/index/image/"+image_name {
); fprintf(stderr, "Error changing %s: %s\n", image_name.c_str(), err.c_str());
std::string new_idx_key = base64_encode( exit(1);
parent->cli->st_cli.etcd_prefix+"/index/image/"+new_name }
); if (!res["succeeded"].bool_value())
checks.push_back(json11::Json::object { {
{ "target", "MOD" }, fprintf(stderr, "Image %s was modified by someone else, please repeat your request\n", image_name.c_str());
{ "key", old_idx_key }, exit(1);
{ "result", "LESS" }, }
{ "mod_revision", cfg.mod_revision+1 }, parent->waiting--;
}); parent->ringloop->wakeup();
checks.push_back(json11::Json::object {
{ "target", "VERSION" },
{ "version", 0 },
{ "key", new_idx_key },
});
success.push_back(json11::Json::object {
{ "request_delete_range", json11::Json::object {
{ "key", old_idx_key },
} }
});
success.push_back(json11::Json::object {
{ "request_put", json11::Json::object {
{ "key", new_idx_key },
{ "value", base64_encode(json11::Json(json11::Json::object{
{ "id", INODE_NO_POOL(inode_num) },
{ "pool_id", (uint64_t)INODE_POOL(inode_num) },
}).dump()) },
} }
});
}
parent->etcd_txn(json11::Json::object {
{ "compare", checks },
{ "success", success },
}); });
state = 2; state = 2;
resume_2: resume_2:
if (parent->waiting > 0) if (parent->waiting > 0)
return; return;
if (!parent->etcd_result["succeeded"].bool_value()) printf("Image %s changed\n", image_name.c_str());
{
fprintf(stderr, "Image %s was modified by someone else, please repeat your request\n", image_name.c_str());
exit(1);
}
printf("Image %s modified\n", image_name.c_str());
state = 100; state = 100;
} }
}; };
@@ -199,8 +156,7 @@ std::function<bool(void)> cli_tool_t::start_modify(json11::Json cfg)
fprintf(stderr, "Image name is missing\n"); fprintf(stderr, "Image name is missing\n");
exit(1); exit(1);
} }
changer->new_name = cfg["rename"].string_value(); changer->new_size = cfg["size"].uint64_value();
changer->new_size = parse_size(cfg["resize"].string_value());
if (changer->new_size != 0 && (changer->new_size % 4096)) if (changer->new_size != 0 && (changer->new_size % 4096))
{ {
fprintf(stderr, "Image size should be a multiple of 4096\n"); fprintf(stderr, "Image size should be a multiple of 4096\n");

View File

@@ -8,7 +8,6 @@
#include "cli.h" #include "cli.h"
#include "cluster_client.h" #include "cluster_client.h"
#include "base64.h" #include "base64.h"
#include <sys/stat.h>
// Calculate offsets for a block device and print OSD command line parameters // Calculate offsets for a block device and print OSD command line parameters
std::function<bool(void)> cli_tool_t::simple_offsets(json11::Json cfg) std::function<bool(void)> cli_tool_t::simple_offsets(json11::Json cfg)

View File

@@ -256,9 +256,9 @@ resume_9:
}); });
} }
parent->waiting++; parent->waiting++;
parent->cli->st_cli.etcd_txn_slow(json11::Json::object { parent->cli->st_cli.etcd_txn(json11::Json::object {
{ "success", reads }, { "success", reads },
}, [this](std::string err, json11::Json data) }, ETCD_SLOW_TIMEOUT, [this](std::string err, json11::Json data)
{ {
parent->waiting--; parent->waiting--;
if (err != "") if (err != "")
@@ -414,10 +414,10 @@ resume_9:
} }
} }
parent->waiting++; parent->waiting++;
parent->cli->st_cli.etcd_txn_slow(json11::Json::object { parent->cli->st_cli.etcd_txn(json11::Json::object {
{ "compare", cmp }, { "compare", cmp },
{ "success", txn }, { "success", txn },
}, [this, target_name, child_name](std::string err, json11::Json res) }, ETCD_SLOW_TIMEOUT, [this, target_name, child_name](std::string err, json11::Json res)
{ {
parent->waiting--; parent->waiting--;
if (err != "") if (err != "")
@@ -454,7 +454,7 @@ resume_9:
"/"+std::to_string(INODE_NO_POOL(cur)) "/"+std::to_string(INODE_NO_POOL(cur))
); );
parent->waiting++; parent->waiting++;
parent->cli->st_cli.etcd_txn_slow(json11::Json::object { parent->cli->st_cli.etcd_txn(json11::Json::object {
{ "compare", json11::Json::array { { "compare", json11::Json::array {
json11::Json::object { json11::Json::object {
{ "target", "MOD" }, { "target", "MOD" },
@@ -468,14 +468,12 @@ resume_9:
{ "request_delete_range", json11::Json::object { { "request_delete_range", json11::Json::object {
{ "key", cur_cfg_key }, { "key", cur_cfg_key },
} }, } },
},
json11::Json::object {
{ "request_delete_range", json11::Json::object { { "request_delete_range", json11::Json::object {
{ "key", base64_encode(parent->cli->st_cli.etcd_prefix+"/index/image/"+cur_name) }, { "key", base64_encode(parent->cli->st_cli.etcd_prefix+"/index/image/"+cur_name) },
} }, } },
}, },
} }, } },
}, [this, cur_name](std::string err, json11::Json res) }, ETCD_SLOW_TIMEOUT, [this, cur_name](std::string err, json11::Json res)
{ {
parent->waiting--; parent->waiting--;
if (err != "") if (err != "")

View File

@@ -64,7 +64,6 @@ cluster_client_t::cluster_client_t(ring_loop_t *ringloop, timerfd_manager_t *tfd
st_cli.on_change_osd_state_hook = [this](uint64_t peer_osd) { on_change_osd_state_hook(peer_osd); }; st_cli.on_change_osd_state_hook = [this](uint64_t peer_osd) { on_change_osd_state_hook(peer_osd); };
st_cli.on_change_hook = [this](std::map<std::string, etcd_kv_t> & changes) { on_change_hook(changes); }; st_cli.on_change_hook = [this](std::map<std::string, etcd_kv_t> & changes) { on_change_hook(changes); };
st_cli.on_load_pgs_hook = [this](bool success) { on_load_pgs_hook(success); }; st_cli.on_load_pgs_hook = [this](bool success) { on_load_pgs_hook(success); };
st_cli.on_reload_hook = [this]() { st_cli.load_global_config(); };
st_cli.parse_config(config); st_cli.parse_config(config);
st_cli.load_global_config(); st_cli.load_global_config();
@@ -534,8 +533,8 @@ void cluster_client_t::copy_write(cluster_op_t *op, std::map<object_id, cluster_
unsigned iov_len = (op->iov.buf[iov_idx].iov_len - iov_pos); unsigned iov_len = (op->iov.buf[iov_idx].iov_len - iov_pos);
if (iov_len <= cur_len) if (iov_len <= cur_len)
{ {
memcpy((uint8_t*)dirty_it->second.buf + pos - dirty_it->first.stripe, memcpy(dirty_it->second.buf + pos - dirty_it->first.stripe,
(uint8_t*)op->iov.buf[iov_idx].iov_base + iov_pos, iov_len); op->iov.buf[iov_idx].iov_base + iov_pos, iov_len);
pos += iov_len; pos += iov_len;
len -= iov_len; len -= iov_len;
cur_len -= iov_len; cur_len -= iov_len;
@@ -544,8 +543,8 @@ void cluster_client_t::copy_write(cluster_op_t *op, std::map<object_id, cluster_
} }
else else
{ {
memcpy((uint8_t*)dirty_it->second.buf + pos - dirty_it->first.stripe, memcpy(dirty_it->second.buf + pos - dirty_it->first.stripe,
(uint8_t*)op->iov.buf[iov_idx].iov_base + iov_pos, cur_len); op->iov.buf[iov_idx].iov_base + iov_pos, cur_len);
pos += cur_len; pos += cur_len;
len -= cur_len; len -= cur_len;
iov_pos += cur_len; iov_pos += cur_len;
@@ -762,7 +761,7 @@ static void add_iov(int size, bool skip, cluster_op_t *op, int &iov_idx, size_t
{ {
if (!skip) if (!skip)
{ {
iov.push_back((uint8_t*)op->iov.buf[iov_idx].iov_base + iov_pos, cur_left); iov.push_back(op->iov.buf[iov_idx].iov_base + iov_pos, cur_left);
} }
left -= cur_left; left -= cur_left;
iov_pos = 0; iov_pos = 0;
@@ -772,7 +771,7 @@ static void add_iov(int size, bool skip, cluster_op_t *op, int &iov_idx, size_t
{ {
if (!skip) if (!skip)
{ {
iov.push_back((uint8_t*)op->iov.buf[iov_idx].iov_base + iov_pos, left); iov.push_back(op->iov.buf[iov_idx].iov_base + iov_pos, left);
} }
iov_pos += left; iov_pos += left;
left = 0; left = 0;
@@ -817,7 +816,7 @@ void cluster_client_t::slice_rw(cluster_op_t *op)
// First allocation // First allocation
memset(op->bitmap_buf, 0, object_bitmap_size); memset(op->bitmap_buf, 0, object_bitmap_size);
} }
op->part_bitmaps = (uint8_t*)op->bitmap_buf + object_bitmap_size; op->part_bitmaps = op->bitmap_buf + object_bitmap_size;
op->bitmap_buf_size = bitmap_mem; op->bitmap_buf_size = bitmap_mem;
} }
} }
@@ -839,7 +838,7 @@ void cluster_client_t::slice_rw(cluster_op_t *op)
while (cur < end) while (cur < end)
{ {
unsigned bmp_loc = (cur - op->offset)/bs_bitmap_granularity; unsigned bmp_loc = (cur - op->offset)/bs_bitmap_granularity;
bool skip = (((*((uint8_t*)op->bitmap_buf + bmp_loc/8)) >> (bmp_loc%8)) & 0x1); bool skip = (((*(uint8_t*)(op->bitmap_buf + bmp_loc/8)) >> (bmp_loc%8)) & 0x1);
if (skip_prev != skip) if (skip_prev != skip)
{ {
if (cur > prev) if (cur > prev)
@@ -944,7 +943,7 @@ bool cluster_client_t::try_send(cluster_op_t *op, int i)
.meta_revision = meta_rev, .meta_revision = meta_rev,
.version = op->opcode == OSD_OP_WRITE || op->opcode == OSD_OP_DELETE ? op->version : 0, .version = op->opcode == OSD_OP_WRITE || op->opcode == OSD_OP_DELETE ? op->version : 0,
} }, } },
.bitmap = (op->opcode == OSD_OP_READ || op->opcode == OSD_OP_READ_BITMAP ? (uint8_t*)op->part_bitmaps + pg_bitmap_size*i : NULL), .bitmap = (op->opcode == OSD_OP_READ || op->opcode == OSD_OP_READ_BITMAP ? op->part_bitmaps + pg_bitmap_size*i : NULL),
.bitmap_len = (unsigned)(op->opcode == OSD_OP_READ || op->opcode == OSD_OP_READ_BITMAP ? pg_bitmap_size : 0), .bitmap_len = (unsigned)(op->opcode == OSD_OP_READ || op->opcode == OSD_OP_READ_BITMAP ? pg_bitmap_size : 0),
.callback = [this, part](osd_op_t *op_part) .callback = [this, part](osd_op_t *op_part)
{ {
@@ -1155,7 +1154,7 @@ void cluster_client_t::copy_part_bitmap(cluster_op_t *op, cluster_op_part_t *par
if (!(object_offset & 0x7) && !(part_offset & 0x7) && (part_len >= 8)) if (!(object_offset & 0x7) && !(part_offset & 0x7) && (part_len >= 8))
{ {
// Copy bytes // Copy bytes
mem_or((uint8_t*)op->bitmap_buf + object_offset/8, (uint8_t*)part->op.bitmap + part_offset/8, part_len/8); mem_or(op->bitmap_buf + object_offset/8, part->op.bitmap + part_offset/8, part_len/8);
object_offset += (part_len & ~0x7); object_offset += (part_len & ~0x7);
part_offset += (part_len & ~0x7); part_offset += (part_len & ~0x7);
part_len = (part_len & 0x7); part_len = (part_len & 0x7);
@@ -1163,8 +1162,8 @@ void cluster_client_t::copy_part_bitmap(cluster_op_t *op, cluster_op_part_t *par
while (part_len > 0) while (part_len > 0)
{ {
// Copy bits // Copy bits
(*((uint8_t*)op->bitmap_buf + (object_offset >> 3))) |= ( (*(uint8_t*)(op->bitmap_buf + (object_offset >> 3))) |= (
(((*((uint8_t*)part->op.bitmap + (part_offset >> 3))) >> (part_offset & 0x7)) & 0x1) << (object_offset & 0x7) (((*(uint8_t*)(part->op.bitmap + (part_offset >> 3))) >> (part_offset & 0x7)) & 0x1) << (object_offset & 0x7)
); );
part_offset++; part_offset++;
object_offset++; object_offset++;

View File

@@ -75,7 +75,7 @@ int main(int argc, char *argv[])
uint64_t s; uint64_t s;
for (s = 0; s < self.journal_block; s += 8) for (s = 0; s < self.journal_block; s += 8)
{ {
if (*((uint64_t*)((uint8_t*)data+s)) != 0) if (*((uint64_t*)(data+s)) != 0)
break; break;
} }
if (s == self.journal_block) if (s == self.journal_block)
@@ -139,7 +139,7 @@ int journal_dump_t::dump_block(void *buf)
bool wrapped = false; bool wrapped = false;
while (pos < journal_block) while (pos < journal_block)
{ {
journal_entry *je = (journal_entry*)((uint8_t*)buf + pos); journal_entry *je = (journal_entry*)(buf + pos);
if (je->magic != JOURNAL_MAGIC || je->type < JE_MIN || je->type > JE_MAX || if (je->magic != JOURNAL_MAGIC || je->type < JE_MIN || je->type > JE_MAX ||
!all && started && je->crc32_prev != crc32_last) !all && started && je->crc32_prev != crc32_last)
{ {

View File

@@ -5,7 +5,6 @@
#include "pg_states.h" #include "pg_states.h"
#include "etcd_state_client.h" #include "etcd_state_client.h"
#ifndef __MOCK__ #ifndef __MOCK__
#include "addr_util.h"
#include "http_client.h" #include "http_client.h"
#include "base64.h" #include "base64.h"
#endif #endif
@@ -18,22 +17,12 @@ etcd_state_client_t::~etcd_state_client_t()
} }
watches.clear(); watches.clear();
etcd_watches_initialised = -1; etcd_watches_initialised = -1;
if (ws_keepalive_timer >= 0)
{
tfd->clear_timer(ws_keepalive_timer);
ws_keepalive_timer = -1;
}
#ifndef __MOCK__ #ifndef __MOCK__
if (etcd_watch_ws) if (etcd_watch_ws)
{ {
http_close(etcd_watch_ws); etcd_watch_ws->close();
etcd_watch_ws = NULL; etcd_watch_ws = NULL;
} }
if (keepalive_client)
{
http_close(keepalive_client);
keepalive_client = NULL;
}
#endif #endif
} }
@@ -54,26 +43,19 @@ etcd_kv_t etcd_state_client_t::parse_etcd_kv(const json11::Json & kv_json)
return kv; return kv;
} }
void etcd_state_client_t::etcd_txn(json11::Json txn, int timeout, int retries, int interval, std::function<void(std::string, json11::Json)> callback) void etcd_state_client_t::etcd_txn(json11::Json txn, int timeout, std::function<void(std::string, json11::Json)> callback)
{ {
etcd_call("/kv/txn", txn, timeout, retries, interval, callback); etcd_call("/kv/txn", txn, timeout, callback);
} }
void etcd_state_client_t::etcd_txn_slow(json11::Json txn, std::function<void(std::string, json11::Json)> callback) void etcd_state_client_t::etcd_call(std::string api, json11::Json payload, int timeout, std::function<void(std::string, json11::Json)> callback)
{ {
etcd_call("/kv/txn", txn, etcd_slow_timeout, max_etcd_attempts, 0, callback); if (!etcd_addresses.size())
}
void etcd_state_client_t::etcd_call(std::string api, json11::Json payload, int timeout,
int retries, int interval, std::function<void(std::string, json11::Json)> callback)
{
if (!etcd_addresses.size() && !etcd_local.size())
{ {
fprintf(stderr, "etcd_address is missing in Vitastor configuration\n"); fprintf(stderr, "etcd_address is missing in Vitastor configuration\n");
exit(1); exit(1);
} }
pick_next_etcd(); std::string etcd_address = etcd_addresses[rand() % etcd_addresses.size()];
std::string etcd_address = selected_etcd_address;
std::string etcd_api_path; std::string etcd_api_path;
int pos = etcd_address.find('/'); int pos = etcd_address.find('/');
if (pos >= 0) if (pos >= 0)
@@ -86,49 +68,9 @@ void etcd_state_client_t::etcd_call(std::string api, json11::Json payload, int t
"Host: "+etcd_address+"\r\n" "Host: "+etcd_address+"\r\n"
"Content-Type: application/json\r\n" "Content-Type: application/json\r\n"
"Content-Length: "+std::to_string(req.size())+"\r\n" "Content-Length: "+std::to_string(req.size())+"\r\n"
"Connection: keep-alive\r\n" "Connection: close\r\n"
"Keep-Alive: timeout="+std::to_string(etcd_keepalive_timeout)+"\r\n"
"\r\n"+req; "\r\n"+req;
auto cb = [this, api, payload, timeout, retries, interval, callback, http_request_json(tfd, etcd_address, req, timeout, callback);
cur_addr = selected_etcd_address](const http_response_t *response)
{
std::string err;
json11::Json data;
response->parse_json_response(err, data);
if (err != "")
{
if (cur_addr == selected_etcd_address)
selected_etcd_address = "";
if (retries > 0)
{
if (this->log_level > 0)
{
printf(
"Warning: etcd request failed: %s, retrying %d more times\n",
err.c_str(), retries
);
}
if (interval > 0)
{
tfd->set_timer(interval, false, [this, api, payload, timeout, retries, interval, callback](int)
{
etcd_call(api, payload, timeout, retries-1, interval, callback);
});
}
else
etcd_call(api, payload, timeout, retries-1, interval, callback);
}
else
callback(err, data);
}
else
callback(err, data);
};
if (!keepalive_client)
{
keepalive_client = http_init(tfd);
}
http_request(keepalive_client, etcd_address, req, { .timeout = timeout, .keepalive = true }, cb);
} }
void etcd_state_client_t::add_etcd_url(std::string addr) void etcd_state_client_t::add_etcd_url(std::string addr)
@@ -142,30 +84,9 @@ void etcd_state_client_t::add_etcd_url(std::string addr)
fprintf(stderr, "HTTPS is unsupported for etcd. Either use plain HTTP or setup a local proxy for etcd interaction\n"); fprintf(stderr, "HTTPS is unsupported for etcd. Either use plain HTTP or setup a local proxy for etcd interaction\n");
exit(1); exit(1);
} }
if (!local_ips.size()) if (addr.find('/') == std::string::npos)
local_ips = getifaddr_list();
std::string check_addr;
int pos = addr.find('/');
int pos2 = addr.find(':');
if (pos2 >= 0)
check_addr = addr.substr(0, pos2);
else if (pos >= 0)
check_addr = addr.substr(0, pos);
else
check_addr = addr;
if (pos == std::string::npos)
addr += "/v3"; addr += "/v3";
int i; this->etcd_addresses.push_back(addr);
for (i = 0; i < local_ips.size(); i++)
{
if (local_ips[i] == check_addr)
{
this->etcd_local.push_back(addr);
break;
}
}
if (i >= local_ips.size())
this->etcd_addresses.push_back(addr);
} }
} }
@@ -202,74 +123,16 @@ void etcd_state_client_t::parse_config(const json11::Json & config)
this->etcd_prefix = "/"+this->etcd_prefix; this->etcd_prefix = "/"+this->etcd_prefix;
} }
this->log_level = config["log_level"].int64_value(); this->log_level = config["log_level"].int64_value();
this->etcd_keepalive_timeout = config["etcd_keepalive_timeout"].uint64_value();
if (this->etcd_keepalive_timeout <= 0)
{
this->etcd_keepalive_timeout = config["etcd_report_interval"].uint64_value() * 2;
if (this->etcd_keepalive_timeout < 30)
this->etcd_keepalive_timeout = 30;
}
this->etcd_ws_keepalive_interval = config["etcd_ws_keepalive_interval"].uint64_value();
if (this->etcd_ws_keepalive_interval <= 0)
{
this->etcd_ws_keepalive_interval = 30;
}
this->max_etcd_attempts = config["max_etcd_attempts"].uint64_value();
if (this->max_etcd_attempts <= 0)
{
this->max_etcd_attempts = 5;
}
this->etcd_slow_timeout = config["etcd_slow_timeout"].uint64_value();
if (this->etcd_slow_timeout <= 0)
{
this->etcd_slow_timeout = 5000;
}
this->etcd_quick_timeout = config["etcd_quick_timeout"].uint64_value();
if (this->etcd_quick_timeout <= 0)
{
this->etcd_quick_timeout = 1000;
}
}
void etcd_state_client_t::pick_next_etcd()
{
if (selected_etcd_address != "")
return;
if (addresses_to_try.size() == 0)
{
// Prefer local etcd, if any
for (int i = 0; i < etcd_local.size(); i++)
addresses_to_try.push_back(etcd_local[i]);
std::vector<int> ns;
for (int i = 0; i < etcd_addresses.size(); i++)
ns.push_back(i);
if (!rand_initialized)
{
timespec tv;
clock_gettime(CLOCK_REALTIME, &tv);
srand48(tv.tv_sec*1000000000 + tv.tv_nsec);
rand_initialized = true;
}
while (ns.size())
{
int i = lrand48() % ns.size();
addresses_to_try.push_back(etcd_addresses[ns[i]]);
ns.erase(ns.begin()+i, ns.begin()+i+1);
}
}
selected_etcd_address = addresses_to_try[0];
addresses_to_try.erase(addresses_to_try.begin(), addresses_to_try.begin()+1);
} }
void etcd_state_client_t::start_etcd_watcher() void etcd_state_client_t::start_etcd_watcher()
{ {
if (!etcd_addresses.size() && !etcd_local.size()) if (!etcd_addresses.size())
{ {
fprintf(stderr, "etcd_address is missing in Vitastor configuration\n"); fprintf(stderr, "etcd_address is missing in Vitastor configuration\n");
exit(1); exit(1);
} }
pick_next_etcd(); std::string etcd_address = etcd_addresses[rand() % etcd_addresses.size()];
std::string etcd_address = selected_etcd_address;
std::string etcd_api_path; std::string etcd_api_path;
int pos = etcd_address.find('/'); int pos = etcd_address.find('/');
if (pos >= 0) if (pos >= 0)
@@ -278,20 +141,10 @@ void etcd_state_client_t::start_etcd_watcher()
etcd_address = etcd_address.substr(0, pos); etcd_address = etcd_address.substr(0, pos);
} }
etcd_watches_initialised = 0; etcd_watches_initialised = 0;
ws_alive = 1; etcd_watch_ws = open_websocket(tfd, etcd_address, etcd_api_path+"/watch", ETCD_SLOW_TIMEOUT, [this](const http_response_t *msg)
if (etcd_watch_ws)
{
http_close(etcd_watch_ws);
etcd_watch_ws = NULL;
}
if (this->log_level > 1)
printf("Trying to connect to etcd websocket at %s\n", etcd_address.c_str());
etcd_watch_ws = open_websocket(tfd, etcd_address, etcd_api_path+"/watch", etcd_slow_timeout,
[this, cur_addr = selected_etcd_address](const http_response_t *msg)
{ {
if (msg->body.length()) if (msg->body.length())
{ {
ws_alive = 1;
std::string json_err; std::string json_err;
json11::Json data = json11::Json::parse(msg->body, json_err); json11::Json data = json11::Json::parse(msg->body, json_err);
if (json_err != "") if (json_err != "")
@@ -302,46 +155,11 @@ void etcd_state_client_t::start_etcd_watcher()
{ {
if (data["result"]["created"].bool_value()) if (data["result"]["created"].bool_value())
{ {
if (etcd_watches_initialised == 3 && this->log_level > 0)
fprintf(stderr, "Successfully subscribed to etcd at %s\n", selected_etcd_address.c_str());
etcd_watches_initialised++; etcd_watches_initialised++;
} }
if (data["result"]["canceled"].bool_value())
{
// etcd watch canceled, maybe because the revision was compacted
if (data["result"]["compact_revision"].uint64_value())
{
// we may miss events if we proceed
// so we should restart from the beginning if we can
if (on_reload_hook != NULL)
{
fprintf(stderr, "Revisions before %lu were compacted by etcd, reloading state\n",
data["result"]["compact_revision"].uint64_value());
if (etcd_watch_ws)
{
http_close(etcd_watch_ws);
etcd_watch_ws = NULL;
}
etcd_watch_revision = 0;
on_reload_hook();
}
else
{
fprintf(stderr, "Revisions before %lu were compacted by etcd, exiting\n",
data["result"]["compact_revision"].uint64_value());
exit(1);
}
}
else
{
fprintf(stderr, "Watch canceled by etcd, reason: %s, exiting\n", data["result"]["cancel_reason"].string_value().c_str());
exit(1);
}
}
if (etcd_watches_initialised == 4) if (etcd_watches_initialised == 4)
{ {
etcd_watch_revision = data["result"]["header"]["revision"].uint64_value()+1; etcd_watch_revision = data["result"]["header"]["revision"].uint64_value();
addresses_to_try.clear();
} }
// First gather all changes into a hash to remove multiple overwrites // First gather all changes into a hash to remove multiple overwrites
std::map<std::string, etcd_kv_t> changes; std::map<std::string, etcd_kv_t> changes;
@@ -370,22 +188,11 @@ void etcd_state_client_t::start_etcd_watcher()
} }
if (msg->eof) if (msg->eof)
{ {
if (cur_addr == selected_etcd_address) etcd_watch_ws = NULL;
{
fprintf(stderr, "Disconnected from etcd %s\n", selected_etcd_address.c_str());
selected_etcd_address = "";
}
else
fprintf(stderr, "Disconnected from etcd\n");
if (etcd_watch_ws)
{
http_close(etcd_watch_ws);
etcd_watch_ws = NULL;
}
if (etcd_watches_initialised == 0) if (etcd_watches_initialised == 0)
{ {
// Connection not established, retry in <etcd_quick_timeout> // Connection not established, retry in <ETCD_SLOW_TIMEOUT>
tfd->set_timer(etcd_quick_timeout, false, [this](int) tfd->set_timer(ETCD_SLOW_TIMEOUT, false, [this](int)
{ {
start_etcd_watcher(); start_etcd_watcher();
}); });
@@ -397,84 +204,54 @@ void etcd_state_client_t::start_etcd_watcher()
} }
} }
}); });
http_post_message(etcd_watch_ws, WS_TEXT, json11::Json(json11::Json::object { etcd_watch_ws->post_message(WS_TEXT, json11::Json(json11::Json::object {
{ "create_request", json11::Json::object { { "create_request", json11::Json::object {
{ "key", base64_encode(etcd_prefix+"/config/") }, { "key", base64_encode(etcd_prefix+"/config/") },
{ "range_end", base64_encode(etcd_prefix+"/config0") }, { "range_end", base64_encode(etcd_prefix+"/config0") },
{ "start_revision", etcd_watch_revision }, { "start_revision", etcd_watch_revision+1 },
{ "watch_id", ETCD_CONFIG_WATCH_ID }, { "watch_id", ETCD_CONFIG_WATCH_ID },
{ "progress_notify", true }, { "progress_notify", true },
} } } }
}).dump()); }).dump());
http_post_message(etcd_watch_ws, WS_TEXT, json11::Json(json11::Json::object { etcd_watch_ws->post_message(WS_TEXT, json11::Json(json11::Json::object {
{ "create_request", json11::Json::object { { "create_request", json11::Json::object {
{ "key", base64_encode(etcd_prefix+"/osd/state/") }, { "key", base64_encode(etcd_prefix+"/osd/state/") },
{ "range_end", base64_encode(etcd_prefix+"/osd/state0") }, { "range_end", base64_encode(etcd_prefix+"/osd/state0") },
{ "start_revision", etcd_watch_revision }, { "start_revision", etcd_watch_revision+1 },
{ "watch_id", ETCD_OSD_STATE_WATCH_ID }, { "watch_id", ETCD_OSD_STATE_WATCH_ID },
{ "progress_notify", true }, { "progress_notify", true },
} } } }
}).dump()); }).dump());
http_post_message(etcd_watch_ws, WS_TEXT, json11::Json(json11::Json::object { etcd_watch_ws->post_message(WS_TEXT, json11::Json(json11::Json::object {
{ "create_request", json11::Json::object { { "create_request", json11::Json::object {
{ "key", base64_encode(etcd_prefix+"/pg/state/") }, { "key", base64_encode(etcd_prefix+"/pg/state/") },
{ "range_end", base64_encode(etcd_prefix+"/pg/state0") }, { "range_end", base64_encode(etcd_prefix+"/pg/state0") },
{ "start_revision", etcd_watch_revision }, { "start_revision", etcd_watch_revision+1 },
{ "watch_id", ETCD_PG_STATE_WATCH_ID }, { "watch_id", ETCD_PG_STATE_WATCH_ID },
{ "progress_notify", true }, { "progress_notify", true },
} } } }
}).dump()); }).dump());
http_post_message(etcd_watch_ws, WS_TEXT, json11::Json(json11::Json::object { etcd_watch_ws->post_message(WS_TEXT, json11::Json(json11::Json::object {
{ "create_request", json11::Json::object { { "create_request", json11::Json::object {
{ "key", base64_encode(etcd_prefix+"/pg/history/") }, { "key", base64_encode(etcd_prefix+"/pg/history/") },
{ "range_end", base64_encode(etcd_prefix+"/pg/history0") }, { "range_end", base64_encode(etcd_prefix+"/pg/history0") },
{ "start_revision", etcd_watch_revision }, { "start_revision", etcd_watch_revision+1 },
{ "watch_id", ETCD_PG_HISTORY_WATCH_ID }, { "watch_id", ETCD_PG_HISTORY_WATCH_ID },
{ "progress_notify", true }, { "progress_notify", true },
} } } }
}).dump()); }).dump());
if (ws_keepalive_timer < 0)
{
ws_keepalive_timer = tfd->set_timer(etcd_ws_keepalive_interval*1000, true, [this](int)
{
if (!etcd_watch_ws)
{
// Do nothing
}
else if (!ws_alive)
{
if (this->log_level > 0)
{
fprintf(stderr, "Websocket ping failed, disconnecting from etcd %s\n", selected_etcd_address.c_str());
}
if (etcd_watch_ws)
{
http_close(etcd_watch_ws);
etcd_watch_ws = NULL;
}
start_etcd_watcher();
}
else
{
ws_alive = 0;
http_post_message(etcd_watch_ws, WS_TEXT, json11::Json(json11::Json::object {
{ "progress_request", json11::Json::object { } }
}).dump());
}
});
}
} }
void etcd_state_client_t::load_global_config() void etcd_state_client_t::load_global_config()
{ {
etcd_call("/kv/range", json11::Json::object { etcd_call("/kv/range", json11::Json::object {
{ "key", base64_encode(etcd_prefix+"/config/global") } { "key", base64_encode(etcd_prefix+"/config/global") }
}, etcd_slow_timeout, max_etcd_attempts, 0, [this](std::string err, json11::Json data) }, ETCD_SLOW_TIMEOUT, [this](std::string err, json11::Json data)
{ {
if (err != "") if (err != "")
{ {
fprintf(stderr, "Error reading OSD configuration from etcd: %s\n", err.c_str()); fprintf(stderr, "Error reading OSD configuration from etcd: %s\n", err.c_str());
tfd->set_timer(etcd_slow_timeout, false, [this](int timer_id) tfd->set_timer(ETCD_SLOW_TIMEOUT, false, [this](int timer_id)
{ {
load_global_config(); load_global_config();
}); });
@@ -542,13 +319,12 @@ void etcd_state_client_t::load_pgs()
{ {
req["compare"] = checks; req["compare"] = checks;
} }
etcd_txn_slow(req, [this](std::string err, json11::Json data) etcd_txn(req, ETCD_SLOW_TIMEOUT, [this](std::string err, json11::Json data)
{ {
if (err != "") if (err != "")
{ {
// Retry indefinitely
fprintf(stderr, "Error loading PGs from etcd: %s\n", err.c_str()); fprintf(stderr, "Error loading PGs from etcd: %s\n", err.c_str());
tfd->set_timer(etcd_slow_timeout, false, [this](int timer_id) tfd->set_timer(ETCD_SLOW_TIMEOUT, false, [this](int timer_id)
{ {
load_pgs(); load_pgs();
}); });
@@ -561,7 +337,7 @@ void etcd_state_client_t::load_pgs()
} }
if (!etcd_watch_revision) if (!etcd_watch_revision)
{ {
etcd_watch_revision = data["header"]["revision"].uint64_value()+1; etcd_watch_revision = data["header"]["revision"].uint64_value();
} }
for (auto & res: data["responses"].array_items()) for (auto & res: data["responses"].array_items())
{ {
@@ -1008,8 +784,3 @@ json11::Json::object etcd_state_client_t::serialize_inode_cfg(inode_config_t *cf
} }
return new_cfg; return new_cfg;
} }
int etcd_state_client_t::address_count()
{
return etcd_addresses.size() + etcd_local.size();
}

View File

@@ -12,6 +12,10 @@
#define ETCD_PG_HISTORY_WATCH_ID 3 #define ETCD_PG_HISTORY_WATCH_ID 3
#define ETCD_OSD_STATE_WATCH_ID 4 #define ETCD_OSD_STATE_WATCH_ID 4
#define MAX_ETCD_ATTEMPTS 5
#define ETCD_SLOW_TIMEOUT 5000
#define ETCD_QUICK_TIMEOUT 1000
#define DEFAULT_BLOCK_SIZE 128*1024 #define DEFAULT_BLOCK_SIZE 128*1024
struct etcd_kv_t struct etcd_kv_t
@@ -66,31 +70,17 @@ struct inode_watch_t
inode_config_t cfg; inode_config_t cfg;
}; };
struct http_co_t; struct websocket_t;
struct etcd_state_client_t struct etcd_state_client_t
{ {
protected: protected:
std::vector<std::string> local_ips;
std::vector<std::string> etcd_addresses;
std::vector<std::string> etcd_local;
std::string selected_etcd_address;
std::vector<std::string> addresses_to_try;
std::vector<inode_watch_t*> watches; std::vector<inode_watch_t*> watches;
http_co_t *etcd_watch_ws = NULL, *keepalive_client = NULL; websocket_t *etcd_watch_ws = NULL;
int ws_keepalive_timer = -1;
int ws_alive = 0;
bool rand_initialized = false;
uint64_t bs_block_size = DEFAULT_BLOCK_SIZE; uint64_t bs_block_size = DEFAULT_BLOCK_SIZE;
void add_etcd_url(std::string); void add_etcd_url(std::string);
void pick_next_etcd();
public: public:
int etcd_keepalive_timeout = 30; std::vector<std::string> etcd_addresses;
int etcd_ws_keepalive_interval = 30;
int max_etcd_attempts = 5;
int etcd_quick_timeout = 1000;
int etcd_slow_timeout = 5000;
std::string etcd_prefix; std::string etcd_prefix;
int log_level = 0; int log_level = 0;
timerfd_manager_t *tfd = NULL; timerfd_manager_t *tfd = NULL;
@@ -108,13 +98,11 @@ public:
std::function<void(bool)> on_load_pgs_hook; std::function<void(bool)> on_load_pgs_hook;
std::function<void(pool_id_t, pg_num_t)> on_change_pg_history_hook; std::function<void(pool_id_t, pg_num_t)> on_change_pg_history_hook;
std::function<void(osd_num_t)> on_change_osd_state_hook; std::function<void(osd_num_t)> on_change_osd_state_hook;
std::function<void()> on_reload_hook;
json11::Json::object serialize_inode_cfg(inode_config_t *cfg); json11::Json::object serialize_inode_cfg(inode_config_t *cfg);
etcd_kv_t parse_etcd_kv(const json11::Json & kv_json); etcd_kv_t parse_etcd_kv(const json11::Json & kv_json);
void etcd_call(std::string api, json11::Json payload, int timeout, int retries, int interval, std::function<void(std::string, json11::Json)> callback); void etcd_call(std::string api, json11::Json payload, int timeout, std::function<void(std::string, json11::Json)> callback);
void etcd_txn(json11::Json txn, int timeout, int retries, int interval, std::function<void(std::string, json11::Json)> callback); void etcd_txn(json11::Json txn, int timeout, std::function<void(std::string, json11::Json)> callback);
void etcd_txn_slow(json11::Json txn, std::function<void(std::string, json11::Json)> callback);
void start_etcd_watcher(); void start_etcd_watcher();
void load_global_config(); void load_global_config();
void load_pgs(); void load_pgs();
@@ -122,6 +110,5 @@ public:
void parse_config(const json11::Json & config); void parse_config(const json11::Json & config);
inode_watch_t* watch_inode(std::string name); inode_watch_t* watch_inode(std::string name);
void close_watch(inode_watch_t* watch); void close_watch(inode_watch_t* watch);
int address_count();
~etcd_state_client_t(); ~etcd_state_client_t();
}; };

View File

@@ -247,12 +247,6 @@ static int sec_setup(struct thread_data *td)
vitastor_c_uring_wait_events(bsd->cli); vitastor_c_uring_wait_events(bsd->cli);
} }
td->files[0]->real_file_size = vitastor_c_inode_get_size(bsd->watch); td->files[0]->real_file_size = vitastor_c_inode_get_size(bsd->watch);
if (!vitastor_c_inode_get_num(bsd->watch) ||
!td->files[0]->real_file_size)
{
td_verror(td, EINVAL, "image does not exist");
return 1;
}
} }
bsd->trace = o->trace ? true : false; bsd->trace = o->trace ? true : false;

View File

@@ -26,9 +26,10 @@
#include "blockstore.h" #include "blockstore.h"
#include "epoll_manager.h" #include "epoll_manager.h"
#include "json11/json11.hpp"
#include "fio_headers.h" #include "fio_headers.h"
#include "json11/json11.hpp"
struct bs_data struct bs_data
{ {
blockstore_t *bs; blockstore_t *bs;
@@ -149,6 +150,7 @@ static int bs_init(struct thread_data *td)
static enum fio_q_status bs_queue(struct thread_data *td, struct io_u *io) static enum fio_q_status bs_queue(struct thread_data *td, struct io_u *io)
{ {
bs_data *bsd = (bs_data*)td->io_ops_data; bs_data *bsd = (bs_data*)td->io_ops_data;
int n = bsd->op_n;
if (io->ddir == DDIR_SYNC && bsd->last_sync) if (io->ddir == DDIR_SYNC && bsd->last_sync)
{ {
return FIO_Q_COMPLETED; return FIO_Q_COMPLETED;
@@ -176,7 +178,7 @@ static enum fio_q_status bs_queue(struct thread_data *td, struct io_u *io)
op->version = UINT64_MAX; // last unstable op->version = UINT64_MAX; // last unstable
op->offset = io->offset % bsd->bs->get_block_size(); op->offset = io->offset % bsd->bs->get_block_size();
op->len = io->xfer_buflen; op->len = io->xfer_buflen;
op->callback = [io](blockstore_op_t *op) op->callback = [io, n](blockstore_op_t *op)
{ {
io->error = op->retval < 0 ? -op->retval : 0; io->error = op->retval < 0 ? -op->retval : 0;
bs_data *bsd = (bs_data*)io->engine_data; bs_data *bsd = (bs_data*)io->engine_data;
@@ -198,7 +200,7 @@ static enum fio_q_status bs_queue(struct thread_data *td, struct io_u *io)
op->version = 0; // assign automatically op->version = 0; // assign automatically
op->offset = io->offset % bsd->bs->get_block_size(); op->offset = io->offset % bsd->bs->get_block_size();
op->len = io->xfer_buflen; op->len = io->xfer_buflen;
op->callback = [io](blockstore_op_t *op) op->callback = [io, n](blockstore_op_t *op)
{ {
io->error = op->retval < 0 ? -op->retval : 0; io->error = op->retval < 0 ? -op->retval : 0;
bs_data *bsd = (bs_data*)io->engine_data; bs_data *bsd = (bs_data*)io->engine_data;
@@ -213,7 +215,7 @@ static enum fio_q_status bs_queue(struct thread_data *td, struct io_u *io)
break; break;
case DDIR_SYNC: case DDIR_SYNC:
op->opcode = BS_OP_SYNC_STAB_ALL; op->opcode = BS_OP_SYNC_STAB_ALL;
op->callback = [io](blockstore_op_t *op) op->callback = [io, n](blockstore_op_t *op)
{ {
bs_data *bsd = (bs_data*)io->engine_data; bs_data *bsd = (bs_data*)io->engine_data;
io->error = op->retval < 0 ? -op->retval : 0; io->error = op->retval < 0 ? -op->retval : 0;
@@ -228,7 +230,6 @@ static enum fio_q_status bs_queue(struct thread_data *td, struct io_u *io)
break; break;
default: default:
io->error = EINVAL; io->error = EINVAL;
delete op;
return FIO_Q_COMPLETED; return FIO_Q_COMPLETED;
} }

View File

@@ -28,23 +28,16 @@
#include <vector> #include <vector>
#include <unordered_map> #include <unordered_map>
#include "addr_util.h"
#include "rw_blocking.h" #include "rw_blocking.h"
#include "osd_ops.h" #include "osd_ops.h"
#include "fio_headers.h" #include "fio_headers.h"
struct op_buf_t
{
osd_any_op_t buf;
io_u* fio_op;
};
struct sec_data struct sec_data
{ {
int connect_fd; int connect_fd;
/* block_size = 1 << block_order (128KB by default) */ /* block_size = 1 << block_order (128KB by default) */
uint64_t block_order = 17, block_size = 1 << 17; uint64_t block_order = 17, block_size = 1 << 17;
std::unordered_map<uint64_t, op_buf_t*> queue; std::unordered_map<uint64_t, io_u*> queue;
bool last_sync = false; bool last_sync = false;
/* The list of completed io_u structs. */ /* The list of completed io_u structs. */
std::vector<io_u*> completed; std::vector<io_u*> completed;
@@ -59,7 +52,6 @@ struct sec_options
int single_primary = 0; int single_primary = 0;
int trace = 0; int trace = 0;
int block_order = 17; int block_order = 17;
int zerocopy_send = 0;
}; };
static struct fio_option options[] = { static struct fio_option options[] = {
@@ -110,16 +102,6 @@ static struct fio_option options[] = {
.category = FIO_OPT_C_ENGINE, .category = FIO_OPT_C_ENGINE,
.group = FIO_OPT_G_FILENAME, .group = FIO_OPT_G_FILENAME,
}, },
{
.name = "zerocopy_send",
.lname = "Use zero-copy send",
.type = FIO_OPT_BOOL,
.off1 = offsetof(struct sec_options, zerocopy_send),
.help = "Use zero-copy send (MSG_ZEROCOPY)",
.def = "0",
.category = FIO_OPT_C_ENGINE,
.group = FIO_OPT_G_FILENAME,
},
{ {
.name = NULL, .name = NULL,
}, },
@@ -170,14 +152,17 @@ static int sec_init(struct thread_data *td)
bsd->block_order = o->block_order == 0 ? 17 : o->block_order; bsd->block_order = o->block_order == 0 ? 17 : o->block_order;
bsd->block_size = 1 << o->block_order; bsd->block_size = 1 << o->block_order;
sockaddr addr; struct sockaddr_in addr;
if (!string_to_addr(std::string(o->host ? o->host : "127.0.0.1"), false, o->port > 0 ? o->port : 11203, &addr)) int r;
if ((r = inet_pton(AF_INET, o->host ? o->host : "127.0.0.1", &addr.sin_addr)) != 1)
{ {
fprintf(stderr, "server address: %s is not valid\n", o->host ? o->host : "127.0.0.1"); fprintf(stderr, "server address: %s%s\n", o->host ? o->host : "127.0.0.1", r == 0 ? " is not valid" : ": no ipv4 support");
return 1; return 1;
} }
addr.sin_family = AF_INET;
addr.sin_port = htons(o->port ? o->port : 11203);
bsd->connect_fd = socket(addr.sa_family, SOCK_STREAM, 0); bsd->connect_fd = socket(AF_INET, SOCK_STREAM, 0);
if (bsd->connect_fd < 0) if (bsd->connect_fd < 0)
{ {
perror("socket"); perror("socket");
@@ -190,14 +175,6 @@ static int sec_init(struct thread_data *td)
} }
int one = 1; int one = 1;
setsockopt(bsd->connect_fd, SOL_TCP, TCP_NODELAY, &one, sizeof(one)); setsockopt(bsd->connect_fd, SOL_TCP, TCP_NODELAY, &one, sizeof(one));
if (o->zerocopy_send)
{
if (setsockopt(bsd->connect_fd, SOL_SOCKET, SO_ZEROCOPY, &one, sizeof(one)) < 0)
{
perror("setsockopt zerocopy");
return 1;
}
}
// FIXME: read config (block size) from OSD // FIXME: read config (block size) from OSD
@@ -218,9 +195,7 @@ static enum fio_q_status sec_queue(struct thread_data *td, struct io_u *io)
} }
io->engine_data = bsd; io->engine_data = bsd;
op_buf_t *op_buf = new op_buf_t; osd_any_op_t op = { 0 };
op_buf->fio_op = io;
osd_any_op_t &op = op_buf->buf;
op.hdr.magic = SECONDARY_OSD_OP_MAGIC; op.hdr.magic = SECONDARY_OSD_OP_MAGIC;
op.hdr.id = n; op.hdr.id = n;
@@ -284,7 +259,6 @@ static enum fio_q_status sec_queue(struct thread_data *td, struct io_u *io)
break; break;
default: default:
io->error = EINVAL; io->error = EINVAL;
delete op_buf;
return FIO_Q_COMPLETED; return FIO_Q_COMPLETED;
} }
@@ -297,18 +271,19 @@ static enum fio_q_status sec_queue(struct thread_data *td, struct io_u *io)
io->error = 0; io->error = 0;
bsd->inflight++; bsd->inflight++;
bsd->op_n++; bsd->op_n++;
bsd->queue[n] = op_buf; bsd->queue[n] = io;
iovec iov[2] = { { .iov_base = op.buf, .iov_len = OSD_PACKET_SIZE } }; iovec iov[2] = { { .iov_base = op.buf, .iov_len = OSD_PACKET_SIZE } };
int iovcnt = 1, wtotal = OSD_PACKET_SIZE; int iovcnt = 1, wtotal = OSD_PACKET_SIZE;
if (io->ddir == DDIR_WRITE) if (io->ddir == DDIR_WRITE)
{ {
iov[iovcnt++] = { .iov_base = io->xfer_buf, .iov_len = io->xfer_buflen }; iov[1] = { .iov_base = io->xfer_buf, .iov_len = io->xfer_buflen };
wtotal += io->xfer_buflen; wtotal += io->xfer_buflen;
iovcnt++;
} }
if (sendv_blocking(bsd->connect_fd, iov, iovcnt, opt->zerocopy_send ? MSG_ZEROCOPY : 0) != wtotal) if (writev_blocking(bsd->connect_fd, iov, iovcnt) != wtotal)
{ {
perror("sendmsg"); perror("writev");
exit(1); exit(1);
} }
@@ -337,8 +312,7 @@ static int sec_getevents(struct thread_data *td, unsigned int min, unsigned int
fprintf(stderr, "bad reply: op id %lx missing in local queue\n", reply.hdr.id); fprintf(stderr, "bad reply: op id %lx missing in local queue\n", reply.hdr.id);
exit(1); exit(1);
} }
io_u* io = it->second->fio_op; io_u* io = it->second;
delete it->second;
bsd->queue.erase(it); bsd->queue.erase(it);
if (io->ddir == DDIR_READ) if (io->ddir == DDIR_READ)
{ {
@@ -347,23 +321,7 @@ static int sec_getevents(struct thread_data *td, unsigned int min, unsigned int
fprintf(stderr, "Short read: retval = %ld instead of %llu\n", reply.hdr.retval, io->xfer_buflen); fprintf(stderr, "Short read: retval = %ld instead of %llu\n", reply.hdr.retval, io->xfer_buflen);
exit(1); exit(1);
} }
// Support bitmap read_blocking(bsd->connect_fd, io->xfer_buf, io->xfer_buflen);
uint64_t bitmap = 0;
int iovcnt = 0;
iovec iov[2];
if (reply.sec_rw.attr_len > 0)
{
if (reply.sec_rw.attr_len <= 8)
iov[iovcnt++] = { .iov_base = &bitmap, .iov_len = reply.sec_rw.attr_len };
else
iov[iovcnt++] = { .iov_base = (void*)(bitmap = (uint64_t)malloc(reply.sec_rw.attr_len)), .iov_len = reply.sec_rw.attr_len };
}
iov[iovcnt++] = { .iov_base = io->xfer_buf, .iov_len = io->xfer_buflen };
readv_blocking(bsd->connect_fd, iov, iovcnt);
if (reply.sec_rw.attr_len > 8)
{
free((void*)bitmap);
}
} }
else if (io->ddir == DDIR_WRITE) else if (io->ddir == DDIR_WRITE)
{ {

View File

@@ -4,7 +4,9 @@
#include <netinet/tcp.h> #include <netinet/tcp.h>
#include <sys/epoll.h> #include <sys/epoll.h>
#include <net/if.h>
#include <arpa/inet.h> #include <arpa/inet.h>
#include <ifaddrs.h>
#include <ctype.h> #include <ctype.h>
#include <unistd.h> #include <unistd.h>
@@ -13,22 +15,21 @@
#include <stdexcept> #include <stdexcept>
#include "addr_util.h"
#include "json11/json11.hpp" #include "json11/json11.hpp"
#include "http_client.h" #include "http_client.h"
#include "timerfd_manager.h" #include "timerfd_manager.h"
#define READ_BUFFER_SIZE 9000 #define READ_BUFFER_SIZE 9000
static int extract_port(std::string & host);
static std::string trim(const std::string & in); static std::string trim(const std::string & in);
static std::string ws_format_frame(int type, uint64_t size); static std::string ws_format_frame(int type, uint64_t size);
static bool ws_parse_frame(std::string & buf, int & type, std::string & res); static bool ws_parse_frame(std::string & buf, int & type, std::string & res);
static void parse_http_headers(std::string & res, http_response_t *parsed);
// FIXME: Use keepalive
struct http_co_t struct http_co_t
{ {
timerfd_manager_t *tfd; timerfd_manager_t *tfd;
std::function<void(const http_response_t*)> response_callback;
int request_timeout = 0; int request_timeout = 0;
std::string host; std::string host;
@@ -36,12 +37,11 @@ struct http_co_t
std::string ws_outbox; std::string ws_outbox;
std::string response; std::string response;
bool want_streaming; bool want_streaming;
bool keepalive;
std::vector<std::function<void()>> keepalive_queue; http_response_t parsed;
uint64_t target_response_size = 0;
int state = 0; int state = 0;
std::string connected_host;
int peer_fd = -1; int peer_fd = -1;
int timeout_id = -1; int timeout_id = -1;
int epoll_events = 0; int epoll_events = 0;
@@ -49,8 +49,10 @@ struct http_co_t
std::vector<char> rbuf; std::vector<char> rbuf;
iovec read_iov, send_iov; iovec read_iov, send_iov;
msghdr read_msg = { 0 }, send_msg = { 0 }; msghdr read_msg = { 0 }, send_msg = { 0 };
http_response_t parsed;
uint64_t target_response_size = 0; std::function<void(const http_response_t*)> callback;
websocket_t ws;
int onstack = 0; int onstack = 0;
bool ended = false; bool ended = false;
@@ -59,40 +61,66 @@ struct http_co_t
inline void stackin() { onstack++; } inline void stackin() { onstack++; }
inline void stackout() { onstack--; if (!onstack && ended) end(); } inline void stackout() { onstack--; if (!onstack && ended) end(); }
inline void end() { ended = true; if (!onstack) { delete this; } } inline void end() { ended = true; if (!onstack) { delete this; } }
void run_cb_and_clear();
void start_connection(); void start_connection();
void close_connection();
void handle_events(); void handle_events();
void handle_connect_result(); void handle_connect_result();
void submit_read(); void submit_read();
void submit_send(); void submit_send();
bool handle_read(); bool handle_read();
void post_message(int type, const std::string & msg); void post_message(int type, const std::string & msg);
void send_request(const std::string & host, const std::string & request,
const http_options_t & options, std::function<void(const http_response_t *response)> response_callback);
}; };
#define HTTP_CO_CLOSED 0
#define HTTP_CO_CONNECTING 1 #define HTTP_CO_CONNECTING 1
#define HTTP_CO_SENDING_REQUEST 2 #define HTTP_CO_SENDING_REQUEST 2
#define HTTP_CO_REQUEST_SENT 3 #define HTTP_CO_REQUEST_SENT 3
#define HTTP_CO_HEADERS_RECEIVED 4 #define HTTP_CO_HEADERS_RECEIVED 4
#define HTTP_CO_WEBSOCKET 5 #define HTTP_CO_WEBSOCKET 5
#define HTTP_CO_CHUNKED 6 #define HTTP_CO_CHUNKED 6
#define HTTP_CO_KEEPALIVE 7
#define DEFAULT_TIMEOUT 5000 #define DEFAULT_TIMEOUT 5000
http_co_t *http_init(timerfd_manager_t *tfd) void http_request(timerfd_manager_t *tfd, const std::string & host, const std::string & request,
const http_options_t & options, std::function<void(const http_response_t *response)> callback)
{ {
http_co_t *handler = new http_co_t(); http_co_t *handler = new http_co_t();
handler->request_timeout = options.timeout < 0 ? 0 : (options.timeout == 0 ? DEFAULT_TIMEOUT : options.timeout);
handler->want_streaming = options.want_streaming;
handler->tfd = tfd; handler->tfd = tfd;
handler->state = HTTP_CO_CLOSED; handler->host = host;
return handler; handler->request = request;
handler->callback = callback;
handler->ws.co = handler;
handler->start_connection();
} }
http_co_t* open_websocket(timerfd_manager_t *tfd, const std::string & host, const std::string & path, void http_request_json(timerfd_manager_t *tfd, const std::string & host, const std::string & request,
int timeout, std::function<void(const http_response_t *msg)> response_callback) int timeout, std::function<void(std::string, json11::Json r)> callback)
{
http_request(tfd, host, request, { .timeout = timeout }, [callback](const http_response_t* res)
{
if (res->error_code != 0)
{
callback("Error code: "+std::to_string(res->error_code)+" ("+std::string(strerror(res->error_code))+")", json11::Json());
return;
}
if (res->status_code != 200)
{
callback("HTTP "+std::to_string(res->status_code)+" "+res->status_line+" body: "+trim(res->body), json11::Json());
return;
}
std::string json_err;
json11::Json data = json11::Json::parse(res->body, json_err);
if (json_err != "")
{
callback("Bad JSON: "+json_err+" (response: "+trim(res->body)+")", json11::Json());
return;
}
callback(std::string(), data);
});
}
websocket_t* open_websocket(timerfd_manager_t *tfd, const std::string & host, const std::string & path,
int timeout, std::function<void(const http_response_t *msg)> callback)
{ {
std::string request = "GET "+path+" HTTP/1.1\r\n" std::string request = "GET "+path+" HTTP/1.1\r\n"
"Host: "+host+"\r\n" "Host: "+host+"\r\n"
@@ -102,154 +130,28 @@ http_co_t* open_websocket(timerfd_manager_t *tfd, const std::string & host, cons
"Sec-WebSocket-Version: 13\r\n" "Sec-WebSocket-Version: 13\r\n"
"\r\n"; "\r\n";
http_co_t *handler = new http_co_t(); http_co_t *handler = new http_co_t();
handler->tfd = tfd;
handler->state = HTTP_CO_CLOSED;
handler->host = host;
handler->request_timeout = timeout < 0 ? -1 : (timeout == 0 ? DEFAULT_TIMEOUT : timeout); handler->request_timeout = timeout < 0 ? -1 : (timeout == 0 ? DEFAULT_TIMEOUT : timeout);
handler->want_streaming = false; handler->want_streaming = false;
handler->keepalive = false; handler->tfd = tfd;
handler->host = host;
handler->request = request; handler->request = request;
handler->response_callback = response_callback; handler->callback = callback;
handler->ws.co = handler;
handler->start_connection(); handler->start_connection();
return handler; return &handler->ws;
} }
void http_request(http_co_t *handler, const std::string & host, const std::string & request, void websocket_t::post_message(int type, const std::string & msg)
const http_options_t & options, std::function<void(const http_response_t *response)> response_callback)
{ {
handler->send_request(host, request, options, response_callback); co->post_message(type, msg);
} }
void http_co_t::run_cb_and_clear() void websocket_t::close()
{ {
parsed.eof = true; co->end();
std::function<void(const http_response_t*)> cb;
cb.swap(response_callback);
// Call callback after clearing it because otherwise we may hit reenterability problems
if (cb != NULL)
cb(&parsed);
}
void http_co_t::send_request(const std::string & host, const std::string & request,
const http_options_t & options, std::function<void(const http_response_t *response)> response_callback)
{
stackin();
if (state == HTTP_CO_WEBSOCKET)
{
stackout();
throw std::runtime_error("Attempt to send HTTP request into a websocket or chunked stream");
}
else if (state != HTTP_CO_KEEPALIVE && state != HTTP_CO_CLOSED)
{
keepalive_queue.push_back([this, host, request, options, response_callback]()
{
this->send_request(host, request, options, response_callback);
});
stackout();
return;
}
if (state == HTTP_CO_KEEPALIVE && connected_host != host)
{
close_connection();
}
this->request_timeout = options.timeout < 0 ? 0 : (options.timeout == 0 ? DEFAULT_TIMEOUT : options.timeout);
this->want_streaming = options.want_streaming;
this->keepalive = options.keepalive;
this->host = host;
this->request = request;
this->response = "";
this->sent = 0;
this->response_callback = response_callback;
this->parsed = {};
if (request_timeout > 0)
{
timeout_id = tfd->set_timer(request_timeout, false, [this](int timer_id)
{
stackin();
close_connection();
parsed = { .error = "HTTP request timed out" };
run_cb_and_clear();
stackout();
});
}
if (state == HTTP_CO_KEEPALIVE)
{
state = HTTP_CO_SENDING_REQUEST;
submit_send();
}
else
{
start_connection();
}
stackout();
}
void http_post_message(http_co_t *handler, int type, const std::string & msg)
{
handler->post_message(type, msg);
}
void http_co_t::post_message(int type, const std::string & msg)
{
stackin();
if (state == HTTP_CO_WEBSOCKET)
{
request += ws_format_frame(type, msg.size());
request += msg;
submit_send();
}
else if (state == HTTP_CO_KEEPALIVE || state == HTTP_CO_CHUNKED)
{
throw std::runtime_error("Attempt to send websocket message on a regular HTTP connection");
}
else
{
ws_outbox += ws_format_frame(type, msg.size());
ws_outbox += msg;
}
stackout();
}
void http_close(http_co_t *handler)
{
handler->end();
}
void http_response_t::parse_json_response(std::string & error, json11::Json & r) const
{
if (this->error != "")
{
error = this->error;
r = json11::Json();
}
else if (status_code != 200)
{
error = "HTTP "+std::to_string(status_code)+" "+status_line+" body: "+trim(body);
r = json11::Json();
}
else
{
std::string json_err;
json11::Json data = json11::Json::parse(body, json_err);
if (json_err != "")
{
error = "Bad JSON: "+json_err+" (response: "+trim(body)+")";
r = json11::Json();
}
else
{
error = "";
r = data;
}
}
} }
http_co_t::~http_co_t() http_co_t::~http_co_t()
{
close_connection();
}
void http_co_t::close_connection()
{ {
if (timeout_id >= 0) if (timeout_id >= 0)
{ {
@@ -262,41 +164,67 @@ void http_co_t::close_connection()
close(peer_fd); close(peer_fd);
peer_fd = -1; peer_fd = -1;
} }
state = HTTP_CO_CLOSED; if (parsed.headers["transfer-encoding"] == "chunked")
connected_host = ""; {
response = ""; int prev = 0, pos = 0;
epoll_events = 0; while ((pos = response.find("\r\n", prev)) >= prev)
{
uint64_t len = strtoull(response.c_str()+prev, NULL, 16);
parsed.body += response.substr(pos+2, len);
prev = pos+2+len+2;
}
}
else
{
std::swap(parsed.body, response);
}
parsed.eof = true;
callback(&parsed);
} }
void http_co_t::start_connection() void http_co_t::start_connection()
{ {
stackin(); stackin();
struct sockaddr addr; int port = extract_port(host);
if (!string_to_addr(host.c_str(), 1, 80, &addr)) struct sockaddr_in addr;
int r;
if ((r = inet_pton(AF_INET, host.c_str(), &addr.sin_addr)) != 1)
{ {
parsed = { .error = "Invalid address: "+host }; parsed.error_code = ENXIO;
run_cb_and_clear();
stackout(); stackout();
end();
return; return;
} }
peer_fd = socket(addr.sa_family, SOCK_STREAM, 0); addr.sin_family = AF_INET;
addr.sin_port = htons(port ? port : 80);
peer_fd = socket(AF_INET, SOCK_STREAM, 0);
if (peer_fd < 0) if (peer_fd < 0)
{ {
parsed = { .error = std::string("socket: ")+strerror(errno) }; parsed.error_code = errno;
run_cb_and_clear();
stackout(); stackout();
end();
return; return;
} }
fcntl(peer_fd, F_SETFL, fcntl(peer_fd, F_GETFL, 0) | O_NONBLOCK); fcntl(peer_fd, F_SETFL, fcntl(peer_fd, F_GETFL, 0) | O_NONBLOCK);
if (request_timeout > 0)
{
timeout_id = tfd->set_timer(request_timeout, false, [this](int timer_id)
{
if (response.length() == 0)
{
parsed.error_code = ETIME;
}
end();
});
}
epoll_events = 0; epoll_events = 0;
// Finally call connect // Finally call connect
int r = ::connect(peer_fd, (sockaddr*)&addr, sizeof(addr)); r = ::connect(peer_fd, (sockaddr*)&addr, sizeof(addr));
if (r < 0 && errno != EINPROGRESS) if (r < 0 && errno != EINPROGRESS)
{ {
close_connection(); parsed.error_code = errno;
parsed = { .error = std::string("connect: ")+strerror(errno) };
run_cb_and_clear();
stackout(); stackout();
end();
return; return;
} }
tfd->set_fd_handler(peer_fd, true, [this](int peer_fd, int epoll_events) tfd->set_fd_handler(peer_fd, true, [this](int peer_fd, int epoll_events)
@@ -304,7 +232,6 @@ void http_co_t::start_connection()
this->epoll_events |= epoll_events; this->epoll_events |= epoll_events;
handle_events(); handle_events();
}); });
connected_host = host;
state = HTTP_CO_CONNECTING; state = HTTP_CO_CONNECTING;
stackout(); stackout();
} }
@@ -327,8 +254,7 @@ void http_co_t::handle_events()
} }
else if (epoll_events & (EPOLLRDHUP|EPOLLERR)) else if (epoll_events & (EPOLLRDHUP|EPOLLERR))
{ {
close_connection(); end();
run_cb_and_clear();
break; break;
} }
} }
@@ -347,10 +273,9 @@ void http_co_t::handle_connect_result()
} }
if (result != 0) if (result != 0)
{ {
close_connection(); parsed.error_code = result;
parsed = { .error = std::string("connect: ")+strerror(result) };
run_cb_and_clear();
stackout(); stackout();
end();
return; return;
} }
int one = 1; int one = 1;
@@ -365,51 +290,6 @@ void http_co_t::handle_connect_result()
stackout(); stackout();
} }
void http_co_t::submit_send()
{
stackin();
int res;
again:
if (sent < request.size())
{
send_iov = (iovec){ .iov_base = (void*)(request.c_str()+sent), .iov_len = request.size()-sent };
send_msg.msg_iov = &send_iov;
send_msg.msg_iovlen = 1;
res = sendmsg(peer_fd, &send_msg, MSG_NOSIGNAL);
if (res < 0)
{
res = -errno;
}
if (res == -EAGAIN || res == -EINTR)
{
res = 0;
}
else if (res < 0)
{
close_connection();
parsed = { .error = std::string("sendmsg: ")+strerror(errno) };
run_cb_and_clear();
stackout();
return;
}
sent += res;
if (state == HTTP_CO_SENDING_REQUEST)
{
if (sent >= request.size())
state = HTTP_CO_REQUEST_SENT;
else
goto again;
}
else if (state == HTTP_CO_WEBSOCKET)
{
request = request.substr(sent);
sent = 0;
goto again;
}
}
stackout();
}
void http_co_t::submit_read() void http_co_t::submit_read()
{ {
stackin(); stackin();
@@ -426,18 +306,16 @@ void http_co_t::submit_read()
{ {
res = -errno; res = -errno;
} }
if (res == -EAGAIN || res == -EINTR) if (res == -EAGAIN)
{ {
epoll_events = epoll_events & ~EPOLLIN; epoll_events = epoll_events & ~EPOLLIN;
} }
else if (res <= 0) else if (res <= 0)
{ {
// < 0 means error, 0 means EOF // < 0 means error, 0 means EOF
epoll_events = epoll_events & ~EPOLLIN; if (!res)
close_connection(); epoll_events = epoll_events & ~EPOLLIN;
if (res < 0) end();
parsed = { .error = std::string("recvmsg: ")+strerror(-res) };
run_cb_and_clear();
} }
else else
{ {
@@ -447,6 +325,51 @@ void http_co_t::submit_read()
stackout(); stackout();
} }
void http_co_t::submit_send()
{
stackin();
int res;
again:
if (sent < request.size())
{
send_iov = (iovec){ .iov_base = (void*)(request.c_str()+sent), .iov_len = request.size()-sent };
send_msg.msg_iov = &send_iov;
send_msg.msg_iovlen = 1;
res = sendmsg(peer_fd, &send_msg, MSG_NOSIGNAL);
if (res < 0)
{
res = -errno;
}
if (res == -EAGAIN)
{
res = 0;
}
else if (res < 0)
{
stackout();
end();
return;
}
sent += res;
if (state == HTTP_CO_SENDING_REQUEST)
{
if (sent >= request.size())
{
state = HTTP_CO_REQUEST_SENT;
}
else
goto again;
}
else if (state == HTTP_CO_WEBSOCKET)
{
request = request.substr(sent);
sent = 0;
goto again;
}
}
stackout();
}
bool http_co_t::handle_read() bool http_co_t::handle_read()
{ {
stackin(); stackin();
@@ -457,7 +380,6 @@ bool http_co_t::handle_read()
{ {
if (timeout_id >= 0) if (timeout_id >= 0)
{ {
// Timeout is cleared when headers are received
tfd->clear_timer(timeout_id); tfd->clear_timer(timeout_id);
timeout_id = -1; timeout_id = -1;
} }
@@ -485,26 +407,20 @@ bool http_co_t::handle_read()
if (!target_response_size) if (!target_response_size)
{ {
// Sorry, unsupported response // Sorry, unsupported response
close_connection();
parsed = { .error = "Response has neither Connection: close, nor Transfer-Encoding: chunked nor Content-Length headers" };
run_cb_and_clear();
stackout(); stackout();
end();
return false; return false;
} }
} }
else
{
keepalive = false;
}
} }
} }
if (state == HTTP_CO_HEADERS_RECEIVED && target_response_size > 0 && response.size() >= target_response_size) if (state == HTTP_CO_HEADERS_RECEIVED && target_response_size > 0 && response.size() >= target_response_size)
{ {
std::swap(parsed.body, response); stackout();
response_callback(&parsed); end();
parsed.eof = true; return false;
} }
else if (state == HTTP_CO_CHUNKED && response.size() > 0) if (state == HTTP_CO_CHUNKED && response.size() > 0)
{ {
int prev = 0, pos = 0; int prev = 0, pos = 0;
while ((pos = response.find("\r\n", prev)) >= prev) while ((pos = response.find("\r\n", prev)) >= prev)
@@ -527,49 +443,47 @@ bool http_co_t::handle_read()
{ {
response = response.substr(prev); response = response.substr(prev);
} }
if (want_streaming) if (parsed.eof)
{ {
// Streaming response stackout();
response_callback(&parsed); end();
return false;
}
if (want_streaming && parsed.body.size() > 0)
{
callback(&parsed);
parsed.body = ""; parsed.body = "";
} }
if (parsed.eof && !want_streaming)
{
// Normal response
response_callback(&parsed);
}
} }
else if (state == HTTP_CO_WEBSOCKET && response.size() > 0) if (state == HTTP_CO_WEBSOCKET && response.size() > 0)
{ {
while (ws_parse_frame(response, parsed.ws_msg_type, parsed.body)) while (ws_parse_frame(response, parsed.ws_msg_type, parsed.body))
{ {
response_callback(&parsed); callback(&parsed);
parsed.body = ""; parsed.body = "";
} }
} }
if (parsed.eof)
{
response_callback = NULL;
parsed = {};
if (!keepalive)
{
close_connection();
}
else
{
state = HTTP_CO_KEEPALIVE;
if (keepalive_queue.size() > 0)
{
auto next = keepalive_queue[0];
keepalive_queue.erase(keepalive_queue.begin(), keepalive_queue.begin()+1);
next();
}
}
}
stackout(); stackout();
return true; return true;
} }
void http_co_t::post_message(int type, const std::string & msg)
{
stackin();
if (state == HTTP_CO_WEBSOCKET)
{
request += ws_format_frame(type, msg.size());
request += msg;
submit_send();
}
else
{
ws_outbox += ws_format_frame(type, msg.size());
ws_outbox += msg;
}
stackout();
}
uint64_t stoull_full(const std::string & str, int base) uint64_t stoull_full(const std::string & str, int base)
{ {
if (isspace(str[0])) if (isspace(str[0]))
@@ -585,7 +499,7 @@ uint64_t stoull_full(const std::string & str, int base)
return r; return r;
} }
static void parse_http_headers(std::string & res, http_response_t *parsed) void parse_http_headers(std::string & res, http_response_t *parsed)
{ {
int pos = res.find("\r\n"); int pos = res.find("\r\n");
pos = pos < 0 ? res.length() : pos+2; pos = pos < 0 ? res.length() : pos+2;
@@ -634,13 +548,13 @@ static std::string ws_format_frame(int type, uint64_t size)
res[p++] = size | /*mask*/0x80; res[p++] = size | /*mask*/0x80;
else if (size < 65536) else if (size < 65536)
{ {
res[p++] = (char)(126 | /*mask*/0x80); res[p++] = 126 | /*mask*/0x80;
res[p++] = (size >> 8) & 0xFF; res[p++] = (size >> 8) & 0xFF;
res[p++] = (size >> 0) & 0xFF; res[p++] = (size >> 0) & 0xFF;
} }
else else
{ {
res[p++] = (char)(127 | /*mask*/0x80); res[p++] = 127 | /*mask*/0x80;
res[p++] = (size >> 56) & 0xFF; res[p++] = (size >> 56) & 0xFF;
res[p++] = (size >> 48) & 0xFF; res[p++] = (size >> 48) & 0xFF;
res[p++] = (size >> 40) & 0xFF; res[p++] = (size >> 40) & 0xFF;
@@ -707,6 +621,57 @@ static bool ws_parse_frame(std::string & buf, int & type, std::string & res)
return true; return true;
} }
std::vector<std::string> getifaddr_list(bool include_v6)
{
std::vector<std::string> addresses;
ifaddrs *list, *ifa;
if (getifaddrs(&list) == -1)
{
throw std::runtime_error(std::string("getifaddrs: ") + strerror(errno));
}
for (ifa = list; ifa != NULL; ifa = ifa->ifa_next)
{
if (!ifa->ifa_addr)
{
continue;
}
int family = ifa->ifa_addr->sa_family;
if ((family == AF_INET || family == AF_INET6 && include_v6) &&
(ifa->ifa_flags & (IFF_UP | IFF_RUNNING | IFF_LOOPBACK)) == (IFF_UP | IFF_RUNNING))
{
void *addr_ptr;
if (family == AF_INET)
addr_ptr = &((sockaddr_in *)ifa->ifa_addr)->sin_addr;
else
addr_ptr = &((sockaddr_in6 *)ifa->ifa_addr)->sin6_addr;
char addr[INET6_ADDRSTRLEN];
if (!inet_ntop(family, addr_ptr, addr, INET6_ADDRSTRLEN))
{
throw std::runtime_error(std::string("inet_ntop: ") + strerror(errno));
}
addresses.push_back(std::string(addr));
}
}
freeifaddrs(list);
return addresses;
}
static int extract_port(std::string & host)
{
int port = 0;
int pos = 0;
if ((pos = host.find(':')) >= 0)
{
port = strtoull(host.c_str() + pos + 1, NULL, 10);
if (port >= 0x10000)
{
port = 0;
}
host = host.substr(0, pos);
}
return port;
}
std::string strtolower(const std::string & in) std::string strtolower(const std::string & in)
{ {
std::string s = in; std::string s = in;

View File

@@ -21,34 +21,41 @@ struct http_options_t
{ {
int timeout; int timeout;
bool want_streaming; bool want_streaming;
bool keepalive;
}; };
struct http_response_t struct http_response_t
{ {
std::string error;
bool eof = false; bool eof = false;
int error_code = 0;
int status_code = 0; int status_code = 0;
std::string status_line; std::string status_line;
std::map<std::string, std::string> headers; std::map<std::string, std::string> headers;
int ws_msg_type = -1; int ws_msg_type = -1;
std::string body; std::string body;
void parse_json_response(std::string & error, json11::Json & r) const;
}; };
// Opened websocket or keepalive HTTP connection
struct http_co_t; struct http_co_t;
http_co_t* http_init(timerfd_manager_t *tfd); struct websocket_t
http_co_t* open_websocket(timerfd_manager_t *tfd, const std::string & host, const std::string & path, {
int timeout, std::function<void(const http_response_t *msg)> on_message); http_co_t *co;
void http_request(http_co_t *handler, const std::string & host, const std::string & request, void post_message(int type, const std::string & msg);
const http_options_t & options, std::function<void(const http_response_t *response)> response_callback); void close();
void http_post_message(http_co_t *handler, int type, const std::string & msg); };
void http_close(http_co_t *co);
void parse_http_headers(std::string & res, http_response_t *parsed);
std::vector<std::string> getifaddr_list(bool include_v6 = false);
// Utils
uint64_t stoull_full(const std::string & str, int base = 10); uint64_t stoull_full(const std::string & str, int base = 10);
std::string strtolower(const std::string & in); std::string strtolower(const std::string & in);
void http_request(timerfd_manager_t *tfd, const std::string & host, const std::string & request,
const http_options_t & options, std::function<void(const http_response_t *response)> callback);
void http_request_json(timerfd_manager_t *tfd, const std::string & host, const std::string & request,
int timeout, std::function<void(std::string, json11::Json r)> callback);
websocket_t* open_websocket(timerfd_manager_t *tfd, const std::string & host, const std::string & path,
int timeout, std::function<void(const http_response_t *msg)> callback);

View File

@@ -4,12 +4,10 @@
#include <unistd.h> #include <unistd.h>
#include <fcntl.h> #include <fcntl.h>
#include <sys/socket.h> #include <sys/socket.h>
#include <sys/stat.h>
#include <sys/epoll.h> #include <sys/epoll.h>
#include <netinet/tcp.h> #include <netinet/tcp.h>
#include <stdexcept> #include <stdexcept>
#include "addr_util.h"
#include "messenger.h" #include "messenger.h"
void osd_messenger_t::init() void osd_messenger_t::init()
@@ -222,20 +220,23 @@ void osd_messenger_t::try_connect_peer(uint64_t peer_osd)
void osd_messenger_t::try_connect_peer_addr(osd_num_t peer_osd, const char *peer_host, int peer_port) void osd_messenger_t::try_connect_peer_addr(osd_num_t peer_osd, const char *peer_host, int peer_port)
{ {
assert(peer_osd != this->osd_num); assert(peer_osd != this->osd_num);
struct sockaddr addr; struct sockaddr_in addr;
if (!string_to_addr(peer_host, 0, peer_port, &addr)) int r;
if ((r = inet_pton(AF_INET, peer_host, &addr.sin_addr)) != 1)
{ {
on_connect_peer(peer_osd, -EINVAL); on_connect_peer(peer_osd, -EINVAL);
return; return;
} }
int peer_fd = socket(addr.sa_family, SOCK_STREAM, 0); addr.sin_family = AF_INET;
addr.sin_port = htons(peer_port ? peer_port : 11203);
int peer_fd = socket(AF_INET, SOCK_STREAM, 0);
if (peer_fd < 0) if (peer_fd < 0)
{ {
on_connect_peer(peer_osd, -errno); on_connect_peer(peer_osd, -errno);
return; return;
} }
fcntl(peer_fd, F_SETFL, fcntl(peer_fd, F_GETFL, 0) | O_NONBLOCK); fcntl(peer_fd, F_SETFL, fcntl(peer_fd, F_GETFL, 0) | O_NONBLOCK);
int r = connect(peer_fd, (sockaddr*)&addr, sizeof(addr)); r = connect(peer_fd, (sockaddr*)&addr, sizeof(addr));
if (r < 0 && errno != EINPROGRESS) if (r < 0 && errno != EINPROGRESS)
{ {
close(peer_fd); close(peer_fd);
@@ -484,20 +485,21 @@ void osd_messenger_t::check_peer_config(osd_client_t *cl)
void osd_messenger_t::accept_connections(int listen_fd) void osd_messenger_t::accept_connections(int listen_fd)
{ {
// Accept new connections // Accept new connections
sockaddr addr; sockaddr_in addr;
socklen_t peer_addr_size = sizeof(addr); socklen_t peer_addr_size = sizeof(addr);
int peer_fd; int peer_fd;
while ((peer_fd = accept(listen_fd, &addr, &peer_addr_size)) >= 0) while ((peer_fd = accept(listen_fd, (sockaddr*)&addr, &peer_addr_size)) >= 0)
{ {
assert(peer_fd != 0); assert(peer_fd != 0);
fprintf(stderr, "[OSD %lu] new client %d: connection from %s\n", this->osd_num, peer_fd, char peer_str[256];
addr_to_string(addr).c_str()); fprintf(stderr, "[OSD %lu] new client %d: connection from %s port %d\n", this->osd_num, peer_fd,
inet_ntop(AF_INET, &addr.sin_addr, peer_str, 256), ntohs(addr.sin_port));
fcntl(peer_fd, F_SETFL, fcntl(peer_fd, F_GETFL, 0) | O_NONBLOCK); fcntl(peer_fd, F_SETFL, fcntl(peer_fd, F_GETFL, 0) | O_NONBLOCK);
int one = 1; int one = 1;
setsockopt(peer_fd, SOL_TCP, TCP_NODELAY, &one, sizeof(one)); setsockopt(peer_fd, SOL_TCP, TCP_NODELAY, &one, sizeof(one));
clients[peer_fd] = new osd_client_t(); clients[peer_fd] = new osd_client_t();
clients[peer_fd]->peer_addr = addr; clients[peer_fd]->peer_addr = addr;
clients[peer_fd]->peer_port = ntohs(((sockaddr_in*)&addr)->sin_port); clients[peer_fd]->peer_port = ntohs(addr.sin_port);
clients[peer_fd]->peer_fd = peer_fd; clients[peer_fd]->peer_fd = peer_fd;
clients[peer_fd]->peer_state = PEER_CONNECTED; clients[peer_fd]->peer_state = PEER_CONNECTED;
clients[peer_fd]->in_buf = malloc_or_die(receive_buffer_size); clients[peer_fd]->in_buf = malloc_or_die(receive_buffer_size);
@@ -545,7 +547,7 @@ json11::Json osd_messenger_t::read_config(const json11::Json & config)
int done = 0; int done = 0;
while (done < st.st_size) while (done < st.st_size)
{ {
int r = read(fd, (uint8_t*)buf.data()+done, st.st_size-done); int r = read(fd, (void*)buf.data()+done, st.st_size-done);
if (r < 0) if (r < 0)
{ {
fprintf(stderr, "Error reading %s: %s\n", config_path, strerror(errno)); fprintf(stderr, "Error reading %s: %s\n", config_path, strerror(errno));

View File

@@ -49,7 +49,7 @@ struct osd_client_t
{ {
int refs = 0; int refs = 0;
sockaddr peer_addr; sockaddr_in peer_addr;
int peer_port; int peer_port;
int peer_fd; int peer_fd;
int peer_state; int peer_state;

82
src/mmap_manager.cpp Normal file
View File

@@ -0,0 +1,82 @@
#include <stdexcept>
#include <cassert>
#include <sys/mman.h>
#include "mmap_manager.h"
mmap_manager_t::mmap_manager_t(uint64_t mmap_size)
{
this->mmap_size = mmap_size;
}
mmap_manager_t::~mmap_manager_t()
{
for (auto & kv: past_buffers)
{
munmap(kv.second.addr, kv.second.size);
}
if (active_buffer.addr != NULL)
{
munmap(active_buffer.addr, active_buffer.size);
}
}
void *mmap_manager_t::alloc(uint64_t size)
{
if (!active_buffer.addr || (active_buffer.pos + size) > active_buffer.size)
{
if (active_buffer.addr)
{
if (active_buffer.freed >= active_buffer.pos)
munmap(active_buffer.addr, active_buffer.size);
else
past_buffers[active_buffer.addr] = active_buffer;
active_buffer = { 0 };
}
uint64_t new_size = size < mmap_size ? mmap_size : size;
void *buf = mmap(NULL, new_size, PROT_READ|PROT_WRITE, MAP_PRIVATE|MAP_ANONYMOUS, -1, 0);
if (!buf)
throw std::runtime_error(std::string("can't mmap "+std::to_string(new_size)+" bytes"));
active_buffer = {
.addr = buf,
.size = new_size,
.freed = 0,
.pos = 0,
};
}
void *res = active_buffer.addr + active_buffer.pos;
active_buffer.pos += size;
return res;
}
void mmap_manager_t::free(void *addr, uint64_t size)
{
auto it = past_buffers.upper_bound(addr);
if (it != past_buffers.begin())
{
if (it == past_buffers.end())
{
it--;
if (addr < it->second.addr || addr >= it->second.addr+it->second.size)
it = past_buffers.end();
}
else
it--;
}
else
it = past_buffers.end();
if (it != past_buffers.end())
{
assert(addr >= it->second.addr && addr+size <= it->second.addr+it->second.size);
it->second.freed += size;
if (it->second.freed >= it->second.pos)
{
munmap(it->second.addr, it->second.size);
past_buffers.erase(it);
}
}
else
{
assert(addr < active_buffer.addr+active_buffer.size);
active_buffer.freed += size;
}
}

26
src/mmap_manager.h Normal file
View File

@@ -0,0 +1,26 @@
#pragma once
#include <stdint.h>
#include <map>
struct mmap_buffer_t
{
void *addr = NULL;
uint64_t size = 0;
uint64_t freed = 0;
uint64_t pos = 0;
};
class mmap_manager_t
{
protected:
uint64_t mmap_size = 32*1024*1024;
std::map<void*, mmap_buffer_t> past_buffers;
mmap_buffer_t active_buffer;
public:
mmap_manager_t(uint64_t mmap_size = 32*1024*1024);
~mmap_manager_t();
void *alloc(uint64_t size);
void free(void *addr, uint64_t size);
};

View File

@@ -141,7 +141,7 @@ struct osd_op_buf_list_t
else else
{ {
iov.iov_len -= result; iov.iov_len -= result;
iov.iov_base = (uint8_t*)iov.iov_base + result; iov.iov_base += result;
break; break;
} }
} }

View File

@@ -58,19 +58,11 @@ msgr_rdma_context_t *msgr_rdma_context_t::create(const char *ib_devname, uint8_t
msgr_rdma_context_t *ctx = new msgr_rdma_context_t(); msgr_rdma_context_t *ctx = new msgr_rdma_context_t();
ctx->mtu = mtu; ctx->mtu = mtu;
timespec tv; srand48(time(NULL));
clock_gettime(CLOCK_REALTIME, &tv);
srand48(tv.tv_sec*1000000000 + tv.tv_nsec);
dev_list = ibv_get_device_list(NULL); dev_list = ibv_get_device_list(NULL);
if (!dev_list) if (!dev_list)
{ {
if (errno == -ENOSYS || errno == ENOSYS) fprintf(stderr, "Failed to get RDMA device list: %s\n", strerror(errno));
{
if (log_level > 0)
fprintf(stderr, "No RDMA devices found (RDMA device list returned ENOSYS)\n");
}
else
fprintf(stderr, "Failed to get RDMA device list: %s\n", strerror(errno));
goto cleanup; goto cleanup;
} }
if (!ib_devname) if (!ib_devname)
@@ -391,7 +383,7 @@ bool osd_messenger_t::try_send_rdma(osd_client_t *cl)
uint32_t len = (uint32_t)(op_size+iov.iov_len-rc->send_buf_pos < rc->max_msg uint32_t len = (uint32_t)(op_size+iov.iov_len-rc->send_buf_pos < rc->max_msg
? iov.iov_len-rc->send_buf_pos : rc->max_msg-op_size); ? iov.iov_len-rc->send_buf_pos : rc->max_msg-op_size);
sge[op_sge++] = { sge[op_sge++] = {
.addr = (uintptr_t)((uint8_t*)iov.iov_base+rc->send_buf_pos), .addr = (uintptr_t)(iov.iov_base+rc->send_buf_pos),
.length = len, .length = len,
.lkey = rc->ctx->mr->lkey, .lkey = rc->ctx->mr->lkey,
}; };
@@ -521,7 +513,7 @@ void osd_messenger_t::handle_rdma_events()
} }
if (cl->rdma_conn->send_buf_pos > 0) if (cl->rdma_conn->send_buf_pos > 0)
{ {
cl->send_list[0].iov_base = (uint8_t*)cl->send_list[0].iov_base + cl->rdma_conn->send_buf_pos; cl->send_list[0].iov_base += cl->rdma_conn->send_buf_pos;
cl->send_list[0].iov_len -= cl->rdma_conn->send_buf_pos; cl->send_list[0].iov_len -= cl->rdma_conn->send_buf_pos;
cl->rdma_conn->send_buf_pos = 0; cl->rdma_conn->send_buf_pos = 0;
} }

View File

@@ -67,7 +67,7 @@ bool osd_messenger_t::handle_read(int result, osd_client_t *cl)
} }
return false; return false;
} }
if (result <= 0 && result != -EAGAIN && result != -EINTR) if (result <= 0 && result != -EAGAIN)
{ {
// this is a client socket, so don't panic on error. just disconnect it // this is a client socket, so don't panic on error. just disconnect it
if (result != 0) if (result != 0)
@@ -77,7 +77,7 @@ bool osd_messenger_t::handle_read(int result, osd_client_t *cl)
stop_client(cl->peer_fd); stop_client(cl->peer_fd);
return false; return false;
} }
if (result == -EAGAIN || result == -EINTR || result < cl->read_iov.iov_len) if (result == -EAGAIN || result < cl->read_iov.iov_len)
{ {
cl->read_ready--; cl->read_ready--;
if (cl->read_ready > 0) if (cl->read_ready > 0)
@@ -142,13 +142,13 @@ bool osd_messenger_t::handle_read_buffer(osd_client_t *cl, void *curbuf, int rem
memcpy(cur->iov_base, curbuf, remain); memcpy(cur->iov_base, curbuf, remain);
cl->read_remaining -= remain; cl->read_remaining -= remain;
cur->iov_len -= remain; cur->iov_len -= remain;
cur->iov_base = (uint8_t*)cur->iov_base + remain; cur->iov_base += remain;
remain = 0; remain = 0;
} }
else else
{ {
memcpy(cur->iov_base, curbuf, cur->iov_len); memcpy(cur->iov_base, curbuf, cur->iov_len);
curbuf = (uint8_t*)curbuf + cur->iov_len; curbuf += cur->iov_len;
cl->read_remaining -= cur->iov_len; cl->read_remaining -= cur->iov_len;
remain -= cur->iov_len; remain -= cur->iov_len;
cur->iov_len = 0; cur->iov_len = 0;
@@ -390,7 +390,7 @@ void osd_messenger_t::handle_reply_ready(osd_op_t *op)
(tv_end.tv_sec - op->tv_begin.tv_sec)*1000000 + (tv_end.tv_sec - op->tv_begin.tv_sec)*1000000 +
(tv_end.tv_nsec - op->tv_begin.tv_nsec)/1000 (tv_end.tv_nsec - op->tv_begin.tv_nsec)/1000
); );
set_immediate.push_back([op]() set_immediate.push_back([this, op]()
{ {
// Copy lambda to be unaffected by `delete op` // Copy lambda to be unaffected by `delete op`
std::function<void(osd_op_t*)>(op->callback)(op); std::function<void(osd_op_t*)>(op->callback)(op);

Some files were not shown because too many files have changed in this diff Show More