forked from vitalif/vitastor
Compare commits
32 Commits
Author | SHA1 | Date | |
---|---|---|---|
f9fe72d40a | |||
10ee4f7c1d | |||
fd8244699b | |||
eaac1fc5d1 | |||
57be1923d3 | |||
c467acc388 | |||
bf591ba3ee | |||
699a0fbbc7 | |||
6b2dd50f27 | |||
caf2f3c56f | |||
9174f188b1 | |||
d3978c6d0e | |||
4a7365660d | |||
818ae5d61d | |||
6810e93c3f | |||
f6f35f4127 | |||
72aa2fd819 | |||
5010b0dd75 | |||
483c5ab380 | |||
6a6fd6544d | |||
971aa4ae4f | |||
9e6cbc6ebc | |||
ce777319c3 | |||
f8ff39b0ab | |||
d749159585 | |||
9703773a63 | |||
5d8d486f7c | |||
2b546cdd55 | |||
bd7b177707 | |||
33f9d03d22 | |||
82e6aff17b | |||
57e2c503f7 |
@@ -2,4 +2,6 @@ cmake_minimum_required(VERSION 2.8)
|
|||||||
|
|
||||||
project(vitastor)
|
project(vitastor)
|
||||||
|
|
||||||
|
set(VERSION "0.6.4")
|
||||||
|
|
||||||
add_subdirectory(src)
|
add_subdirectory(src)
|
||||||
|
134
README-ru.md
134
README-ru.md
@@ -22,7 +22,6 @@ Vitastor на данный момент находится в статусе п
|
|||||||
|
|
||||||
Однако следующее уже реализовано:
|
Однако следующее уже реализовано:
|
||||||
|
|
||||||
0.5.x (стабильная версия):
|
|
||||||
- Базовая часть - надёжное кластерное блочное хранилище без единой точки отказа
|
- Базовая часть - надёжное кластерное блочное хранилище без единой точки отказа
|
||||||
- Производительность ;-D
|
- Производительность ;-D
|
||||||
- Несколько схем отказоустойчивости: репликация, XOR n+1 (1 диск чётности), коды коррекции ошибок
|
- Несколько схем отказоустойчивости: репликация, XOR n+1 (1 диск чётности), коды коррекции ошибок
|
||||||
@@ -43,24 +42,24 @@ Vitastor на данный момент находится в статусе п
|
|||||||
- NBD-прокси для монтирования образов ядром ("блочное устройство в режиме пользователя")
|
- NBD-прокси для монтирования образов ядром ("блочное устройство в режиме пользователя")
|
||||||
- Утилита удаления образов/инодов (vitastor-rm)
|
- Утилита удаления образов/инодов (vitastor-rm)
|
||||||
- Пакеты для Debian и CentOS
|
- Пакеты для Debian и CentOS
|
||||||
|
|
||||||
0.6.x (master-ветка):
|
|
||||||
- Статистика операций ввода/вывода и занятого места в разрезе инодов
|
- Статистика операций ввода/вывода и занятого места в разрезе инодов
|
||||||
- Именование инодов через хранение их метаданных в etcd
|
- Именование инодов через хранение их метаданных в etcd
|
||||||
- Снапшоты и copy-on-write клоны
|
- Снапшоты и copy-on-write клоны
|
||||||
- Сглаживание производительности случайной записи в SSD+HDD конфигурациях
|
- Сглаживание производительности случайной записи в SSD+HDD конфигурациях
|
||||||
|
- Поддержка RDMA/RoCEv2 через libibverbs
|
||||||
|
- CSI-плагин для Kubernetes
|
||||||
|
|
||||||
## Планы развития
|
## Планы развития
|
||||||
|
|
||||||
- Более корректные скрипты разметки дисков и автоматического запуска OSD
|
- Более корректные скрипты разметки дисков и автоматического запуска OSD
|
||||||
- Другие инструменты администрирования
|
- Другие инструменты администрирования
|
||||||
- Плагины для OpenStack, Kubernetes, OpenNebula, Proxmox и других облачных систем
|
- Плагины для OpenStack, OpenNebula, Proxmox и других облачных систем
|
||||||
- iSCSI-прокси
|
- iSCSI-прокси
|
||||||
- Более быстрое переключение при отказах
|
- Более быстрое переключение при отказах
|
||||||
- Фоновая проверка целостности без контрольных сумм (сверка реплик)
|
- Фоновая проверка целостности без контрольных сумм (сверка реплик)
|
||||||
- Контрольные суммы
|
- Контрольные суммы
|
||||||
- Поддержка SSD-кэширования (tiered storage)
|
- Поддержка SSD-кэширования (tiered storage)
|
||||||
- Поддержка RDMA и NVDIMM
|
- Поддержка NVDIMM
|
||||||
- Web-интерфейс
|
- Web-интерфейс
|
||||||
- Возможно, сжатие
|
- Возможно, сжатие
|
||||||
- Возможно, поддержка кэширования данных через системный page cache
|
- Возможно, поддержка кэширования данных через системный page cache
|
||||||
@@ -314,14 +313,15 @@ Ceph:
|
|||||||
|
|
||||||
### NBD
|
### NBD
|
||||||
|
|
||||||
NBD - на данный момент единственный способ монтировать Vitastor ядром Linux, но он
|
|
||||||
приводит к дополнительным копированиям данных, поэтому немного ухудшает производительность,
|
|
||||||
правда, в основном - линейную, а случайная затрагивается слабо.
|
|
||||||
|
|
||||||
NBD расшифровывается как "сетевое блочное устройство", но на самом деле оно также
|
NBD расшифровывается как "сетевое блочное устройство", но на самом деле оно также
|
||||||
работает просто как аналог FUSE для блочных устройств, то есть, представляет собой
|
работает просто как аналог FUSE для блочных устройств, то есть, представляет собой
|
||||||
"блочное устройство в пространстве пользователя".
|
"блочное устройство в пространстве пользователя".
|
||||||
|
|
||||||
|
NBD - на данный момент единственный способ монтировать Vitastor ядром Linux.
|
||||||
|
NBD немного снижает производительность, так как приводит к дополнительным копированиям
|
||||||
|
данных между ядром и пространством пользователя. Тем не менее, способ достаточно оптимален,
|
||||||
|
а производительность случайного доступа вообще затрагивается слабо.
|
||||||
|
|
||||||
Vitastor с однопоточной NBD прокси на том же стенде:
|
Vitastor с однопоточной NBD прокси на том же стенде:
|
||||||
- T1Q1 запись: 6000 iops (задержка 0.166ms)
|
- T1Q1 запись: 6000 iops (задержка 0.166ms)
|
||||||
- T1Q1 чтение: 5518 iops (задержка 0.18ms)
|
- T1Q1 чтение: 5518 iops (задержка 0.18ms)
|
||||||
@@ -424,23 +424,105 @@ Vitastor с однопоточной NBD прокси на том же стен
|
|||||||
- Запустите все OSD: `systemctl start vitastor.target`
|
- Запустите все OSD: `systemctl start vitastor.target`
|
||||||
- Ваш кластер должен быть готов - один из мониторов должен уже сконфигурировать PG, а OSD должны запустить их.
|
- Ваш кластер должен быть готов - один из мониторов должен уже сконфигурировать PG, а OSD должны запустить их.
|
||||||
- Вы можете проверить состояние PG прямо в etcd: `etcdctl --endpoints=... get --prefix /vitastor/pg/state`. Все PG должны быть 'active'.
|
- Вы можете проверить состояние PG прямо в etcd: `etcdctl --endpoints=... get --prefix /vitastor/pg/state`. Все PG должны быть 'active'.
|
||||||
- Пример команды для запуска тестов: `fio -thread -ioengine=libfio_vitastor.so -name=test -bs=4M -direct=1 -iodepth=16 -rw=write -etcd=10.115.0.10:2379/v3 -pool=1 -inode=1 -size=400G`.
|
|
||||||
- Пример команды для заливки образа ВМ в vitastor через qemu-img:
|
### Задать имя образу
|
||||||
```
|
|
||||||
qemu-img convert -f qcow2 debian10.qcow2 -p -O raw 'vitastor:etcd_host=10.115.0.10\:2379/v3:pool=1:inode=1:size=2147483648'
|
```
|
||||||
```
|
etcdctl --endpoints=<etcd> put /vitastor/config/inode/<pool>/<inode> '{"name":"<name>","size":<size>[,"parent_id":<parent_inode_number>][,"readonly":true]}'
|
||||||
Если вы используете немодифицированный QEMU, данной команде потребуется переменная окружения `LD_PRELOAD=/usr/lib/x86_64-linux-gnu/qemu/block-vitastor.so`.
|
```
|
||||||
- Пример команды запуска QEMU:
|
|
||||||
```
|
Например:
|
||||||
qemu-system-x86_64 -enable-kvm -m 1024
|
|
||||||
-drive 'file=vitastor:etcd_host=10.115.0.10\:2379/v3:pool=1:inode=1:size=2147483648',format=raw,if=none,id=drive-virtio-disk0,cache=none
|
```
|
||||||
-device virtio-blk-pci,scsi=off,bus=pci.0,addr=0x5,drive=drive-virtio-disk0,id=virtio-disk0,bootindex=1,write-cache=off,physical_block_size=4096,logical_block_size=512
|
etcdctl --endpoints=http://10.115.0.10:2379/v3 put /vitastor/config/inode/1/1 '{"name":"testimg","size":2147483648}'
|
||||||
-vnc 0.0.0.0:0
|
```
|
||||||
```
|
|
||||||
- Пример команды удаления образа (инода) из Vitastor:
|
Если вы зададите parent_id, то образ станет CoW-клоном, т.е. все новые запросы записи пойдут в новый инод, а запросы
|
||||||
```
|
чтения будут проверять сначала его, а потом родительские слои по цепочке вверх. Чтобы случайно не перезаписать данные
|
||||||
vitastor-rm --etcd_address 10.115.0.10:2379/v3 --pool 1 --inode 1 --parallel_osds 16 --iodepth 32
|
в родительском слое, вы можете переключить его в режим "только чтение", добавив флаг `"readonly":true` в его запись
|
||||||
```
|
метаданных. В таком случае родительский образ становится просто снапшотом.
|
||||||
|
|
||||||
|
Таким образом, для создания снапшота вам нужно просто переименовать предыдущий inode (например, из testimg в testimg@0),
|
||||||
|
сделать его readonly и создать новый слой с исходным именем образа (testimg), ссылающийся на только что переименованный
|
||||||
|
в качестве родительского.
|
||||||
|
|
||||||
|
### Запуск тестов с fio
|
||||||
|
|
||||||
|
Пример команды для запуска тестов:
|
||||||
|
|
||||||
|
```
|
||||||
|
fio -thread -ioengine=libfio_vitastor.so -name=test -bs=4M -direct=1 -iodepth=16 -rw=write -etcd=10.115.0.10:2379/v3 -image=testimg
|
||||||
|
```
|
||||||
|
|
||||||
|
Если вы не хотите обращаться к образу по имени, вместо `-image=testimg` можно указать номер пула, номер инода и размер:
|
||||||
|
`-pool=1 -inode=1 -size=400G`.
|
||||||
|
|
||||||
|
### Загрузить образ диска ВМ в/из Vitastor
|
||||||
|
|
||||||
|
Используйте qemu-img и строку `vitastor:etcd_host=<HOST>:image=<IMAGE>` в качестве имени файла диска. Например:
|
||||||
|
|
||||||
|
```
|
||||||
|
qemu-img convert -f qcow2 debian10.qcow2 -p -O raw 'vitastor:etcd_host=10.115.0.10\:2379/v3:image=testimg'
|
||||||
|
```
|
||||||
|
|
||||||
|
Обратите внимание, что если вы используете немодифицированный QEMU, потребуется установить переменную окружения
|
||||||
|
`LD_PRELOAD=/usr/lib/x86_64-linux-gnu/qemu/block-vitastor.so`.
|
||||||
|
|
||||||
|
Если вы не хотите обращаться к образу по имени, вместо `:image=<IMAGE>` можно указать номер пула, номер инода и размер:
|
||||||
|
`:pool=<POOL>:inode=<INODE>:size=<SIZE>`.
|
||||||
|
|
||||||
|
### Запустить ВМ
|
||||||
|
|
||||||
|
Для запуска QEMU используйте опцию `-drive file=vitastor:etcd_host=<HOST>:image=<IMAGE>` (аналогично qemu-img)
|
||||||
|
и физический размер блока 4 KB.
|
||||||
|
|
||||||
|
Например:
|
||||||
|
|
||||||
|
```
|
||||||
|
qemu-system-x86_64 -enable-kvm -m 1024
|
||||||
|
-drive 'file=vitastor:etcd_host=10.115.0.10\:2379/v3:image=testimg',format=raw,if=none,id=drive-virtio-disk0,cache=none
|
||||||
|
-device virtio-blk-pci,scsi=off,bus=pci.0,addr=0x5,drive=drive-virtio-disk0,id=virtio-disk0,bootindex=1,write-cache=off,physical_block_size=4096,logical_block_size=512
|
||||||
|
-vnc 0.0.0.0:0
|
||||||
|
```
|
||||||
|
|
||||||
|
Обращение по номерам (`:pool=<POOL>:inode=<INODE>:size=<SIZE>` вместо `:image=<IMAGE>`) работает аналогично qemu-img.
|
||||||
|
|
||||||
|
### Удалить образ
|
||||||
|
|
||||||
|
Используйте утилиту vitastor-rm. Например:
|
||||||
|
|
||||||
|
```
|
||||||
|
vitastor-rm --etcd_address 10.115.0.10:2379/v3 --pool 1 --inode 1 --parallel_osds 16 --iodepth 32
|
||||||
|
```
|
||||||
|
|
||||||
|
### NBD
|
||||||
|
|
||||||
|
Чтобы создать локальное блочное устройство, используйте NBD. Например:
|
||||||
|
|
||||||
|
```
|
||||||
|
vitastor-nbd map --etcd_address 10.115.0.10:2379/v3 --image testimg
|
||||||
|
```
|
||||||
|
|
||||||
|
Команда напечатает название устройства вида /dev/nbd0, которое потом можно будет форматировать
|
||||||
|
и использовать как обычное блочное устройство.
|
||||||
|
|
||||||
|
Для обращения по номеру инода, аналогично другим командам, можно использовать опции
|
||||||
|
`--pool <POOL> --inode <INODE> --size <SIZE>` вместо `--image testimg`.
|
||||||
|
|
||||||
|
### Kubernetes
|
||||||
|
|
||||||
|
У Vitastor есть CSI-плагин для Kubernetes, поддерживающий RWO-тома.
|
||||||
|
|
||||||
|
Для установки возьмите манифесты из директории [csi/deploy/](csi/deploy/), поместите
|
||||||
|
вашу конфигурацию подключения к Vitastor в [csi/deploy/001-csi-config-map.yaml](001-csi-config-map.yaml),
|
||||||
|
настройте StorageClass в [csi/deploy/009-storage-class.yaml](009-storage-class.yaml)
|
||||||
|
и примените все `NNN-*.yaml` к вашей инсталляции Kubernetes.
|
||||||
|
|
||||||
|
```
|
||||||
|
for i in ./???-*.yaml; do kubectl apply -f $i; done
|
||||||
|
```
|
||||||
|
|
||||||
|
После этого вы сможете создавать PersistentVolume. Пример смотрите в файле [csi/deploy/example-pvc.yaml](csi/deploy/example-pvc.yaml).
|
||||||
|
|
||||||
## Известные проблемы
|
## Известные проблемы
|
||||||
|
|
||||||
|
122
README.md
122
README.md
@@ -16,7 +16,6 @@ with configurable redundancy (replication or erasure codes/XOR).
|
|||||||
Vitastor is currently a pre-release, a lot of features are missing and you can still expect
|
Vitastor is currently a pre-release, a lot of features are missing and you can still expect
|
||||||
breaking changes in the future. However, the following is implemented:
|
breaking changes in the future. However, the following is implemented:
|
||||||
|
|
||||||
0.5.x (stable):
|
|
||||||
- Basic part: highly-available block storage with symmetric clustering and no SPOF
|
- Basic part: highly-available block storage with symmetric clustering and no SPOF
|
||||||
- Performance ;-D
|
- Performance ;-D
|
||||||
- Multiple redundancy schemes: Replication, XOR n+1, Reed-Solomon erasure codes
|
- Multiple redundancy schemes: Replication, XOR n+1, Reed-Solomon erasure codes
|
||||||
@@ -37,24 +36,24 @@ breaking changes in the future. However, the following is implemented:
|
|||||||
- NBD proxy for kernel mounts
|
- NBD proxy for kernel mounts
|
||||||
- Inode removal tool (vitastor-rm)
|
- Inode removal tool (vitastor-rm)
|
||||||
- Packaging for Debian and CentOS
|
- Packaging for Debian and CentOS
|
||||||
|
|
||||||
0.6.x (master):
|
|
||||||
- Per-inode I/O and space usage statistics
|
- Per-inode I/O and space usage statistics
|
||||||
- Inode metadata storage in etcd
|
- Inode metadata storage in etcd
|
||||||
- Snapshots and copy-on-write image clones
|
- Snapshots and copy-on-write image clones
|
||||||
- Write throttling to smooth random write workloads in SSD+HDD configurations
|
- Write throttling to smooth random write workloads in SSD+HDD configurations
|
||||||
|
- RDMA/RoCEv2 support via libibverbs
|
||||||
|
- CSI plugin for Kubernetes
|
||||||
|
|
||||||
## Roadmap
|
## Roadmap
|
||||||
|
|
||||||
- Better OSD creation and auto-start tools
|
- Better OSD creation and auto-start tools
|
||||||
- Other administrative tools
|
- Other administrative tools
|
||||||
- Plugins for OpenStack, Kubernetes, OpenNebula, Proxmox and other cloud systems
|
- Plugins for OpenStack, OpenNebula, Proxmox and other cloud systems
|
||||||
- iSCSI proxy
|
- iSCSI proxy
|
||||||
- Faster failover
|
- Faster failover
|
||||||
- Scrubbing without checksums (verification of replicas)
|
- Scrubbing without checksums (verification of replicas)
|
||||||
- Checksums
|
- Checksums
|
||||||
- Tiered storage
|
- Tiered storage
|
||||||
- RDMA and NVDIMM support
|
- NVDIMM support
|
||||||
- Web GUI
|
- Web GUI
|
||||||
- Compression (possibly)
|
- Compression (possibly)
|
||||||
- Read caching using system page cache (possibly)
|
- Read caching using system page cache (possibly)
|
||||||
@@ -379,24 +378,101 @@ and calculate disk offsets almost by hand. This will be fixed in near future.
|
|||||||
For jerasure pools the configuration should look like the following: `2:{"name":"ecpool","scheme":"jerasure","pg_size":4,"parity_chunks":2,"pg_minsize":2,"pg_count":256,"failure_domain":"host"}`.
|
For jerasure pools the configuration should look like the following: `2:{"name":"ecpool","scheme":"jerasure","pg_size":4,"parity_chunks":2,"pg_minsize":2,"pg_count":256,"failure_domain":"host"}`.
|
||||||
- At this point, one of the monitors will configure PGs and OSDs will start them.
|
- At this point, one of the monitors will configure PGs and OSDs will start them.
|
||||||
- You can check PG states with `etcdctl --endpoints=... get --prefix /vitastor/pg/state`. All PGs should become 'active'.
|
- You can check PG states with `etcdctl --endpoints=... get --prefix /vitastor/pg/state`. All PGs should become 'active'.
|
||||||
- Run tests with (for example): `fio -thread -ioengine=libfio_vitastor.so -name=test -bs=4M -direct=1 -iodepth=16 -rw=write -etcd=10.115.0.10:2379/v3 -pool=1 -inode=1 -size=400G`.
|
|
||||||
- Upload VM disk image with qemu-img (for example):
|
### Name an image
|
||||||
```
|
|
||||||
qemu-img convert -f qcow2 debian10.qcow2 -p -O raw 'vitastor:etcd_host=10.115.0.10\:2379/v3:pool=1:inode=1:size=2147483648'
|
```
|
||||||
```
|
etcdctl --endpoints=<etcd> put /vitastor/config/inode/<pool>/<inode> '{"name":"<name>","size":<size>[,"parent_id":<parent_inode_number>][,"readonly":true]}'
|
||||||
Note that the command requires to be run with `LD_PRELOAD=/usr/lib/x86_64-linux-gnu/qemu/block-vitastor.so qemu-img ...`
|
```
|
||||||
if you use unmodified QEMU.
|
|
||||||
- Run QEMU with (for example):
|
For example:
|
||||||
```
|
|
||||||
qemu-system-x86_64 -enable-kvm -m 1024
|
```
|
||||||
-drive 'file=vitastor:etcd_host=10.115.0.10\:2379/v3:pool=1:inode=1:size=2147483648',format=raw,if=none,id=drive-virtio-disk0,cache=none
|
etcdctl --endpoints=http://10.115.0.10:2379/v3 put /vitastor/config/inode/1/1 '{"name":"testimg","size":2147483648}'
|
||||||
-device virtio-blk-pci,scsi=off,bus=pci.0,addr=0x5,drive=drive-virtio-disk0,id=virtio-disk0,bootindex=1,write-cache=off,physical_block_size=4096,logical_block_size=512
|
```
|
||||||
-vnc 0.0.0.0:0
|
|
||||||
```
|
If you specify parent_id the image becomes a CoW clone. I.e. all writes go to the new inode and reads first check it
|
||||||
- Remove inode with (for example):
|
and then upper layers. You can then make parent readonly by updating its entry with `"readonly":true` for safety and
|
||||||
```
|
basically treat it as a snapshot.
|
||||||
vitastor-rm --etcd_address 10.115.0.10:2379/v3 --pool 1 --inode 1 --parallel_osds 16 --iodepth 32
|
|
||||||
```
|
So to create a snapshot you basically rename the previous upper layer (for example from testimg to testimg@0), make it readonly
|
||||||
|
and create a new top layer with the original name (testimg) and the previous one as a parent.
|
||||||
|
|
||||||
|
### Run fio benchmarks
|
||||||
|
|
||||||
|
fio command example:
|
||||||
|
|
||||||
|
```
|
||||||
|
fio -thread -ioengine=libfio_vitastor.so -name=test -bs=4M -direct=1 -iodepth=16 -rw=write -etcd=10.115.0.10:2379/v3 -image=testimg
|
||||||
|
```
|
||||||
|
|
||||||
|
If you don't want to access your image by name, you can specify pool number, inode number and size
|
||||||
|
(`-pool=1 -inode=1 -size=400G`) instead of the image name (`-image=testimg`).
|
||||||
|
|
||||||
|
### Upload VM image
|
||||||
|
|
||||||
|
Use qemu-img and `vitastor:etcd_host=<HOST>:image=<IMAGE>` disk filename. For example:
|
||||||
|
|
||||||
|
```
|
||||||
|
qemu-img convert -f qcow2 debian10.qcow2 -p -O raw 'vitastor:etcd_host=10.115.0.10\:2379/v3:image=testimg'
|
||||||
|
```
|
||||||
|
|
||||||
|
Note that the command requires to be run with `LD_PRELOAD=/usr/lib/x86_64-linux-gnu/qemu/block-vitastor.so qemu-img ...`
|
||||||
|
if you use unmodified QEMU.
|
||||||
|
|
||||||
|
You can also specify `:pool=<POOL>:inode=<INODE>:size=<SIZE>` instead of `:image=<IMAGE>`
|
||||||
|
if you don't want to use inode metadata.
|
||||||
|
|
||||||
|
### Start a VM
|
||||||
|
|
||||||
|
Run QEMU with `-drive file=vitastor:etcd_host=<HOST>:image=<IMAGE>` and use 4 KB physical block size.
|
||||||
|
|
||||||
|
For example:
|
||||||
|
|
||||||
|
```
|
||||||
|
qemu-system-x86_64 -enable-kvm -m 1024
|
||||||
|
-drive 'file=vitastor:etcd_host=10.115.0.10\:2379/v3:image=testimg',format=raw,if=none,id=drive-virtio-disk0,cache=none
|
||||||
|
-device virtio-blk-pci,scsi=off,bus=pci.0,addr=0x5,drive=drive-virtio-disk0,id=virtio-disk0,bootindex=1,write-cache=off,physical_block_size=4096,logical_block_size=512
|
||||||
|
-vnc 0.0.0.0:0
|
||||||
|
```
|
||||||
|
|
||||||
|
You can also specify `:pool=<POOL>:inode=<INODE>:size=<SIZE>` instead of `:image=<IMAGE>`,
|
||||||
|
just like in qemu-img.
|
||||||
|
|
||||||
|
### Remove inode
|
||||||
|
|
||||||
|
Use vitastor-rm. For example:
|
||||||
|
|
||||||
|
```
|
||||||
|
vitastor-rm --etcd_address 10.115.0.10:2379/v3 --pool 1 --inode 1 --parallel_osds 16 --iodepth 32
|
||||||
|
```
|
||||||
|
|
||||||
|
### NBD
|
||||||
|
|
||||||
|
To create a local block device for a Vitastor image, use NBD. For example:
|
||||||
|
|
||||||
|
```
|
||||||
|
vitastor-nbd map --etcd_address 10.115.0.10:2379/v3 --image testimg
|
||||||
|
```
|
||||||
|
|
||||||
|
It will output the device name, like /dev/nbd0 which you can then format and mount as a normal block device.
|
||||||
|
|
||||||
|
Again, you can use `--pool <POOL> --inode <INODE> --size <SIZE>` insteaf of `--image <IMAGE>` if you want.
|
||||||
|
|
||||||
|
### Kubernetes
|
||||||
|
|
||||||
|
Vitastor has a CSI plugin for Kubernetes which supports RWO volumes.
|
||||||
|
|
||||||
|
To deploy it, take manifests from [csi/deploy/](csi/deploy/) directory, put your
|
||||||
|
Vitastor configuration in [csi/deploy/001-csi-config-map.yaml](001-csi-config-map.yaml),
|
||||||
|
configure storage class in [csi/deploy/009-storage-class.yaml](009-storage-class.yaml)
|
||||||
|
and apply all `NNN-*.yaml` manifests to your Kubernetes installation:
|
||||||
|
|
||||||
|
```
|
||||||
|
for i in ./???-*.yaml; do kubectl apply -f $i; done
|
||||||
|
```
|
||||||
|
|
||||||
|
After that you'll be able to create PersistentVolumes. See example in [csi/deploy/example-pvc.yaml](csi/deploy/example-pvc.yaml).
|
||||||
|
|
||||||
## Known Problems
|
## Known Problems
|
||||||
|
|
||||||
|
3
csi/.dockerignore
Normal file
3
csi/.dockerignore
Normal file
@@ -0,0 +1,3 @@
|
|||||||
|
vitastor-csi
|
||||||
|
go.sum
|
||||||
|
Dockerfile
|
32
csi/Dockerfile
Normal file
32
csi/Dockerfile
Normal file
@@ -0,0 +1,32 @@
|
|||||||
|
# Compile stage
|
||||||
|
FROM golang:buster AS build
|
||||||
|
|
||||||
|
ADD go.mod /app/
|
||||||
|
RUN cd /app; CGO_ENABLED=1 GOOS=linux GOARCH=amd64 go mod download -x
|
||||||
|
ADD . /app
|
||||||
|
RUN perl -i -e '$/ = undef; while(<>) { s/\n\s*(\{\s*\n)/$1\n/g; s/\}(\s*\n\s*)else\b/$1} else/g; print; }' `find /app -name '*.go'`
|
||||||
|
RUN cd /app; CGO_ENABLED=1 GOOS=linux GOARCH=amd64 go build -o vitastor-csi
|
||||||
|
|
||||||
|
# Final stage
|
||||||
|
FROM debian:buster
|
||||||
|
|
||||||
|
LABEL maintainers="Vitaliy Filippov <vitalif@yourcmc.ru>"
|
||||||
|
LABEL description="Vitastor CSI Driver"
|
||||||
|
|
||||||
|
ENV NODE_ID=""
|
||||||
|
ENV CSI_ENDPOINT=""
|
||||||
|
|
||||||
|
RUN apt-get update && \
|
||||||
|
apt-get install -y wget && \
|
||||||
|
wget -q -O /etc/apt/trusted.gpg.d/vitastor.gpg https://vitastor.io/debian/pubkey.gpg && \
|
||||||
|
(echo deb http://vitastor.io/debian buster main > /etc/apt/sources.list.d/vitastor.list) && \
|
||||||
|
(echo deb http://deb.debian.org/debian buster-backports main > /etc/apt/sources.list.d/backports.list) && \
|
||||||
|
(echo "APT::Install-Recommends false;" > /etc/apt/apt.conf) && \
|
||||||
|
apt-get update && \
|
||||||
|
apt-get install -y e2fsprogs xfsprogs vitastor kmod && \
|
||||||
|
apt-get clean && \
|
||||||
|
(echo options nbd nbds_max=128 > /etc/modprobe.d/nbd.conf)
|
||||||
|
|
||||||
|
COPY --from=build /app/vitastor-csi /bin/
|
||||||
|
|
||||||
|
ENTRYPOINT ["/bin/vitastor-csi"]
|
9
csi/Makefile
Normal file
9
csi/Makefile
Normal file
@@ -0,0 +1,9 @@
|
|||||||
|
VERSION ?= v0.6.4
|
||||||
|
|
||||||
|
all: build push
|
||||||
|
|
||||||
|
build:
|
||||||
|
@docker build --rm -t vitalif/vitastor-csi:$(VERSION) .
|
||||||
|
|
||||||
|
push:
|
||||||
|
@docker push vitalif/vitastor-csi:$(VERSION)
|
5
csi/deploy/000-csi-namespace.yaml
Normal file
5
csi/deploy/000-csi-namespace.yaml
Normal file
@@ -0,0 +1,5 @@
|
|||||||
|
---
|
||||||
|
apiVersion: v1
|
||||||
|
kind: Namespace
|
||||||
|
metadata:
|
||||||
|
name: vitastor-system
|
9
csi/deploy/001-csi-config-map.yaml
Normal file
9
csi/deploy/001-csi-config-map.yaml
Normal file
@@ -0,0 +1,9 @@
|
|||||||
|
---
|
||||||
|
apiVersion: v1
|
||||||
|
kind: ConfigMap
|
||||||
|
data:
|
||||||
|
vitastor.conf: |-
|
||||||
|
{"etcd_address":"http://192.168.7.2:2379","etcd_prefix":"/vitastor"}
|
||||||
|
metadata:
|
||||||
|
namespace: vitastor-system
|
||||||
|
name: vitastor-config
|
37
csi/deploy/002-csi-nodeplugin-rbac.yaml
Normal file
37
csi/deploy/002-csi-nodeplugin-rbac.yaml
Normal file
@@ -0,0 +1,37 @@
|
|||||||
|
---
|
||||||
|
apiVersion: v1
|
||||||
|
kind: ServiceAccount
|
||||||
|
metadata:
|
||||||
|
namespace: vitastor-system
|
||||||
|
name: vitastor-csi-nodeplugin
|
||||||
|
---
|
||||||
|
kind: ClusterRole
|
||||||
|
apiVersion: rbac.authorization.k8s.io/v1
|
||||||
|
metadata:
|
||||||
|
namespace: vitastor-system
|
||||||
|
name: vitastor-csi-nodeplugin
|
||||||
|
rules:
|
||||||
|
- apiGroups: [""]
|
||||||
|
resources: ["nodes"]
|
||||||
|
verbs: ["get"]
|
||||||
|
# allow to read Vault Token and connection options from the Tenants namespace
|
||||||
|
- apiGroups: [""]
|
||||||
|
resources: ["secrets"]
|
||||||
|
verbs: ["get"]
|
||||||
|
- apiGroups: [""]
|
||||||
|
resources: ["configmaps"]
|
||||||
|
verbs: ["get"]
|
||||||
|
---
|
||||||
|
kind: ClusterRoleBinding
|
||||||
|
apiVersion: rbac.authorization.k8s.io/v1
|
||||||
|
metadata:
|
||||||
|
namespace: vitastor-system
|
||||||
|
name: vitastor-csi-nodeplugin
|
||||||
|
subjects:
|
||||||
|
- kind: ServiceAccount
|
||||||
|
name: vitastor-csi-nodeplugin
|
||||||
|
namespace: vitastor-system
|
||||||
|
roleRef:
|
||||||
|
kind: ClusterRole
|
||||||
|
name: vitastor-csi-nodeplugin
|
||||||
|
apiGroup: rbac.authorization.k8s.io
|
72
csi/deploy/003-csi-nodeplugin-psp.yaml
Normal file
72
csi/deploy/003-csi-nodeplugin-psp.yaml
Normal file
@@ -0,0 +1,72 @@
|
|||||||
|
---
|
||||||
|
apiVersion: policy/v1beta1
|
||||||
|
kind: PodSecurityPolicy
|
||||||
|
metadata:
|
||||||
|
namespace: vitastor-system
|
||||||
|
name: vitastor-csi-nodeplugin-psp
|
||||||
|
spec:
|
||||||
|
allowPrivilegeEscalation: true
|
||||||
|
allowedCapabilities:
|
||||||
|
- 'SYS_ADMIN'
|
||||||
|
fsGroup:
|
||||||
|
rule: RunAsAny
|
||||||
|
privileged: true
|
||||||
|
hostNetwork: true
|
||||||
|
hostPID: true
|
||||||
|
runAsUser:
|
||||||
|
rule: RunAsAny
|
||||||
|
seLinux:
|
||||||
|
rule: RunAsAny
|
||||||
|
supplementalGroups:
|
||||||
|
rule: RunAsAny
|
||||||
|
volumes:
|
||||||
|
- 'configMap'
|
||||||
|
- 'emptyDir'
|
||||||
|
- 'projected'
|
||||||
|
- 'secret'
|
||||||
|
- 'downwardAPI'
|
||||||
|
- 'hostPath'
|
||||||
|
allowedHostPaths:
|
||||||
|
- pathPrefix: '/dev'
|
||||||
|
readOnly: false
|
||||||
|
- pathPrefix: '/run/mount'
|
||||||
|
readOnly: false
|
||||||
|
- pathPrefix: '/sys'
|
||||||
|
readOnly: false
|
||||||
|
- pathPrefix: '/lib/modules'
|
||||||
|
readOnly: true
|
||||||
|
- pathPrefix: '/var/lib/kubelet/pods'
|
||||||
|
readOnly: false
|
||||||
|
- pathPrefix: '/var/lib/kubelet/plugins/csi.vitastor.io'
|
||||||
|
readOnly: false
|
||||||
|
- pathPrefix: '/var/lib/kubelet/plugins_registry'
|
||||||
|
readOnly: false
|
||||||
|
- pathPrefix: '/var/lib/kubelet/plugins'
|
||||||
|
readOnly: false
|
||||||
|
|
||||||
|
---
|
||||||
|
kind: Role
|
||||||
|
apiVersion: rbac.authorization.k8s.io/v1
|
||||||
|
metadata:
|
||||||
|
namespace: vitastor-system
|
||||||
|
name: vitastor-csi-nodeplugin-psp
|
||||||
|
rules:
|
||||||
|
- apiGroups: ['policy']
|
||||||
|
resources: ['podsecuritypolicies']
|
||||||
|
verbs: ['use']
|
||||||
|
resourceNames: ['vitastor-csi-nodeplugin-psp']
|
||||||
|
|
||||||
|
---
|
||||||
|
kind: RoleBinding
|
||||||
|
apiVersion: rbac.authorization.k8s.io/v1
|
||||||
|
metadata:
|
||||||
|
namespace: vitastor-system
|
||||||
|
name: vitastor-csi-nodeplugin-psp
|
||||||
|
subjects:
|
||||||
|
- kind: ServiceAccount
|
||||||
|
name: vitastor-csi-nodeplugin
|
||||||
|
namespace: vitastor-system
|
||||||
|
roleRef:
|
||||||
|
kind: Role
|
||||||
|
name: vitastor-csi-nodeplugin-psp
|
||||||
|
apiGroup: rbac.authorization.k8s.io
|
140
csi/deploy/004-csi-nodeplugin.yaml
Normal file
140
csi/deploy/004-csi-nodeplugin.yaml
Normal file
@@ -0,0 +1,140 @@
|
|||||||
|
---
|
||||||
|
kind: DaemonSet
|
||||||
|
apiVersion: apps/v1
|
||||||
|
metadata:
|
||||||
|
namespace: vitastor-system
|
||||||
|
name: csi-vitastor
|
||||||
|
spec:
|
||||||
|
selector:
|
||||||
|
matchLabels:
|
||||||
|
app: csi-vitastor
|
||||||
|
template:
|
||||||
|
metadata:
|
||||||
|
namespace: vitastor-system
|
||||||
|
labels:
|
||||||
|
app: csi-vitastor
|
||||||
|
spec:
|
||||||
|
serviceAccountName: vitastor-csi-nodeplugin
|
||||||
|
hostNetwork: true
|
||||||
|
hostPID: true
|
||||||
|
priorityClassName: system-node-critical
|
||||||
|
# to use e.g. Rook orchestrated cluster, and mons' FQDN is
|
||||||
|
# resolved through k8s service, set dns policy to cluster first
|
||||||
|
dnsPolicy: ClusterFirstWithHostNet
|
||||||
|
containers:
|
||||||
|
- name: driver-registrar
|
||||||
|
# This is necessary only for systems with SELinux, where
|
||||||
|
# non-privileged sidecar containers cannot access unix domain socket
|
||||||
|
# created by privileged CSI driver container.
|
||||||
|
securityContext:
|
||||||
|
privileged: true
|
||||||
|
image: k8s.gcr.io/sig-storage/csi-node-driver-registrar:v2.2.0
|
||||||
|
args:
|
||||||
|
- "--v=5"
|
||||||
|
- "--csi-address=/csi/csi.sock"
|
||||||
|
- "--kubelet-registration-path=/var/lib/kubelet/plugins/csi.vitastor.io/csi.sock"
|
||||||
|
env:
|
||||||
|
- name: KUBE_NODE_NAME
|
||||||
|
valueFrom:
|
||||||
|
fieldRef:
|
||||||
|
fieldPath: spec.nodeName
|
||||||
|
volumeMounts:
|
||||||
|
- name: socket-dir
|
||||||
|
mountPath: /csi
|
||||||
|
- name: registration-dir
|
||||||
|
mountPath: /registration
|
||||||
|
- name: csi-vitastor
|
||||||
|
securityContext:
|
||||||
|
privileged: true
|
||||||
|
capabilities:
|
||||||
|
add: ["SYS_ADMIN"]
|
||||||
|
allowPrivilegeEscalation: true
|
||||||
|
image: vitalif/vitastor-csi:v0.6.4
|
||||||
|
args:
|
||||||
|
- "--node=$(NODE_ID)"
|
||||||
|
- "--endpoint=$(CSI_ENDPOINT)"
|
||||||
|
env:
|
||||||
|
- name: NODE_ID
|
||||||
|
valueFrom:
|
||||||
|
fieldRef:
|
||||||
|
fieldPath: spec.nodeName
|
||||||
|
- name: CSI_ENDPOINT
|
||||||
|
value: unix:///csi/csi.sock
|
||||||
|
imagePullPolicy: "IfNotPresent"
|
||||||
|
ports:
|
||||||
|
- containerPort: 9898
|
||||||
|
name: healthz
|
||||||
|
protocol: TCP
|
||||||
|
livenessProbe:
|
||||||
|
failureThreshold: 5
|
||||||
|
httpGet:
|
||||||
|
path: /healthz
|
||||||
|
port: healthz
|
||||||
|
initialDelaySeconds: 10
|
||||||
|
timeoutSeconds: 3
|
||||||
|
periodSeconds: 2
|
||||||
|
volumeMounts:
|
||||||
|
- name: socket-dir
|
||||||
|
mountPath: /csi
|
||||||
|
- mountPath: /dev
|
||||||
|
name: host-dev
|
||||||
|
- mountPath: /sys
|
||||||
|
name: host-sys
|
||||||
|
- mountPath: /run/mount
|
||||||
|
name: host-mount
|
||||||
|
- mountPath: /lib/modules
|
||||||
|
name: lib-modules
|
||||||
|
readOnly: true
|
||||||
|
- name: vitastor-config
|
||||||
|
mountPath: /etc/vitastor
|
||||||
|
- name: plugin-dir
|
||||||
|
mountPath: /var/lib/kubelet/plugins
|
||||||
|
mountPropagation: "Bidirectional"
|
||||||
|
- name: mountpoint-dir
|
||||||
|
mountPath: /var/lib/kubelet/pods
|
||||||
|
mountPropagation: "Bidirectional"
|
||||||
|
- name: liveness-probe
|
||||||
|
securityContext:
|
||||||
|
privileged: true
|
||||||
|
image: quay.io/k8scsi/livenessprobe:v1.1.0
|
||||||
|
args:
|
||||||
|
- "--csi-address=$(CSI_ENDPOINT)"
|
||||||
|
- "--health-port=9898"
|
||||||
|
env:
|
||||||
|
- name: CSI_ENDPOINT
|
||||||
|
value: unix://csi/csi.sock
|
||||||
|
volumeMounts:
|
||||||
|
- mountPath: /csi
|
||||||
|
name: socket-dir
|
||||||
|
volumes:
|
||||||
|
- name: socket-dir
|
||||||
|
hostPath:
|
||||||
|
path: /var/lib/kubelet/plugins/csi.vitastor.io
|
||||||
|
type: DirectoryOrCreate
|
||||||
|
- name: plugin-dir
|
||||||
|
hostPath:
|
||||||
|
path: /var/lib/kubelet/plugins
|
||||||
|
type: Directory
|
||||||
|
- name: mountpoint-dir
|
||||||
|
hostPath:
|
||||||
|
path: /var/lib/kubelet/pods
|
||||||
|
type: DirectoryOrCreate
|
||||||
|
- name: registration-dir
|
||||||
|
hostPath:
|
||||||
|
path: /var/lib/kubelet/plugins_registry/
|
||||||
|
type: Directory
|
||||||
|
- name: host-dev
|
||||||
|
hostPath:
|
||||||
|
path: /dev
|
||||||
|
- name: host-sys
|
||||||
|
hostPath:
|
||||||
|
path: /sys
|
||||||
|
- name: host-mount
|
||||||
|
hostPath:
|
||||||
|
path: /run/mount
|
||||||
|
- name: lib-modules
|
||||||
|
hostPath:
|
||||||
|
path: /lib/modules
|
||||||
|
- name: vitastor-config
|
||||||
|
configMap:
|
||||||
|
name: vitastor-config
|
102
csi/deploy/005-csi-provisioner-rbac.yaml
Normal file
102
csi/deploy/005-csi-provisioner-rbac.yaml
Normal file
@@ -0,0 +1,102 @@
|
|||||||
|
---
|
||||||
|
apiVersion: v1
|
||||||
|
kind: ServiceAccount
|
||||||
|
metadata:
|
||||||
|
namespace: vitastor-system
|
||||||
|
name: vitastor-csi-provisioner
|
||||||
|
|
||||||
|
---
|
||||||
|
kind: ClusterRole
|
||||||
|
apiVersion: rbac.authorization.k8s.io/v1
|
||||||
|
metadata:
|
||||||
|
namespace: vitastor-system
|
||||||
|
name: vitastor-external-provisioner-runner
|
||||||
|
rules:
|
||||||
|
- apiGroups: [""]
|
||||||
|
resources: ["nodes"]
|
||||||
|
verbs: ["get", "list", "watch"]
|
||||||
|
- apiGroups: [""]
|
||||||
|
resources: ["secrets"]
|
||||||
|
verbs: ["get", "list", "watch"]
|
||||||
|
- apiGroups: [""]
|
||||||
|
resources: ["events"]
|
||||||
|
verbs: ["list", "watch", "create", "update", "patch"]
|
||||||
|
- apiGroups: [""]
|
||||||
|
resources: ["persistentvolumes"]
|
||||||
|
verbs: ["get", "list", "watch", "create", "update", "delete", "patch"]
|
||||||
|
- apiGroups: [""]
|
||||||
|
resources: ["persistentvolumeclaims"]
|
||||||
|
verbs: ["get", "list", "watch", "update"]
|
||||||
|
- apiGroups: [""]
|
||||||
|
resources: ["persistentvolumeclaims/status"]
|
||||||
|
verbs: ["update", "patch"]
|
||||||
|
- apiGroups: ["storage.k8s.io"]
|
||||||
|
resources: ["storageclasses"]
|
||||||
|
verbs: ["get", "list", "watch"]
|
||||||
|
- apiGroups: ["snapshot.storage.k8s.io"]
|
||||||
|
resources: ["volumesnapshots"]
|
||||||
|
verbs: ["get", "list"]
|
||||||
|
- apiGroups: ["snapshot.storage.k8s.io"]
|
||||||
|
resources: ["volumesnapshotcontents"]
|
||||||
|
verbs: ["create", "get", "list", "watch", "update", "delete"]
|
||||||
|
- apiGroups: ["snapshot.storage.k8s.io"]
|
||||||
|
resources: ["volumesnapshotclasses"]
|
||||||
|
verbs: ["get", "list", "watch"]
|
||||||
|
- apiGroups: ["storage.k8s.io"]
|
||||||
|
resources: ["volumeattachments"]
|
||||||
|
verbs: ["get", "list", "watch", "update", "patch"]
|
||||||
|
- apiGroups: ["storage.k8s.io"]
|
||||||
|
resources: ["volumeattachments/status"]
|
||||||
|
verbs: ["patch"]
|
||||||
|
- apiGroups: ["storage.k8s.io"]
|
||||||
|
resources: ["csinodes"]
|
||||||
|
verbs: ["get", "list", "watch"]
|
||||||
|
- apiGroups: ["snapshot.storage.k8s.io"]
|
||||||
|
resources: ["volumesnapshotcontents/status"]
|
||||||
|
verbs: ["update"]
|
||||||
|
- apiGroups: [""]
|
||||||
|
resources: ["configmaps"]
|
||||||
|
verbs: ["get"]
|
||||||
|
---
|
||||||
|
kind: ClusterRoleBinding
|
||||||
|
apiVersion: rbac.authorization.k8s.io/v1
|
||||||
|
metadata:
|
||||||
|
namespace: vitastor-system
|
||||||
|
name: vitastor-csi-provisioner-role
|
||||||
|
subjects:
|
||||||
|
- kind: ServiceAccount
|
||||||
|
name: vitastor-csi-provisioner
|
||||||
|
namespace: vitastor-system
|
||||||
|
roleRef:
|
||||||
|
kind: ClusterRole
|
||||||
|
name: vitastor-external-provisioner-runner
|
||||||
|
apiGroup: rbac.authorization.k8s.io
|
||||||
|
|
||||||
|
---
|
||||||
|
kind: Role
|
||||||
|
apiVersion: rbac.authorization.k8s.io/v1
|
||||||
|
metadata:
|
||||||
|
namespace: vitastor-system
|
||||||
|
name: vitastor-external-provisioner-cfg
|
||||||
|
rules:
|
||||||
|
- apiGroups: [""]
|
||||||
|
resources: ["configmaps"]
|
||||||
|
verbs: ["get", "list", "watch", "create", "update", "delete"]
|
||||||
|
- apiGroups: ["coordination.k8s.io"]
|
||||||
|
resources: ["leases"]
|
||||||
|
verbs: ["get", "watch", "list", "delete", "update", "create"]
|
||||||
|
|
||||||
|
---
|
||||||
|
kind: RoleBinding
|
||||||
|
apiVersion: rbac.authorization.k8s.io/v1
|
||||||
|
metadata:
|
||||||
|
name: vitastor-csi-provisioner-role-cfg
|
||||||
|
namespace: vitastor-system
|
||||||
|
subjects:
|
||||||
|
- kind: ServiceAccount
|
||||||
|
name: vitastor-csi-provisioner
|
||||||
|
namespace: vitastor-system
|
||||||
|
roleRef:
|
||||||
|
kind: Role
|
||||||
|
name: vitastor-external-provisioner-cfg
|
||||||
|
apiGroup: rbac.authorization.k8s.io
|
60
csi/deploy/006-csi-provisioner-psp.yaml
Normal file
60
csi/deploy/006-csi-provisioner-psp.yaml
Normal file
@@ -0,0 +1,60 @@
|
|||||||
|
---
|
||||||
|
apiVersion: policy/v1beta1
|
||||||
|
kind: PodSecurityPolicy
|
||||||
|
metadata:
|
||||||
|
namespace: vitastor-system
|
||||||
|
name: vitastor-csi-provisioner-psp
|
||||||
|
spec:
|
||||||
|
allowPrivilegeEscalation: true
|
||||||
|
allowedCapabilities:
|
||||||
|
- 'SYS_ADMIN'
|
||||||
|
fsGroup:
|
||||||
|
rule: RunAsAny
|
||||||
|
privileged: true
|
||||||
|
runAsUser:
|
||||||
|
rule: RunAsAny
|
||||||
|
seLinux:
|
||||||
|
rule: RunAsAny
|
||||||
|
supplementalGroups:
|
||||||
|
rule: RunAsAny
|
||||||
|
volumes:
|
||||||
|
- 'configMap'
|
||||||
|
- 'emptyDir'
|
||||||
|
- 'projected'
|
||||||
|
- 'secret'
|
||||||
|
- 'downwardAPI'
|
||||||
|
- 'hostPath'
|
||||||
|
allowedHostPaths:
|
||||||
|
- pathPrefix: '/dev'
|
||||||
|
readOnly: false
|
||||||
|
- pathPrefix: '/sys'
|
||||||
|
readOnly: false
|
||||||
|
- pathPrefix: '/lib/modules'
|
||||||
|
readOnly: true
|
||||||
|
|
||||||
|
---
|
||||||
|
kind: Role
|
||||||
|
apiVersion: rbac.authorization.k8s.io/v1
|
||||||
|
metadata:
|
||||||
|
namespace: vitastor-system
|
||||||
|
name: vitastor-csi-provisioner-psp
|
||||||
|
rules:
|
||||||
|
- apiGroups: ['policy']
|
||||||
|
resources: ['podsecuritypolicies']
|
||||||
|
verbs: ['use']
|
||||||
|
resourceNames: ['vitastor-csi-provisioner-psp']
|
||||||
|
|
||||||
|
---
|
||||||
|
kind: RoleBinding
|
||||||
|
apiVersion: rbac.authorization.k8s.io/v1
|
||||||
|
metadata:
|
||||||
|
name: vitastor-csi-provisioner-psp
|
||||||
|
namespace: vitastor-system
|
||||||
|
subjects:
|
||||||
|
- kind: ServiceAccount
|
||||||
|
name: vitastor-csi-provisioner
|
||||||
|
namespace: vitastor-system
|
||||||
|
roleRef:
|
||||||
|
kind: Role
|
||||||
|
name: vitastor-csi-provisioner-psp
|
||||||
|
apiGroup: rbac.authorization.k8s.io
|
159
csi/deploy/007-csi-provisioner.yaml
Normal file
159
csi/deploy/007-csi-provisioner.yaml
Normal file
@@ -0,0 +1,159 @@
|
|||||||
|
---
|
||||||
|
kind: Service
|
||||||
|
apiVersion: v1
|
||||||
|
metadata:
|
||||||
|
namespace: vitastor-system
|
||||||
|
name: csi-vitastor-provisioner
|
||||||
|
labels:
|
||||||
|
app: csi-metrics
|
||||||
|
spec:
|
||||||
|
selector:
|
||||||
|
app: csi-vitastor-provisioner
|
||||||
|
ports:
|
||||||
|
- name: http-metrics
|
||||||
|
port: 8080
|
||||||
|
protocol: TCP
|
||||||
|
targetPort: 8680
|
||||||
|
|
||||||
|
---
|
||||||
|
kind: Deployment
|
||||||
|
apiVersion: apps/v1
|
||||||
|
metadata:
|
||||||
|
namespace: vitastor-system
|
||||||
|
name: csi-vitastor-provisioner
|
||||||
|
spec:
|
||||||
|
replicas: 3
|
||||||
|
selector:
|
||||||
|
matchLabels:
|
||||||
|
app: csi-vitastor-provisioner
|
||||||
|
template:
|
||||||
|
metadata:
|
||||||
|
namespace: vitastor-system
|
||||||
|
labels:
|
||||||
|
app: csi-vitastor-provisioner
|
||||||
|
spec:
|
||||||
|
affinity:
|
||||||
|
podAntiAffinity:
|
||||||
|
requiredDuringSchedulingIgnoredDuringExecution:
|
||||||
|
- labelSelector:
|
||||||
|
matchExpressions:
|
||||||
|
- key: app
|
||||||
|
operator: In
|
||||||
|
values:
|
||||||
|
- csi-vitastor-provisioner
|
||||||
|
topologyKey: "kubernetes.io/hostname"
|
||||||
|
serviceAccountName: vitastor-csi-provisioner
|
||||||
|
priorityClassName: system-cluster-critical
|
||||||
|
containers:
|
||||||
|
- name: csi-provisioner
|
||||||
|
image: k8s.gcr.io/sig-storage/csi-provisioner:v2.2.0
|
||||||
|
args:
|
||||||
|
- "--csi-address=$(ADDRESS)"
|
||||||
|
- "--v=5"
|
||||||
|
- "--timeout=150s"
|
||||||
|
- "--retry-interval-start=500ms"
|
||||||
|
- "--leader-election=true"
|
||||||
|
# set it to true to use topology based provisioning
|
||||||
|
- "--feature-gates=Topology=false"
|
||||||
|
# if fstype is not specified in storageclass, ext4 is default
|
||||||
|
- "--default-fstype=ext4"
|
||||||
|
- "--extra-create-metadata=true"
|
||||||
|
env:
|
||||||
|
- name: ADDRESS
|
||||||
|
value: unix:///csi/csi-provisioner.sock
|
||||||
|
imagePullPolicy: "IfNotPresent"
|
||||||
|
volumeMounts:
|
||||||
|
- name: socket-dir
|
||||||
|
mountPath: /csi
|
||||||
|
- name: csi-snapshotter
|
||||||
|
image: k8s.gcr.io/sig-storage/csi-snapshotter:v4.0.0
|
||||||
|
args:
|
||||||
|
- "--csi-address=$(ADDRESS)"
|
||||||
|
- "--v=5"
|
||||||
|
- "--timeout=150s"
|
||||||
|
- "--leader-election=true"
|
||||||
|
env:
|
||||||
|
- name: ADDRESS
|
||||||
|
value: unix:///csi/csi-provisioner.sock
|
||||||
|
imagePullPolicy: "IfNotPresent"
|
||||||
|
securityContext:
|
||||||
|
privileged: true
|
||||||
|
volumeMounts:
|
||||||
|
- name: socket-dir
|
||||||
|
mountPath: /csi
|
||||||
|
- name: csi-attacher
|
||||||
|
image: k8s.gcr.io/sig-storage/csi-attacher:v3.1.0
|
||||||
|
args:
|
||||||
|
- "--v=5"
|
||||||
|
- "--csi-address=$(ADDRESS)"
|
||||||
|
- "--leader-election=true"
|
||||||
|
- "--retry-interval-start=500ms"
|
||||||
|
env:
|
||||||
|
- name: ADDRESS
|
||||||
|
value: /csi/csi-provisioner.sock
|
||||||
|
imagePullPolicy: "IfNotPresent"
|
||||||
|
volumeMounts:
|
||||||
|
- name: socket-dir
|
||||||
|
mountPath: /csi
|
||||||
|
- name: csi-resizer
|
||||||
|
image: k8s.gcr.io/sig-storage/csi-resizer:v1.1.0
|
||||||
|
args:
|
||||||
|
- "--csi-address=$(ADDRESS)"
|
||||||
|
- "--v=5"
|
||||||
|
- "--timeout=150s"
|
||||||
|
- "--leader-election"
|
||||||
|
- "--retry-interval-start=500ms"
|
||||||
|
- "--handle-volume-inuse-error=false"
|
||||||
|
env:
|
||||||
|
- name: ADDRESS
|
||||||
|
value: unix:///csi/csi-provisioner.sock
|
||||||
|
imagePullPolicy: "IfNotPresent"
|
||||||
|
volumeMounts:
|
||||||
|
- name: socket-dir
|
||||||
|
mountPath: /csi
|
||||||
|
- name: csi-vitastor
|
||||||
|
securityContext:
|
||||||
|
privileged: true
|
||||||
|
capabilities:
|
||||||
|
add: ["SYS_ADMIN"]
|
||||||
|
image: vitalif/vitastor-csi:v0.6.4
|
||||||
|
args:
|
||||||
|
- "--node=$(NODE_ID)"
|
||||||
|
- "--endpoint=$(CSI_ENDPOINT)"
|
||||||
|
env:
|
||||||
|
- name: NODE_ID
|
||||||
|
valueFrom:
|
||||||
|
fieldRef:
|
||||||
|
fieldPath: spec.nodeName
|
||||||
|
- name: CSI_ENDPOINT
|
||||||
|
value: unix:///csi/csi-provisioner.sock
|
||||||
|
imagePullPolicy: "IfNotPresent"
|
||||||
|
volumeMounts:
|
||||||
|
- name: socket-dir
|
||||||
|
mountPath: /csi
|
||||||
|
- mountPath: /dev
|
||||||
|
name: host-dev
|
||||||
|
- mountPath: /sys
|
||||||
|
name: host-sys
|
||||||
|
- mountPath: /lib/modules
|
||||||
|
name: lib-modules
|
||||||
|
readOnly: true
|
||||||
|
- name: vitastor-config
|
||||||
|
mountPath: /etc/vitastor
|
||||||
|
volumes:
|
||||||
|
- name: host-dev
|
||||||
|
hostPath:
|
||||||
|
path: /dev
|
||||||
|
- name: host-sys
|
||||||
|
hostPath:
|
||||||
|
path: /sys
|
||||||
|
- name: lib-modules
|
||||||
|
hostPath:
|
||||||
|
path: /lib/modules
|
||||||
|
- name: socket-dir
|
||||||
|
emptyDir: {
|
||||||
|
medium: "Memory"
|
||||||
|
}
|
||||||
|
- name: vitastor-config
|
||||||
|
configMap:
|
||||||
|
name: vitastor-config
|
11
csi/deploy/008-csi-driver.yaml
Normal file
11
csi/deploy/008-csi-driver.yaml
Normal file
@@ -0,0 +1,11 @@
|
|||||||
|
---
|
||||||
|
# if Kubernetes version is less than 1.18 change
|
||||||
|
# apiVersion to storage.k8s.io/v1betav1
|
||||||
|
apiVersion: storage.k8s.io/v1
|
||||||
|
kind: CSIDriver
|
||||||
|
metadata:
|
||||||
|
namespace: vitastor-system
|
||||||
|
name: csi.vitastor.io
|
||||||
|
spec:
|
||||||
|
attachRequired: true
|
||||||
|
podInfoOnMount: false
|
19
csi/deploy/009-storage-class.yaml
Normal file
19
csi/deploy/009-storage-class.yaml
Normal file
@@ -0,0 +1,19 @@
|
|||||||
|
---
|
||||||
|
apiVersion: storage.k8s.io/v1
|
||||||
|
kind: StorageClass
|
||||||
|
metadata:
|
||||||
|
namespace: vitastor-system
|
||||||
|
name: vitastor
|
||||||
|
annotations:
|
||||||
|
storageclass.kubernetes.io/is-default-class: "true"
|
||||||
|
provisioner: csi.vitastor.io
|
||||||
|
volumeBindingMode: Immediate
|
||||||
|
parameters:
|
||||||
|
etcdVolumePrefix: ""
|
||||||
|
poolId: "1"
|
||||||
|
# you can choose other configuration file if you have it in the config map
|
||||||
|
#configPath: "/etc/vitastor/vitastor.conf"
|
||||||
|
# you can also specify etcdUrl here, maybe to connect to another Vitastor cluster
|
||||||
|
# multiple etcdUrls may be specified, delimited by comma
|
||||||
|
#etcdUrl: "http://192.168.7.2:2379"
|
||||||
|
#etcdPrefix: "/vitastor"
|
12
csi/deploy/example-pvc.yaml
Normal file
12
csi/deploy/example-pvc.yaml
Normal file
@@ -0,0 +1,12 @@
|
|||||||
|
---
|
||||||
|
apiVersion: v1
|
||||||
|
kind: PersistentVolumeClaim
|
||||||
|
metadata:
|
||||||
|
name: test-vitastor-pvc
|
||||||
|
spec:
|
||||||
|
storageClassName: vitastor
|
||||||
|
accessModes:
|
||||||
|
- ReadWriteOnce
|
||||||
|
resources:
|
||||||
|
requests:
|
||||||
|
storage: 10Gi
|
35
csi/go.mod
Normal file
35
csi/go.mod
Normal file
@@ -0,0 +1,35 @@
|
|||||||
|
module vitastor.io/csi
|
||||||
|
|
||||||
|
go 1.15
|
||||||
|
|
||||||
|
require (
|
||||||
|
github.com/container-storage-interface/spec v1.4.0
|
||||||
|
github.com/coreos/bbolt v0.0.0-00010101000000-000000000000 // indirect
|
||||||
|
github.com/coreos/etcd v3.3.25+incompatible // indirect
|
||||||
|
github.com/coreos/go-semver v0.3.0 // indirect
|
||||||
|
github.com/coreos/go-systemd v0.0.0-20191104093116-d3cd4ed1dbcf // indirect
|
||||||
|
github.com/coreos/pkg v0.0.0-20180928190104-399ea9e2e55f // indirect
|
||||||
|
github.com/dustin/go-humanize v1.0.0 // indirect
|
||||||
|
github.com/golang/glog v0.0.0-20160126235308-23def4e6c14b
|
||||||
|
github.com/gorilla/websocket v1.4.2 // indirect
|
||||||
|
github.com/grpc-ecosystem/go-grpc-middleware v1.3.0 // indirect
|
||||||
|
github.com/grpc-ecosystem/go-grpc-prometheus v1.2.0 // indirect
|
||||||
|
github.com/grpc-ecosystem/grpc-gateway v1.16.0 // indirect
|
||||||
|
github.com/jonboulle/clockwork v0.2.2 // indirect
|
||||||
|
github.com/kubernetes-csi/csi-lib-utils v0.9.1
|
||||||
|
github.com/soheilhy/cmux v0.1.5 // indirect
|
||||||
|
github.com/tmc/grpc-websocket-proxy v0.0.0-20201229170055-e5319fda7802 // indirect
|
||||||
|
github.com/xiang90/probing v0.0.0-20190116061207-43a291ad63a2 // indirect
|
||||||
|
go.etcd.io/bbolt v0.0.0-00010101000000-000000000000 // indirect
|
||||||
|
go.etcd.io/etcd v3.3.25+incompatible
|
||||||
|
golang.org/x/net v0.0.0-20201202161906-c7110b5ffcbb
|
||||||
|
google.golang.org/grpc v1.33.1
|
||||||
|
k8s.io/klog v1.0.0
|
||||||
|
k8s.io/utils v0.0.0-20210305010621-2afb4311ab10
|
||||||
|
)
|
||||||
|
|
||||||
|
replace github.com/coreos/bbolt => go.etcd.io/bbolt v1.3.5
|
||||||
|
|
||||||
|
replace go.etcd.io/bbolt => github.com/coreos/bbolt v1.3.5
|
||||||
|
|
||||||
|
replace google.golang.org/grpc => google.golang.org/grpc v1.25.1
|
22
csi/src/config.go
Normal file
22
csi/src/config.go
Normal file
@@ -0,0 +1,22 @@
|
|||||||
|
// Copyright (c) Vitaliy Filippov, 2019+
|
||||||
|
// License: VNPL-1.1 or GNU GPL-2.0+ (see README.md for details)
|
||||||
|
|
||||||
|
package vitastor
|
||||||
|
|
||||||
|
const (
|
||||||
|
vitastorCSIDriverName = "csi.vitastor.io"
|
||||||
|
vitastorCSIDriverVersion = "0.6.4"
|
||||||
|
)
|
||||||
|
|
||||||
|
// Config struct fills the parameters of request or user input
|
||||||
|
type Config struct
|
||||||
|
{
|
||||||
|
Endpoint string
|
||||||
|
NodeID string
|
||||||
|
}
|
||||||
|
|
||||||
|
// NewConfig returns config struct to initialize new driver
|
||||||
|
func NewConfig() *Config
|
||||||
|
{
|
||||||
|
return &Config{}
|
||||||
|
}
|
530
csi/src/controllerserver.go
Normal file
530
csi/src/controllerserver.go
Normal file
@@ -0,0 +1,530 @@
|
|||||||
|
// Copyright (c) Vitaliy Filippov, 2019+
|
||||||
|
// License: VNPL-1.1 or GNU GPL-2.0+ (see README.md for details)
|
||||||
|
|
||||||
|
package vitastor
|
||||||
|
|
||||||
|
import (
|
||||||
|
"context"
|
||||||
|
"encoding/json"
|
||||||
|
"strings"
|
||||||
|
"bytes"
|
||||||
|
"strconv"
|
||||||
|
"time"
|
||||||
|
"fmt"
|
||||||
|
"os"
|
||||||
|
"os/exec"
|
||||||
|
"io/ioutil"
|
||||||
|
|
||||||
|
"github.com/kubernetes-csi/csi-lib-utils/protosanitizer"
|
||||||
|
"k8s.io/klog"
|
||||||
|
|
||||||
|
"google.golang.org/grpc/codes"
|
||||||
|
"google.golang.org/grpc/status"
|
||||||
|
|
||||||
|
"go.etcd.io/etcd/clientv3"
|
||||||
|
|
||||||
|
"github.com/container-storage-interface/spec/lib/go/csi"
|
||||||
|
)
|
||||||
|
|
||||||
|
const (
|
||||||
|
KB int64 = 1024
|
||||||
|
MB int64 = 1024 * KB
|
||||||
|
GB int64 = 1024 * MB
|
||||||
|
TB int64 = 1024 * GB
|
||||||
|
ETCD_TIMEOUT time.Duration = 15*time.Second
|
||||||
|
)
|
||||||
|
|
||||||
|
type InodeIndex struct
|
||||||
|
{
|
||||||
|
Id uint64 `json:"id"`
|
||||||
|
PoolId uint64 `json:"pool_id"`
|
||||||
|
}
|
||||||
|
|
||||||
|
type InodeConfig struct
|
||||||
|
{
|
||||||
|
Name string `json:"name"`
|
||||||
|
Size uint64 `json:"size,omitempty"`
|
||||||
|
ParentPool uint64 `json:"parent_pool,omitempty"`
|
||||||
|
ParentId uint64 `json:"parent_id,omitempty"`
|
||||||
|
Readonly bool `json:"readonly,omitempty"`
|
||||||
|
}
|
||||||
|
|
||||||
|
type ControllerServer struct
|
||||||
|
{
|
||||||
|
*Driver
|
||||||
|
}
|
||||||
|
|
||||||
|
// NewControllerServer create new instance controller
|
||||||
|
func NewControllerServer(driver *Driver) *ControllerServer
|
||||||
|
{
|
||||||
|
return &ControllerServer{
|
||||||
|
Driver: driver,
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
func GetConnectionParams(params map[string]string) (map[string]string, []string, string)
|
||||||
|
{
|
||||||
|
ctxVars := make(map[string]string)
|
||||||
|
configPath := params["configPath"]
|
||||||
|
if (configPath == "")
|
||||||
|
{
|
||||||
|
configPath = "/etc/vitastor/vitastor.conf"
|
||||||
|
}
|
||||||
|
else
|
||||||
|
{
|
||||||
|
ctxVars["configPath"] = configPath
|
||||||
|
}
|
||||||
|
config := make(map[string]interface{})
|
||||||
|
if configFD, err := os.Open(configPath); err == nil
|
||||||
|
{
|
||||||
|
defer configFD.Close()
|
||||||
|
data, _ := ioutil.ReadAll(configFD)
|
||||||
|
json.Unmarshal(data, &config)
|
||||||
|
}
|
||||||
|
// Try to load prefix & etcd URL from the config
|
||||||
|
var etcdUrl []string
|
||||||
|
if (params["etcdUrl"] != "")
|
||||||
|
{
|
||||||
|
ctxVars["etcdUrl"] = params["etcdUrl"]
|
||||||
|
etcdUrl = strings.Split(params["etcdUrl"], ",")
|
||||||
|
}
|
||||||
|
if (len(etcdUrl) == 0)
|
||||||
|
{
|
||||||
|
switch config["etcd_address"].(type)
|
||||||
|
{
|
||||||
|
case string:
|
||||||
|
etcdUrl = strings.Split(config["etcd_address"].(string), ",")
|
||||||
|
case []string:
|
||||||
|
etcdUrl = config["etcd_address"].([]string)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
etcdPrefix := params["etcdPrefix"]
|
||||||
|
if (etcdPrefix == "")
|
||||||
|
{
|
||||||
|
etcdPrefix, _ = config["etcd_prefix"].(string)
|
||||||
|
if (etcdPrefix == "")
|
||||||
|
{
|
||||||
|
etcdPrefix = "/vitastor"
|
||||||
|
}
|
||||||
|
}
|
||||||
|
else
|
||||||
|
{
|
||||||
|
ctxVars["etcdPrefix"] = etcdPrefix
|
||||||
|
}
|
||||||
|
return ctxVars, etcdUrl, etcdPrefix
|
||||||
|
}
|
||||||
|
|
||||||
|
// Create the volume
|
||||||
|
func (cs *ControllerServer) CreateVolume(ctx context.Context, req *csi.CreateVolumeRequest) (*csi.CreateVolumeResponse, error)
|
||||||
|
{
|
||||||
|
klog.Infof("received controller create volume request %+v", protosanitizer.StripSecrets(req))
|
||||||
|
if (req == nil)
|
||||||
|
{
|
||||||
|
return nil, status.Errorf(codes.InvalidArgument, "request cannot be empty")
|
||||||
|
}
|
||||||
|
if (req.GetName() == "")
|
||||||
|
{
|
||||||
|
return nil, status.Error(codes.InvalidArgument, "name is a required field")
|
||||||
|
}
|
||||||
|
volumeCapabilities := req.GetVolumeCapabilities()
|
||||||
|
if (volumeCapabilities == nil)
|
||||||
|
{
|
||||||
|
return nil, status.Error(codes.InvalidArgument, "volume capabilities is a required field")
|
||||||
|
}
|
||||||
|
|
||||||
|
etcdVolumePrefix := req.Parameters["etcdVolumePrefix"]
|
||||||
|
poolId, _ := strconv.ParseUint(req.Parameters["poolId"], 10, 64)
|
||||||
|
if (poolId == 0)
|
||||||
|
{
|
||||||
|
return nil, status.Error(codes.InvalidArgument, "poolId is missing in storage class configuration")
|
||||||
|
}
|
||||||
|
|
||||||
|
volName := etcdVolumePrefix + req.GetName()
|
||||||
|
volSize := 1 * GB
|
||||||
|
if capRange := req.GetCapacityRange(); capRange != nil
|
||||||
|
{
|
||||||
|
volSize = ((capRange.GetRequiredBytes() + MB - 1) / MB) * MB
|
||||||
|
}
|
||||||
|
|
||||||
|
// FIXME: The following should PROBABLY be implemented externally in a management tool
|
||||||
|
|
||||||
|
ctxVars, etcdUrl, etcdPrefix := GetConnectionParams(req.Parameters)
|
||||||
|
if (len(etcdUrl) == 0)
|
||||||
|
{
|
||||||
|
return nil, status.Error(codes.InvalidArgument, "no etcdUrl in storage class configuration and no etcd_address in vitastor.conf")
|
||||||
|
}
|
||||||
|
|
||||||
|
// Connect to etcd
|
||||||
|
cli, err := clientv3.New(clientv3.Config{
|
||||||
|
DialTimeout: ETCD_TIMEOUT,
|
||||||
|
Endpoints: etcdUrl,
|
||||||
|
})
|
||||||
|
if (err != nil)
|
||||||
|
{
|
||||||
|
return nil, status.Error(codes.Internal, "failed to connect to etcd at "+strings.Join(etcdUrl, ",")+": "+err.Error())
|
||||||
|
}
|
||||||
|
defer cli.Close()
|
||||||
|
|
||||||
|
var imageId uint64 = 0
|
||||||
|
for
|
||||||
|
{
|
||||||
|
// Check if the image exists
|
||||||
|
ctx, cancel := context.WithTimeout(context.Background(), ETCD_TIMEOUT)
|
||||||
|
resp, err := cli.Get(ctx, etcdPrefix+"/index/image/"+volName)
|
||||||
|
cancel()
|
||||||
|
if (err != nil)
|
||||||
|
{
|
||||||
|
return nil, status.Error(codes.Internal, "failed to read key from etcd: "+err.Error())
|
||||||
|
}
|
||||||
|
if (len(resp.Kvs) > 0)
|
||||||
|
{
|
||||||
|
kv := resp.Kvs[0]
|
||||||
|
var v InodeIndex
|
||||||
|
err := json.Unmarshal(kv.Value, &v)
|
||||||
|
if (err != nil)
|
||||||
|
{
|
||||||
|
return nil, status.Error(codes.Internal, "invalid /index/image/"+volName+" key in etcd: "+err.Error())
|
||||||
|
}
|
||||||
|
poolId = v.PoolId
|
||||||
|
imageId = v.Id
|
||||||
|
inodeCfgKey := fmt.Sprintf("/config/inode/%d/%d", poolId, imageId)
|
||||||
|
ctx, cancel := context.WithTimeout(context.Background(), ETCD_TIMEOUT)
|
||||||
|
resp, err := cli.Get(ctx, etcdPrefix+inodeCfgKey)
|
||||||
|
cancel()
|
||||||
|
if (err != nil)
|
||||||
|
{
|
||||||
|
return nil, status.Error(codes.Internal, "failed to read key from etcd: "+err.Error())
|
||||||
|
}
|
||||||
|
if (len(resp.Kvs) == 0)
|
||||||
|
{
|
||||||
|
return nil, status.Error(codes.Internal, "missing "+inodeCfgKey+" key in etcd")
|
||||||
|
}
|
||||||
|
var inodeCfg InodeConfig
|
||||||
|
err = json.Unmarshal(resp.Kvs[0].Value, &inodeCfg)
|
||||||
|
if (err != nil)
|
||||||
|
{
|
||||||
|
return nil, status.Error(codes.Internal, "invalid "+inodeCfgKey+" key in etcd: "+err.Error())
|
||||||
|
}
|
||||||
|
if (inodeCfg.Size < uint64(volSize))
|
||||||
|
{
|
||||||
|
return nil, status.Error(codes.Internal, "image "+volName+" is already created, but size is less than expected")
|
||||||
|
}
|
||||||
|
}
|
||||||
|
else
|
||||||
|
{
|
||||||
|
// Find a free ID
|
||||||
|
// Create image metadata in a transaction verifying that the image doesn't exist yet AND ID is still free
|
||||||
|
maxIdKey := fmt.Sprintf("%s/index/maxid/%d", etcdPrefix, poolId)
|
||||||
|
ctx, cancel := context.WithTimeout(context.Background(), ETCD_TIMEOUT)
|
||||||
|
resp, err := cli.Get(ctx, maxIdKey)
|
||||||
|
cancel()
|
||||||
|
if (err != nil)
|
||||||
|
{
|
||||||
|
return nil, status.Error(codes.Internal, "failed to read key from etcd: "+err.Error())
|
||||||
|
}
|
||||||
|
var modRev int64
|
||||||
|
var nextId uint64
|
||||||
|
if (len(resp.Kvs) > 0)
|
||||||
|
{
|
||||||
|
var err error
|
||||||
|
nextId, err = strconv.ParseUint(string(resp.Kvs[0].Value), 10, 64)
|
||||||
|
if (err != nil)
|
||||||
|
{
|
||||||
|
return nil, status.Error(codes.Internal, maxIdKey+" contains invalid ID")
|
||||||
|
}
|
||||||
|
modRev = resp.Kvs[0].ModRevision
|
||||||
|
nextId++
|
||||||
|
}
|
||||||
|
else
|
||||||
|
{
|
||||||
|
nextId = 1
|
||||||
|
}
|
||||||
|
inodeIdxJson, _ := json.Marshal(InodeIndex{
|
||||||
|
Id: nextId,
|
||||||
|
PoolId: poolId,
|
||||||
|
})
|
||||||
|
inodeCfgJson, _ := json.Marshal(InodeConfig{
|
||||||
|
Name: volName,
|
||||||
|
Size: uint64(volSize),
|
||||||
|
})
|
||||||
|
ctx, cancel = context.WithTimeout(context.Background(), ETCD_TIMEOUT)
|
||||||
|
txnResp, err := cli.Txn(ctx).If(
|
||||||
|
clientv3.Compare(clientv3.ModRevision(fmt.Sprintf("%s/index/maxid/%d", etcdPrefix, poolId)), "=", modRev),
|
||||||
|
clientv3.Compare(clientv3.CreateRevision(fmt.Sprintf("%s/index/image/%s", etcdPrefix, volName)), "=", 0),
|
||||||
|
clientv3.Compare(clientv3.CreateRevision(fmt.Sprintf("%s/config/inode/%d/%d", etcdPrefix, poolId, nextId)), "=", 0),
|
||||||
|
).Then(
|
||||||
|
clientv3.OpPut(fmt.Sprintf("%s/index/maxid/%d", etcdPrefix, poolId), fmt.Sprintf("%d", nextId)),
|
||||||
|
clientv3.OpPut(fmt.Sprintf("%s/index/image/%s", etcdPrefix, volName), string(inodeIdxJson)),
|
||||||
|
clientv3.OpPut(fmt.Sprintf("%s/config/inode/%d/%d", etcdPrefix, poolId, nextId), string(inodeCfgJson)),
|
||||||
|
).Commit()
|
||||||
|
cancel()
|
||||||
|
if (err != nil)
|
||||||
|
{
|
||||||
|
return nil, status.Error(codes.Internal, "failed to commit transaction in etcd: "+err.Error())
|
||||||
|
}
|
||||||
|
if (txnResp.Succeeded)
|
||||||
|
{
|
||||||
|
imageId = nextId
|
||||||
|
break
|
||||||
|
}
|
||||||
|
// Start over if the transaction fails
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
ctxVars["name"] = volName
|
||||||
|
volumeIdJson, _ := json.Marshal(ctxVars)
|
||||||
|
return &csi.CreateVolumeResponse{
|
||||||
|
Volume: &csi.Volume{
|
||||||
|
// Ugly, but VolumeContext isn't passed to DeleteVolume :-(
|
||||||
|
VolumeId: string(volumeIdJson),
|
||||||
|
CapacityBytes: volSize,
|
||||||
|
},
|
||||||
|
}, nil
|
||||||
|
}
|
||||||
|
|
||||||
|
// DeleteVolume deletes the given volume
|
||||||
|
func (cs *ControllerServer) DeleteVolume(ctx context.Context, req *csi.DeleteVolumeRequest) (*csi.DeleteVolumeResponse, error)
|
||||||
|
{
|
||||||
|
klog.Infof("received controller delete volume request %+v", protosanitizer.StripSecrets(req))
|
||||||
|
if (req == nil)
|
||||||
|
{
|
||||||
|
return nil, status.Error(codes.InvalidArgument, "request cannot be empty")
|
||||||
|
}
|
||||||
|
|
||||||
|
ctxVars := make(map[string]string)
|
||||||
|
err := json.Unmarshal([]byte(req.VolumeId), &ctxVars)
|
||||||
|
if (err != nil)
|
||||||
|
{
|
||||||
|
return nil, status.Error(codes.Internal, "volume ID not in JSON format")
|
||||||
|
}
|
||||||
|
volName := ctxVars["name"]
|
||||||
|
|
||||||
|
_, etcdUrl, etcdPrefix := GetConnectionParams(ctxVars)
|
||||||
|
if (len(etcdUrl) == 0)
|
||||||
|
{
|
||||||
|
return nil, status.Error(codes.InvalidArgument, "no etcdUrl in storage class configuration and no etcd_address in vitastor.conf")
|
||||||
|
}
|
||||||
|
|
||||||
|
cli, err := clientv3.New(clientv3.Config{
|
||||||
|
DialTimeout: ETCD_TIMEOUT,
|
||||||
|
Endpoints: etcdUrl,
|
||||||
|
})
|
||||||
|
if (err != nil)
|
||||||
|
{
|
||||||
|
return nil, status.Error(codes.Internal, "failed to connect to etcd at "+strings.Join(etcdUrl, ",")+": "+err.Error())
|
||||||
|
}
|
||||||
|
defer cli.Close()
|
||||||
|
|
||||||
|
// Find inode by name
|
||||||
|
ctx, cancel := context.WithTimeout(context.Background(), ETCD_TIMEOUT)
|
||||||
|
resp, err := cli.Get(ctx, etcdPrefix+"/index/image/"+volName)
|
||||||
|
cancel()
|
||||||
|
if (err != nil)
|
||||||
|
{
|
||||||
|
return nil, status.Error(codes.Internal, "failed to read key from etcd: "+err.Error())
|
||||||
|
}
|
||||||
|
if (len(resp.Kvs) == 0)
|
||||||
|
{
|
||||||
|
return nil, status.Error(codes.NotFound, "volume "+volName+" does not exist")
|
||||||
|
}
|
||||||
|
var idx InodeIndex
|
||||||
|
err = json.Unmarshal(resp.Kvs[0].Value, &idx)
|
||||||
|
if (err != nil)
|
||||||
|
{
|
||||||
|
return nil, status.Error(codes.Internal, "invalid /index/image/"+volName+" key in etcd: "+err.Error())
|
||||||
|
}
|
||||||
|
|
||||||
|
// Get inode config
|
||||||
|
inodeCfgKey := fmt.Sprintf("%s/config/inode/%d/%d", etcdPrefix, idx.PoolId, idx.Id)
|
||||||
|
ctx, cancel = context.WithTimeout(context.Background(), ETCD_TIMEOUT)
|
||||||
|
resp, err = cli.Get(ctx, inodeCfgKey)
|
||||||
|
cancel()
|
||||||
|
if (err != nil)
|
||||||
|
{
|
||||||
|
return nil, status.Error(codes.Internal, "failed to read key from etcd: "+err.Error())
|
||||||
|
}
|
||||||
|
if (len(resp.Kvs) == 0)
|
||||||
|
{
|
||||||
|
return nil, status.Error(codes.NotFound, "volume "+volName+" does not exist")
|
||||||
|
}
|
||||||
|
var inodeCfg InodeConfig
|
||||||
|
err = json.Unmarshal(resp.Kvs[0].Value, &inodeCfg)
|
||||||
|
if (err != nil)
|
||||||
|
{
|
||||||
|
return nil, status.Error(codes.Internal, "invalid "+inodeCfgKey+" key in etcd: "+err.Error())
|
||||||
|
}
|
||||||
|
|
||||||
|
// Delete inode data by invoking vitastor-rm
|
||||||
|
args := []string{
|
||||||
|
"--etcd_address", strings.Join(etcdUrl, ","),
|
||||||
|
"--pool", fmt.Sprintf("%d", idx.PoolId),
|
||||||
|
"--inode", fmt.Sprintf("%d", idx.Id),
|
||||||
|
}
|
||||||
|
if (ctxVars["configPath"] != "")
|
||||||
|
{
|
||||||
|
args = append(args, "--config_path", ctxVars["configPath"])
|
||||||
|
}
|
||||||
|
c := exec.Command("/usr/bin/vitastor-rm", args...)
|
||||||
|
var stderr bytes.Buffer
|
||||||
|
c.Stdout = nil
|
||||||
|
c.Stderr = &stderr
|
||||||
|
err = c.Run()
|
||||||
|
stderrStr := string(stderr.Bytes())
|
||||||
|
if (err != nil)
|
||||||
|
{
|
||||||
|
klog.Errorf("vitastor-rm failed: %s, status %s\n", stderrStr, err)
|
||||||
|
return nil, status.Error(codes.Internal, stderrStr+" (status "+err.Error()+")")
|
||||||
|
}
|
||||||
|
|
||||||
|
// Delete inode config in etcd
|
||||||
|
ctx, cancel = context.WithTimeout(context.Background(), ETCD_TIMEOUT)
|
||||||
|
txnResp, err := cli.Txn(ctx).Then(
|
||||||
|
clientv3.OpDelete(fmt.Sprintf("%s/index/image/%s", etcdPrefix, volName)),
|
||||||
|
clientv3.OpDelete(fmt.Sprintf("%s/config/inode/%d/%d", etcdPrefix, idx.PoolId, idx.Id)),
|
||||||
|
).Commit()
|
||||||
|
cancel()
|
||||||
|
if (err != nil)
|
||||||
|
{
|
||||||
|
return nil, status.Error(codes.Internal, "failed to delete keys in etcd: "+err.Error())
|
||||||
|
}
|
||||||
|
if (!txnResp.Succeeded)
|
||||||
|
{
|
||||||
|
return nil, status.Error(codes.Internal, "failed to delete keys in etcd: transaction failed")
|
||||||
|
}
|
||||||
|
|
||||||
|
return &csi.DeleteVolumeResponse{}, nil
|
||||||
|
}
|
||||||
|
|
||||||
|
// ControllerPublishVolume return Unimplemented error
|
||||||
|
func (cs *ControllerServer) ControllerPublishVolume(ctx context.Context, req *csi.ControllerPublishVolumeRequest) (*csi.ControllerPublishVolumeResponse, error)
|
||||||
|
{
|
||||||
|
return nil, status.Error(codes.Unimplemented, "")
|
||||||
|
}
|
||||||
|
|
||||||
|
// ControllerUnpublishVolume return Unimplemented error
|
||||||
|
func (cs *ControllerServer) ControllerUnpublishVolume(ctx context.Context, req *csi.ControllerUnpublishVolumeRequest) (*csi.ControllerUnpublishVolumeResponse, error)
|
||||||
|
{
|
||||||
|
return nil, status.Error(codes.Unimplemented, "")
|
||||||
|
}
|
||||||
|
|
||||||
|
// ValidateVolumeCapabilities checks whether the volume capabilities requested are supported.
|
||||||
|
func (cs *ControllerServer) ValidateVolumeCapabilities(ctx context.Context, req *csi.ValidateVolumeCapabilitiesRequest) (*csi.ValidateVolumeCapabilitiesResponse, error)
|
||||||
|
{
|
||||||
|
klog.Infof("received controller validate volume capability request %+v", protosanitizer.StripSecrets(req))
|
||||||
|
if (req == nil)
|
||||||
|
{
|
||||||
|
return nil, status.Errorf(codes.InvalidArgument, "request is nil")
|
||||||
|
}
|
||||||
|
volumeID := req.GetVolumeId()
|
||||||
|
if (volumeID == "")
|
||||||
|
{
|
||||||
|
return nil, status.Error(codes.InvalidArgument, "volumeId is nil")
|
||||||
|
}
|
||||||
|
volumeCapabilities := req.GetVolumeCapabilities()
|
||||||
|
if (volumeCapabilities == nil)
|
||||||
|
{
|
||||||
|
return nil, status.Error(codes.InvalidArgument, "volumeCapabilities is nil")
|
||||||
|
}
|
||||||
|
|
||||||
|
var volumeCapabilityAccessModes []*csi.VolumeCapability_AccessMode
|
||||||
|
for _, mode := range []csi.VolumeCapability_AccessMode_Mode{
|
||||||
|
csi.VolumeCapability_AccessMode_SINGLE_NODE_WRITER,
|
||||||
|
csi.VolumeCapability_AccessMode_MULTI_NODE_MULTI_WRITER,
|
||||||
|
} {
|
||||||
|
volumeCapabilityAccessModes = append(volumeCapabilityAccessModes, &csi.VolumeCapability_AccessMode{Mode: mode})
|
||||||
|
}
|
||||||
|
|
||||||
|
capabilitySupport := false
|
||||||
|
for _, capability := range volumeCapabilities
|
||||||
|
{
|
||||||
|
for _, volumeCapabilityAccessMode := range volumeCapabilityAccessModes
|
||||||
|
{
|
||||||
|
if (volumeCapabilityAccessMode.Mode == capability.AccessMode.Mode)
|
||||||
|
{
|
||||||
|
capabilitySupport = true
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
if (!capabilitySupport)
|
||||||
|
{
|
||||||
|
return nil, status.Errorf(codes.NotFound, "%v not supported", req.GetVolumeCapabilities())
|
||||||
|
}
|
||||||
|
|
||||||
|
return &csi.ValidateVolumeCapabilitiesResponse{
|
||||||
|
Confirmed: &csi.ValidateVolumeCapabilitiesResponse_Confirmed{
|
||||||
|
VolumeCapabilities: req.VolumeCapabilities,
|
||||||
|
},
|
||||||
|
}, nil
|
||||||
|
}
|
||||||
|
|
||||||
|
// ListVolumes returns a list of volumes
|
||||||
|
func (cs *ControllerServer) ListVolumes(ctx context.Context, req *csi.ListVolumesRequest) (*csi.ListVolumesResponse, error)
|
||||||
|
{
|
||||||
|
return nil, status.Error(codes.Unimplemented, "")
|
||||||
|
}
|
||||||
|
|
||||||
|
// GetCapacity returns the capacity of the storage pool
|
||||||
|
func (cs *ControllerServer) GetCapacity(ctx context.Context, req *csi.GetCapacityRequest) (*csi.GetCapacityResponse, error)
|
||||||
|
{
|
||||||
|
return nil, status.Error(codes.Unimplemented, "")
|
||||||
|
}
|
||||||
|
|
||||||
|
// ControllerGetCapabilities returns the capabilities of the controller service.
|
||||||
|
func (cs *ControllerServer) ControllerGetCapabilities(ctx context.Context, req *csi.ControllerGetCapabilitiesRequest) (*csi.ControllerGetCapabilitiesResponse, error)
|
||||||
|
{
|
||||||
|
functionControllerServerCapabilities := func(cap csi.ControllerServiceCapability_RPC_Type) *csi.ControllerServiceCapability
|
||||||
|
{
|
||||||
|
return &csi.ControllerServiceCapability{
|
||||||
|
Type: &csi.ControllerServiceCapability_Rpc{
|
||||||
|
Rpc: &csi.ControllerServiceCapability_RPC{
|
||||||
|
Type: cap,
|
||||||
|
},
|
||||||
|
},
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
var controllerServerCapabilities []*csi.ControllerServiceCapability
|
||||||
|
for _, capability := range []csi.ControllerServiceCapability_RPC_Type{
|
||||||
|
csi.ControllerServiceCapability_RPC_CREATE_DELETE_VOLUME,
|
||||||
|
csi.ControllerServiceCapability_RPC_LIST_VOLUMES,
|
||||||
|
csi.ControllerServiceCapability_RPC_EXPAND_VOLUME,
|
||||||
|
csi.ControllerServiceCapability_RPC_CREATE_DELETE_SNAPSHOT,
|
||||||
|
} {
|
||||||
|
controllerServerCapabilities = append(controllerServerCapabilities, functionControllerServerCapabilities(capability))
|
||||||
|
}
|
||||||
|
|
||||||
|
return &csi.ControllerGetCapabilitiesResponse{
|
||||||
|
Capabilities: controllerServerCapabilities,
|
||||||
|
}, nil
|
||||||
|
}
|
||||||
|
|
||||||
|
// CreateSnapshot create snapshot of an existing PV
|
||||||
|
func (cs *ControllerServer) CreateSnapshot(ctx context.Context, req *csi.CreateSnapshotRequest) (*csi.CreateSnapshotResponse, error)
|
||||||
|
{
|
||||||
|
return nil, status.Error(codes.Unimplemented, "")
|
||||||
|
}
|
||||||
|
|
||||||
|
// DeleteSnapshot delete provided snapshot of a PV
|
||||||
|
func (cs *ControllerServer) DeleteSnapshot(ctx context.Context, req *csi.DeleteSnapshotRequest) (*csi.DeleteSnapshotResponse, error)
|
||||||
|
{
|
||||||
|
return nil, status.Error(codes.Unimplemented, "")
|
||||||
|
}
|
||||||
|
|
||||||
|
// ListSnapshots list the snapshots of a PV
|
||||||
|
func (cs *ControllerServer) ListSnapshots(ctx context.Context, req *csi.ListSnapshotsRequest) (*csi.ListSnapshotsResponse, error)
|
||||||
|
{
|
||||||
|
return nil, status.Error(codes.Unimplemented, "")
|
||||||
|
}
|
||||||
|
|
||||||
|
// ControllerExpandVolume resizes a volume
|
||||||
|
func (cs *ControllerServer) ControllerExpandVolume(ctx context.Context, req *csi.ControllerExpandVolumeRequest) (*csi.ControllerExpandVolumeResponse, error)
|
||||||
|
{
|
||||||
|
return nil, status.Error(codes.Unimplemented, "")
|
||||||
|
}
|
||||||
|
|
||||||
|
// ControllerGetVolume get volume info
|
||||||
|
func (cs *ControllerServer) ControllerGetVolume(ctx context.Context, req *csi.ControllerGetVolumeRequest) (*csi.ControllerGetVolumeResponse, error)
|
||||||
|
{
|
||||||
|
return nil, status.Error(codes.Unimplemented, "")
|
||||||
|
}
|
137
csi/src/grpc.go
Normal file
137
csi/src/grpc.go
Normal file
@@ -0,0 +1,137 @@
|
|||||||
|
/*
|
||||||
|
Copyright 2017 The Kubernetes Authors.
|
||||||
|
|
||||||
|
Licensed under the Apache License, Version 2.0 (the "License");
|
||||||
|
you may not use this file except in compliance with the License.
|
||||||
|
You may obtain a copy of the License at
|
||||||
|
|
||||||
|
http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
|
||||||
|
Unless required by applicable law or agreed to in writing, software
|
||||||
|
distributed under the License is distributed on an "AS IS" BASIS,
|
||||||
|
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||||
|
See the License for the specific language governing permissions and
|
||||||
|
limitations under the License.
|
||||||
|
*/
|
||||||
|
|
||||||
|
package vitastor
|
||||||
|
|
||||||
|
import (
|
||||||
|
"fmt"
|
||||||
|
"net"
|
||||||
|
"os"
|
||||||
|
"strings"
|
||||||
|
"sync"
|
||||||
|
|
||||||
|
"github.com/golang/glog"
|
||||||
|
"golang.org/x/net/context"
|
||||||
|
"google.golang.org/grpc"
|
||||||
|
|
||||||
|
"github.com/container-storage-interface/spec/lib/go/csi"
|
||||||
|
"github.com/kubernetes-csi/csi-lib-utils/protosanitizer"
|
||||||
|
)
|
||||||
|
|
||||||
|
// Defines Non blocking GRPC server interfaces
|
||||||
|
type NonBlockingGRPCServer interface {
|
||||||
|
// Start services at the endpoint
|
||||||
|
Start(endpoint string, ids csi.IdentityServer, cs csi.ControllerServer, ns csi.NodeServer)
|
||||||
|
// Waits for the service to stop
|
||||||
|
Wait()
|
||||||
|
// Stops the service gracefully
|
||||||
|
Stop()
|
||||||
|
// Stops the service forcefully
|
||||||
|
ForceStop()
|
||||||
|
}
|
||||||
|
|
||||||
|
func NewNonBlockingGRPCServer() NonBlockingGRPCServer {
|
||||||
|
return &nonBlockingGRPCServer{}
|
||||||
|
}
|
||||||
|
|
||||||
|
// NonBlocking server
|
||||||
|
type nonBlockingGRPCServer struct {
|
||||||
|
wg sync.WaitGroup
|
||||||
|
server *grpc.Server
|
||||||
|
}
|
||||||
|
|
||||||
|
func (s *nonBlockingGRPCServer) Start(endpoint string, ids csi.IdentityServer, cs csi.ControllerServer, ns csi.NodeServer) {
|
||||||
|
|
||||||
|
s.wg.Add(1)
|
||||||
|
|
||||||
|
go s.serve(endpoint, ids, cs, ns)
|
||||||
|
|
||||||
|
return
|
||||||
|
}
|
||||||
|
|
||||||
|
func (s *nonBlockingGRPCServer) Wait() {
|
||||||
|
s.wg.Wait()
|
||||||
|
}
|
||||||
|
|
||||||
|
func (s *nonBlockingGRPCServer) Stop() {
|
||||||
|
s.server.GracefulStop()
|
||||||
|
}
|
||||||
|
|
||||||
|
func (s *nonBlockingGRPCServer) ForceStop() {
|
||||||
|
s.server.Stop()
|
||||||
|
}
|
||||||
|
|
||||||
|
func (s *nonBlockingGRPCServer) serve(endpoint string, ids csi.IdentityServer, cs csi.ControllerServer, ns csi.NodeServer) {
|
||||||
|
|
||||||
|
proto, addr, err := ParseEndpoint(endpoint)
|
||||||
|
if err != nil {
|
||||||
|
glog.Fatal(err.Error())
|
||||||
|
}
|
||||||
|
|
||||||
|
if proto == "unix" {
|
||||||
|
addr = "/" + addr
|
||||||
|
if err := os.Remove(addr); err != nil && !os.IsNotExist(err) {
|
||||||
|
glog.Fatalf("Failed to remove %s, error: %s", addr, err.Error())
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
listener, err := net.Listen(proto, addr)
|
||||||
|
if err != nil {
|
||||||
|
glog.Fatalf("Failed to listen: %v", err)
|
||||||
|
}
|
||||||
|
|
||||||
|
opts := []grpc.ServerOption{
|
||||||
|
grpc.UnaryInterceptor(logGRPC),
|
||||||
|
}
|
||||||
|
server := grpc.NewServer(opts...)
|
||||||
|
s.server = server
|
||||||
|
|
||||||
|
if ids != nil {
|
||||||
|
csi.RegisterIdentityServer(server, ids)
|
||||||
|
}
|
||||||
|
if cs != nil {
|
||||||
|
csi.RegisterControllerServer(server, cs)
|
||||||
|
}
|
||||||
|
if ns != nil {
|
||||||
|
csi.RegisterNodeServer(server, ns)
|
||||||
|
}
|
||||||
|
|
||||||
|
glog.Infof("Listening for connections on address: %#v", listener.Addr())
|
||||||
|
|
||||||
|
server.Serve(listener)
|
||||||
|
}
|
||||||
|
|
||||||
|
func ParseEndpoint(ep string) (string, string, error) {
|
||||||
|
if strings.HasPrefix(strings.ToLower(ep), "unix://") || strings.HasPrefix(strings.ToLower(ep), "tcp://") {
|
||||||
|
s := strings.SplitN(ep, "://", 2)
|
||||||
|
if s[1] != "" {
|
||||||
|
return s[0], s[1], nil
|
||||||
|
}
|
||||||
|
}
|
||||||
|
return "", "", fmt.Errorf("Invalid endpoint: %v", ep)
|
||||||
|
}
|
||||||
|
|
||||||
|
func logGRPC(ctx context.Context, req interface{}, info *grpc.UnaryServerInfo, handler grpc.UnaryHandler) (interface{}, error) {
|
||||||
|
glog.V(3).Infof("GRPC call: %s", info.FullMethod)
|
||||||
|
glog.V(5).Infof("GRPC request: %s", protosanitizer.StripSecrets(req))
|
||||||
|
resp, err := handler(ctx, req)
|
||||||
|
if err != nil {
|
||||||
|
glog.Errorf("GRPC error: %v", err)
|
||||||
|
} else {
|
||||||
|
glog.V(5).Infof("GRPC response: %s", protosanitizer.StripSecrets(resp))
|
||||||
|
}
|
||||||
|
return resp, err
|
||||||
|
}
|
60
csi/src/identityserver.go
Normal file
60
csi/src/identityserver.go
Normal file
@@ -0,0 +1,60 @@
|
|||||||
|
// Copyright (c) Vitaliy Filippov, 2019+
|
||||||
|
// License: VNPL-1.1 or GNU GPL-2.0+ (see README.md for details)
|
||||||
|
|
||||||
|
package vitastor
|
||||||
|
|
||||||
|
import (
|
||||||
|
"context"
|
||||||
|
|
||||||
|
"github.com/kubernetes-csi/csi-lib-utils/protosanitizer"
|
||||||
|
"k8s.io/klog"
|
||||||
|
|
||||||
|
"github.com/container-storage-interface/spec/lib/go/csi"
|
||||||
|
)
|
||||||
|
|
||||||
|
// IdentityServer struct of Vitastor CSI driver with supported methods of CSI identity server spec.
|
||||||
|
type IdentityServer struct
|
||||||
|
{
|
||||||
|
*Driver
|
||||||
|
}
|
||||||
|
|
||||||
|
// NewIdentityServer create new instance identity
|
||||||
|
func NewIdentityServer(driver *Driver) *IdentityServer
|
||||||
|
{
|
||||||
|
return &IdentityServer{
|
||||||
|
Driver: driver,
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// GetPluginInfo returns metadata of the plugin
|
||||||
|
func (is *IdentityServer) GetPluginInfo(ctx context.Context, req *csi.GetPluginInfoRequest) (*csi.GetPluginInfoResponse, error)
|
||||||
|
{
|
||||||
|
klog.Infof("received identity plugin info request %+v", protosanitizer.StripSecrets(req))
|
||||||
|
return &csi.GetPluginInfoResponse{
|
||||||
|
Name: vitastorCSIDriverName,
|
||||||
|
VendorVersion: vitastorCSIDriverVersion,
|
||||||
|
}, nil
|
||||||
|
}
|
||||||
|
|
||||||
|
// GetPluginCapabilities returns available capabilities of the plugin
|
||||||
|
func (is *IdentityServer) GetPluginCapabilities(ctx context.Context, req *csi.GetPluginCapabilitiesRequest) (*csi.GetPluginCapabilitiesResponse, error)
|
||||||
|
{
|
||||||
|
klog.Infof("received identity plugin capabilities request %+v", protosanitizer.StripSecrets(req))
|
||||||
|
return &csi.GetPluginCapabilitiesResponse{
|
||||||
|
Capabilities: []*csi.PluginCapability{
|
||||||
|
{
|
||||||
|
Type: &csi.PluginCapability_Service_{
|
||||||
|
Service: &csi.PluginCapability_Service{
|
||||||
|
Type: csi.PluginCapability_Service_CONTROLLER_SERVICE,
|
||||||
|
},
|
||||||
|
},
|
||||||
|
},
|
||||||
|
},
|
||||||
|
}, nil
|
||||||
|
}
|
||||||
|
|
||||||
|
// Probe returns the health and readiness of the plugin
|
||||||
|
func (is *IdentityServer) Probe(ctx context.Context, req *csi.ProbeRequest) (*csi.ProbeResponse, error)
|
||||||
|
{
|
||||||
|
return &csi.ProbeResponse{}, nil
|
||||||
|
}
|
279
csi/src/nodeserver.go
Normal file
279
csi/src/nodeserver.go
Normal file
@@ -0,0 +1,279 @@
|
|||||||
|
// Copyright (c) Vitaliy Filippov, 2019+
|
||||||
|
// License: VNPL-1.1 or GNU GPL-2.0+ (see README.md for details)
|
||||||
|
|
||||||
|
package vitastor
|
||||||
|
|
||||||
|
import (
|
||||||
|
"context"
|
||||||
|
"os"
|
||||||
|
"os/exec"
|
||||||
|
"encoding/json"
|
||||||
|
"strings"
|
||||||
|
"bytes"
|
||||||
|
|
||||||
|
"google.golang.org/grpc/codes"
|
||||||
|
"google.golang.org/grpc/status"
|
||||||
|
"k8s.io/utils/mount"
|
||||||
|
utilexec "k8s.io/utils/exec"
|
||||||
|
|
||||||
|
"github.com/container-storage-interface/spec/lib/go/csi"
|
||||||
|
"github.com/kubernetes-csi/csi-lib-utils/protosanitizer"
|
||||||
|
"k8s.io/klog"
|
||||||
|
)
|
||||||
|
|
||||||
|
// NodeServer struct of Vitastor CSI driver with supported methods of CSI node server spec.
|
||||||
|
type NodeServer struct
|
||||||
|
{
|
||||||
|
*Driver
|
||||||
|
mounter mount.Interface
|
||||||
|
}
|
||||||
|
|
||||||
|
// NewNodeServer create new instance node
|
||||||
|
func NewNodeServer(driver *Driver) *NodeServer
|
||||||
|
{
|
||||||
|
return &NodeServer{
|
||||||
|
Driver: driver,
|
||||||
|
mounter: mount.New(""),
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// NodeStageVolume mounts the volume to a staging path on the node.
|
||||||
|
func (ns *NodeServer) NodeStageVolume(ctx context.Context, req *csi.NodeStageVolumeRequest) (*csi.NodeStageVolumeResponse, error)
|
||||||
|
{
|
||||||
|
return &csi.NodeStageVolumeResponse{}, nil
|
||||||
|
}
|
||||||
|
|
||||||
|
// NodeUnstageVolume unstages the volume from the staging path
|
||||||
|
func (ns *NodeServer) NodeUnstageVolume(ctx context.Context, req *csi.NodeUnstageVolumeRequest) (*csi.NodeUnstageVolumeResponse, error)
|
||||||
|
{
|
||||||
|
return &csi.NodeUnstageVolumeResponse{}, nil
|
||||||
|
}
|
||||||
|
|
||||||
|
func Contains(list []string, s string) bool
|
||||||
|
{
|
||||||
|
for i := 0; i < len(list); i++
|
||||||
|
{
|
||||||
|
if (list[i] == s)
|
||||||
|
{
|
||||||
|
return true
|
||||||
|
}
|
||||||
|
}
|
||||||
|
return false
|
||||||
|
}
|
||||||
|
|
||||||
|
// NodePublishVolume mounts the volume mounted to the staging path to the target path
|
||||||
|
func (ns *NodeServer) NodePublishVolume(ctx context.Context, req *csi.NodePublishVolumeRequest) (*csi.NodePublishVolumeResponse, error)
|
||||||
|
{
|
||||||
|
klog.Infof("received node publish volume request %+v", protosanitizer.StripSecrets(req))
|
||||||
|
|
||||||
|
targetPath := req.GetTargetPath()
|
||||||
|
|
||||||
|
// Check that it's not already mounted
|
||||||
|
free, error := mount.IsNotMountPoint(ns.mounter, targetPath)
|
||||||
|
if (error != nil)
|
||||||
|
{
|
||||||
|
if (os.IsNotExist(error))
|
||||||
|
{
|
||||||
|
error := os.MkdirAll(targetPath, 0777)
|
||||||
|
if (error != nil)
|
||||||
|
{
|
||||||
|
return nil, status.Error(codes.Internal, error.Error())
|
||||||
|
}
|
||||||
|
free = true
|
||||||
|
}
|
||||||
|
else
|
||||||
|
{
|
||||||
|
return nil, status.Error(codes.Internal, error.Error())
|
||||||
|
}
|
||||||
|
}
|
||||||
|
if (!free)
|
||||||
|
{
|
||||||
|
return &csi.NodePublishVolumeResponse{}, nil
|
||||||
|
}
|
||||||
|
|
||||||
|
ctxVars := make(map[string]string)
|
||||||
|
err := json.Unmarshal([]byte(req.VolumeId), &ctxVars)
|
||||||
|
if (err != nil)
|
||||||
|
{
|
||||||
|
return nil, status.Error(codes.Internal, "volume ID not in JSON format")
|
||||||
|
}
|
||||||
|
volName := ctxVars["name"]
|
||||||
|
|
||||||
|
_, etcdUrl, etcdPrefix := GetConnectionParams(ctxVars)
|
||||||
|
if (len(etcdUrl) == 0)
|
||||||
|
{
|
||||||
|
return nil, status.Error(codes.InvalidArgument, "no etcdUrl in storage class configuration and no etcd_address in vitastor.conf")
|
||||||
|
}
|
||||||
|
|
||||||
|
// Map NBD device
|
||||||
|
// FIXME: Check if already mapped
|
||||||
|
args := []string{
|
||||||
|
"map", "--etcd_address", strings.Join(etcdUrl, ","),
|
||||||
|
"--etcd_prefix", etcdPrefix,
|
||||||
|
"--image", volName,
|
||||||
|
};
|
||||||
|
if (ctxVars["configPath"] != "")
|
||||||
|
{
|
||||||
|
args = append(args, "--config_path", ctxVars["configPath"])
|
||||||
|
}
|
||||||
|
if (req.GetReadonly())
|
||||||
|
{
|
||||||
|
args = append(args, "--readonly", "1")
|
||||||
|
}
|
||||||
|
c := exec.Command("/usr/bin/vitastor-nbd", args...)
|
||||||
|
var stdout, stderr bytes.Buffer
|
||||||
|
c.Stdout, c.Stderr = &stdout, &stderr
|
||||||
|
err = c.Run()
|
||||||
|
stdoutStr, stderrStr := string(stdout.Bytes()), string(stderr.Bytes())
|
||||||
|
if (err != nil)
|
||||||
|
{
|
||||||
|
klog.Errorf("vitastor-nbd map failed: %s, status %s\n", stdoutStr+stderrStr, err)
|
||||||
|
return nil, status.Error(codes.Internal, stdoutStr+stderrStr+" (status "+err.Error()+")")
|
||||||
|
}
|
||||||
|
devicePath := strings.TrimSpace(stdoutStr)
|
||||||
|
|
||||||
|
// Check existing format
|
||||||
|
diskMounter := &mount.SafeFormatAndMount{Interface: ns.mounter, Exec: utilexec.New()}
|
||||||
|
existingFormat, err := diskMounter.GetDiskFormat(devicePath)
|
||||||
|
if (err != nil)
|
||||||
|
{
|
||||||
|
klog.Errorf("failed to get disk format for path %s, error: %v", err)
|
||||||
|
// unmap NBD device
|
||||||
|
unmapOut, unmapErr := exec.Command("/usr/bin/vitastor-nbd", "unmap", devicePath).CombinedOutput()
|
||||||
|
if (unmapErr != nil)
|
||||||
|
{
|
||||||
|
klog.Errorf("failed to unmap NBD device %s: %s, error: %v", devicePath, unmapOut, unmapErr)
|
||||||
|
}
|
||||||
|
return nil, err
|
||||||
|
}
|
||||||
|
|
||||||
|
// Format the device (ext4 or xfs)
|
||||||
|
fsType := req.GetVolumeCapability().GetMount().GetFsType()
|
||||||
|
isBlock := req.GetVolumeCapability().GetBlock() != nil
|
||||||
|
opt := req.GetVolumeCapability().GetMount().GetMountFlags()
|
||||||
|
opt = append(opt, "_netdev")
|
||||||
|
if ((req.VolumeCapability.AccessMode.Mode == csi.VolumeCapability_AccessMode_MULTI_NODE_READER_ONLY ||
|
||||||
|
req.VolumeCapability.AccessMode.Mode == csi.VolumeCapability_AccessMode_SINGLE_NODE_READER_ONLY) &&
|
||||||
|
!Contains(opt, "ro"))
|
||||||
|
{
|
||||||
|
opt = append(opt, "ro")
|
||||||
|
}
|
||||||
|
if (fsType == "xfs")
|
||||||
|
{
|
||||||
|
opt = append(opt, "nouuid")
|
||||||
|
}
|
||||||
|
readOnly := Contains(opt, "ro")
|
||||||
|
if (existingFormat == "" && !readOnly)
|
||||||
|
{
|
||||||
|
args := []string{}
|
||||||
|
switch fsType
|
||||||
|
{
|
||||||
|
case "ext4":
|
||||||
|
args = []string{"-m0", "-Enodiscard,lazy_itable_init=1,lazy_journal_init=1", devicePath}
|
||||||
|
case "xfs":
|
||||||
|
args = []string{"-K", devicePath}
|
||||||
|
}
|
||||||
|
if (len(args) > 0)
|
||||||
|
{
|
||||||
|
cmdOut, cmdErr := diskMounter.Exec.Command("mkfs."+fsType, args...).CombinedOutput()
|
||||||
|
if (cmdErr != nil)
|
||||||
|
{
|
||||||
|
klog.Errorf("failed to run mkfs error: %v, output: %v", cmdErr, string(cmdOut))
|
||||||
|
// unmap NBD device
|
||||||
|
unmapOut, unmapErr := exec.Command("/usr/bin/vitastor-nbd", "unmap", devicePath).CombinedOutput()
|
||||||
|
if (unmapErr != nil)
|
||||||
|
{
|
||||||
|
klog.Errorf("failed to unmap NBD device %s: %s, error: %v", devicePath, unmapOut, unmapErr)
|
||||||
|
}
|
||||||
|
return nil, status.Error(codes.Internal, cmdErr.Error())
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
if (isBlock)
|
||||||
|
{
|
||||||
|
opt = append(opt, "bind")
|
||||||
|
err = diskMounter.Mount(devicePath, targetPath, fsType, opt)
|
||||||
|
}
|
||||||
|
else
|
||||||
|
{
|
||||||
|
err = diskMounter.FormatAndMount(devicePath, targetPath, fsType, opt)
|
||||||
|
}
|
||||||
|
if (err != nil)
|
||||||
|
{
|
||||||
|
klog.Errorf(
|
||||||
|
"failed to mount device path (%s) to path (%s) for volume (%s) error: %s",
|
||||||
|
devicePath, targetPath, volName, err,
|
||||||
|
)
|
||||||
|
// unmap NBD device
|
||||||
|
unmapOut, unmapErr := exec.Command("/usr/bin/vitastor-nbd", "unmap", devicePath).CombinedOutput()
|
||||||
|
if (unmapErr != nil)
|
||||||
|
{
|
||||||
|
klog.Errorf("failed to unmap NBD device %s: %s, error: %v", devicePath, unmapOut, unmapErr)
|
||||||
|
}
|
||||||
|
return nil, status.Error(codes.Internal, err.Error())
|
||||||
|
}
|
||||||
|
return &csi.NodePublishVolumeResponse{}, nil
|
||||||
|
}
|
||||||
|
|
||||||
|
// NodeUnpublishVolume unmounts the volume from the target path
|
||||||
|
func (ns *NodeServer) NodeUnpublishVolume(ctx context.Context, req *csi.NodeUnpublishVolumeRequest) (*csi.NodeUnpublishVolumeResponse, error)
|
||||||
|
{
|
||||||
|
klog.Infof("received node unpublish volume request %+v", protosanitizer.StripSecrets(req))
|
||||||
|
targetPath := req.GetTargetPath()
|
||||||
|
devicePath, refCount, err := mount.GetDeviceNameFromMount(ns.mounter, targetPath)
|
||||||
|
if (err != nil)
|
||||||
|
{
|
||||||
|
if (os.IsNotExist(err))
|
||||||
|
{
|
||||||
|
return nil, status.Error(codes.NotFound, "Target path not found")
|
||||||
|
}
|
||||||
|
return nil, status.Error(codes.Internal, err.Error())
|
||||||
|
}
|
||||||
|
if (devicePath == "")
|
||||||
|
{
|
||||||
|
return nil, status.Error(codes.NotFound, "Volume not mounted")
|
||||||
|
}
|
||||||
|
// unmount
|
||||||
|
err = mount.CleanupMountPoint(targetPath, ns.mounter, false)
|
||||||
|
if (err != nil)
|
||||||
|
{
|
||||||
|
return nil, status.Error(codes.Internal, err.Error())
|
||||||
|
}
|
||||||
|
// unmap NBD device
|
||||||
|
if (refCount == 1)
|
||||||
|
{
|
||||||
|
unmapOut, unmapErr := exec.Command("/usr/bin/vitastor-nbd", "unmap", devicePath).CombinedOutput()
|
||||||
|
if (unmapErr != nil)
|
||||||
|
{
|
||||||
|
klog.Errorf("failed to unmap NBD device %s: %s, error: %v", devicePath, unmapOut, unmapErr)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
return &csi.NodeUnpublishVolumeResponse{}, nil
|
||||||
|
}
|
||||||
|
|
||||||
|
// NodeGetVolumeStats returns volume capacity statistics available for the volume
|
||||||
|
func (ns *NodeServer) NodeGetVolumeStats(ctx context.Context, req *csi.NodeGetVolumeStatsRequest) (*csi.NodeGetVolumeStatsResponse, error)
|
||||||
|
{
|
||||||
|
return nil, status.Error(codes.Unimplemented, "")
|
||||||
|
}
|
||||||
|
|
||||||
|
// NodeExpandVolume expanding the file system on the node
|
||||||
|
func (ns *NodeServer) NodeExpandVolume(ctx context.Context, req *csi.NodeExpandVolumeRequest) (*csi.NodeExpandVolumeResponse, error)
|
||||||
|
{
|
||||||
|
return nil, status.Error(codes.Unimplemented, "")
|
||||||
|
}
|
||||||
|
|
||||||
|
// NodeGetCapabilities returns the supported capabilities of the node server
|
||||||
|
func (ns *NodeServer) NodeGetCapabilities(ctx context.Context, req *csi.NodeGetCapabilitiesRequest) (*csi.NodeGetCapabilitiesResponse, error)
|
||||||
|
{
|
||||||
|
return &csi.NodeGetCapabilitiesResponse{}, nil
|
||||||
|
}
|
||||||
|
|
||||||
|
// NodeGetInfo returns NodeGetInfoResponse for CO.
|
||||||
|
func (ns *NodeServer) NodeGetInfo(ctx context.Context, req *csi.NodeGetInfoRequest) (*csi.NodeGetInfoResponse, error)
|
||||||
|
{
|
||||||
|
klog.Infof("received node get info request %+v", protosanitizer.StripSecrets(req))
|
||||||
|
return &csi.NodeGetInfoResponse{
|
||||||
|
NodeId: ns.NodeID,
|
||||||
|
}, nil
|
||||||
|
}
|
36
csi/src/server.go
Normal file
36
csi/src/server.go
Normal file
@@ -0,0 +1,36 @@
|
|||||||
|
// Copyright (c) Vitaliy Filippov, 2019+
|
||||||
|
// License: VNPL-1.1 or GNU GPL-2.0+ (see README.md for details)
|
||||||
|
|
||||||
|
package vitastor
|
||||||
|
|
||||||
|
import (
|
||||||
|
"k8s.io/klog"
|
||||||
|
)
|
||||||
|
|
||||||
|
type Driver struct
|
||||||
|
{
|
||||||
|
*Config
|
||||||
|
}
|
||||||
|
|
||||||
|
// NewDriver create new instance driver
|
||||||
|
func NewDriver(config *Config) (*Driver, error)
|
||||||
|
{
|
||||||
|
if (config == nil)
|
||||||
|
{
|
||||||
|
klog.Errorf("Vitastor CSI driver initialization failed")
|
||||||
|
return nil, nil
|
||||||
|
}
|
||||||
|
driver := &Driver{
|
||||||
|
Config: config,
|
||||||
|
}
|
||||||
|
klog.Infof("Vitastor CSI driver initialized")
|
||||||
|
return driver, nil
|
||||||
|
}
|
||||||
|
|
||||||
|
// Start server
|
||||||
|
func (driver *Driver) Run()
|
||||||
|
{
|
||||||
|
server := NewNonBlockingGRPCServer()
|
||||||
|
server.Start(driver.Endpoint, NewIdentityServer(driver), NewControllerServer(driver), NewNodeServer(driver))
|
||||||
|
server.Wait()
|
||||||
|
}
|
39
csi/vitastor-csi.go
Normal file
39
csi/vitastor-csi.go
Normal file
@@ -0,0 +1,39 @@
|
|||||||
|
// Copyright (c) Vitaliy Filippov, 2019+
|
||||||
|
// License: VNPL-1.1 or GNU GPL-2.0+ (see README.md for details)
|
||||||
|
|
||||||
|
package main
|
||||||
|
|
||||||
|
import (
|
||||||
|
"flag"
|
||||||
|
"fmt"
|
||||||
|
"os"
|
||||||
|
"k8s.io/klog"
|
||||||
|
"vitastor.io/csi/src"
|
||||||
|
)
|
||||||
|
|
||||||
|
func main()
|
||||||
|
{
|
||||||
|
var config = vitastor.NewConfig()
|
||||||
|
flag.StringVar(&config.Endpoint, "endpoint", "", "CSI endpoint")
|
||||||
|
flag.StringVar(&config.NodeID, "node", "", "Node ID")
|
||||||
|
flag.Parse()
|
||||||
|
if (config.Endpoint == "")
|
||||||
|
{
|
||||||
|
config.Endpoint = os.Getenv("CSI_ENDPOINT")
|
||||||
|
}
|
||||||
|
if (config.NodeID == "")
|
||||||
|
{
|
||||||
|
config.NodeID = os.Getenv("NODE_ID")
|
||||||
|
}
|
||||||
|
if (config.Endpoint == "" && config.NodeID == "")
|
||||||
|
{
|
||||||
|
fmt.Fprintf(os.Stderr, "Please set -endpoint and -node / CSI_ENDPOINT & NODE_ID env vars\n")
|
||||||
|
os.Exit(1)
|
||||||
|
}
|
||||||
|
drv, err := vitastor.NewDriver(config)
|
||||||
|
if (err != nil)
|
||||||
|
{
|
||||||
|
klog.Fatalln(err)
|
||||||
|
}
|
||||||
|
drv.Run()
|
||||||
|
}
|
14
debian/changelog
vendored
14
debian/changelog
vendored
@@ -1,8 +1,18 @@
|
|||||||
vitastor (0.6.2-1) unstable; urgency=medium
|
vitastor (0.6.4-1) unstable; urgency=medium
|
||||||
|
|
||||||
|
* RDMA support
|
||||||
* Bugfixes
|
* Bugfixes
|
||||||
|
|
||||||
-- Vitaliy Filippov <vitalif@yourcmc.ru> Tue, 02 Feb 2021 23:01:24 +0300
|
-- Vitaliy Filippov <vitalif@yourcmc.ru> Sat, 01 May 2021 18:46:10 +0300
|
||||||
|
|
||||||
|
vitastor (0.6.0-1) unstable; urgency=medium
|
||||||
|
|
||||||
|
* Snapshots and Copy-on-Write clones
|
||||||
|
* Image metadata in etcd (name, size)
|
||||||
|
* Image I/O and space statistics in etcd
|
||||||
|
* Write throttling for smoothing random write workloads in SSD+HDD configurations
|
||||||
|
|
||||||
|
-- Vitaliy Filippov <vitalif@yourcmc.ru> Sun, 11 Apr 2021 00:49:18 +0300
|
||||||
|
|
||||||
vitastor (0.5.1-1) unstable; urgency=medium
|
vitastor (0.5.1-1) unstable; urgency=medium
|
||||||
|
|
||||||
|
2
debian/control
vendored
2
debian/control
vendored
@@ -2,7 +2,7 @@ Source: vitastor
|
|||||||
Section: admin
|
Section: admin
|
||||||
Priority: optional
|
Priority: optional
|
||||||
Maintainer: Vitaliy Filippov <vitalif@yourcmc.ru>
|
Maintainer: Vitaliy Filippov <vitalif@yourcmc.ru>
|
||||||
Build-Depends: debhelper, liburing-dev (>= 0.6), g++ (>= 8), libstdc++6 (>= 8), linux-libc-dev, libgoogle-perftools-dev, libjerasure-dev, libgf-complete-dev
|
Build-Depends: debhelper, liburing-dev (>= 0.6), g++ (>= 8), libstdc++6 (>= 8), linux-libc-dev, libgoogle-perftools-dev, libjerasure-dev, libgf-complete-dev, libibverbs-dev
|
||||||
Standards-Version: 4.5.0
|
Standards-Version: 4.5.0
|
||||||
Homepage: https://vitastor.io/
|
Homepage: https://vitastor.io/
|
||||||
Rules-Requires-Root: no
|
Rules-Requires-Root: no
|
||||||
|
14
debian/vitastor.Dockerfile
vendored
14
debian/vitastor.Dockerfile
vendored
@@ -22,7 +22,7 @@ RUN apt-get -y build-dep qemu
|
|||||||
RUN apt-get -y build-dep fio
|
RUN apt-get -y build-dep fio
|
||||||
RUN apt-get --download-only source qemu
|
RUN apt-get --download-only source qemu
|
||||||
RUN apt-get --download-only source fio
|
RUN apt-get --download-only source fio
|
||||||
RUN apt-get -y install libjerasure-dev cmake
|
RUN apt-get update && apt-get -y install libjerasure-dev cmake libibverbs-dev
|
||||||
|
|
||||||
ADD . /root/vitastor
|
ADD . /root/vitastor
|
||||||
RUN set -e -x; \
|
RUN set -e -x; \
|
||||||
@@ -40,10 +40,10 @@ RUN set -e -x; \
|
|||||||
mkdir -p /root/packages/vitastor-$REL; \
|
mkdir -p /root/packages/vitastor-$REL; \
|
||||||
rm -rf /root/packages/vitastor-$REL/*; \
|
rm -rf /root/packages/vitastor-$REL/*; \
|
||||||
cd /root/packages/vitastor-$REL; \
|
cd /root/packages/vitastor-$REL; \
|
||||||
cp -r /root/vitastor vitastor-0.6.2; \
|
cp -r /root/vitastor vitastor-0.6.4; \
|
||||||
ln -s /root/packages/qemu-$REL/qemu-*/ vitastor-0.6.2/qemu; \
|
ln -s /root/packages/qemu-$REL/qemu-*/ vitastor-0.6.4/qemu; \
|
||||||
ln -s /root/fio-build/fio-*/ vitastor-0.6.2/fio; \
|
ln -s /root/fio-build/fio-*/ vitastor-0.6.4/fio; \
|
||||||
cd vitastor-0.6.2; \
|
cd vitastor-0.6.4; \
|
||||||
FIO=$(head -n1 fio/debian/changelog | perl -pe 's/^.*\((.*?)\).*$/$1/'); \
|
FIO=$(head -n1 fio/debian/changelog | perl -pe 's/^.*\((.*?)\).*$/$1/'); \
|
||||||
QEMU=$(head -n1 qemu/debian/changelog | perl -pe 's/^.*\((.*?)\).*$/$1/'); \
|
QEMU=$(head -n1 qemu/debian/changelog | perl -pe 's/^.*\((.*?)\).*$/$1/'); \
|
||||||
sh copy-qemu-includes.sh; \
|
sh copy-qemu-includes.sh; \
|
||||||
@@ -59,8 +59,8 @@ RUN set -e -x; \
|
|||||||
echo "dep:fio=$FIO" > debian/substvars; \
|
echo "dep:fio=$FIO" > debian/substvars; \
|
||||||
echo "dep:qemu=$QEMU" >> debian/substvars; \
|
echo "dep:qemu=$QEMU" >> debian/substvars; \
|
||||||
cd /root/packages/vitastor-$REL; \
|
cd /root/packages/vitastor-$REL; \
|
||||||
tar --sort=name --mtime='2020-01-01' --owner=0 --group=0 --exclude=debian -cJf vitastor_0.6.2.orig.tar.xz vitastor-0.6.2; \
|
tar --sort=name --mtime='2020-01-01' --owner=0 --group=0 --exclude=debian -cJf vitastor_0.6.4.orig.tar.xz vitastor-0.6.4; \
|
||||||
cd vitastor-0.6.2; \
|
cd vitastor-0.6.4; \
|
||||||
V=$(head -n1 debian/changelog | perl -pe 's/^.*\((.*?)\).*$/$1/'); \
|
V=$(head -n1 debian/changelog | perl -pe 's/^.*\((.*?)\).*$/$1/'); \
|
||||||
DEBFULLNAME="Vitaliy Filippov <vitalif@yourcmc.ru>" dch -D $REL -v "$V""$REL" "Rebuild for $REL"; \
|
DEBFULLNAME="Vitaliy Filippov <vitalif@yourcmc.ru>" dch -D $REL -v "$V""$REL" "Rebuild for $REL"; \
|
||||||
DEB_BUILD_OPTIONS=nocheck dpkg-buildpackage --jobs=auto -sa; \
|
DEB_BUILD_OPTIONS=nocheck dpkg-buildpackage --jobs=auto -sa; \
|
||||||
|
16
mon/mon.js
16
mon/mon.js
@@ -41,6 +41,12 @@ const etcd_allow = new RegExp('^'+[
|
|||||||
const etcd_tree = {
|
const etcd_tree = {
|
||||||
config: {
|
config: {
|
||||||
/* global: {
|
/* global: {
|
||||||
|
// WARNING: NOT ALL OF THESE ARE ACTUALLY CONFIGURABLE HERE
|
||||||
|
// THIS IS JUST A POOR'S MAN CONFIG DOCUMENTATION
|
||||||
|
// etcd connection
|
||||||
|
config_path: "/etc/vitastor/vitastor.conf",
|
||||||
|
etcd_address: "10.0.115.10:2379/v3",
|
||||||
|
etcd_prefix: "/vitastor",
|
||||||
// mon
|
// mon
|
||||||
etcd_mon_ttl: 30, // min: 10
|
etcd_mon_ttl: 30, // min: 10
|
||||||
etcd_mon_timeout: 1000, // ms. min: 0
|
etcd_mon_timeout: 1000, // ms. min: 0
|
||||||
@@ -50,7 +56,17 @@ const etcd_tree = {
|
|||||||
osd_out_time: 600, // seconds. min: 0
|
osd_out_time: 600, // seconds. min: 0
|
||||||
placement_levels: { datacenter: 1, rack: 2, host: 3, osd: 4, ... },
|
placement_levels: { datacenter: 1, rack: 2, host: 3, osd: 4, ... },
|
||||||
// client and osd
|
// client and osd
|
||||||
|
tcp_header_buffer_size: 65536,
|
||||||
use_sync_send_recv: false,
|
use_sync_send_recv: false,
|
||||||
|
use_rdma: true,
|
||||||
|
rdma_device: null, // for example, "rocep5s0f0"
|
||||||
|
rdma_port_num: 1,
|
||||||
|
rdma_gid_index: 0,
|
||||||
|
rdma_mtu: 4096,
|
||||||
|
rdma_max_sge: 128,
|
||||||
|
rdma_max_send: 32,
|
||||||
|
rdma_max_recv: 8,
|
||||||
|
rdma_max_msg: 1048576,
|
||||||
log_level: 0,
|
log_level: 0,
|
||||||
block_size: 131072,
|
block_size: 131072,
|
||||||
disk_alignment: 4096,
|
disk_alignment: 4096,
|
||||||
|
@@ -48,4 +48,4 @@ FIO=`rpm -qi fio | perl -e 'while(<>) { /^Epoch[\s:]+(\S+)/ && print "$1:"; /^Ve
|
|||||||
QEMU=`rpm -qi qemu qemu-kvm | perl -e 'while(<>) { /^Epoch[\s:]+(\S+)/ && print "$1:"; /^Version[\s:]+(\S+)/ && print $1; /^Release[\s:]+(\S+)/ && print "-$1"; }'`
|
QEMU=`rpm -qi qemu qemu-kvm | perl -e 'while(<>) { /^Epoch[\s:]+(\S+)/ && print "$1:"; /^Version[\s:]+(\S+)/ && print $1; /^Release[\s:]+(\S+)/ && print "-$1"; }'`
|
||||||
perl -i -pe 's/(Requires:\s*fio)([^\n]+)?/$1 = '$FIO'/' $VITASTOR/rpm/vitastor-el$EL.spec
|
perl -i -pe 's/(Requires:\s*fio)([^\n]+)?/$1 = '$FIO'/' $VITASTOR/rpm/vitastor-el$EL.spec
|
||||||
perl -i -pe 's/(Requires:\s*qemu(?:-kvm)?)([^\n]+)?/$1 = '$QEMU'/' $VITASTOR/rpm/vitastor-el$EL.spec
|
perl -i -pe 's/(Requires:\s*qemu(?:-kvm)?)([^\n]+)?/$1 = '$QEMU'/' $VITASTOR/rpm/vitastor-el$EL.spec
|
||||||
tar --transform 's#^#vitastor-0.6.2/#' --exclude 'rpm/*.rpm' -czf $VITASTOR/../vitastor-0.6.2$(rpm --eval '%dist').tar.gz *
|
tar --transform 's#^#vitastor-0.6.4/#' --exclude 'rpm/*.rpm' -czf $VITASTOR/../vitastor-0.6.4$(rpm --eval '%dist').tar.gz *
|
||||||
|
@@ -17,6 +17,7 @@ RUN rpm --nomd5 -i fio*.src.rpm
|
|||||||
RUN rm -f /etc/yum.repos.d/CentOS-Media.repo
|
RUN rm -f /etc/yum.repos.d/CentOS-Media.repo
|
||||||
RUN cd ~/rpmbuild/SPECS && yum-builddep -y --enablerepo='*' --disablerepo=centos-sclo-rh --disablerepo=centos-sclo-rh-source --disablerepo=centos-sclo-sclo-testing qemu-kvm.spec
|
RUN cd ~/rpmbuild/SPECS && yum-builddep -y --enablerepo='*' --disablerepo=centos-sclo-rh --disablerepo=centos-sclo-rh-source --disablerepo=centos-sclo-sclo-testing qemu-kvm.spec
|
||||||
RUN cd ~/rpmbuild/SPECS && yum-builddep -y --enablerepo='*' --disablerepo=centos-sclo-rh --disablerepo=centos-sclo-rh-source --disablerepo=centos-sclo-sclo-testing fio.spec
|
RUN cd ~/rpmbuild/SPECS && yum-builddep -y --enablerepo='*' --disablerepo=centos-sclo-rh --disablerepo=centos-sclo-rh-source --disablerepo=centos-sclo-sclo-testing fio.spec
|
||||||
|
RUN yum -y install rdma-core-devel
|
||||||
|
|
||||||
ADD https://vitastor.io/rpms/liburing-el7/liburing-0.7-2.el7.src.rpm /root
|
ADD https://vitastor.io/rpms/liburing-el7/liburing-0.7-2.el7.src.rpm /root
|
||||||
|
|
||||||
@@ -37,7 +38,7 @@ ADD . /root/vitastor
|
|||||||
RUN set -e; \
|
RUN set -e; \
|
||||||
cd /root/vitastor/rpm; \
|
cd /root/vitastor/rpm; \
|
||||||
sh build-tarball.sh; \
|
sh build-tarball.sh; \
|
||||||
cp /root/vitastor-0.6.2.el7.tar.gz ~/rpmbuild/SOURCES; \
|
cp /root/vitastor-0.6.4.el7.tar.gz ~/rpmbuild/SOURCES; \
|
||||||
cp vitastor-el7.spec ~/rpmbuild/SPECS/vitastor.spec; \
|
cp vitastor-el7.spec ~/rpmbuild/SPECS/vitastor.spec; \
|
||||||
cd ~/rpmbuild/SPECS/; \
|
cd ~/rpmbuild/SPECS/; \
|
||||||
rpmbuild -ba vitastor.spec; \
|
rpmbuild -ba vitastor.spec; \
|
||||||
|
@@ -1,11 +1,11 @@
|
|||||||
Name: vitastor
|
Name: vitastor
|
||||||
Version: 0.6.2
|
Version: 0.6.4
|
||||||
Release: 1%{?dist}
|
Release: 1%{?dist}
|
||||||
Summary: Vitastor, a fast software-defined clustered block storage
|
Summary: Vitastor, a fast software-defined clustered block storage
|
||||||
|
|
||||||
License: Vitastor Network Public License 1.1
|
License: Vitastor Network Public License 1.1
|
||||||
URL: https://vitastor.io/
|
URL: https://vitastor.io/
|
||||||
Source0: vitastor-0.6.2.el7.tar.gz
|
Source0: vitastor-0.6.4.el7.tar.gz
|
||||||
|
|
||||||
BuildRequires: liburing-devel >= 0.6
|
BuildRequires: liburing-devel >= 0.6
|
||||||
BuildRequires: gperftools-devel
|
BuildRequires: gperftools-devel
|
||||||
@@ -14,6 +14,7 @@ BuildRequires: rh-nodejs12
|
|||||||
BuildRequires: rh-nodejs12-npm
|
BuildRequires: rh-nodejs12-npm
|
||||||
BuildRequires: jerasure-devel
|
BuildRequires: jerasure-devel
|
||||||
BuildRequires: gf-complete-devel
|
BuildRequires: gf-complete-devel
|
||||||
|
BuildRequires: libibverbs-devel
|
||||||
BuildRequires: cmake
|
BuildRequires: cmake
|
||||||
Requires: fio = 3.7-1.el7
|
Requires: fio = 3.7-1.el7
|
||||||
Requires: qemu-kvm = 2.0.0-1.el7.6
|
Requires: qemu-kvm = 2.0.0-1.el7.6
|
||||||
@@ -61,8 +62,8 @@ cp -r mon %buildroot/usr/lib/vitastor/mon
|
|||||||
%_libdir/libfio_vitastor.so
|
%_libdir/libfio_vitastor.so
|
||||||
%_libdir/libfio_vitastor_blk.so
|
%_libdir/libfio_vitastor_blk.so
|
||||||
%_libdir/libfio_vitastor_sec.so
|
%_libdir/libfio_vitastor_sec.so
|
||||||
%_libdir/libvitastor_blk.so
|
%_libdir/libvitastor_blk.so*
|
||||||
%_libdir/libvitastor_client.so
|
%_libdir/libvitastor_client.so*
|
||||||
/usr/lib/vitastor
|
/usr/lib/vitastor
|
||||||
|
|
||||||
|
|
||||||
|
@@ -15,6 +15,7 @@ RUN rpm --nomd5 -i qemu*.src.rpm
|
|||||||
RUN rpm --nomd5 -i fio*.src.rpm
|
RUN rpm --nomd5 -i fio*.src.rpm
|
||||||
RUN cd ~/rpmbuild/SPECS && dnf builddep -y --enablerepo=powertools --spec qemu-kvm.spec
|
RUN cd ~/rpmbuild/SPECS && dnf builddep -y --enablerepo=powertools --spec qemu-kvm.spec
|
||||||
RUN cd ~/rpmbuild/SPECS && dnf builddep -y --enablerepo=powertools --spec fio.spec && dnf install -y cmake
|
RUN cd ~/rpmbuild/SPECS && dnf builddep -y --enablerepo=powertools --spec fio.spec && dnf install -y cmake
|
||||||
|
RUN yum -y install libibverbs-devel
|
||||||
|
|
||||||
ADD https://vitastor.io/rpms/liburing-el7/liburing-0.7-2.el7.src.rpm /root
|
ADD https://vitastor.io/rpms/liburing-el7/liburing-0.7-2.el7.src.rpm /root
|
||||||
|
|
||||||
@@ -35,7 +36,7 @@ ADD . /root/vitastor
|
|||||||
RUN set -e; \
|
RUN set -e; \
|
||||||
cd /root/vitastor/rpm; \
|
cd /root/vitastor/rpm; \
|
||||||
sh build-tarball.sh; \
|
sh build-tarball.sh; \
|
||||||
cp /root/vitastor-0.6.2.el8.tar.gz ~/rpmbuild/SOURCES; \
|
cp /root/vitastor-0.6.4.el8.tar.gz ~/rpmbuild/SOURCES; \
|
||||||
cp vitastor-el8.spec ~/rpmbuild/SPECS/vitastor.spec; \
|
cp vitastor-el8.spec ~/rpmbuild/SPECS/vitastor.spec; \
|
||||||
cd ~/rpmbuild/SPECS/; \
|
cd ~/rpmbuild/SPECS/; \
|
||||||
rpmbuild -ba vitastor.spec; \
|
rpmbuild -ba vitastor.spec; \
|
||||||
|
@@ -1,11 +1,11 @@
|
|||||||
Name: vitastor
|
Name: vitastor
|
||||||
Version: 0.6.2
|
Version: 0.6.4
|
||||||
Release: 1%{?dist}
|
Release: 1%{?dist}
|
||||||
Summary: Vitastor, a fast software-defined clustered block storage
|
Summary: Vitastor, a fast software-defined clustered block storage
|
||||||
|
|
||||||
License: Vitastor Network Public License 1.1
|
License: Vitastor Network Public License 1.1
|
||||||
URL: https://vitastor.io/
|
URL: https://vitastor.io/
|
||||||
Source0: vitastor-0.6.2.el8.tar.gz
|
Source0: vitastor-0.6.4.el8.tar.gz
|
||||||
|
|
||||||
BuildRequires: liburing-devel >= 0.6
|
BuildRequires: liburing-devel >= 0.6
|
||||||
BuildRequires: gperftools-devel
|
BuildRequires: gperftools-devel
|
||||||
@@ -13,6 +13,7 @@ BuildRequires: gcc-toolset-9-gcc-c++
|
|||||||
BuildRequires: nodejs >= 10
|
BuildRequires: nodejs >= 10
|
||||||
BuildRequires: jerasure-devel
|
BuildRequires: jerasure-devel
|
||||||
BuildRequires: gf-complete-devel
|
BuildRequires: gf-complete-devel
|
||||||
|
BuildRequires: libibverbs-devel
|
||||||
BuildRequires: cmake
|
BuildRequires: cmake
|
||||||
Requires: fio = 3.7-3.el8
|
Requires: fio = 3.7-3.el8
|
||||||
Requires: qemu-kvm = 4.2.0-29.el8.6
|
Requires: qemu-kvm = 4.2.0-29.el8.6
|
||||||
@@ -58,8 +59,8 @@ cp -r mon %buildroot/usr/lib/vitastor
|
|||||||
%_libdir/libfio_vitastor.so
|
%_libdir/libfio_vitastor.so
|
||||||
%_libdir/libfio_vitastor_blk.so
|
%_libdir/libfio_vitastor_blk.so
|
||||||
%_libdir/libfio_vitastor_sec.so
|
%_libdir/libfio_vitastor_sec.so
|
||||||
%_libdir/libvitastor_blk.so
|
%_libdir/libvitastor_blk.so*
|
||||||
%_libdir/libvitastor_client.so
|
%_libdir/libvitastor_client.so*
|
||||||
/usr/lib/vitastor
|
/usr/lib/vitastor
|
||||||
|
|
||||||
|
|
||||||
|
@@ -13,7 +13,7 @@ if("${CMAKE_INSTALL_PREFIX}" MATCHES "^/usr/local/?$")
|
|||||||
set(CMAKE_INSTALL_RPATH "${CMAKE_INSTALL_PREFIX}/${CMAKE_INSTALL_LIBDIR}")
|
set(CMAKE_INSTALL_RPATH "${CMAKE_INSTALL_PREFIX}/${CMAKE_INSTALL_LIBDIR}")
|
||||||
endif()
|
endif()
|
||||||
|
|
||||||
add_definitions(-DVERSION="0.6.2")
|
add_definitions(-DVERSION="0.6.4")
|
||||||
add_definitions(-Wall -Wno-sign-compare -Wno-comment -Wno-parentheses -Wno-pointer-arith -I ${CMAKE_SOURCE_DIR}/src)
|
add_definitions(-Wall -Wno-sign-compare -Wno-comment -Wno-parentheses -Wno-pointer-arith -I ${CMAKE_SOURCE_DIR}/src)
|
||||||
if (${WITH_ASAN})
|
if (${WITH_ASAN})
|
||||||
add_definitions(-fsanitize=address -fno-omit-frame-pointer)
|
add_definitions(-fsanitize=address -fno-omit-frame-pointer)
|
||||||
@@ -37,11 +37,16 @@ string(REGEX REPLACE "([\\/\\-]D) *NDEBUG" "" CMAKE_C_FLAGS_RELWITHDEBINFO "${CM
|
|||||||
find_package(PkgConfig)
|
find_package(PkgConfig)
|
||||||
pkg_check_modules(LIBURING REQUIRED liburing)
|
pkg_check_modules(LIBURING REQUIRED liburing)
|
||||||
pkg_check_modules(GLIB REQUIRED glib-2.0)
|
pkg_check_modules(GLIB REQUIRED glib-2.0)
|
||||||
|
pkg_check_modules(IBVERBS libibverbs)
|
||||||
|
if (IBVERBS_LIBRARIES)
|
||||||
|
add_definitions(-DWITH_RDMA)
|
||||||
|
endif (IBVERBS_LIBRARIES)
|
||||||
|
|
||||||
include_directories(
|
include_directories(
|
||||||
../
|
../
|
||||||
/usr/include/jerasure
|
/usr/include/jerasure
|
||||||
${LIBURING_INCLUDE_DIRS}
|
${LIBURING_INCLUDE_DIRS}
|
||||||
|
${IBVERBS_INCLUDE_DIRS}
|
||||||
)
|
)
|
||||||
|
|
||||||
# libvitastor_blk.so
|
# libvitastor_blk.so
|
||||||
@@ -52,7 +57,10 @@ add_library(vitastor_blk SHARED
|
|||||||
target_link_libraries(vitastor_blk
|
target_link_libraries(vitastor_blk
|
||||||
${LIBURING_LIBRARIES}
|
${LIBURING_LIBRARIES}
|
||||||
tcmalloc_minimal
|
tcmalloc_minimal
|
||||||
|
# for timerfd_manager
|
||||||
|
vitastor_common
|
||||||
)
|
)
|
||||||
|
set_target_properties(vitastor_blk PROPERTIES VERSION ${VERSION} SOVERSION 0)
|
||||||
|
|
||||||
# libfio_vitastor_blk.so
|
# libfio_vitastor_blk.so
|
||||||
add_library(fio_vitastor_blk SHARED
|
add_library(fio_vitastor_blk SHARED
|
||||||
@@ -64,10 +72,14 @@ target_link_libraries(fio_vitastor_blk
|
|||||||
)
|
)
|
||||||
|
|
||||||
# libvitastor_common.a
|
# libvitastor_common.a
|
||||||
|
set(MSGR_RDMA "")
|
||||||
|
if (IBVERBS_LIBRARIES)
|
||||||
|
set(MSGR_RDMA "msgr_rdma.cpp")
|
||||||
|
endif (IBVERBS_LIBRARIES)
|
||||||
add_library(vitastor_common STATIC
|
add_library(vitastor_common STATIC
|
||||||
epoll_manager.cpp etcd_state_client.cpp
|
epoll_manager.cpp etcd_state_client.cpp
|
||||||
messenger.cpp msgr_stop.cpp msgr_op.cpp msgr_send.cpp msgr_receive.cpp ringloop.cpp ../json11/json11.cpp
|
messenger.cpp msgr_stop.cpp msgr_op.cpp msgr_send.cpp msgr_receive.cpp ringloop.cpp ../json11/json11.cpp
|
||||||
http_client.cpp osd_ops.cpp pg_states.cpp timerfd_manager.cpp base64.cpp
|
http_client.cpp osd_ops.cpp pg_states.cpp timerfd_manager.cpp base64.cpp ${MSGR_RDMA}
|
||||||
)
|
)
|
||||||
target_compile_options(vitastor_common PUBLIC -fPIC)
|
target_compile_options(vitastor_common PUBLIC -fPIC)
|
||||||
|
|
||||||
@@ -81,6 +93,7 @@ target_link_libraries(vitastor-osd
|
|||||||
vitastor_common
|
vitastor_common
|
||||||
vitastor_blk
|
vitastor_blk
|
||||||
Jerasure
|
Jerasure
|
||||||
|
${IBVERBS_LIBRARIES}
|
||||||
)
|
)
|
||||||
|
|
||||||
# libfio_vitastor_sec.so
|
# libfio_vitastor_sec.so
|
||||||
@@ -100,7 +113,9 @@ target_link_libraries(vitastor_client
|
|||||||
vitastor_common
|
vitastor_common
|
||||||
tcmalloc_minimal
|
tcmalloc_minimal
|
||||||
${LIBURING_LIBRARIES}
|
${LIBURING_LIBRARIES}
|
||||||
|
${IBVERBS_LIBRARIES}
|
||||||
)
|
)
|
||||||
|
set_target_properties(vitastor_client PROPERTIES VERSION ${VERSION} SOVERSION 0)
|
||||||
|
|
||||||
# libfio_vitastor.so
|
# libfio_vitastor.so
|
||||||
add_library(fio_vitastor SHARED
|
add_library(fio_vitastor SHARED
|
||||||
@@ -174,6 +189,7 @@ add_executable(stub_uring_osd
|
|||||||
target_link_libraries(stub_uring_osd
|
target_link_libraries(stub_uring_osd
|
||||||
vitastor_common
|
vitastor_common
|
||||||
${LIBURING_LIBRARIES}
|
${LIBURING_LIBRARIES}
|
||||||
|
${IBVERBS_LIBRARIES}
|
||||||
tcmalloc_minimal
|
tcmalloc_minimal
|
||||||
)
|
)
|
||||||
|
|
||||||
|
@@ -43,11 +43,6 @@ int blockstore_t::read_bitmap(object_id oid, uint64_t target_version, void *bitm
|
|||||||
return impl->read_bitmap(oid, target_version, bitmap, result_version);
|
return impl->read_bitmap(oid, target_version, bitmap, result_version);
|
||||||
}
|
}
|
||||||
|
|
||||||
std::unordered_map<object_id, uint64_t> & blockstore_t::get_unstable_writes()
|
|
||||||
{
|
|
||||||
return impl->unstable_writes;
|
|
||||||
}
|
|
||||||
|
|
||||||
std::map<uint64_t, uint64_t> & blockstore_t::get_inode_space_stats()
|
std::map<uint64_t, uint64_t> & blockstore_t::get_inode_space_stats()
|
||||||
{
|
{
|
||||||
return impl->inode_space_stats;
|
return impl->inode_space_stats;
|
||||||
|
@@ -183,9 +183,6 @@ public:
|
|||||||
// Simplified synchronous operation: get object bitmap & current version
|
// Simplified synchronous operation: get object bitmap & current version
|
||||||
int read_bitmap(object_id oid, uint64_t target_version, void *bitmap, uint64_t *result_version = NULL);
|
int read_bitmap(object_id oid, uint64_t target_version, void *bitmap, uint64_t *result_version = NULL);
|
||||||
|
|
||||||
// Unstable writes are added here (map of object_id -> version)
|
|
||||||
std::unordered_map<object_id, uint64_t> & get_unstable_writes();
|
|
||||||
|
|
||||||
// Get per-inode space usage statistics
|
// Get per-inode space usage statistics
|
||||||
std::map<uint64_t, uint64_t> & get_inode_space_stats();
|
std::map<uint64_t, uint64_t> & get_inode_space_stats();
|
||||||
|
|
||||||
|
@@ -16,6 +16,8 @@
|
|||||||
|
|
||||||
cluster_client_t::cluster_client_t(ring_loop_t *ringloop, timerfd_manager_t *tfd, json11::Json & config)
|
cluster_client_t::cluster_client_t(ring_loop_t *ringloop, timerfd_manager_t *tfd, json11::Json & config)
|
||||||
{
|
{
|
||||||
|
config = osd_messenger_t::read_config(config);
|
||||||
|
|
||||||
this->ringloop = ringloop;
|
this->ringloop = ringloop;
|
||||||
this->tfd = tfd;
|
this->tfd = tfd;
|
||||||
this->config = config;
|
this->config = config;
|
||||||
@@ -49,10 +51,11 @@ cluster_client_t::cluster_client_t(ring_loop_t *ringloop, timerfd_manager_t *tfd
|
|||||||
msgr.exec_op = [this](osd_op_t *op)
|
msgr.exec_op = [this](osd_op_t *op)
|
||||||
{
|
{
|
||||||
// Garbage in
|
// Garbage in
|
||||||
printf("Incoming garbage from peer %d\n", op->peer_fd);
|
fprintf(stderr, "Incoming garbage from peer %d\n", op->peer_fd);
|
||||||
msgr.stop_client(op->peer_fd);
|
msgr.stop_client(op->peer_fd);
|
||||||
delete op;
|
delete op;
|
||||||
};
|
};
|
||||||
|
msgr.parse_config(this->config);
|
||||||
msgr.init();
|
msgr.init();
|
||||||
|
|
||||||
st_cli.tfd = tfd;
|
st_cli.tfd = tfd;
|
||||||
@@ -108,6 +111,115 @@ cluster_op_t::~cluster_op_t()
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
void cluster_client_t::calc_wait(cluster_op_t *op)
|
||||||
|
{
|
||||||
|
op->prev_wait = 0;
|
||||||
|
if (op->opcode == OSD_OP_WRITE)
|
||||||
|
{
|
||||||
|
for (auto prev = op->prev; prev; prev = prev->prev)
|
||||||
|
{
|
||||||
|
if (prev->opcode == OSD_OP_SYNC ||
|
||||||
|
prev->opcode == OSD_OP_WRITE && !(op->flags & OP_FLUSH_BUFFER) && (prev->flags & OP_FLUSH_BUFFER))
|
||||||
|
{
|
||||||
|
op->prev_wait++;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
if (!op->prev_wait && pgs_loaded)
|
||||||
|
continue_rw(op);
|
||||||
|
}
|
||||||
|
else if (op->opcode == OSD_OP_SYNC)
|
||||||
|
{
|
||||||
|
for (auto prev = op->prev; prev; prev = prev->prev)
|
||||||
|
{
|
||||||
|
if (prev->opcode == OSD_OP_SYNC || prev->opcode == OSD_OP_WRITE)
|
||||||
|
{
|
||||||
|
op->prev_wait++;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
if (!op->prev_wait && pgs_loaded)
|
||||||
|
continue_sync(op);
|
||||||
|
}
|
||||||
|
else
|
||||||
|
{
|
||||||
|
for (auto prev = op->prev; prev; prev = prev->prev)
|
||||||
|
{
|
||||||
|
if (prev->opcode == OSD_OP_WRITE && prev->flags & OP_FLUSH_BUFFER)
|
||||||
|
{
|
||||||
|
op->prev_wait++;
|
||||||
|
}
|
||||||
|
else if (prev->opcode == OSD_OP_WRITE || prev->opcode == OSD_OP_READ)
|
||||||
|
{
|
||||||
|
// Flushes are always in the beginning
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
if (!op->prev_wait && pgs_loaded)
|
||||||
|
continue_rw(op);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
void cluster_client_t::inc_wait(uint64_t opcode, uint64_t flags, cluster_op_t *next, int inc)
|
||||||
|
{
|
||||||
|
if (opcode == OSD_OP_WRITE)
|
||||||
|
{
|
||||||
|
while (next)
|
||||||
|
{
|
||||||
|
auto n2 = next->next;
|
||||||
|
if (next->opcode == OSD_OP_SYNC ||
|
||||||
|
next->opcode == OSD_OP_WRITE && (flags & OP_FLUSH_BUFFER) && !(next->flags & OP_FLUSH_BUFFER) ||
|
||||||
|
next->opcode == OSD_OP_READ && (flags & OP_FLUSH_BUFFER))
|
||||||
|
{
|
||||||
|
next->prev_wait += inc;
|
||||||
|
if (!next->prev_wait)
|
||||||
|
{
|
||||||
|
if (next->opcode == OSD_OP_SYNC)
|
||||||
|
continue_sync(next);
|
||||||
|
else
|
||||||
|
continue_rw(next);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
next = n2;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
else if (opcode == OSD_OP_SYNC)
|
||||||
|
{
|
||||||
|
while (next)
|
||||||
|
{
|
||||||
|
auto n2 = next->next;
|
||||||
|
if (next->opcode == OSD_OP_SYNC || next->opcode == OSD_OP_WRITE)
|
||||||
|
{
|
||||||
|
next->prev_wait += inc;
|
||||||
|
if (!next->prev_wait)
|
||||||
|
{
|
||||||
|
if (next->opcode == OSD_OP_SYNC)
|
||||||
|
continue_sync(next);
|
||||||
|
else
|
||||||
|
continue_rw(next);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
next = n2;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
void cluster_client_t::erase_op(cluster_op_t *op)
|
||||||
|
{
|
||||||
|
uint64_t opcode = op->opcode, flags = op->flags;
|
||||||
|
cluster_op_t *next = op->next;
|
||||||
|
if (op->prev)
|
||||||
|
op->prev->next = op->next;
|
||||||
|
if (op->next)
|
||||||
|
op->next->prev = op->prev;
|
||||||
|
if (op_queue_head == op)
|
||||||
|
op_queue_head = op->next;
|
||||||
|
if (op_queue_tail == op)
|
||||||
|
op_queue_tail = op->prev;
|
||||||
|
op->next = op->prev = NULL;
|
||||||
|
std::function<void(cluster_op_t*)>(op->callback)(op);
|
||||||
|
if (!immediate_commit)
|
||||||
|
inc_wait(opcode, flags, next, -1);
|
||||||
|
}
|
||||||
|
|
||||||
void cluster_client_t::continue_ops(bool up_retry)
|
void cluster_client_t::continue_ops(bool up_retry)
|
||||||
{
|
{
|
||||||
if (!pgs_loaded)
|
if (!pgs_loaded)
|
||||||
@@ -118,60 +230,25 @@ void cluster_client_t::continue_ops(bool up_retry)
|
|||||||
if (continuing_ops)
|
if (continuing_ops)
|
||||||
{
|
{
|
||||||
// Attempt to reenter the function
|
// Attempt to reenter the function
|
||||||
continuing_ops = 2;
|
|
||||||
return;
|
return;
|
||||||
}
|
}
|
||||||
restart:
|
restart:
|
||||||
continuing_ops = 1;
|
continuing_ops = 1;
|
||||||
op_queue_pos = 0;
|
for (auto op = op_queue_head; op; )
|
||||||
bool has_flushes = false, has_writes = false;
|
|
||||||
while (op_queue_pos < op_queue.size())
|
|
||||||
{
|
{
|
||||||
auto op = op_queue[op_queue_pos];
|
cluster_op_t *next_op = op->next;
|
||||||
bool rm = false, is_flush = op->flags & OP_FLUSH_BUFFER;
|
|
||||||
auto opcode = op->opcode;
|
|
||||||
if (!op->up_wait || up_retry)
|
if (!op->up_wait || up_retry)
|
||||||
{
|
{
|
||||||
op->up_wait = false;
|
op->up_wait = false;
|
||||||
if (opcode == OSD_OP_READ || opcode == OSD_OP_WRITE)
|
if (!op->prev_wait)
|
||||||
{
|
{
|
||||||
if (is_flush || !has_flushes)
|
if (op->opcode == OSD_OP_SYNC)
|
||||||
{
|
continue_sync(op);
|
||||||
// Regular writes can't proceed before buffer flushes
|
else
|
||||||
rm = continue_rw(op);
|
continue_rw(op);
|
||||||
}
|
|
||||||
}
|
|
||||||
else if (opcode == OSD_OP_SYNC)
|
|
||||||
{
|
|
||||||
if (!has_writes)
|
|
||||||
{
|
|
||||||
// SYNC can't proceed before previous writes
|
|
||||||
rm = continue_sync(op);
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
if (opcode == OSD_OP_WRITE)
|
op = next_op;
|
||||||
{
|
|
||||||
has_writes = has_writes || !rm;
|
|
||||||
if (is_flush)
|
|
||||||
{
|
|
||||||
has_flushes = has_writes || !rm;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
else if (opcode == OSD_OP_SYNC)
|
|
||||||
{
|
|
||||||
// Postpone writes until previous SYNC completes
|
|
||||||
// ...so dirty_writes can't contain anything newer than SYNC
|
|
||||||
has_flushes = has_writes || !rm;
|
|
||||||
}
|
|
||||||
if (rm)
|
|
||||||
{
|
|
||||||
op_queue.erase(op_queue.begin()+op_queue_pos, op_queue.begin()+op_queue_pos+1);
|
|
||||||
}
|
|
||||||
else
|
|
||||||
{
|
|
||||||
op_queue_pos++;
|
|
||||||
}
|
|
||||||
if (continuing_ops == 2)
|
if (continuing_ops == 2)
|
||||||
{
|
{
|
||||||
goto restart;
|
goto restart;
|
||||||
@@ -213,11 +290,8 @@ void cluster_client_t::on_load_config_hook(json11::Json::object & config)
|
|||||||
{
|
{
|
||||||
throw std::runtime_error("Bad block size");
|
throw std::runtime_error("Bad block size");
|
||||||
}
|
}
|
||||||
if (config["immediate_commit"] == "all")
|
// Cluster-wide immediate_commit mode
|
||||||
{
|
immediate_commit = (config["immediate_commit"] == "all");
|
||||||
// Cluster-wide immediate_commit mode
|
|
||||||
immediate_commit = true;
|
|
||||||
}
|
|
||||||
if (config.find("client_max_dirty_bytes") != config.end())
|
if (config.find("client_max_dirty_bytes") != config.end())
|
||||||
{
|
{
|
||||||
client_max_dirty_bytes = config["client_max_dirty_bytes"].uint64_value();
|
client_max_dirty_bytes = config["client_max_dirty_bytes"].uint64_value();
|
||||||
@@ -281,7 +355,7 @@ void cluster_client_t::on_change_hook(std::map<std::string, etcd_kv_t> & changes
|
|||||||
{
|
{
|
||||||
// At this point, all pool operations should have been suspended
|
// At this point, all pool operations should have been suspended
|
||||||
// And now they have to be resliced!
|
// And now they have to be resliced!
|
||||||
for (auto op: op_queue)
|
for (auto op = op_queue_head; op; op = op->next)
|
||||||
{
|
{
|
||||||
if ((op->opcode == OSD_OP_WRITE || op->opcode == OSD_OP_READ) &&
|
if ((op->opcode == OSD_OP_WRITE || op->opcode == OSD_OP_READ) &&
|
||||||
INODE_POOL(op->cur_inode) == pool_item.first)
|
INODE_POOL(op->cur_inode) == pool_item.first)
|
||||||
@@ -362,9 +436,17 @@ void cluster_client_t::execute(cluster_op_t *op)
|
|||||||
{
|
{
|
||||||
delete sync_op;
|
delete sync_op;
|
||||||
};
|
};
|
||||||
op_queue.push_back(sync_op);
|
sync_op->prev = op_queue_tail;
|
||||||
|
if (op_queue_tail)
|
||||||
|
{
|
||||||
|
op_queue_tail->next = sync_op;
|
||||||
|
op_queue_tail = sync_op;
|
||||||
|
}
|
||||||
|
else
|
||||||
|
op_queue_tail = op_queue_head = sync_op;
|
||||||
dirty_bytes = 0;
|
dirty_bytes = 0;
|
||||||
dirty_ops = 0;
|
dirty_ops = 0;
|
||||||
|
calc_wait(sync_op);
|
||||||
}
|
}
|
||||||
dirty_bytes += op->len;
|
dirty_bytes += op->len;
|
||||||
dirty_ops++;
|
dirty_ops++;
|
||||||
@@ -374,8 +456,23 @@ void cluster_client_t::execute(cluster_op_t *op)
|
|||||||
dirty_bytes = 0;
|
dirty_bytes = 0;
|
||||||
dirty_ops = 0;
|
dirty_ops = 0;
|
||||||
}
|
}
|
||||||
op_queue.push_back(op);
|
op->prev = op_queue_tail;
|
||||||
continue_ops();
|
if (op_queue_tail)
|
||||||
|
{
|
||||||
|
op_queue_tail->next = op;
|
||||||
|
op_queue_tail = op;
|
||||||
|
}
|
||||||
|
else
|
||||||
|
op_queue_tail = op_queue_head = op;
|
||||||
|
if (!immediate_commit)
|
||||||
|
calc_wait(op);
|
||||||
|
else if (pgs_loaded)
|
||||||
|
{
|
||||||
|
if (op->opcode == OSD_OP_SYNC)
|
||||||
|
continue_sync(op);
|
||||||
|
else
|
||||||
|
continue_rw(op);
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
void cluster_client_t::copy_write(cluster_op_t *op, std::map<object_id, cluster_buffer_t> & dirty_buffers)
|
void cluster_client_t::copy_write(cluster_op_t *op, std::map<object_id, cluster_buffer_t> & dirty_buffers)
|
||||||
@@ -474,12 +571,16 @@ void cluster_client_t::flush_buffer(const object_id & oid, cluster_buffer_t *wr)
|
|||||||
}
|
}
|
||||||
delete op;
|
delete op;
|
||||||
};
|
};
|
||||||
op_queue.insert(op_queue.begin(), op);
|
op->next = op_queue_head;
|
||||||
if (continuing_ops)
|
if (op_queue_head)
|
||||||
{
|
{
|
||||||
continuing_ops = 2;
|
op_queue_head->prev = op;
|
||||||
op_queue_pos++;
|
op_queue_head = op;
|
||||||
}
|
}
|
||||||
|
else
|
||||||
|
op_queue_tail = op_queue_head = op;
|
||||||
|
inc_wait(op->opcode, op->flags, op->next, 1);
|
||||||
|
continue_rw(op);
|
||||||
}
|
}
|
||||||
|
|
||||||
int cluster_client_t::continue_rw(cluster_op_t *op)
|
int cluster_client_t::continue_rw(cluster_op_t *op)
|
||||||
@@ -496,7 +597,7 @@ resume_0:
|
|||||||
if (!op->len || op->offset % bs_bitmap_granularity || op->len % bs_bitmap_granularity)
|
if (!op->len || op->offset % bs_bitmap_granularity || op->len % bs_bitmap_granularity)
|
||||||
{
|
{
|
||||||
op->retval = -EINVAL;
|
op->retval = -EINVAL;
|
||||||
std::function<void(cluster_op_t*)>(op->callback)(op);
|
erase_op(op);
|
||||||
return 1;
|
return 1;
|
||||||
}
|
}
|
||||||
{
|
{
|
||||||
@@ -504,7 +605,7 @@ resume_0:
|
|||||||
if (!pool_id)
|
if (!pool_id)
|
||||||
{
|
{
|
||||||
op->retval = -EINVAL;
|
op->retval = -EINVAL;
|
||||||
std::function<void(cluster_op_t*)>(op->callback)(op);
|
erase_op(op);
|
||||||
return 1;
|
return 1;
|
||||||
}
|
}
|
||||||
if (st_cli.pool_config.find(pool_id) == st_cli.pool_config.end() ||
|
if (st_cli.pool_config.find(pool_id) == st_cli.pool_config.end() ||
|
||||||
@@ -520,7 +621,7 @@ resume_0:
|
|||||||
if (ino_it != st_cli.inode_config.end() && ino_it->second.readonly)
|
if (ino_it != st_cli.inode_config.end() && ino_it->second.readonly)
|
||||||
{
|
{
|
||||||
op->retval = -EINVAL;
|
op->retval = -EINVAL;
|
||||||
std::function<void(cluster_op_t*)>(op->callback)(op);
|
erase_op(op);
|
||||||
return 1;
|
return 1;
|
||||||
}
|
}
|
||||||
if (!immediate_commit && !(op->flags & OP_FLUSH_BUFFER))
|
if (!immediate_commit && !(op->flags & OP_FLUSH_BUFFER))
|
||||||
@@ -603,13 +704,13 @@ resume_3:
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
op->retval = op->len;
|
op->retval = op->len;
|
||||||
std::function<void(cluster_op_t*)>(op->callback)(op);
|
erase_op(op);
|
||||||
return 1;
|
return 1;
|
||||||
}
|
}
|
||||||
else if (op->retval != 0 && op->retval != -EPIPE)
|
else if (op->retval != 0 && op->retval != -EPIPE)
|
||||||
{
|
{
|
||||||
// Fatal error (not -EPIPE)
|
// Fatal error (not -EPIPE)
|
||||||
std::function<void(cluster_op_t*)>(op->callback)(op);
|
erase_op(op);
|
||||||
return 1;
|
return 1;
|
||||||
}
|
}
|
||||||
else
|
else
|
||||||
@@ -849,17 +950,18 @@ int cluster_client_t::continue_sync(cluster_op_t *op)
|
|||||||
{
|
{
|
||||||
// Sync is not required in the immediate_commit mode or if there are no dirty_osds
|
// Sync is not required in the immediate_commit mode or if there are no dirty_osds
|
||||||
op->retval = 0;
|
op->retval = 0;
|
||||||
std::function<void(cluster_op_t*)>(op->callback)(op);
|
erase_op(op);
|
||||||
return 1;
|
return 1;
|
||||||
}
|
}
|
||||||
// Check that all OSD connections are still alive
|
// Check that all OSD connections are still alive
|
||||||
for (auto sync_osd: dirty_osds)
|
for (auto do_it = dirty_osds.begin(); do_it != dirty_osds.end(); )
|
||||||
{
|
{
|
||||||
|
osd_num_t sync_osd = *do_it;
|
||||||
auto peer_it = msgr.osd_peer_fds.find(sync_osd);
|
auto peer_it = msgr.osd_peer_fds.find(sync_osd);
|
||||||
if (peer_it == msgr.osd_peer_fds.end())
|
if (peer_it == msgr.osd_peer_fds.end())
|
||||||
{
|
dirty_osds.erase(do_it++);
|
||||||
return 0;
|
else
|
||||||
}
|
do_it++;
|
||||||
}
|
}
|
||||||
// Post sync to affected OSDs
|
// Post sync to affected OSDs
|
||||||
for (auto & prev_op: dirty_buffers)
|
for (auto & prev_op: dirty_buffers)
|
||||||
@@ -924,7 +1026,7 @@ resume_1:
|
|||||||
uw_it++;
|
uw_it++;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
std::function<void(cluster_op_t*)>(op->callback)(op);
|
erase_op(op);
|
||||||
return 1;
|
return 1;
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -970,8 +1072,8 @@ void cluster_client_t::handle_op_part(cluster_op_part_t *part)
|
|||||||
if (part->op.reply.hdr.retval != expected)
|
if (part->op.reply.hdr.retval != expected)
|
||||||
{
|
{
|
||||||
// Operation failed, retry
|
// Operation failed, retry
|
||||||
printf(
|
fprintf(
|
||||||
"%s operation failed on OSD %lu: retval=%ld (expected %d), dropping connection\n",
|
stderr, "%s operation failed on OSD %lu: retval=%ld (expected %d), dropping connection\n",
|
||||||
osd_op_names[part->op.req.hdr.opcode], part->osd_num, part->op.reply.hdr.retval, expected
|
osd_op_names[part->op.req.hdr.opcode], part->osd_num, part->op.reply.hdr.retval, expected
|
||||||
);
|
);
|
||||||
if (part->op.reply.hdr.retval == -EPIPE)
|
if (part->op.reply.hdr.retval == -EPIPE)
|
||||||
@@ -1008,7 +1110,10 @@ void cluster_client_t::handle_op_part(cluster_op_part_t *part)
|
|||||||
}
|
}
|
||||||
if (op->inflight_count == 0)
|
if (op->inflight_count == 0)
|
||||||
{
|
{
|
||||||
continue_ops();
|
if (op->opcode == OSD_OP_SYNC)
|
||||||
|
continue_sync(op);
|
||||||
|
else
|
||||||
|
continue_rw(op);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@@ -36,7 +36,7 @@ struct cluster_op_t
|
|||||||
std::function<void(cluster_op_t*)> callback;
|
std::function<void(cluster_op_t*)> callback;
|
||||||
~cluster_op_t();
|
~cluster_op_t();
|
||||||
protected:
|
protected:
|
||||||
int flags = 0;
|
uint64_t flags = 0;
|
||||||
int state = 0;
|
int state = 0;
|
||||||
uint64_t cur_inode; // for snapshot reads
|
uint64_t cur_inode; // for snapshot reads
|
||||||
void *buf = NULL;
|
void *buf = NULL;
|
||||||
@@ -47,6 +47,8 @@ protected:
|
|||||||
std::vector<cluster_op_part_t> parts;
|
std::vector<cluster_op_part_t> parts;
|
||||||
void *bitmap_buf = NULL, *part_bitmaps = NULL;
|
void *bitmap_buf = NULL, *part_bitmaps = NULL;
|
||||||
unsigned bitmap_buf_size = 0;
|
unsigned bitmap_buf_size = 0;
|
||||||
|
cluster_op_t *prev = NULL, *next = NULL;
|
||||||
|
int prev_wait = 0;
|
||||||
friend class cluster_client_t;
|
friend class cluster_client_t;
|
||||||
};
|
};
|
||||||
|
|
||||||
@@ -66,7 +68,8 @@ class cluster_client_t
|
|||||||
uint64_t bs_block_size = 0;
|
uint64_t bs_block_size = 0;
|
||||||
uint32_t bs_bitmap_granularity = 0, bs_bitmap_size = 0;
|
uint32_t bs_bitmap_granularity = 0, bs_bitmap_size = 0;
|
||||||
std::map<pool_id_t, uint64_t> pg_counts;
|
std::map<pool_id_t, uint64_t> pg_counts;
|
||||||
bool immediate_commit = false;
|
// WARNING: initially true so execute() doesn't create fake sync
|
||||||
|
bool immediate_commit = true;
|
||||||
// FIXME: Implement inmemory_commit mode. Note that it requires to return overlapping reads from memory.
|
// FIXME: Implement inmemory_commit mode. Note that it requires to return overlapping reads from memory.
|
||||||
uint64_t client_max_dirty_bytes = 0;
|
uint64_t client_max_dirty_bytes = 0;
|
||||||
uint64_t client_max_dirty_ops = 0;
|
uint64_t client_max_dirty_ops = 0;
|
||||||
@@ -76,7 +79,7 @@ class cluster_client_t
|
|||||||
int retry_timeout_id = 0;
|
int retry_timeout_id = 0;
|
||||||
uint64_t op_id = 1;
|
uint64_t op_id = 1;
|
||||||
std::vector<cluster_op_t*> offline_ops;
|
std::vector<cluster_op_t*> offline_ops;
|
||||||
std::vector<cluster_op_t*> op_queue;
|
cluster_op_t *op_queue_head = NULL, *op_queue_tail = NULL;
|
||||||
std::map<object_id, cluster_buffer_t> dirty_buffers;
|
std::map<object_id, cluster_buffer_t> dirty_buffers;
|
||||||
std::set<osd_num_t> dirty_osds;
|
std::set<osd_num_t> dirty_osds;
|
||||||
uint64_t dirty_bytes = 0, dirty_ops = 0;
|
uint64_t dirty_bytes = 0, dirty_ops = 0;
|
||||||
@@ -88,7 +91,6 @@ class cluster_client_t
|
|||||||
ring_consumer_t consumer;
|
ring_consumer_t consumer;
|
||||||
std::vector<std::function<void(void)>> on_ready_hooks;
|
std::vector<std::function<void(void)>> on_ready_hooks;
|
||||||
int continuing_ops = 0;
|
int continuing_ops = 0;
|
||||||
int op_queue_pos = 0;
|
|
||||||
|
|
||||||
public:
|
public:
|
||||||
etcd_state_client_t st_cli;
|
etcd_state_client_t st_cli;
|
||||||
@@ -117,4 +119,7 @@ protected:
|
|||||||
void send_sync(cluster_op_t *op, cluster_op_part_t *part);
|
void send_sync(cluster_op_t *op, cluster_op_part_t *part);
|
||||||
void handle_op_part(cluster_op_part_t *part);
|
void handle_op_part(cluster_op_part_t *part);
|
||||||
void copy_part_bitmap(cluster_op_t *op, cluster_op_part_t *part);
|
void copy_part_bitmap(cluster_op_t *op, cluster_op_part_t *part);
|
||||||
|
void erase_op(cluster_op_t *op);
|
||||||
|
void calc_wait(cluster_op_t *op);
|
||||||
|
void inc_wait(uint64_t opcode, uint64_t flags, cluster_op_t *next, int inc);
|
||||||
};
|
};
|
||||||
|
@@ -35,7 +35,7 @@ etcd_kv_t etcd_state_client_t::parse_etcd_kv(const json11::Json & kv_json)
|
|||||||
kv.value = json_text == "" ? json11::Json() : json11::Json::parse(json_text, json_err);
|
kv.value = json_text == "" ? json11::Json() : json11::Json::parse(json_text, json_err);
|
||||||
if (json_err != "")
|
if (json_err != "")
|
||||||
{
|
{
|
||||||
printf("Bad JSON in etcd key %s: %s (value: %s)\n", kv.key.c_str(), json_err.c_str(), json_text.c_str());
|
fprintf(stderr, "Bad JSON in etcd key %s: %s (value: %s)\n", kv.key.c_str(), json_err.c_str(), json_text.c_str());
|
||||||
kv.key = "";
|
kv.key = "";
|
||||||
}
|
}
|
||||||
else
|
else
|
||||||
@@ -50,6 +50,11 @@ void etcd_state_client_t::etcd_txn(json11::Json txn, int timeout, std::function<
|
|||||||
|
|
||||||
void etcd_state_client_t::etcd_call(std::string api, json11::Json payload, int timeout, std::function<void(std::string, json11::Json)> callback)
|
void etcd_state_client_t::etcd_call(std::string api, json11::Json payload, int timeout, std::function<void(std::string, json11::Json)> callback)
|
||||||
{
|
{
|
||||||
|
if (!etcd_addresses.size())
|
||||||
|
{
|
||||||
|
fprintf(stderr, "etcd_address is missing in Vitastor configuration\n");
|
||||||
|
exit(1);
|
||||||
|
}
|
||||||
std::string etcd_address = etcd_addresses[rand() % etcd_addresses.size()];
|
std::string etcd_address = etcd_addresses[rand() % etcd_addresses.size()];
|
||||||
std::string etcd_api_path;
|
std::string etcd_api_path;
|
||||||
int pos = etcd_address.find('/');
|
int pos = etcd_address.find('/');
|
||||||
@@ -76,16 +81,16 @@ void etcd_state_client_t::add_etcd_url(std::string addr)
|
|||||||
addr = addr.substr(7);
|
addr = addr.substr(7);
|
||||||
else if (strtolower(addr.substr(0, 8)) == "https://")
|
else if (strtolower(addr.substr(0, 8)) == "https://")
|
||||||
{
|
{
|
||||||
printf("HTTPS is unsupported for etcd. Either use plain HTTP or setup a local proxy for etcd interaction\n");
|
fprintf(stderr, "HTTPS is unsupported for etcd. Either use plain HTTP or setup a local proxy for etcd interaction\n");
|
||||||
exit(1);
|
exit(1);
|
||||||
}
|
}
|
||||||
if (addr.find('/') < 0)
|
if (addr.find('/') == std::string::npos)
|
||||||
addr += "/v3";
|
addr += "/v3";
|
||||||
this->etcd_addresses.push_back(addr);
|
this->etcd_addresses.push_back(addr);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
void etcd_state_client_t::parse_config(json11::Json & config)
|
void etcd_state_client_t::parse_config(const json11::Json & config)
|
||||||
{
|
{
|
||||||
this->etcd_addresses.clear();
|
this->etcd_addresses.clear();
|
||||||
if (config["etcd_address"].is_string())
|
if (config["etcd_address"].is_string())
|
||||||
@@ -122,6 +127,11 @@ void etcd_state_client_t::parse_config(json11::Json & config)
|
|||||||
|
|
||||||
void etcd_state_client_t::start_etcd_watcher()
|
void etcd_state_client_t::start_etcd_watcher()
|
||||||
{
|
{
|
||||||
|
if (!etcd_addresses.size())
|
||||||
|
{
|
||||||
|
fprintf(stderr, "etcd_address is missing in Vitastor configuration\n");
|
||||||
|
exit(1);
|
||||||
|
}
|
||||||
std::string etcd_address = etcd_addresses[rand() % etcd_addresses.size()];
|
std::string etcd_address = etcd_addresses[rand() % etcd_addresses.size()];
|
||||||
std::string etcd_api_path;
|
std::string etcd_api_path;
|
||||||
int pos = etcd_address.find('/');
|
int pos = etcd_address.find('/');
|
||||||
@@ -139,7 +149,7 @@ void etcd_state_client_t::start_etcd_watcher()
|
|||||||
json11::Json data = json11::Json::parse(msg->body, json_err);
|
json11::Json data = json11::Json::parse(msg->body, json_err);
|
||||||
if (json_err != "")
|
if (json_err != "")
|
||||||
{
|
{
|
||||||
printf("Bad JSON in etcd event: %s, ignoring event\n", json_err.c_str());
|
fprintf(stderr, "Bad JSON in etcd event: %s, ignoring event\n", json_err.c_str());
|
||||||
}
|
}
|
||||||
else
|
else
|
||||||
{
|
{
|
||||||
@@ -165,7 +175,7 @@ void etcd_state_client_t::start_etcd_watcher()
|
|||||||
{
|
{
|
||||||
if (this->log_level > 3)
|
if (this->log_level > 3)
|
||||||
{
|
{
|
||||||
printf("Incoming event: %s -> %s\n", kv.first.c_str(), kv.second.value.dump().c_str());
|
fprintf(stderr, "Incoming event: %s -> %s\n", kv.first.c_str(), kv.second.value.dump().c_str());
|
||||||
}
|
}
|
||||||
parse_state(kv.second);
|
parse_state(kv.second);
|
||||||
}
|
}
|
||||||
@@ -240,7 +250,7 @@ void etcd_state_client_t::load_global_config()
|
|||||||
{
|
{
|
||||||
if (err != "")
|
if (err != "")
|
||||||
{
|
{
|
||||||
printf("Error reading OSD configuration from etcd: %s\n", err.c_str());
|
fprintf(stderr, "Error reading OSD configuration from etcd: %s\n", err.c_str());
|
||||||
tfd->set_timer(ETCD_SLOW_TIMEOUT, false, [this](int timer_id)
|
tfd->set_timer(ETCD_SLOW_TIMEOUT, false, [this](int timer_id)
|
||||||
{
|
{
|
||||||
load_global_config();
|
load_global_config();
|
||||||
@@ -313,7 +323,7 @@ void etcd_state_client_t::load_pgs()
|
|||||||
{
|
{
|
||||||
if (err != "")
|
if (err != "")
|
||||||
{
|
{
|
||||||
printf("Error loading PGs from etcd: %s\n", err.c_str());
|
fprintf(stderr, "Error loading PGs from etcd: %s\n", err.c_str());
|
||||||
tfd->set_timer(ETCD_SLOW_TIMEOUT, false, [this](int timer_id)
|
tfd->set_timer(ETCD_SLOW_TIMEOUT, false, [this](int timer_id)
|
||||||
{
|
{
|
||||||
load_pgs();
|
load_pgs();
|
||||||
@@ -342,7 +352,7 @@ void etcd_state_client_t::load_pgs()
|
|||||||
});
|
});
|
||||||
}
|
}
|
||||||
#else
|
#else
|
||||||
void etcd_state_client_t::parse_config(json11::Json & config)
|
void etcd_state_client_t::parse_config(const json11::Json & config)
|
||||||
{
|
{
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -376,7 +386,7 @@ void etcd_state_client_t::parse_state(const etcd_kv_t & kv)
|
|||||||
sscanf(pool_item.first.c_str(), "%u%c", &pool_id, &null_byte);
|
sscanf(pool_item.first.c_str(), "%u%c", &pool_id, &null_byte);
|
||||||
if (!pool_id || pool_id >= POOL_ID_MAX || null_byte != 0)
|
if (!pool_id || pool_id >= POOL_ID_MAX || null_byte != 0)
|
||||||
{
|
{
|
||||||
printf("Pool ID %s is invalid (must be a number less than 0x%x), skipping pool\n", pool_item.first.c_str(), POOL_ID_MAX);
|
fprintf(stderr, "Pool ID %s is invalid (must be a number less than 0x%x), skipping pool\n", pool_item.first.c_str(), POOL_ID_MAX);
|
||||||
continue;
|
continue;
|
||||||
}
|
}
|
||||||
pc.id = pool_id;
|
pc.id = pool_id;
|
||||||
@@ -384,7 +394,7 @@ void etcd_state_client_t::parse_state(const etcd_kv_t & kv)
|
|||||||
pc.name = pool_item.second["name"].string_value();
|
pc.name = pool_item.second["name"].string_value();
|
||||||
if (pc.name == "")
|
if (pc.name == "")
|
||||||
{
|
{
|
||||||
printf("Pool %u has empty name, skipping pool\n", pool_id);
|
fprintf(stderr, "Pool %u has empty name, skipping pool\n", pool_id);
|
||||||
continue;
|
continue;
|
||||||
}
|
}
|
||||||
// Failure Domain
|
// Failure Domain
|
||||||
@@ -398,7 +408,7 @@ void etcd_state_client_t::parse_state(const etcd_kv_t & kv)
|
|||||||
pc.scheme = POOL_SCHEME_JERASURE;
|
pc.scheme = POOL_SCHEME_JERASURE;
|
||||||
else
|
else
|
||||||
{
|
{
|
||||||
printf("Pool %u has invalid coding scheme (one of \"xor\", \"replicated\" or \"jerasure\" required), skipping pool\n", pool_id);
|
fprintf(stderr, "Pool %u has invalid coding scheme (one of \"xor\", \"replicated\" or \"jerasure\" required), skipping pool\n", pool_id);
|
||||||
continue;
|
continue;
|
||||||
}
|
}
|
||||||
// PG Size
|
// PG Size
|
||||||
@@ -408,7 +418,7 @@ void etcd_state_client_t::parse_state(const etcd_kv_t & kv)
|
|||||||
(pc.scheme == POOL_SCHEME_XOR || pc.scheme == POOL_SCHEME_JERASURE) ||
|
(pc.scheme == POOL_SCHEME_XOR || pc.scheme == POOL_SCHEME_JERASURE) ||
|
||||||
pool_item.second["pg_size"].uint64_value() > 256)
|
pool_item.second["pg_size"].uint64_value() > 256)
|
||||||
{
|
{
|
||||||
printf("Pool %u has invalid pg_size, skipping pool\n", pool_id);
|
fprintf(stderr, "Pool %u has invalid pg_size, skipping pool\n", pool_id);
|
||||||
continue;
|
continue;
|
||||||
}
|
}
|
||||||
// Parity Chunks
|
// Parity Chunks
|
||||||
@@ -417,7 +427,7 @@ void etcd_state_client_t::parse_state(const etcd_kv_t & kv)
|
|||||||
{
|
{
|
||||||
if (pc.parity_chunks > 1)
|
if (pc.parity_chunks > 1)
|
||||||
{
|
{
|
||||||
printf("Pool %u has invalid parity_chunks (must be 1), skipping pool\n", pool_id);
|
fprintf(stderr, "Pool %u has invalid parity_chunks (must be 1), skipping pool\n", pool_id);
|
||||||
continue;
|
continue;
|
||||||
}
|
}
|
||||||
pc.parity_chunks = 1;
|
pc.parity_chunks = 1;
|
||||||
@@ -425,7 +435,7 @@ void etcd_state_client_t::parse_state(const etcd_kv_t & kv)
|
|||||||
if (pc.scheme == POOL_SCHEME_JERASURE &&
|
if (pc.scheme == POOL_SCHEME_JERASURE &&
|
||||||
(pc.parity_chunks < 1 || pc.parity_chunks > pc.pg_size-2))
|
(pc.parity_chunks < 1 || pc.parity_chunks > pc.pg_size-2))
|
||||||
{
|
{
|
||||||
printf("Pool %u has invalid parity_chunks (must be between 1 and pg_size-2), skipping pool\n", pool_id);
|
fprintf(stderr, "Pool %u has invalid parity_chunks (must be between 1 and pg_size-2), skipping pool\n", pool_id);
|
||||||
continue;
|
continue;
|
||||||
}
|
}
|
||||||
// PG MinSize
|
// PG MinSize
|
||||||
@@ -434,14 +444,14 @@ void etcd_state_client_t::parse_state(const etcd_kv_t & kv)
|
|||||||
(pc.scheme == POOL_SCHEME_XOR || pc.scheme == POOL_SCHEME_JERASURE) &&
|
(pc.scheme == POOL_SCHEME_XOR || pc.scheme == POOL_SCHEME_JERASURE) &&
|
||||||
pc.pg_minsize < (pc.pg_size-pc.parity_chunks))
|
pc.pg_minsize < (pc.pg_size-pc.parity_chunks))
|
||||||
{
|
{
|
||||||
printf("Pool %u has invalid pg_minsize, skipping pool\n", pool_id);
|
fprintf(stderr, "Pool %u has invalid pg_minsize, skipping pool\n", pool_id);
|
||||||
continue;
|
continue;
|
||||||
}
|
}
|
||||||
// PG Count
|
// PG Count
|
||||||
pc.pg_count = pool_item.second["pg_count"].uint64_value();
|
pc.pg_count = pool_item.second["pg_count"].uint64_value();
|
||||||
if (pc.pg_count < 1)
|
if (pc.pg_count < 1)
|
||||||
{
|
{
|
||||||
printf("Pool %u has invalid pg_count, skipping pool\n", pool_id);
|
fprintf(stderr, "Pool %u has invalid pg_count, skipping pool\n", pool_id);
|
||||||
continue;
|
continue;
|
||||||
}
|
}
|
||||||
// Max OSD Combinations
|
// Max OSD Combinations
|
||||||
@@ -450,7 +460,7 @@ void etcd_state_client_t::parse_state(const etcd_kv_t & kv)
|
|||||||
pc.max_osd_combinations = 10000;
|
pc.max_osd_combinations = 10000;
|
||||||
if (pc.max_osd_combinations > 0 && pc.max_osd_combinations < 100)
|
if (pc.max_osd_combinations > 0 && pc.max_osd_combinations < 100)
|
||||||
{
|
{
|
||||||
printf("Pool %u has invalid max_osd_combinations (must be at least 100), skipping pool\n", pool_id);
|
fprintf(stderr, "Pool %u has invalid max_osd_combinations (must be at least 100), skipping pool\n", pool_id);
|
||||||
continue;
|
continue;
|
||||||
}
|
}
|
||||||
// PG Stripe Size
|
// PG Stripe Size
|
||||||
@@ -468,7 +478,7 @@ void etcd_state_client_t::parse_state(const etcd_kv_t & kv)
|
|||||||
{
|
{
|
||||||
if (pg_item.second.target_set.size() != parsed_cfg.pg_size)
|
if (pg_item.second.target_set.size() != parsed_cfg.pg_size)
|
||||||
{
|
{
|
||||||
printf("Pool %u PG %u configuration is invalid: osd_set size %lu != pool pg_size %lu\n",
|
fprintf(stderr, "Pool %u PG %u configuration is invalid: osd_set size %lu != pool pg_size %lu\n",
|
||||||
pool_id, pg_item.first, pg_item.second.target_set.size(), parsed_cfg.pg_size);
|
pool_id, pg_item.first, pg_item.second.target_set.size(), parsed_cfg.pg_size);
|
||||||
pg_item.second.pause = true;
|
pg_item.second.pause = true;
|
||||||
}
|
}
|
||||||
@@ -491,7 +501,7 @@ void etcd_state_client_t::parse_state(const etcd_kv_t & kv)
|
|||||||
sscanf(pool_item.first.c_str(), "%u%c", &pool_id, &null_byte);
|
sscanf(pool_item.first.c_str(), "%u%c", &pool_id, &null_byte);
|
||||||
if (!pool_id || pool_id >= POOL_ID_MAX || null_byte != 0)
|
if (!pool_id || pool_id >= POOL_ID_MAX || null_byte != 0)
|
||||||
{
|
{
|
||||||
printf("Pool ID %s is invalid in PG configuration (must be a number less than 0x%x), skipping pool\n", pool_item.first.c_str(), POOL_ID_MAX);
|
fprintf(stderr, "Pool ID %s is invalid in PG configuration (must be a number less than 0x%x), skipping pool\n", pool_item.first.c_str(), POOL_ID_MAX);
|
||||||
continue;
|
continue;
|
||||||
}
|
}
|
||||||
for (auto & pg_item: pool_item.second.object_items())
|
for (auto & pg_item: pool_item.second.object_items())
|
||||||
@@ -500,7 +510,7 @@ void etcd_state_client_t::parse_state(const etcd_kv_t & kv)
|
|||||||
sscanf(pg_item.first.c_str(), "%u%c", &pg_num, &null_byte);
|
sscanf(pg_item.first.c_str(), "%u%c", &pg_num, &null_byte);
|
||||||
if (!pg_num || null_byte != 0)
|
if (!pg_num || null_byte != 0)
|
||||||
{
|
{
|
||||||
printf("Bad key in pool %u PG configuration: %s (must be a number), skipped\n", pool_id, pg_item.first.c_str());
|
fprintf(stderr, "Bad key in pool %u PG configuration: %s (must be a number), skipped\n", pool_id, pg_item.first.c_str());
|
||||||
continue;
|
continue;
|
||||||
}
|
}
|
||||||
auto & parsed_cfg = this->pool_config[pool_id].pg_config[pg_num];
|
auto & parsed_cfg = this->pool_config[pool_id].pg_config[pg_num];
|
||||||
@@ -514,7 +524,7 @@ void etcd_state_client_t::parse_state(const etcd_kv_t & kv)
|
|||||||
}
|
}
|
||||||
if (parsed_cfg.target_set.size() != pool_config[pool_id].pg_size)
|
if (parsed_cfg.target_set.size() != pool_config[pool_id].pg_size)
|
||||||
{
|
{
|
||||||
printf("Pool %u PG %u configuration is invalid: osd_set size %lu != pool pg_size %lu\n",
|
fprintf(stderr, "Pool %u PG %u configuration is invalid: osd_set size %lu != pool pg_size %lu\n",
|
||||||
pool_id, pg_num, parsed_cfg.target_set.size(), pool_config[pool_id].pg_size);
|
pool_id, pg_num, parsed_cfg.target_set.size(), pool_config[pool_id].pg_size);
|
||||||
parsed_cfg.pause = true;
|
parsed_cfg.pause = true;
|
||||||
}
|
}
|
||||||
@@ -527,8 +537,8 @@ void etcd_state_client_t::parse_state(const etcd_kv_t & kv)
|
|||||||
{
|
{
|
||||||
if (pg_it->second.exists && pg_it->first != ++n)
|
if (pg_it->second.exists && pg_it->first != ++n)
|
||||||
{
|
{
|
||||||
printf(
|
fprintf(
|
||||||
"Invalid pool %u PG configuration: PG numbers don't cover whole 1..%lu range\n",
|
stderr, "Invalid pool %u PG configuration: PG numbers don't cover whole 1..%lu range\n",
|
||||||
pool_item.second.id, pool_item.second.pg_config.size()
|
pool_item.second.id, pool_item.second.pg_config.size()
|
||||||
);
|
);
|
||||||
for (pg_it = pool_item.second.pg_config.begin(); pg_it != pool_item.second.pg_config.end(); pg_it++)
|
for (pg_it = pool_item.second.pg_config.begin(); pg_it != pool_item.second.pg_config.end(); pg_it++)
|
||||||
@@ -551,7 +561,7 @@ void etcd_state_client_t::parse_state(const etcd_kv_t & kv)
|
|||||||
sscanf(key.c_str() + etcd_prefix.length()+12, "%u/%u%c", &pool_id, &pg_num, &null_byte);
|
sscanf(key.c_str() + etcd_prefix.length()+12, "%u/%u%c", &pool_id, &pg_num, &null_byte);
|
||||||
if (!pool_id || pool_id >= POOL_ID_MAX || !pg_num || null_byte != 0)
|
if (!pool_id || pool_id >= POOL_ID_MAX || !pg_num || null_byte != 0)
|
||||||
{
|
{
|
||||||
printf("Bad etcd key %s, ignoring\n", key.c_str());
|
fprintf(stderr, "Bad etcd key %s, ignoring\n", key.c_str());
|
||||||
}
|
}
|
||||||
else
|
else
|
||||||
{
|
{
|
||||||
@@ -590,7 +600,7 @@ void etcd_state_client_t::parse_state(const etcd_kv_t & kv)
|
|||||||
sscanf(key.c_str() + etcd_prefix.length()+10, "%u/%u%c", &pool_id, &pg_num, &null_byte);
|
sscanf(key.c_str() + etcd_prefix.length()+10, "%u/%u%c", &pool_id, &pg_num, &null_byte);
|
||||||
if (!pool_id || pool_id >= POOL_ID_MAX || !pg_num || null_byte != 0)
|
if (!pool_id || pool_id >= POOL_ID_MAX || !pg_num || null_byte != 0)
|
||||||
{
|
{
|
||||||
printf("Bad etcd key %s, ignoring\n", key.c_str());
|
fprintf(stderr, "Bad etcd key %s, ignoring\n", key.c_str());
|
||||||
}
|
}
|
||||||
else if (value.is_null())
|
else if (value.is_null())
|
||||||
{
|
{
|
||||||
@@ -614,7 +624,7 @@ void etcd_state_client_t::parse_state(const etcd_kv_t & kv)
|
|||||||
}
|
}
|
||||||
if (i >= pg_state_bit_count)
|
if (i >= pg_state_bit_count)
|
||||||
{
|
{
|
||||||
printf("Unexpected pool %u PG %u state keyword in etcd: %s\n", pool_id, pg_num, e.dump().c_str());
|
fprintf(stderr, "Unexpected pool %u PG %u state keyword in etcd: %s\n", pool_id, pg_num, e.dump().c_str());
|
||||||
return;
|
return;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
@@ -623,7 +633,7 @@ void etcd_state_client_t::parse_state(const etcd_kv_t & kv)
|
|||||||
(state & PG_PEERING) && state != PG_PEERING ||
|
(state & PG_PEERING) && state != PG_PEERING ||
|
||||||
(state & PG_INCOMPLETE) && state != PG_INCOMPLETE)
|
(state & PG_INCOMPLETE) && state != PG_INCOMPLETE)
|
||||||
{
|
{
|
||||||
printf("Unexpected pool %u PG %u state in etcd: primary=%lu, state=%s\n", pool_id, pg_num, cur_primary, value["state"].dump().c_str());
|
fprintf(stderr, "Unexpected pool %u PG %u state in etcd: primary=%lu, state=%s\n", pool_id, pg_num, cur_primary, value["state"].dump().c_str());
|
||||||
return;
|
return;
|
||||||
}
|
}
|
||||||
this->pool_config[pool_id].pg_config[pg_num].cur_primary = cur_primary;
|
this->pool_config[pool_id].pg_config[pg_num].cur_primary = cur_primary;
|
||||||
@@ -661,7 +671,7 @@ void etcd_state_client_t::parse_state(const etcd_kv_t & kv)
|
|||||||
sscanf(key.c_str() + etcd_prefix.length()+14, "%lu/%lu%c", &pool_id, &inode_num, &null_byte);
|
sscanf(key.c_str() + etcd_prefix.length()+14, "%lu/%lu%c", &pool_id, &inode_num, &null_byte);
|
||||||
if (!pool_id || pool_id >= POOL_ID_MAX || !inode_num || (inode_num >> (64-POOL_ID_BITS)) || null_byte != 0)
|
if (!pool_id || pool_id >= POOL_ID_MAX || !inode_num || (inode_num >> (64-POOL_ID_BITS)) || null_byte != 0)
|
||||||
{
|
{
|
||||||
printf("Bad etcd key %s, ignoring\n", key.c_str());
|
fprintf(stderr, "Bad etcd key %s, ignoring\n", key.c_str());
|
||||||
}
|
}
|
||||||
else
|
else
|
||||||
{
|
{
|
||||||
@@ -696,8 +706,8 @@ void etcd_state_client_t::parse_state(const etcd_kv_t & kv)
|
|||||||
parent_inode_num |= pool_id << (64-POOL_ID_BITS);
|
parent_inode_num |= pool_id << (64-POOL_ID_BITS);
|
||||||
else if (parent_pool_id >= POOL_ID_MAX)
|
else if (parent_pool_id >= POOL_ID_MAX)
|
||||||
{
|
{
|
||||||
printf(
|
fprintf(
|
||||||
"Inode %lu/%lu parent_pool value is invalid, ignoring parent setting\n",
|
stderr, "Inode %lu/%lu parent_pool value is invalid, ignoring parent setting\n",
|
||||||
inode_num >> (64-POOL_ID_BITS), inode_num & ((1l << (64-POOL_ID_BITS)) - 1)
|
inode_num >> (64-POOL_ID_BITS), inode_num & ((1l << (64-POOL_ID_BITS)) - 1)
|
||||||
);
|
);
|
||||||
parent_inode_num = 0;
|
parent_inode_num = 0;
|
||||||
|
@@ -106,7 +106,7 @@ public:
|
|||||||
void load_global_config();
|
void load_global_config();
|
||||||
void load_pgs();
|
void load_pgs();
|
||||||
void parse_state(const etcd_kv_t & kv);
|
void parse_state(const etcd_kv_t & kv);
|
||||||
void parse_config(json11::Json & config);
|
void parse_config(const json11::Json & config);
|
||||||
inode_watch_t* watch_inode(std::string name);
|
inode_watch_t* watch_inode(std::string name);
|
||||||
void close_watch(inode_watch_t* watch);
|
void close_watch(inode_watch_t* watch);
|
||||||
~etcd_state_client_t();
|
~etcd_state_client_t();
|
||||||
|
@@ -24,7 +24,6 @@
|
|||||||
#include <netinet/tcp.h>
|
#include <netinet/tcp.h>
|
||||||
|
|
||||||
#include <vector>
|
#include <vector>
|
||||||
#include <unordered_map>
|
|
||||||
|
|
||||||
#include "epoll_manager.h"
|
#include "epoll_manager.h"
|
||||||
#include "cluster_client.h"
|
#include "cluster_client.h"
|
||||||
@@ -46,6 +45,7 @@ struct sec_data
|
|||||||
struct sec_options
|
struct sec_options
|
||||||
{
|
{
|
||||||
int __pad;
|
int __pad;
|
||||||
|
char *config_path = NULL;
|
||||||
char *etcd_host = NULL;
|
char *etcd_host = NULL;
|
||||||
char *etcd_prefix = NULL;
|
char *etcd_prefix = NULL;
|
||||||
char *image = NULL;
|
char *image = NULL;
|
||||||
@@ -53,9 +53,23 @@ struct sec_options
|
|||||||
uint64_t inode = 0;
|
uint64_t inode = 0;
|
||||||
int cluster_log = 0;
|
int cluster_log = 0;
|
||||||
int trace = 0;
|
int trace = 0;
|
||||||
|
int use_rdma = 0;
|
||||||
|
char *rdma_device = NULL;
|
||||||
|
int rdma_port_num = 0;
|
||||||
|
int rdma_gid_index = 0;
|
||||||
|
int rdma_mtu = 0;
|
||||||
};
|
};
|
||||||
|
|
||||||
static struct fio_option options[] = {
|
static struct fio_option options[] = {
|
||||||
|
{
|
||||||
|
.name = "conf",
|
||||||
|
.lname = "Vitastor config path",
|
||||||
|
.type = FIO_OPT_STR_STORE,
|
||||||
|
.off1 = offsetof(struct sec_options, config_path),
|
||||||
|
.help = "Vitastor config path",
|
||||||
|
.category = FIO_OPT_C_ENGINE,
|
||||||
|
.group = FIO_OPT_G_FILENAME,
|
||||||
|
},
|
||||||
{
|
{
|
||||||
.name = "etcd",
|
.name = "etcd",
|
||||||
.lname = "etcd address",
|
.lname = "etcd address",
|
||||||
@@ -121,6 +135,55 @@ static struct fio_option options[] = {
|
|||||||
.category = FIO_OPT_C_ENGINE,
|
.category = FIO_OPT_C_ENGINE,
|
||||||
.group = FIO_OPT_G_FILENAME,
|
.group = FIO_OPT_G_FILENAME,
|
||||||
},
|
},
|
||||||
|
{
|
||||||
|
.name = "use_rdma",
|
||||||
|
.lname = "Use RDMA",
|
||||||
|
.type = FIO_OPT_BOOL,
|
||||||
|
.off1 = offsetof(struct sec_options, use_rdma),
|
||||||
|
.help = "Use RDMA",
|
||||||
|
.def = "-1",
|
||||||
|
.category = FIO_OPT_C_ENGINE,
|
||||||
|
.group = FIO_OPT_G_FILENAME,
|
||||||
|
},
|
||||||
|
{
|
||||||
|
.name = "rdma_device",
|
||||||
|
.lname = "RDMA device name",
|
||||||
|
.type = FIO_OPT_STR_STORE,
|
||||||
|
.off1 = offsetof(struct sec_options, rdma_device),
|
||||||
|
.help = "RDMA device name",
|
||||||
|
.category = FIO_OPT_C_ENGINE,
|
||||||
|
.group = FIO_OPT_G_FILENAME,
|
||||||
|
},
|
||||||
|
{
|
||||||
|
.name = "rdma_port_num",
|
||||||
|
.lname = "RDMA port number",
|
||||||
|
.type = FIO_OPT_INT,
|
||||||
|
.off1 = offsetof(struct sec_options, rdma_port_num),
|
||||||
|
.help = "RDMA port number",
|
||||||
|
.def = "0",
|
||||||
|
.category = FIO_OPT_C_ENGINE,
|
||||||
|
.group = FIO_OPT_G_FILENAME,
|
||||||
|
},
|
||||||
|
{
|
||||||
|
.name = "rdma_gid_index",
|
||||||
|
.lname = "RDMA gid index",
|
||||||
|
.type = FIO_OPT_INT,
|
||||||
|
.off1 = offsetof(struct sec_options, rdma_gid_index),
|
||||||
|
.help = "RDMA gid index",
|
||||||
|
.def = "0",
|
||||||
|
.category = FIO_OPT_C_ENGINE,
|
||||||
|
.group = FIO_OPT_G_FILENAME,
|
||||||
|
},
|
||||||
|
{
|
||||||
|
.name = "rdma_mtu",
|
||||||
|
.lname = "RDMA path MTU",
|
||||||
|
.type = FIO_OPT_INT,
|
||||||
|
.off1 = offsetof(struct sec_options, rdma_mtu),
|
||||||
|
.help = "RDMA path MTU",
|
||||||
|
.def = "0",
|
||||||
|
.category = FIO_OPT_C_ENGINE,
|
||||||
|
.group = FIO_OPT_G_FILENAME,
|
||||||
|
},
|
||||||
{
|
{
|
||||||
.name = NULL,
|
.name = NULL,
|
||||||
},
|
},
|
||||||
@@ -131,12 +194,6 @@ static int sec_setup(struct thread_data *td)
|
|||||||
sec_options *o = (sec_options*)td->eo;
|
sec_options *o = (sec_options*)td->eo;
|
||||||
sec_data *bsd;
|
sec_data *bsd;
|
||||||
|
|
||||||
if (!o->etcd_host)
|
|
||||||
{
|
|
||||||
td_verror(td, EINVAL, "etcd address is missing");
|
|
||||||
return 1;
|
|
||||||
}
|
|
||||||
|
|
||||||
bsd = new sec_data;
|
bsd = new sec_data;
|
||||||
if (!bsd)
|
if (!bsd)
|
||||||
{
|
{
|
||||||
@@ -152,11 +209,26 @@ static int sec_setup(struct thread_data *td)
|
|||||||
td->o.open_files++;
|
td->o.open_files++;
|
||||||
}
|
}
|
||||||
|
|
||||||
json11::Json cfg = json11::Json::object {
|
json11::Json::object cfg;
|
||||||
{ "etcd_address", std::string(o->etcd_host) },
|
if (o->config_path)
|
||||||
{ "etcd_prefix", std::string(o->etcd_prefix ? o->etcd_prefix : "/vitastor") },
|
cfg["config_path"] = std::string(o->config_path);
|
||||||
{ "log_level", o->cluster_log },
|
if (o->etcd_host)
|
||||||
};
|
cfg["etcd_address"] = std::string(o->etcd_host);
|
||||||
|
if (o->etcd_prefix)
|
||||||
|
cfg["etcd_prefix"] = std::string(o->etcd_prefix);
|
||||||
|
if (o->rdma_device)
|
||||||
|
cfg["rdma_device"] = std::string(o->rdma_device);
|
||||||
|
if (o->rdma_port_num)
|
||||||
|
cfg["rdma_port_num"] = o->rdma_port_num;
|
||||||
|
if (o->rdma_gid_index)
|
||||||
|
cfg["rdma_gid_index"] = o->rdma_gid_index;
|
||||||
|
if (o->rdma_mtu)
|
||||||
|
cfg["rdma_mtu"] = o->rdma_mtu;
|
||||||
|
if (o->cluster_log)
|
||||||
|
cfg["log_level"] = o->cluster_log;
|
||||||
|
if (o->use_rdma != -1)
|
||||||
|
cfg["use_rdma"] = o->use_rdma;
|
||||||
|
json11::Json cfg_json(cfg);
|
||||||
|
|
||||||
if (!o->image)
|
if (!o->image)
|
||||||
{
|
{
|
||||||
@@ -181,7 +253,7 @@ static int sec_setup(struct thread_data *td)
|
|||||||
}
|
}
|
||||||
bsd->ringloop = new ring_loop_t(512);
|
bsd->ringloop = new ring_loop_t(512);
|
||||||
bsd->epmgr = new epoll_manager_t(bsd->ringloop);
|
bsd->epmgr = new epoll_manager_t(bsd->ringloop);
|
||||||
bsd->cli = new cluster_client_t(bsd->ringloop, bsd->epmgr->tfd, cfg);
|
bsd->cli = new cluster_client_t(bsd->ringloop, bsd->epmgr->tfd, cfg_json);
|
||||||
if (o->image)
|
if (o->image)
|
||||||
{
|
{
|
||||||
while (!bsd->cli->is_ready())
|
while (!bsd->cli->is_ready())
|
||||||
|
@@ -12,6 +12,31 @@
|
|||||||
|
|
||||||
void osd_messenger_t::init()
|
void osd_messenger_t::init()
|
||||||
{
|
{
|
||||||
|
#ifdef WITH_RDMA
|
||||||
|
if (use_rdma)
|
||||||
|
{
|
||||||
|
rdma_context = msgr_rdma_context_t::create(
|
||||||
|
rdma_device != "" ? rdma_device.c_str() : NULL,
|
||||||
|
rdma_port_num, rdma_gid_index, rdma_mtu
|
||||||
|
);
|
||||||
|
if (!rdma_context)
|
||||||
|
{
|
||||||
|
fprintf(stderr, "[OSD %lu] Couldn't initialize RDMA, proceeding with TCP only\n", osd_num);
|
||||||
|
}
|
||||||
|
else
|
||||||
|
{
|
||||||
|
rdma_max_sge = rdma_max_sge < rdma_context->attrx.orig_attr.max_sge
|
||||||
|
? rdma_max_sge : rdma_context->attrx.orig_attr.max_sge;
|
||||||
|
fprintf(stderr, "[OSD %lu] RDMA initialized successfully\n", osd_num);
|
||||||
|
fcntl(rdma_context->channel->fd, F_SETFL, fcntl(rdma_context->channel->fd, F_GETFL, 0) | O_NONBLOCK);
|
||||||
|
tfd->set_fd_handler(rdma_context->channel->fd, false, [this](int notify_fd, int epoll_events)
|
||||||
|
{
|
||||||
|
handle_rdma_events();
|
||||||
|
});
|
||||||
|
handle_rdma_events();
|
||||||
|
}
|
||||||
|
}
|
||||||
|
#endif
|
||||||
keepalive_timer_id = tfd->set_timer(1000, true, [this](int)
|
keepalive_timer_id = tfd->set_timer(1000, true, [this](int)
|
||||||
{
|
{
|
||||||
std::vector<int> to_stop;
|
std::vector<int> to_stop;
|
||||||
@@ -19,7 +44,7 @@ void osd_messenger_t::init()
|
|||||||
for (auto cl_it = clients.begin(); cl_it != clients.end(); cl_it++)
|
for (auto cl_it = clients.begin(); cl_it != clients.end(); cl_it++)
|
||||||
{
|
{
|
||||||
auto cl = cl_it->second;
|
auto cl = cl_it->second;
|
||||||
if (!cl->osd_num || cl->peer_state != PEER_CONNECTED)
|
if (!cl->osd_num || cl->peer_state != PEER_CONNECTED && cl->peer_state != PEER_RDMA)
|
||||||
{
|
{
|
||||||
// Do not run keepalive on regular clients
|
// Do not run keepalive on regular clients
|
||||||
continue;
|
continue;
|
||||||
@@ -30,7 +55,7 @@ void osd_messenger_t::init()
|
|||||||
if (!cl->ping_time_remaining)
|
if (!cl->ping_time_remaining)
|
||||||
{
|
{
|
||||||
// Ping timed out, stop the client
|
// Ping timed out, stop the client
|
||||||
printf("Ping timed out for OSD %lu (client %d), disconnecting peer\n", cl->osd_num, cl->peer_fd);
|
fprintf(stderr, "Ping timed out for OSD %lu (client %d), disconnecting peer\n", cl->osd_num, cl->peer_fd);
|
||||||
to_stop.push_back(cl->peer_fd);
|
to_stop.push_back(cl->peer_fd);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
@@ -57,7 +82,7 @@ void osd_messenger_t::init()
|
|||||||
delete op;
|
delete op;
|
||||||
if (fail_fd >= 0)
|
if (fail_fd >= 0)
|
||||||
{
|
{
|
||||||
printf("Ping failed for OSD %lu (client %d), disconnecting peer\n", cl->osd_num, cl->peer_fd);
|
fprintf(stderr, "Ping failed for OSD %lu (client %d), disconnecting peer\n", cl->osd_num, cl->peer_fd);
|
||||||
stop_client(fail_fd, true);
|
stop_client(fail_fd, true);
|
||||||
}
|
}
|
||||||
};
|
};
|
||||||
@@ -94,32 +119,58 @@ osd_messenger_t::~osd_messenger_t()
|
|||||||
{
|
{
|
||||||
stop_client(clients.begin()->first, true);
|
stop_client(clients.begin()->first, true);
|
||||||
}
|
}
|
||||||
|
#ifdef WITH_RDMA
|
||||||
|
if (rdma_context)
|
||||||
|
{
|
||||||
|
delete rdma_context;
|
||||||
|
}
|
||||||
|
#endif
|
||||||
}
|
}
|
||||||
|
|
||||||
void osd_messenger_t::parse_config(const json11::Json & config)
|
void osd_messenger_t::parse_config(const json11::Json & config)
|
||||||
{
|
{
|
||||||
|
#ifdef WITH_RDMA
|
||||||
|
if (!config["use_rdma"].is_null())
|
||||||
|
{
|
||||||
|
// RDMA is on by default in RDMA-enabled builds
|
||||||
|
this->use_rdma = config["use_rdma"].bool_value() || config["use_rdma"].uint64_value() != 0;
|
||||||
|
}
|
||||||
|
this->rdma_device = config["rdma_device"].string_value();
|
||||||
|
this->rdma_port_num = (uint8_t)config["rdma_port_num"].uint64_value();
|
||||||
|
if (!this->rdma_port_num)
|
||||||
|
this->rdma_port_num = 1;
|
||||||
|
this->rdma_gid_index = (uint8_t)config["rdma_gid_index"].uint64_value();
|
||||||
|
this->rdma_mtu = (uint32_t)config["rdma_mtu"].uint64_value();
|
||||||
|
this->rdma_max_sge = config["rdma_max_sge"].uint64_value();
|
||||||
|
if (!this->rdma_max_sge)
|
||||||
|
this->rdma_max_sge = 128;
|
||||||
|
this->rdma_max_send = config["rdma_max_send"].uint64_value();
|
||||||
|
if (!this->rdma_max_send)
|
||||||
|
this->rdma_max_send = 32;
|
||||||
|
this->rdma_max_recv = config["rdma_max_recv"].uint64_value();
|
||||||
|
if (!this->rdma_max_recv)
|
||||||
|
this->rdma_max_recv = 8;
|
||||||
|
this->rdma_max_msg = config["rdma_max_msg"].uint64_value();
|
||||||
|
if (!this->rdma_max_msg || this->rdma_max_msg > 128*1024*1024)
|
||||||
|
this->rdma_max_msg = 1024*1024;
|
||||||
|
#endif
|
||||||
|
this->receive_buffer_size = (uint32_t)config["tcp_header_buffer_size"].uint64_value();
|
||||||
|
if (!this->receive_buffer_size || this->receive_buffer_size > 1024*1024*1024)
|
||||||
|
this->receive_buffer_size = 65536;
|
||||||
this->use_sync_send_recv = config["use_sync_send_recv"].bool_value() ||
|
this->use_sync_send_recv = config["use_sync_send_recv"].bool_value() ||
|
||||||
config["use_sync_send_recv"].uint64_value();
|
config["use_sync_send_recv"].uint64_value();
|
||||||
this->peer_connect_interval = config["peer_connect_interval"].uint64_value();
|
this->peer_connect_interval = config["peer_connect_interval"].uint64_value();
|
||||||
if (!this->peer_connect_interval)
|
if (!this->peer_connect_interval)
|
||||||
{
|
this->peer_connect_interval = 5;
|
||||||
this->peer_connect_interval = DEFAULT_PEER_CONNECT_INTERVAL;
|
|
||||||
}
|
|
||||||
this->peer_connect_timeout = config["peer_connect_timeout"].uint64_value();
|
this->peer_connect_timeout = config["peer_connect_timeout"].uint64_value();
|
||||||
if (!this->peer_connect_timeout)
|
if (!this->peer_connect_timeout)
|
||||||
{
|
this->peer_connect_timeout = 5;
|
||||||
this->peer_connect_timeout = DEFAULT_PEER_CONNECT_TIMEOUT;
|
|
||||||
}
|
|
||||||
this->osd_idle_timeout = config["osd_idle_timeout"].uint64_value();
|
this->osd_idle_timeout = config["osd_idle_timeout"].uint64_value();
|
||||||
if (!this->osd_idle_timeout)
|
if (!this->osd_idle_timeout)
|
||||||
{
|
this->osd_idle_timeout = 5;
|
||||||
this->osd_idle_timeout = DEFAULT_OSD_PING_TIMEOUT;
|
|
||||||
}
|
|
||||||
this->osd_ping_timeout = config["osd_ping_timeout"].uint64_value();
|
this->osd_ping_timeout = config["osd_ping_timeout"].uint64_value();
|
||||||
if (!this->osd_ping_timeout)
|
if (!this->osd_ping_timeout)
|
||||||
{
|
this->osd_ping_timeout = 5;
|
||||||
this->osd_ping_timeout = DEFAULT_OSD_PING_TIMEOUT;
|
|
||||||
}
|
|
||||||
this->log_level = config["log_level"].uint64_value();
|
this->log_level = config["log_level"].uint64_value();
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -254,7 +305,7 @@ void osd_messenger_t::handle_peer_epoll(int peer_fd, int epoll_events)
|
|||||||
if (epoll_events & EPOLLRDHUP)
|
if (epoll_events & EPOLLRDHUP)
|
||||||
{
|
{
|
||||||
// Stop client
|
// Stop client
|
||||||
printf("[OSD %lu] client %d disconnected\n", this->osd_num, peer_fd);
|
fprintf(stderr, "[OSD %lu] client %d disconnected\n", this->osd_num, peer_fd);
|
||||||
stop_client(peer_fd, true);
|
stop_client(peer_fd, true);
|
||||||
}
|
}
|
||||||
else if (epoll_events & EPOLLIN)
|
else if (epoll_events & EPOLLIN)
|
||||||
@@ -279,7 +330,7 @@ void osd_messenger_t::on_connect_peer(osd_num_t peer_osd, int peer_fd)
|
|||||||
wp.connecting = false;
|
wp.connecting = false;
|
||||||
if (peer_fd < 0)
|
if (peer_fd < 0)
|
||||||
{
|
{
|
||||||
printf("Failed to connect to peer OSD %lu address %s port %d: %s\n", peer_osd, wp.cur_addr.c_str(), wp.cur_port, strerror(-peer_fd));
|
fprintf(stderr, "Failed to connect to peer OSD %lu address %s port %d: %s\n", peer_osd, wp.cur_addr.c_str(), wp.cur_port, strerror(-peer_fd));
|
||||||
if (wp.address_changed)
|
if (wp.address_changed)
|
||||||
{
|
{
|
||||||
wp.address_changed = false;
|
wp.address_changed = false;
|
||||||
@@ -306,7 +357,7 @@ void osd_messenger_t::on_connect_peer(osd_num_t peer_osd, int peer_fd)
|
|||||||
}
|
}
|
||||||
if (log_level > 0)
|
if (log_level > 0)
|
||||||
{
|
{
|
||||||
printf("[OSD %lu] Connected with peer OSD %lu (client %d)\n", osd_num, peer_osd, peer_fd);
|
fprintf(stderr, "[OSD %lu] Connected with peer OSD %lu (client %d)\n", osd_num, peer_osd, peer_fd);
|
||||||
}
|
}
|
||||||
wanted_peers.erase(peer_osd);
|
wanted_peers.erase(peer_osd);
|
||||||
repeer_pgs(peer_osd);
|
repeer_pgs(peer_osd);
|
||||||
@@ -326,6 +377,24 @@ void osd_messenger_t::check_peer_config(osd_client_t *cl)
|
|||||||
},
|
},
|
||||||
},
|
},
|
||||||
};
|
};
|
||||||
|
#ifdef WITH_RDMA
|
||||||
|
if (rdma_context)
|
||||||
|
{
|
||||||
|
cl->rdma_conn = msgr_rdma_connection_t::create(rdma_context, rdma_max_send, rdma_max_recv, rdma_max_sge, rdma_max_msg);
|
||||||
|
if (cl->rdma_conn)
|
||||||
|
{
|
||||||
|
json11::Json payload = json11::Json::object {
|
||||||
|
{ "connect_rdma", cl->rdma_conn->addr.to_string() },
|
||||||
|
{ "rdma_max_msg", cl->rdma_conn->max_msg },
|
||||||
|
};
|
||||||
|
std::string payload_str = payload.dump();
|
||||||
|
op->req.show_conf.json_len = payload_str.size();
|
||||||
|
op->buf = malloc_or_die(payload_str.size());
|
||||||
|
op->iov.push_back(op->buf, payload_str.size());
|
||||||
|
memcpy(op->buf, payload_str.c_str(), payload_str.size());
|
||||||
|
}
|
||||||
|
}
|
||||||
|
#endif
|
||||||
op->callback = [this, cl](osd_op_t *op)
|
op->callback = [this, cl](osd_op_t *op)
|
||||||
{
|
{
|
||||||
std::string json_err;
|
std::string json_err;
|
||||||
@@ -334,7 +403,7 @@ void osd_messenger_t::check_peer_config(osd_client_t *cl)
|
|||||||
if (op->reply.hdr.retval < 0)
|
if (op->reply.hdr.retval < 0)
|
||||||
{
|
{
|
||||||
err = true;
|
err = true;
|
||||||
printf("Failed to get config from OSD %lu (retval=%ld), disconnecting peer\n", cl->osd_num, op->reply.hdr.retval);
|
fprintf(stderr, "Failed to get config from OSD %lu (retval=%ld), disconnecting peer\n", cl->osd_num, op->reply.hdr.retval);
|
||||||
}
|
}
|
||||||
else
|
else
|
||||||
{
|
{
|
||||||
@@ -342,18 +411,18 @@ void osd_messenger_t::check_peer_config(osd_client_t *cl)
|
|||||||
if (json_err != "")
|
if (json_err != "")
|
||||||
{
|
{
|
||||||
err = true;
|
err = true;
|
||||||
printf("Failed to get config from OSD %lu: bad JSON: %s, disconnecting peer\n", cl->osd_num, json_err.c_str());
|
fprintf(stderr, "Failed to get config from OSD %lu: bad JSON: %s, disconnecting peer\n", cl->osd_num, json_err.c_str());
|
||||||
}
|
}
|
||||||
else if (config["osd_num"].uint64_value() != cl->osd_num)
|
else if (config["osd_num"].uint64_value() != cl->osd_num)
|
||||||
{
|
{
|
||||||
err = true;
|
err = true;
|
||||||
printf("Connected to OSD %lu instead of OSD %lu, peer state is outdated, disconnecting peer\n", config["osd_num"].uint64_value(), cl->osd_num);
|
fprintf(stderr, "Connected to OSD %lu instead of OSD %lu, peer state is outdated, disconnecting peer\n", config["osd_num"].uint64_value(), cl->osd_num);
|
||||||
}
|
}
|
||||||
else if (config["protocol_version"].uint64_value() != OSD_PROTOCOL_VERSION)
|
else if (config["protocol_version"].uint64_value() != OSD_PROTOCOL_VERSION)
|
||||||
{
|
{
|
||||||
err = true;
|
err = true;
|
||||||
printf(
|
fprintf(
|
||||||
"OSD %lu protocol version is %lu, but only version %u is supported.\n"
|
stderr, "OSD %lu protocol version is %lu, but only version %u is supported.\n"
|
||||||
" If you need to upgrade from 0.5.x please request it via the issue tracker.\n",
|
" If you need to upgrade from 0.5.x please request it via the issue tracker.\n",
|
||||||
cl->osd_num, config["protocol_version"].uint64_value(), OSD_PROTOCOL_VERSION
|
cl->osd_num, config["protocol_version"].uint64_value(), OSD_PROTOCOL_VERSION
|
||||||
);
|
);
|
||||||
@@ -361,12 +430,50 @@ void osd_messenger_t::check_peer_config(osd_client_t *cl)
|
|||||||
}
|
}
|
||||||
if (err)
|
if (err)
|
||||||
{
|
{
|
||||||
osd_num_t osd_num = cl->osd_num;
|
osd_num_t peer_osd = cl->osd_num;
|
||||||
stop_client(op->peer_fd);
|
stop_client(op->peer_fd);
|
||||||
on_connect_peer(osd_num, -1);
|
on_connect_peer(peer_osd, -1);
|
||||||
delete op;
|
delete op;
|
||||||
return;
|
return;
|
||||||
}
|
}
|
||||||
|
#ifdef WITH_RDMA
|
||||||
|
if (config["rdma_address"].is_string())
|
||||||
|
{
|
||||||
|
msgr_rdma_address_t addr;
|
||||||
|
if (!msgr_rdma_address_t::from_string(config["rdma_address"].string_value().c_str(), &addr) ||
|
||||||
|
cl->rdma_conn->connect(&addr) != 0)
|
||||||
|
{
|
||||||
|
fprintf(
|
||||||
|
stderr, "Failed to connect to OSD %lu (address %s) using RDMA\n",
|
||||||
|
cl->osd_num, config["rdma_address"].string_value().c_str()
|
||||||
|
);
|
||||||
|
delete cl->rdma_conn;
|
||||||
|
cl->rdma_conn = NULL;
|
||||||
|
// FIXME: Keep TCP connection in this case
|
||||||
|
osd_num_t peer_osd = cl->osd_num;
|
||||||
|
stop_client(cl->peer_fd);
|
||||||
|
on_connect_peer(peer_osd, -1);
|
||||||
|
delete op;
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
else
|
||||||
|
{
|
||||||
|
uint64_t server_max_msg = config["rdma_max_msg"].uint64_value();
|
||||||
|
if (cl->rdma_conn->max_msg > server_max_msg)
|
||||||
|
{
|
||||||
|
cl->rdma_conn->max_msg = server_max_msg;
|
||||||
|
}
|
||||||
|
if (log_level > 0)
|
||||||
|
{
|
||||||
|
fprintf(stderr, "Connected to OSD %lu using RDMA\n", cl->osd_num);
|
||||||
|
}
|
||||||
|
cl->peer_state = PEER_RDMA;
|
||||||
|
tfd->set_fd_handler(cl->peer_fd, false, NULL);
|
||||||
|
// Add the initial receive request
|
||||||
|
try_recv_rdma(cl);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
#endif
|
||||||
osd_peer_fds[cl->osd_num] = cl->peer_fd;
|
osd_peer_fds[cl->osd_num] = cl->peer_fd;
|
||||||
on_connect_peer(cl->osd_num, cl->peer_fd);
|
on_connect_peer(cl->osd_num, cl->peer_fd);
|
||||||
delete op;
|
delete op;
|
||||||
@@ -384,7 +491,7 @@ void osd_messenger_t::accept_connections(int listen_fd)
|
|||||||
{
|
{
|
||||||
assert(peer_fd != 0);
|
assert(peer_fd != 0);
|
||||||
char peer_str[256];
|
char peer_str[256];
|
||||||
printf("[OSD %lu] new client %d: connection from %s port %d\n", this->osd_num, peer_fd,
|
fprintf(stderr, "[OSD %lu] new client %d: connection from %s port %d\n", this->osd_num, peer_fd,
|
||||||
inet_ntop(AF_INET, &addr.sin_addr, peer_str, 256), ntohs(addr.sin_port));
|
inet_ntop(AF_INET, &addr.sin_addr, peer_str, 256), ntohs(addr.sin_port));
|
||||||
fcntl(peer_fd, F_SETFL, fcntl(peer_fd, F_GETFL, 0) | O_NONBLOCK);
|
fcntl(peer_fd, F_SETFL, fcntl(peer_fd, F_GETFL, 0) | O_NONBLOCK);
|
||||||
int one = 1;
|
int one = 1;
|
||||||
@@ -408,3 +515,59 @@ void osd_messenger_t::accept_connections(int listen_fd)
|
|||||||
throw std::runtime_error(std::string("accept: ") + strerror(errno));
|
throw std::runtime_error(std::string("accept: ") + strerror(errno));
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
#ifdef WITH_RDMA
|
||||||
|
bool osd_messenger_t::is_rdma_enabled()
|
||||||
|
{
|
||||||
|
return rdma_context != NULL;
|
||||||
|
}
|
||||||
|
#endif
|
||||||
|
|
||||||
|
json11::Json osd_messenger_t::read_config(const json11::Json & config)
|
||||||
|
{
|
||||||
|
const char *config_path = config["config_path"].string_value() != ""
|
||||||
|
? config["config_path"].string_value().c_str() : VITASTOR_CONFIG_PATH;
|
||||||
|
int fd = open(config_path, O_RDONLY);
|
||||||
|
if (fd < 0)
|
||||||
|
{
|
||||||
|
if (errno != ENOENT)
|
||||||
|
fprintf(stderr, "Error reading %s: %s\n", config_path, strerror(errno));
|
||||||
|
return config;
|
||||||
|
}
|
||||||
|
struct stat st;
|
||||||
|
if (fstat(fd, &st) != 0)
|
||||||
|
{
|
||||||
|
fprintf(stderr, "Error reading %s: %s\n", config_path, strerror(errno));
|
||||||
|
close(fd);
|
||||||
|
return config;
|
||||||
|
}
|
||||||
|
std::string buf;
|
||||||
|
buf.resize(st.st_size);
|
||||||
|
int done = 0;
|
||||||
|
while (done < st.st_size)
|
||||||
|
{
|
||||||
|
int r = read(fd, (void*)buf.data()+done, st.st_size-done);
|
||||||
|
if (r < 0)
|
||||||
|
{
|
||||||
|
fprintf(stderr, "Error reading %s: %s\n", config_path, strerror(errno));
|
||||||
|
close(fd);
|
||||||
|
return config;
|
||||||
|
}
|
||||||
|
done += r;
|
||||||
|
}
|
||||||
|
close(fd);
|
||||||
|
std::string json_err;
|
||||||
|
json11::Json::object file_config = json11::Json::parse(buf, json_err).object_items();
|
||||||
|
if (json_err != "")
|
||||||
|
{
|
||||||
|
fprintf(stderr, "Invalid JSON in %s: %s\n", config_path, json_err.c_str());
|
||||||
|
return config;
|
||||||
|
}
|
||||||
|
file_config.erase("config_path");
|
||||||
|
file_config.erase("osd_num");
|
||||||
|
for (auto kv: config.object_items())
|
||||||
|
{
|
||||||
|
file_config[kv.first] = kv.second;
|
||||||
|
}
|
||||||
|
return file_config;
|
||||||
|
}
|
||||||
|
@@ -18,20 +18,32 @@
|
|||||||
#include "timerfd_manager.h"
|
#include "timerfd_manager.h"
|
||||||
#include <ringloop.h>
|
#include <ringloop.h>
|
||||||
|
|
||||||
|
#ifdef WITH_RDMA
|
||||||
|
#include "msgr_rdma.h"
|
||||||
|
#endif
|
||||||
|
|
||||||
#define CL_READ_HDR 1
|
#define CL_READ_HDR 1
|
||||||
#define CL_READ_DATA 2
|
#define CL_READ_DATA 2
|
||||||
#define CL_READ_REPLY_DATA 3
|
#define CL_READ_REPLY_DATA 3
|
||||||
#define CL_WRITE_READY 1
|
#define CL_WRITE_READY 1
|
||||||
#define CL_WRITE_REPLY 2
|
|
||||||
|
|
||||||
#define PEER_CONNECTING 1
|
#define PEER_CONNECTING 1
|
||||||
#define PEER_CONNECTED 2
|
#define PEER_CONNECTED 2
|
||||||
#define PEER_STOPPED 3
|
#define PEER_RDMA_CONNECTING 3
|
||||||
|
#define PEER_RDMA 4
|
||||||
|
#define PEER_STOPPED 5
|
||||||
|
|
||||||
#define DEFAULT_PEER_CONNECT_INTERVAL 5
|
|
||||||
#define DEFAULT_PEER_CONNECT_TIMEOUT 5
|
|
||||||
#define DEFAULT_OSD_PING_TIMEOUT 5
|
|
||||||
#define DEFAULT_BITMAP_GRANULARITY 4096
|
#define DEFAULT_BITMAP_GRANULARITY 4096
|
||||||
|
#define VITASTOR_CONFIG_PATH "/etc/vitastor/vitastor.conf"
|
||||||
|
|
||||||
|
#define MSGR_SENDP_HDR 1
|
||||||
|
#define MSGR_SENDP_FREE 2
|
||||||
|
|
||||||
|
struct msgr_sendp_t
|
||||||
|
{
|
||||||
|
osd_op_t *op;
|
||||||
|
int flags;
|
||||||
|
};
|
||||||
|
|
||||||
struct osd_client_t
|
struct osd_client_t
|
||||||
{
|
{
|
||||||
@@ -48,6 +60,10 @@ struct osd_client_t
|
|||||||
|
|
||||||
void *in_buf = NULL;
|
void *in_buf = NULL;
|
||||||
|
|
||||||
|
#ifdef WITH_RDMA
|
||||||
|
msgr_rdma_connection_t *rdma_conn = NULL;
|
||||||
|
#endif
|
||||||
|
|
||||||
// Read state
|
// Read state
|
||||||
int read_ready = 0;
|
int read_ready = 0;
|
||||||
osd_op_t *read_op = NULL;
|
osd_op_t *read_op = NULL;
|
||||||
@@ -70,7 +86,7 @@ struct osd_client_t
|
|||||||
msghdr write_msg = { 0 };
|
msghdr write_msg = { 0 };
|
||||||
int write_state = 0;
|
int write_state = 0;
|
||||||
std::vector<iovec> send_list, next_send_list;
|
std::vector<iovec> send_list, next_send_list;
|
||||||
std::vector<osd_op_t*> outbox, next_outbox;
|
std::vector<msgr_sendp_t> outbox, next_outbox;
|
||||||
|
|
||||||
~osd_client_t()
|
~osd_client_t()
|
||||||
{
|
{
|
||||||
@@ -104,15 +120,23 @@ struct osd_messenger_t
|
|||||||
protected:
|
protected:
|
||||||
int keepalive_timer_id = -1;
|
int keepalive_timer_id = -1;
|
||||||
|
|
||||||
// FIXME: make receive_buffer_size configurable
|
uint32_t receive_buffer_size = 0;
|
||||||
int receive_buffer_size = 64*1024;
|
int peer_connect_interval = 0;
|
||||||
int peer_connect_interval = DEFAULT_PEER_CONNECT_INTERVAL;
|
int peer_connect_timeout = 0;
|
||||||
int peer_connect_timeout = DEFAULT_PEER_CONNECT_TIMEOUT;
|
int osd_idle_timeout = 0;
|
||||||
int osd_idle_timeout = DEFAULT_OSD_PING_TIMEOUT;
|
int osd_ping_timeout = 0;
|
||||||
int osd_ping_timeout = DEFAULT_OSD_PING_TIMEOUT;
|
|
||||||
int log_level = 0;
|
int log_level = 0;
|
||||||
bool use_sync_send_recv = false;
|
bool use_sync_send_recv = false;
|
||||||
|
|
||||||
|
#ifdef WITH_RDMA
|
||||||
|
bool use_rdma = true;
|
||||||
|
std::string rdma_device;
|
||||||
|
uint64_t rdma_port_num = 1, rdma_gid_index = 0, rdma_mtu = 0;
|
||||||
|
msgr_rdma_context_t *rdma_context = NULL;
|
||||||
|
uint64_t rdma_max_sge = 0, rdma_max_send = 0, rdma_max_recv = 8;
|
||||||
|
uint64_t rdma_max_msg = 0;
|
||||||
|
#endif
|
||||||
|
|
||||||
std::vector<int> read_ready_clients;
|
std::vector<int> read_ready_clients;
|
||||||
std::vector<int> write_ready_clients;
|
std::vector<int> write_ready_clients;
|
||||||
std::vector<std::function<void()>> set_immediate;
|
std::vector<std::function<void()>> set_immediate;
|
||||||
@@ -141,6 +165,13 @@ public:
|
|||||||
void accept_connections(int listen_fd);
|
void accept_connections(int listen_fd);
|
||||||
~osd_messenger_t();
|
~osd_messenger_t();
|
||||||
|
|
||||||
|
static json11::Json read_config(const json11::Json & config);
|
||||||
|
|
||||||
|
#ifdef WITH_RDMA
|
||||||
|
bool is_rdma_enabled();
|
||||||
|
bool connect_rdma(int peer_fd, std::string rdma_address, uint64_t client_max_msg);
|
||||||
|
#endif
|
||||||
|
|
||||||
protected:
|
protected:
|
||||||
void try_connect_peer(uint64_t osd_num);
|
void try_connect_peer(uint64_t osd_num);
|
||||||
void try_connect_peer_addr(osd_num_t peer_osd, const char *peer_host, int peer_port);
|
void try_connect_peer_addr(osd_num_t peer_osd, const char *peer_host, int peer_port);
|
||||||
@@ -156,8 +187,15 @@ protected:
|
|||||||
void handle_send(int result, osd_client_t *cl);
|
void handle_send(int result, osd_client_t *cl);
|
||||||
|
|
||||||
bool handle_read(int result, osd_client_t *cl);
|
bool handle_read(int result, osd_client_t *cl);
|
||||||
|
bool handle_read_buffer(osd_client_t *cl, void *curbuf, int remain);
|
||||||
bool handle_finished_read(osd_client_t *cl);
|
bool handle_finished_read(osd_client_t *cl);
|
||||||
void handle_op_hdr(osd_client_t *cl);
|
void handle_op_hdr(osd_client_t *cl);
|
||||||
bool handle_reply_hdr(osd_client_t *cl);
|
bool handle_reply_hdr(osd_client_t *cl);
|
||||||
void handle_reply_ready(osd_op_t *op);
|
void handle_reply_ready(osd_op_t *op);
|
||||||
|
|
||||||
|
#ifdef WITH_RDMA
|
||||||
|
bool try_send_rdma(osd_client_t *cl);
|
||||||
|
bool try_recv_rdma(osd_client_t *cl);
|
||||||
|
void handle_rdma_events();
|
||||||
|
#endif
|
||||||
};
|
};
|
||||||
|
@@ -42,3 +42,8 @@ void osd_messenger_t::read_requests()
|
|||||||
void osd_messenger_t::send_replies()
|
void osd_messenger_t::send_replies()
|
||||||
{
|
{
|
||||||
}
|
}
|
||||||
|
|
||||||
|
json11::Json osd_messenger_t::read_config(const json11::Json & config)
|
||||||
|
{
|
||||||
|
return config;
|
||||||
|
}
|
||||||
|
@@ -76,7 +76,7 @@ struct osd_op_buf_list_t
|
|||||||
buf = (iovec*)malloc(sizeof(iovec) * alloc);
|
buf = (iovec*)malloc(sizeof(iovec) * alloc);
|
||||||
if (!buf)
|
if (!buf)
|
||||||
{
|
{
|
||||||
printf("Failed to allocate %lu bytes\n", sizeof(iovec) * alloc);
|
fprintf(stderr, "Failed to allocate %lu bytes\n", sizeof(iovec) * alloc);
|
||||||
exit(1);
|
exit(1);
|
||||||
}
|
}
|
||||||
memcpy(buf, inline_buf, sizeof(iovec) * old);
|
memcpy(buf, inline_buf, sizeof(iovec) * old);
|
||||||
@@ -87,7 +87,7 @@ struct osd_op_buf_list_t
|
|||||||
buf = (iovec*)realloc(buf, sizeof(iovec) * alloc);
|
buf = (iovec*)realloc(buf, sizeof(iovec) * alloc);
|
||||||
if (!buf)
|
if (!buf)
|
||||||
{
|
{
|
||||||
printf("Failed to allocate %lu bytes\n", sizeof(iovec) * alloc);
|
fprintf(stderr, "Failed to allocate %lu bytes\n", sizeof(iovec) * alloc);
|
||||||
exit(1);
|
exit(1);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
@@ -109,7 +109,7 @@ struct osd_op_buf_list_t
|
|||||||
buf = (iovec*)malloc(sizeof(iovec) * alloc);
|
buf = (iovec*)malloc(sizeof(iovec) * alloc);
|
||||||
if (!buf)
|
if (!buf)
|
||||||
{
|
{
|
||||||
printf("Failed to allocate %lu bytes\n", sizeof(iovec) * alloc);
|
fprintf(stderr, "Failed to allocate %lu bytes\n", sizeof(iovec) * alloc);
|
||||||
exit(1);
|
exit(1);
|
||||||
}
|
}
|
||||||
memcpy(buf, inline_buf, sizeof(iovec)*old);
|
memcpy(buf, inline_buf, sizeof(iovec)*old);
|
||||||
@@ -120,7 +120,7 @@ struct osd_op_buf_list_t
|
|||||||
buf = (iovec*)realloc(buf, sizeof(iovec) * alloc);
|
buf = (iovec*)realloc(buf, sizeof(iovec) * alloc);
|
||||||
if (!buf)
|
if (!buf)
|
||||||
{
|
{
|
||||||
printf("Failed to allocate %lu bytes\n", sizeof(iovec) * alloc);
|
fprintf(stderr, "Failed to allocate %lu bytes\n", sizeof(iovec) * alloc);
|
||||||
exit(1);
|
exit(1);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
521
src/msgr_rdma.cpp
Normal file
521
src/msgr_rdma.cpp
Normal file
@@ -0,0 +1,521 @@
|
|||||||
|
// Copyright (c) Vitaliy Filippov, 2019+
|
||||||
|
// License: VNPL-1.1 or GNU GPL-2.0+ (see README.md for details)
|
||||||
|
|
||||||
|
#include <stdio.h>
|
||||||
|
#include <stdlib.h>
|
||||||
|
#include "msgr_rdma.h"
|
||||||
|
#include "messenger.h"
|
||||||
|
|
||||||
|
std::string msgr_rdma_address_t::to_string()
|
||||||
|
{
|
||||||
|
char msg[sizeof "0000:00000000:00000000:00000000000000000000000000000000"];
|
||||||
|
sprintf(
|
||||||
|
msg, "%04x:%06x:%06x:%016lx%016lx", lid, qpn, psn,
|
||||||
|
htobe64(((uint64_t*)&gid)[0]), htobe64(((uint64_t*)&gid)[1])
|
||||||
|
);
|
||||||
|
return std::string(msg);
|
||||||
|
}
|
||||||
|
|
||||||
|
bool msgr_rdma_address_t::from_string(const char *str, msgr_rdma_address_t *dest)
|
||||||
|
{
|
||||||
|
uint64_t* gid = (uint64_t*)&dest->gid;
|
||||||
|
int n = sscanf(
|
||||||
|
str, "%hx:%x:%x:%16lx%16lx", &dest->lid, &dest->qpn, &dest->psn, gid, gid+1
|
||||||
|
);
|
||||||
|
gid[0] = be64toh(gid[0]);
|
||||||
|
gid[1] = be64toh(gid[1]);
|
||||||
|
return n == 5;
|
||||||
|
}
|
||||||
|
|
||||||
|
msgr_rdma_context_t::~msgr_rdma_context_t()
|
||||||
|
{
|
||||||
|
if (cq)
|
||||||
|
ibv_destroy_cq(cq);
|
||||||
|
if (channel)
|
||||||
|
ibv_destroy_comp_channel(channel);
|
||||||
|
if (mr)
|
||||||
|
ibv_dereg_mr(mr);
|
||||||
|
if (pd)
|
||||||
|
ibv_dealloc_pd(pd);
|
||||||
|
if (context)
|
||||||
|
ibv_close_device(context);
|
||||||
|
}
|
||||||
|
|
||||||
|
msgr_rdma_connection_t::~msgr_rdma_connection_t()
|
||||||
|
{
|
||||||
|
ctx->used_max_cqe -= max_send+max_recv;
|
||||||
|
if (qp)
|
||||||
|
ibv_destroy_qp(qp);
|
||||||
|
}
|
||||||
|
|
||||||
|
msgr_rdma_context_t *msgr_rdma_context_t::create(const char *ib_devname, uint8_t ib_port, uint8_t gid_index, uint32_t mtu)
|
||||||
|
{
|
||||||
|
int res;
|
||||||
|
ibv_device **dev_list = NULL;
|
||||||
|
msgr_rdma_context_t *ctx = new msgr_rdma_context_t();
|
||||||
|
ctx->mtu = mtu;
|
||||||
|
|
||||||
|
dev_list = ibv_get_device_list(NULL);
|
||||||
|
if (!dev_list)
|
||||||
|
{
|
||||||
|
fprintf(stderr, "Failed to get RDMA device list: %s\n", strerror(errno));
|
||||||
|
goto cleanup;
|
||||||
|
}
|
||||||
|
if (!ib_devname)
|
||||||
|
{
|
||||||
|
ctx->dev = *dev_list;
|
||||||
|
if (!ctx->dev)
|
||||||
|
{
|
||||||
|
fprintf(stderr, "No RDMA devices found\n");
|
||||||
|
goto cleanup;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
else
|
||||||
|
{
|
||||||
|
int i;
|
||||||
|
for (i = 0; dev_list[i]; ++i)
|
||||||
|
if (!strcmp(ibv_get_device_name(dev_list[i]), ib_devname))
|
||||||
|
break;
|
||||||
|
ctx->dev = dev_list[i];
|
||||||
|
if (!ctx->dev)
|
||||||
|
{
|
||||||
|
fprintf(stderr, "RDMA device %s not found\n", ib_devname);
|
||||||
|
goto cleanup;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
ctx->context = ibv_open_device(ctx->dev);
|
||||||
|
if (!ctx->context)
|
||||||
|
{
|
||||||
|
fprintf(stderr, "Couldn't get RDMA context for %s\n", ibv_get_device_name(ctx->dev));
|
||||||
|
goto cleanup;
|
||||||
|
}
|
||||||
|
|
||||||
|
ctx->ib_port = ib_port;
|
||||||
|
ctx->gid_index = gid_index;
|
||||||
|
if ((res = ibv_query_port(ctx->context, ib_port, &ctx->portinfo)) != 0)
|
||||||
|
{
|
||||||
|
fprintf(stderr, "Couldn't get RDMA device %s port %d info: %s\n", ibv_get_device_name(ctx->dev), ib_port, strerror(res));
|
||||||
|
goto cleanup;
|
||||||
|
}
|
||||||
|
ctx->my_lid = ctx->portinfo.lid;
|
||||||
|
if (ctx->portinfo.link_layer != IBV_LINK_LAYER_ETHERNET && !ctx->my_lid)
|
||||||
|
{
|
||||||
|
fprintf(stderr, "RDMA device %s must have local LID because it's not Ethernet, but LID is zero\n", ibv_get_device_name(ctx->dev));
|
||||||
|
goto cleanup;
|
||||||
|
}
|
||||||
|
if (ibv_query_gid(ctx->context, ib_port, gid_index, &ctx->my_gid))
|
||||||
|
{
|
||||||
|
fprintf(stderr, "Couldn't read RDMA device %s GID index %d\n", ibv_get_device_name(ctx->dev), gid_index);
|
||||||
|
goto cleanup;
|
||||||
|
}
|
||||||
|
|
||||||
|
ctx->pd = ibv_alloc_pd(ctx->context);
|
||||||
|
if (!ctx->pd)
|
||||||
|
{
|
||||||
|
fprintf(stderr, "Couldn't allocate RDMA protection domain\n");
|
||||||
|
goto cleanup;
|
||||||
|
}
|
||||||
|
|
||||||
|
{
|
||||||
|
if (ibv_query_device_ex(ctx->context, NULL, &ctx->attrx))
|
||||||
|
{
|
||||||
|
fprintf(stderr, "Couldn't query RDMA device for its features\n");
|
||||||
|
goto cleanup;
|
||||||
|
}
|
||||||
|
if (!(ctx->attrx.odp_caps.general_caps & IBV_ODP_SUPPORT) ||
|
||||||
|
!(ctx->attrx.odp_caps.general_caps & IBV_ODP_SUPPORT_IMPLICIT) ||
|
||||||
|
!(ctx->attrx.odp_caps.per_transport_caps.rc_odp_caps & IBV_ODP_SUPPORT_SEND) ||
|
||||||
|
!(ctx->attrx.odp_caps.per_transport_caps.rc_odp_caps & IBV_ODP_SUPPORT_RECV))
|
||||||
|
{
|
||||||
|
fprintf(stderr, "The RDMA device isn't implicit ODP (On-Demand Paging) capable or does not support RC send and receive with ODP\n");
|
||||||
|
goto cleanup;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
ctx->mr = ibv_reg_mr(ctx->pd, NULL, SIZE_MAX, IBV_ACCESS_LOCAL_WRITE | IBV_ACCESS_ON_DEMAND);
|
||||||
|
if (!ctx->mr)
|
||||||
|
{
|
||||||
|
fprintf(stderr, "Couldn't register RDMA memory region\n");
|
||||||
|
goto cleanup;
|
||||||
|
}
|
||||||
|
|
||||||
|
ctx->channel = ibv_create_comp_channel(ctx->context);
|
||||||
|
if (!ctx->channel)
|
||||||
|
{
|
||||||
|
fprintf(stderr, "Couldn't create RDMA completion channel\n");
|
||||||
|
goto cleanup;
|
||||||
|
}
|
||||||
|
|
||||||
|
ctx->max_cqe = 4096;
|
||||||
|
ctx->cq = ibv_create_cq(ctx->context, ctx->max_cqe, NULL, ctx->channel, 0);
|
||||||
|
if (!ctx->cq)
|
||||||
|
{
|
||||||
|
fprintf(stderr, "Couldn't create RDMA completion queue\n");
|
||||||
|
goto cleanup;
|
||||||
|
}
|
||||||
|
|
||||||
|
if (dev_list)
|
||||||
|
ibv_free_device_list(dev_list);
|
||||||
|
return ctx;
|
||||||
|
|
||||||
|
cleanup:
|
||||||
|
delete ctx;
|
||||||
|
if (dev_list)
|
||||||
|
ibv_free_device_list(dev_list);
|
||||||
|
return NULL;
|
||||||
|
}
|
||||||
|
|
||||||
|
msgr_rdma_connection_t *msgr_rdma_connection_t::create(msgr_rdma_context_t *ctx, uint32_t max_send,
|
||||||
|
uint32_t max_recv, uint32_t max_sge, uint32_t max_msg)
|
||||||
|
{
|
||||||
|
msgr_rdma_connection_t *conn = new msgr_rdma_connection_t;
|
||||||
|
|
||||||
|
max_sge = max_sge > ctx->attrx.orig_attr.max_sge ? ctx->attrx.orig_attr.max_sge : max_sge;
|
||||||
|
|
||||||
|
conn->ctx = ctx;
|
||||||
|
conn->max_send = max_send;
|
||||||
|
conn->max_recv = max_recv;
|
||||||
|
conn->max_sge = max_sge;
|
||||||
|
conn->max_msg = max_msg;
|
||||||
|
|
||||||
|
ctx->used_max_cqe += max_send+max_recv;
|
||||||
|
if (ctx->used_max_cqe > ctx->max_cqe)
|
||||||
|
{
|
||||||
|
// Resize CQ
|
||||||
|
// Mellanox ConnectX-4 supports up to 4194303 CQEs, so it's fine to put everything into a single CQ
|
||||||
|
int new_max_cqe = ctx->max_cqe;
|
||||||
|
while (ctx->used_max_cqe > new_max_cqe)
|
||||||
|
{
|
||||||
|
new_max_cqe *= 2;
|
||||||
|
}
|
||||||
|
if (ibv_resize_cq(ctx->cq, new_max_cqe) != 0)
|
||||||
|
{
|
||||||
|
fprintf(stderr, "Couldn't resize RDMA completion queue to %d entries\n", new_max_cqe);
|
||||||
|
delete conn;
|
||||||
|
return NULL;
|
||||||
|
}
|
||||||
|
ctx->max_cqe = new_max_cqe;
|
||||||
|
}
|
||||||
|
|
||||||
|
ibv_qp_init_attr init_attr = {
|
||||||
|
.send_cq = ctx->cq,
|
||||||
|
.recv_cq = ctx->cq,
|
||||||
|
.cap = {
|
||||||
|
.max_send_wr = max_send,
|
||||||
|
.max_recv_wr = max_recv,
|
||||||
|
.max_send_sge = max_sge,
|
||||||
|
.max_recv_sge = max_sge,
|
||||||
|
},
|
||||||
|
.qp_type = IBV_QPT_RC,
|
||||||
|
};
|
||||||
|
conn->qp = ibv_create_qp(ctx->pd, &init_attr);
|
||||||
|
if (!conn->qp)
|
||||||
|
{
|
||||||
|
fprintf(stderr, "Couldn't create RDMA queue pair\n");
|
||||||
|
delete conn;
|
||||||
|
return NULL;
|
||||||
|
}
|
||||||
|
|
||||||
|
conn->addr.lid = ctx->my_lid;
|
||||||
|
conn->addr.gid = ctx->my_gid;
|
||||||
|
conn->addr.qpn = conn->qp->qp_num;
|
||||||
|
conn->addr.psn = lrand48() & 0xffffff;
|
||||||
|
|
||||||
|
ibv_qp_attr attr = {
|
||||||
|
.qp_state = IBV_QPS_INIT,
|
||||||
|
.qp_access_flags = 0,
|
||||||
|
.pkey_index = 0,
|
||||||
|
.port_num = ctx->ib_port,
|
||||||
|
};
|
||||||
|
|
||||||
|
if (ibv_modify_qp(conn->qp, &attr, IBV_QP_STATE | IBV_QP_PKEY_INDEX | IBV_QP_PORT | IBV_QP_ACCESS_FLAGS))
|
||||||
|
{
|
||||||
|
fprintf(stderr, "Failed to switch RDMA queue pair to INIT state\n");
|
||||||
|
delete conn;
|
||||||
|
return NULL;
|
||||||
|
}
|
||||||
|
|
||||||
|
return conn;
|
||||||
|
}
|
||||||
|
|
||||||
|
static ibv_mtu mtu_to_ibv_mtu(uint32_t mtu)
|
||||||
|
{
|
||||||
|
switch (mtu)
|
||||||
|
{
|
||||||
|
case 256: return IBV_MTU_256;
|
||||||
|
case 512: return IBV_MTU_512;
|
||||||
|
case 1024: return IBV_MTU_1024;
|
||||||
|
case 2048: return IBV_MTU_2048;
|
||||||
|
case 4096: return IBV_MTU_4096;
|
||||||
|
}
|
||||||
|
return IBV_MTU_4096;
|
||||||
|
}
|
||||||
|
|
||||||
|
int msgr_rdma_connection_t::connect(msgr_rdma_address_t *dest)
|
||||||
|
{
|
||||||
|
auto conn = this;
|
||||||
|
ibv_qp_attr attr = {
|
||||||
|
.qp_state = IBV_QPS_RTR,
|
||||||
|
.path_mtu = mtu_to_ibv_mtu(conn->ctx->mtu),
|
||||||
|
.rq_psn = dest->psn,
|
||||||
|
.sq_psn = conn->addr.psn,
|
||||||
|
.dest_qp_num = dest->qpn,
|
||||||
|
.ah_attr = {
|
||||||
|
.grh = {
|
||||||
|
.dgid = dest->gid,
|
||||||
|
.sgid_index = conn->ctx->gid_index,
|
||||||
|
.hop_limit = 1, // FIXME can it vary?
|
||||||
|
},
|
||||||
|
.dlid = dest->lid,
|
||||||
|
.sl = 0, // service level
|
||||||
|
.src_path_bits = 0,
|
||||||
|
.is_global = (uint8_t)(dest->gid.global.interface_id ? 1 : 0),
|
||||||
|
.port_num = conn->ctx->ib_port,
|
||||||
|
},
|
||||||
|
.max_rd_atomic = 1,
|
||||||
|
.max_dest_rd_atomic = 1,
|
||||||
|
// Timeout and min_rnr_timer actual values seem to be 4.096us*2^(timeout+1)
|
||||||
|
.min_rnr_timer = 1,
|
||||||
|
.timeout = 14,
|
||||||
|
.retry_cnt = 7,
|
||||||
|
.rnr_retry = 7,
|
||||||
|
};
|
||||||
|
// FIXME No idea if ibv_modify_qp is a blocking operation or not. No idea if it has a timeout and what it is.
|
||||||
|
if (ibv_modify_qp(conn->qp, &attr, IBV_QP_STATE | IBV_QP_AV | IBV_QP_PATH_MTU |
|
||||||
|
IBV_QP_DEST_QPN | IBV_QP_RQ_PSN | IBV_QP_MAX_DEST_RD_ATOMIC | IBV_QP_MIN_RNR_TIMER))
|
||||||
|
{
|
||||||
|
fprintf(stderr, "Failed to switch RDMA queue pair to RTR (ready-to-receive) state\n");
|
||||||
|
return 1;
|
||||||
|
}
|
||||||
|
attr.qp_state = IBV_QPS_RTS;
|
||||||
|
if (ibv_modify_qp(conn->qp, &attr, IBV_QP_STATE | IBV_QP_TIMEOUT |
|
||||||
|
IBV_QP_RETRY_CNT | IBV_QP_RNR_RETRY | IBV_QP_SQ_PSN | IBV_QP_MAX_QP_RD_ATOMIC))
|
||||||
|
{
|
||||||
|
fprintf(stderr, "Failed to switch RDMA queue pair to RTS (ready-to-send) state\n");
|
||||||
|
return 1;
|
||||||
|
}
|
||||||
|
return 0;
|
||||||
|
}
|
||||||
|
|
||||||
|
bool osd_messenger_t::connect_rdma(int peer_fd, std::string rdma_address, uint64_t client_max_msg)
|
||||||
|
{
|
||||||
|
// Try to connect to the peer using RDMA
|
||||||
|
msgr_rdma_address_t addr;
|
||||||
|
if (msgr_rdma_address_t::from_string(rdma_address.c_str(), &addr))
|
||||||
|
{
|
||||||
|
if (client_max_msg > rdma_max_msg)
|
||||||
|
{
|
||||||
|
client_max_msg = rdma_max_msg;
|
||||||
|
}
|
||||||
|
auto rdma_conn = msgr_rdma_connection_t::create(rdma_context, rdma_max_send, rdma_max_recv, rdma_max_sge, client_max_msg);
|
||||||
|
if (rdma_conn)
|
||||||
|
{
|
||||||
|
int r = rdma_conn->connect(&addr);
|
||||||
|
if (r != 0)
|
||||||
|
{
|
||||||
|
delete rdma_conn;
|
||||||
|
fprintf(
|
||||||
|
stderr, "Failed to connect RDMA queue pair to %s (client %d)\n",
|
||||||
|
addr.to_string().c_str(), peer_fd
|
||||||
|
);
|
||||||
|
}
|
||||||
|
else
|
||||||
|
{
|
||||||
|
// Remember connection, but switch to RDMA only after sending the configuration response
|
||||||
|
auto cl = clients.at(peer_fd);
|
||||||
|
cl->rdma_conn = rdma_conn;
|
||||||
|
cl->peer_state = PEER_RDMA_CONNECTING;
|
||||||
|
return true;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
|
||||||
|
static void try_send_rdma_wr(osd_client_t *cl, ibv_sge *sge, int op_sge)
|
||||||
|
{
|
||||||
|
ibv_send_wr *bad_wr = NULL;
|
||||||
|
ibv_send_wr wr = {
|
||||||
|
.wr_id = (uint64_t)(cl->peer_fd*2+1),
|
||||||
|
.sg_list = sge,
|
||||||
|
.num_sge = op_sge,
|
||||||
|
.opcode = IBV_WR_SEND,
|
||||||
|
.send_flags = IBV_SEND_SIGNALED,
|
||||||
|
};
|
||||||
|
int err = ibv_post_send(cl->rdma_conn->qp, &wr, &bad_wr);
|
||||||
|
if (err || bad_wr)
|
||||||
|
{
|
||||||
|
fprintf(stderr, "RDMA send failed: %s\n", strerror(err));
|
||||||
|
exit(1);
|
||||||
|
}
|
||||||
|
cl->rdma_conn->cur_send++;
|
||||||
|
}
|
||||||
|
|
||||||
|
bool osd_messenger_t::try_send_rdma(osd_client_t *cl)
|
||||||
|
{
|
||||||
|
auto rc = cl->rdma_conn;
|
||||||
|
if (!cl->send_list.size() || rc->cur_send > 0)
|
||||||
|
{
|
||||||
|
// Only send one batch at a time
|
||||||
|
return true;
|
||||||
|
}
|
||||||
|
uint64_t op_size = 0, op_sge = 0;
|
||||||
|
ibv_sge sge[rc->max_sge];
|
||||||
|
while (rc->send_pos < cl->send_list.size())
|
||||||
|
{
|
||||||
|
iovec & iov = cl->send_list[rc->send_pos];
|
||||||
|
if (op_size >= rc->max_msg || op_sge >= rc->max_sge)
|
||||||
|
{
|
||||||
|
try_send_rdma_wr(cl, sge, op_sge);
|
||||||
|
op_sge = 0;
|
||||||
|
op_size = 0;
|
||||||
|
if (rc->cur_send >= rc->max_send)
|
||||||
|
{
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
uint32_t len = (uint32_t)(op_size+iov.iov_len-rc->send_buf_pos < rc->max_msg
|
||||||
|
? iov.iov_len-rc->send_buf_pos : rc->max_msg-op_size);
|
||||||
|
sge[op_sge++] = {
|
||||||
|
.addr = (uintptr_t)(iov.iov_base+rc->send_buf_pos),
|
||||||
|
.length = len,
|
||||||
|
.lkey = rc->ctx->mr->lkey,
|
||||||
|
};
|
||||||
|
op_size += len;
|
||||||
|
rc->send_buf_pos += len;
|
||||||
|
if (rc->send_buf_pos >= iov.iov_len)
|
||||||
|
{
|
||||||
|
rc->send_pos++;
|
||||||
|
rc->send_buf_pos = 0;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
if (op_sge > 0)
|
||||||
|
{
|
||||||
|
try_send_rdma_wr(cl, sge, op_sge);
|
||||||
|
}
|
||||||
|
return true;
|
||||||
|
}
|
||||||
|
|
||||||
|
static void try_recv_rdma_wr(osd_client_t *cl, ibv_sge *sge, int op_sge)
|
||||||
|
{
|
||||||
|
ibv_recv_wr *bad_wr = NULL;
|
||||||
|
ibv_recv_wr wr = {
|
||||||
|
.wr_id = (uint64_t)(cl->peer_fd*2),
|
||||||
|
.sg_list = sge,
|
||||||
|
.num_sge = op_sge,
|
||||||
|
};
|
||||||
|
int err = ibv_post_recv(cl->rdma_conn->qp, &wr, &bad_wr);
|
||||||
|
if (err || bad_wr)
|
||||||
|
{
|
||||||
|
fprintf(stderr, "RDMA receive failed: %s\n", strerror(err));
|
||||||
|
exit(1);
|
||||||
|
}
|
||||||
|
cl->rdma_conn->cur_recv++;
|
||||||
|
}
|
||||||
|
|
||||||
|
bool osd_messenger_t::try_recv_rdma(osd_client_t *cl)
|
||||||
|
{
|
||||||
|
auto rc = cl->rdma_conn;
|
||||||
|
while (rc->cur_recv < rc->max_recv)
|
||||||
|
{
|
||||||
|
void *buf = malloc_or_die(rc->max_msg);
|
||||||
|
rc->recv_buffers.push_back(buf);
|
||||||
|
ibv_sge sge = {
|
||||||
|
.addr = (uintptr_t)buf,
|
||||||
|
.length = (uint32_t)rc->max_msg,
|
||||||
|
.lkey = rc->ctx->mr->lkey,
|
||||||
|
};
|
||||||
|
try_recv_rdma_wr(cl, &sge, 1);
|
||||||
|
}
|
||||||
|
return true;
|
||||||
|
}
|
||||||
|
|
||||||
|
#define RDMA_EVENTS_AT_ONCE 32
|
||||||
|
|
||||||
|
void osd_messenger_t::handle_rdma_events()
|
||||||
|
{
|
||||||
|
// Request next notification
|
||||||
|
ibv_cq *ev_cq;
|
||||||
|
void *ev_ctx;
|
||||||
|
// FIXME: This is inefficient as it calls read()...
|
||||||
|
if (ibv_get_cq_event(rdma_context->channel, &ev_cq, &ev_ctx) == 0)
|
||||||
|
{
|
||||||
|
ibv_ack_cq_events(rdma_context->cq, 1);
|
||||||
|
}
|
||||||
|
if (ibv_req_notify_cq(rdma_context->cq, 0) != 0)
|
||||||
|
{
|
||||||
|
fprintf(stderr, "Failed to request RDMA completion notification, exiting\n");
|
||||||
|
exit(1);
|
||||||
|
}
|
||||||
|
ibv_wc wc[RDMA_EVENTS_AT_ONCE];
|
||||||
|
int event_count;
|
||||||
|
do
|
||||||
|
{
|
||||||
|
event_count = ibv_poll_cq(rdma_context->cq, RDMA_EVENTS_AT_ONCE, wc);
|
||||||
|
for (int i = 0; i < event_count; i++)
|
||||||
|
{
|
||||||
|
int client_id = wc[i].wr_id >> 1;
|
||||||
|
bool is_send = wc[i].wr_id & 1;
|
||||||
|
auto cl_it = clients.find(client_id);
|
||||||
|
if (cl_it == clients.end())
|
||||||
|
{
|
||||||
|
continue;
|
||||||
|
}
|
||||||
|
osd_client_t *cl = cl_it->second;
|
||||||
|
if (wc[i].status != IBV_WC_SUCCESS)
|
||||||
|
{
|
||||||
|
fprintf(stderr, "RDMA work request failed for client %d", client_id);
|
||||||
|
if (cl->osd_num)
|
||||||
|
{
|
||||||
|
fprintf(stderr, " (OSD %lu)", cl->osd_num);
|
||||||
|
}
|
||||||
|
fprintf(stderr, " with status: %s, stopping client\n", ibv_wc_status_str(wc[i].status));
|
||||||
|
stop_client(client_id);
|
||||||
|
continue;
|
||||||
|
}
|
||||||
|
if (!is_send)
|
||||||
|
{
|
||||||
|
cl->rdma_conn->cur_recv--;
|
||||||
|
handle_read_buffer(cl, cl->rdma_conn->recv_buffers[0], wc[i].byte_len);
|
||||||
|
free(cl->rdma_conn->recv_buffers[0]);
|
||||||
|
cl->rdma_conn->recv_buffers.erase(cl->rdma_conn->recv_buffers.begin(), cl->rdma_conn->recv_buffers.begin()+1);
|
||||||
|
try_recv_rdma(cl);
|
||||||
|
}
|
||||||
|
else
|
||||||
|
{
|
||||||
|
cl->rdma_conn->cur_send--;
|
||||||
|
if (!cl->rdma_conn->cur_send)
|
||||||
|
{
|
||||||
|
// Wait for the whole batch
|
||||||
|
for (int i = 0; i < cl->rdma_conn->send_pos; i++)
|
||||||
|
{
|
||||||
|
if (cl->outbox[i].flags & MSGR_SENDP_FREE)
|
||||||
|
{
|
||||||
|
// Reply fully sent
|
||||||
|
delete cl->outbox[i].op;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
if (cl->rdma_conn->send_pos > 0)
|
||||||
|
{
|
||||||
|
cl->send_list.erase(cl->send_list.begin(), cl->send_list.begin()+cl->rdma_conn->send_pos);
|
||||||
|
cl->outbox.erase(cl->outbox.begin(), cl->outbox.begin()+cl->rdma_conn->send_pos);
|
||||||
|
cl->rdma_conn->send_pos = 0;
|
||||||
|
}
|
||||||
|
if (cl->rdma_conn->send_buf_pos > 0)
|
||||||
|
{
|
||||||
|
cl->send_list[0].iov_base += cl->rdma_conn->send_buf_pos;
|
||||||
|
cl->send_list[0].iov_len -= cl->rdma_conn->send_buf_pos;
|
||||||
|
cl->rdma_conn->send_buf_pos = 0;
|
||||||
|
}
|
||||||
|
try_send_rdma(cl);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
} while (event_count > 0);
|
||||||
|
for (auto cb: set_immediate)
|
||||||
|
{
|
||||||
|
cb();
|
||||||
|
}
|
||||||
|
set_immediate.clear();
|
||||||
|
}
|
58
src/msgr_rdma.h
Normal file
58
src/msgr_rdma.h
Normal file
@@ -0,0 +1,58 @@
|
|||||||
|
// Copyright (c) Vitaliy Filippov, 2019+
|
||||||
|
// License: VNPL-1.1 or GNU GPL-2.0+ (see README.md for details)
|
||||||
|
|
||||||
|
#pragma once
|
||||||
|
#include <infiniband/verbs.h>
|
||||||
|
#include <string>
|
||||||
|
#include <vector>
|
||||||
|
|
||||||
|
struct msgr_rdma_address_t
|
||||||
|
{
|
||||||
|
ibv_gid gid;
|
||||||
|
uint16_t lid;
|
||||||
|
uint32_t qpn;
|
||||||
|
uint32_t psn;
|
||||||
|
|
||||||
|
std::string to_string();
|
||||||
|
static bool from_string(const char *str, msgr_rdma_address_t *dest);
|
||||||
|
};
|
||||||
|
|
||||||
|
struct msgr_rdma_context_t
|
||||||
|
{
|
||||||
|
ibv_context *context = NULL;
|
||||||
|
ibv_device *dev = NULL;
|
||||||
|
ibv_device_attr_ex attrx;
|
||||||
|
ibv_pd *pd = NULL;
|
||||||
|
ibv_mr *mr = NULL;
|
||||||
|
ibv_comp_channel *channel = NULL;
|
||||||
|
ibv_cq *cq = NULL;
|
||||||
|
ibv_port_attr portinfo;
|
||||||
|
uint8_t ib_port;
|
||||||
|
uint8_t gid_index;
|
||||||
|
uint16_t my_lid;
|
||||||
|
ibv_gid my_gid;
|
||||||
|
uint32_t mtu;
|
||||||
|
int max_cqe = 0;
|
||||||
|
int used_max_cqe = 0;
|
||||||
|
|
||||||
|
static msgr_rdma_context_t *create(const char *ib_devname, uint8_t ib_port, uint8_t gid_index, uint32_t mtu);
|
||||||
|
~msgr_rdma_context_t();
|
||||||
|
};
|
||||||
|
|
||||||
|
struct msgr_rdma_connection_t
|
||||||
|
{
|
||||||
|
msgr_rdma_context_t *ctx = NULL;
|
||||||
|
ibv_qp *qp = NULL;
|
||||||
|
msgr_rdma_address_t addr;
|
||||||
|
int max_send = 0, max_recv = 0, max_sge = 0;
|
||||||
|
int cur_send = 0, cur_recv = 0;
|
||||||
|
uint64_t max_msg = 0;
|
||||||
|
|
||||||
|
int send_pos = 0, send_buf_pos = 0;
|
||||||
|
int recv_pos = 0, recv_buf_pos = 0;
|
||||||
|
std::vector<void*> recv_buffers;
|
||||||
|
|
||||||
|
~msgr_rdma_connection_t();
|
||||||
|
static msgr_rdma_connection_t *create(msgr_rdma_context_t *ctx, uint32_t max_send, uint32_t max_recv, uint32_t max_sge, uint32_t max_msg);
|
||||||
|
int connect(msgr_rdma_address_t *dest);
|
||||||
|
};
|
@@ -72,7 +72,7 @@ bool osd_messenger_t::handle_read(int result, osd_client_t *cl)
|
|||||||
// this is a client socket, so don't panic on error. just disconnect it
|
// this is a client socket, so don't panic on error. just disconnect it
|
||||||
if (result != 0)
|
if (result != 0)
|
||||||
{
|
{
|
||||||
printf("Client %d socket read error: %d (%s). Disconnecting client\n", cl->peer_fd, -result, strerror(-result));
|
fprintf(stderr, "Client %d socket read error: %d (%s). Disconnecting client\n", cl->peer_fd, -result, strerror(-result));
|
||||||
}
|
}
|
||||||
stop_client(cl->peer_fd);
|
stop_client(cl->peer_fd);
|
||||||
return false;
|
return false;
|
||||||
@@ -91,48 +91,9 @@ bool osd_messenger_t::handle_read(int result, osd_client_t *cl)
|
|||||||
{
|
{
|
||||||
if (cl->read_iov.iov_base == cl->in_buf)
|
if (cl->read_iov.iov_base == cl->in_buf)
|
||||||
{
|
{
|
||||||
// Compose operation(s) from the buffer
|
if (!handle_read_buffer(cl, cl->in_buf, result))
|
||||||
int remain = result;
|
|
||||||
void *curbuf = cl->in_buf;
|
|
||||||
while (remain > 0)
|
|
||||||
{
|
{
|
||||||
if (!cl->read_op)
|
goto fin;
|
||||||
{
|
|
||||||
cl->read_op = new osd_op_t;
|
|
||||||
cl->read_op->peer_fd = cl->peer_fd;
|
|
||||||
cl->read_op->op_type = OSD_OP_IN;
|
|
||||||
cl->recv_list.push_back(cl->read_op->req.buf, OSD_PACKET_SIZE);
|
|
||||||
cl->read_remaining = OSD_PACKET_SIZE;
|
|
||||||
cl->read_state = CL_READ_HDR;
|
|
||||||
}
|
|
||||||
while (cl->recv_list.done < cl->recv_list.count && remain > 0)
|
|
||||||
{
|
|
||||||
iovec* cur = cl->recv_list.get_iovec();
|
|
||||||
if (cur->iov_len > remain)
|
|
||||||
{
|
|
||||||
memcpy(cur->iov_base, curbuf, remain);
|
|
||||||
cl->read_remaining -= remain;
|
|
||||||
cur->iov_len -= remain;
|
|
||||||
cur->iov_base += remain;
|
|
||||||
remain = 0;
|
|
||||||
}
|
|
||||||
else
|
|
||||||
{
|
|
||||||
memcpy(cur->iov_base, curbuf, cur->iov_len);
|
|
||||||
curbuf += cur->iov_len;
|
|
||||||
cl->read_remaining -= cur->iov_len;
|
|
||||||
remain -= cur->iov_len;
|
|
||||||
cur->iov_len = 0;
|
|
||||||
cl->recv_list.done++;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
if (cl->recv_list.done >= cl->recv_list.count)
|
|
||||||
{
|
|
||||||
if (!handle_finished_read(cl))
|
|
||||||
{
|
|
||||||
goto fin;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
else
|
else
|
||||||
@@ -159,6 +120,52 @@ fin:
|
|||||||
return ret;
|
return ret;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
bool osd_messenger_t::handle_read_buffer(osd_client_t *cl, void *curbuf, int remain)
|
||||||
|
{
|
||||||
|
// Compose operation(s) from the buffer
|
||||||
|
while (remain > 0)
|
||||||
|
{
|
||||||
|
if (!cl->read_op)
|
||||||
|
{
|
||||||
|
cl->read_op = new osd_op_t;
|
||||||
|
cl->read_op->peer_fd = cl->peer_fd;
|
||||||
|
cl->read_op->op_type = OSD_OP_IN;
|
||||||
|
cl->recv_list.push_back(cl->read_op->req.buf, OSD_PACKET_SIZE);
|
||||||
|
cl->read_remaining = OSD_PACKET_SIZE;
|
||||||
|
cl->read_state = CL_READ_HDR;
|
||||||
|
}
|
||||||
|
while (cl->recv_list.done < cl->recv_list.count && remain > 0)
|
||||||
|
{
|
||||||
|
iovec* cur = cl->recv_list.get_iovec();
|
||||||
|
if (cur->iov_len > remain)
|
||||||
|
{
|
||||||
|
memcpy(cur->iov_base, curbuf, remain);
|
||||||
|
cl->read_remaining -= remain;
|
||||||
|
cur->iov_len -= remain;
|
||||||
|
cur->iov_base += remain;
|
||||||
|
remain = 0;
|
||||||
|
}
|
||||||
|
else
|
||||||
|
{
|
||||||
|
memcpy(cur->iov_base, curbuf, cur->iov_len);
|
||||||
|
curbuf += cur->iov_len;
|
||||||
|
cl->read_remaining -= cur->iov_len;
|
||||||
|
remain -= cur->iov_len;
|
||||||
|
cur->iov_len = 0;
|
||||||
|
cl->recv_list.done++;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
if (cl->recv_list.done >= cl->recv_list.count)
|
||||||
|
{
|
||||||
|
if (!handle_finished_read(cl))
|
||||||
|
{
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
return true;
|
||||||
|
}
|
||||||
|
|
||||||
bool osd_messenger_t::handle_finished_read(osd_client_t *cl)
|
bool osd_messenger_t::handle_finished_read(osd_client_t *cl)
|
||||||
{
|
{
|
||||||
cl->recv_list.reset();
|
cl->recv_list.reset();
|
||||||
@@ -170,7 +177,7 @@ bool osd_messenger_t::handle_finished_read(osd_client_t *cl)
|
|||||||
handle_op_hdr(cl);
|
handle_op_hdr(cl);
|
||||||
else
|
else
|
||||||
{
|
{
|
||||||
printf("Received garbage: magic=%lx id=%lu opcode=%lx from %d\n", cl->read_op->req.hdr.magic, cl->read_op->req.hdr.id, cl->read_op->req.hdr.opcode, cl->peer_fd);
|
fprintf(stderr, "Received garbage: magic=%lx id=%lu opcode=%lx from %d\n", cl->read_op->req.hdr.magic, cl->read_op->req.hdr.id, cl->read_op->req.hdr.opcode, cl->peer_fd);
|
||||||
stop_client(cl->peer_fd);
|
stop_client(cl->peer_fd);
|
||||||
return false;
|
return false;
|
||||||
}
|
}
|
||||||
@@ -254,6 +261,16 @@ void osd_messenger_t::handle_op_hdr(osd_client_t *cl)
|
|||||||
}
|
}
|
||||||
cl->read_remaining = cur_op->req.rw.len;
|
cl->read_remaining = cur_op->req.rw.len;
|
||||||
}
|
}
|
||||||
|
else if (cur_op->req.hdr.opcode == OSD_OP_SHOW_CONFIG)
|
||||||
|
{
|
||||||
|
if (cur_op->req.show_conf.json_len > 0)
|
||||||
|
{
|
||||||
|
cur_op->buf = malloc_or_die(cur_op->req.show_conf.json_len+1);
|
||||||
|
((uint8_t*)cur_op->buf)[cur_op->req.show_conf.json_len] = 0;
|
||||||
|
cl->recv_list.push_back(cur_op->buf, cur_op->req.show_conf.json_len);
|
||||||
|
}
|
||||||
|
cl->read_remaining = cur_op->req.show_conf.json_len;
|
||||||
|
}
|
||||||
if (cl->read_remaining > 0)
|
if (cl->read_remaining > 0)
|
||||||
{
|
{
|
||||||
// Read data
|
// Read data
|
||||||
@@ -275,7 +292,7 @@ bool osd_messenger_t::handle_reply_hdr(osd_client_t *cl)
|
|||||||
if (req_it == cl->sent_ops.end())
|
if (req_it == cl->sent_ops.end())
|
||||||
{
|
{
|
||||||
// Command out of sync. Drop connection
|
// Command out of sync. Drop connection
|
||||||
printf("Client %d command out of sync: id %lu\n", cl->peer_fd, cl->read_op->req.hdr.id);
|
fprintf(stderr, "Client %d command out of sync: id %lu\n", cl->peer_fd, cl->read_op->req.hdr.id);
|
||||||
stop_client(cl->peer_fd);
|
stop_client(cl->peer_fd);
|
||||||
return false;
|
return false;
|
||||||
}
|
}
|
||||||
@@ -290,7 +307,7 @@ bool osd_messenger_t::handle_reply_hdr(osd_client_t *cl)
|
|||||||
if (op->reply.hdr.retval >= 0 && (op->reply.hdr.retval != expected_size || bmp_len > op->bitmap_len))
|
if (op->reply.hdr.retval >= 0 && (op->reply.hdr.retval != expected_size || bmp_len > op->bitmap_len))
|
||||||
{
|
{
|
||||||
// Check reply length to not overflow the buffer
|
// Check reply length to not overflow the buffer
|
||||||
printf("Client %d read reply of different length: expected %u+%u, got %ld+%u\n",
|
fprintf(stderr, "Client %d read reply of different length: expected %u+%u, got %ld+%u\n",
|
||||||
cl->peer_fd, expected_size, op->bitmap_len, op->reply.hdr.retval, bmp_len);
|
cl->peer_fd, expected_size, op->bitmap_len, op->reply.hdr.retval, bmp_len);
|
||||||
cl->sent_ops[op->req.hdr.id] = op;
|
cl->sent_ops[op->req.hdr.id] = op;
|
||||||
stop_client(cl->peer_fd);
|
stop_client(cl->peer_fd);
|
||||||
@@ -338,11 +355,11 @@ bool osd_messenger_t::handle_reply_hdr(osd_client_t *cl)
|
|||||||
}
|
}
|
||||||
else if (op->reply.hdr.opcode == OSD_OP_SHOW_CONFIG && op->reply.hdr.retval > 0)
|
else if (op->reply.hdr.opcode == OSD_OP_SHOW_CONFIG && op->reply.hdr.retval > 0)
|
||||||
{
|
{
|
||||||
assert(!op->iov.count);
|
|
||||||
delete cl->read_op;
|
delete cl->read_op;
|
||||||
cl->read_op = op;
|
cl->read_op = op;
|
||||||
cl->read_state = CL_READ_REPLY_DATA;
|
cl->read_state = CL_READ_REPLY_DATA;
|
||||||
cl->read_remaining = op->reply.hdr.retval;
|
cl->read_remaining = op->reply.hdr.retval;
|
||||||
|
free(op->buf);
|
||||||
op->buf = malloc_or_die(op->reply.hdr.retval);
|
op->buf = malloc_or_die(op->reply.hdr.retval);
|
||||||
cl->recv_list.push_back(op->buf, op->reply.hdr.retval);
|
cl->recv_list.push_back(op->buf, op->reply.hdr.retval);
|
||||||
}
|
}
|
||||||
|
@@ -46,7 +46,7 @@ void osd_messenger_t::outbox_push(osd_op_t *cur_op)
|
|||||||
to_send_list.push_back((iovec){ .iov_base = cur_op->req.buf, .iov_len = OSD_PACKET_SIZE });
|
to_send_list.push_back((iovec){ .iov_base = cur_op->req.buf, .iov_len = OSD_PACKET_SIZE });
|
||||||
cl->sent_ops[cur_op->req.hdr.id] = cur_op;
|
cl->sent_ops[cur_op->req.hdr.id] = cur_op;
|
||||||
}
|
}
|
||||||
to_outbox.push_back(NULL);
|
to_outbox.push_back((msgr_sendp_t){ .op = cur_op, .flags = MSGR_SENDP_HDR });
|
||||||
// Bitmap
|
// Bitmap
|
||||||
if (cur_op->op_type == OSD_OP_IN &&
|
if (cur_op->op_type == OSD_OP_IN &&
|
||||||
cur_op->req.hdr.opcode == OSD_OP_SEC_READ &&
|
cur_op->req.hdr.opcode == OSD_OP_SEC_READ &&
|
||||||
@@ -56,7 +56,7 @@ void osd_messenger_t::outbox_push(osd_op_t *cur_op)
|
|||||||
.iov_base = cur_op->bitmap,
|
.iov_base = cur_op->bitmap,
|
||||||
.iov_len = cur_op->reply.sec_rw.attr_len,
|
.iov_len = cur_op->reply.sec_rw.attr_len,
|
||||||
});
|
});
|
||||||
to_outbox.push_back(NULL);
|
to_outbox.push_back((msgr_sendp_t){ .op = cur_op, .flags = 0 });
|
||||||
}
|
}
|
||||||
else if (cur_op->op_type == OSD_OP_OUT &&
|
else if (cur_op->op_type == OSD_OP_OUT &&
|
||||||
(cur_op->req.hdr.opcode == OSD_OP_SEC_WRITE || cur_op->req.hdr.opcode == OSD_OP_SEC_WRITE_STABLE) &&
|
(cur_op->req.hdr.opcode == OSD_OP_SEC_WRITE || cur_op->req.hdr.opcode == OSD_OP_SEC_WRITE_STABLE) &&
|
||||||
@@ -66,7 +66,7 @@ void osd_messenger_t::outbox_push(osd_op_t *cur_op)
|
|||||||
.iov_base = cur_op->bitmap,
|
.iov_base = cur_op->bitmap,
|
||||||
.iov_len = cur_op->req.sec_rw.attr_len,
|
.iov_len = cur_op->req.sec_rw.attr_len,
|
||||||
});
|
});
|
||||||
to_outbox.push_back(NULL);
|
to_outbox.push_back((msgr_sendp_t){ .op = cur_op, .flags = 0 });
|
||||||
}
|
}
|
||||||
// Operation data
|
// Operation data
|
||||||
if ((cur_op->op_type == OSD_OP_IN
|
if ((cur_op->op_type == OSD_OP_IN
|
||||||
@@ -78,13 +78,14 @@ void osd_messenger_t::outbox_push(osd_op_t *cur_op)
|
|||||||
cur_op->req.hdr.opcode == OSD_OP_SEC_WRITE ||
|
cur_op->req.hdr.opcode == OSD_OP_SEC_WRITE ||
|
||||||
cur_op->req.hdr.opcode == OSD_OP_SEC_WRITE_STABLE ||
|
cur_op->req.hdr.opcode == OSD_OP_SEC_WRITE_STABLE ||
|
||||||
cur_op->req.hdr.opcode == OSD_OP_SEC_STABILIZE ||
|
cur_op->req.hdr.opcode == OSD_OP_SEC_STABILIZE ||
|
||||||
cur_op->req.hdr.opcode == OSD_OP_SEC_ROLLBACK)) && cur_op->iov.count > 0)
|
cur_op->req.hdr.opcode == OSD_OP_SEC_ROLLBACK ||
|
||||||
|
cur_op->req.hdr.opcode == OSD_OP_SHOW_CONFIG)) && cur_op->iov.count > 0)
|
||||||
{
|
{
|
||||||
for (int i = 0; i < cur_op->iov.count; i++)
|
for (int i = 0; i < cur_op->iov.count; i++)
|
||||||
{
|
{
|
||||||
assert(cur_op->iov.buf[i].iov_base);
|
assert(cur_op->iov.buf[i].iov_base);
|
||||||
to_send_list.push_back(cur_op->iov.buf[i]);
|
to_send_list.push_back(cur_op->iov.buf[i]);
|
||||||
to_outbox.push_back(NULL);
|
to_outbox.push_back((msgr_sendp_t){ .op = cur_op, .flags = 0 });
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
if (cur_op->req.hdr.opcode == OSD_OP_SEC_READ_BMP)
|
if (cur_op->req.hdr.opcode == OSD_OP_SEC_READ_BMP)
|
||||||
@@ -93,13 +94,19 @@ void osd_messenger_t::outbox_push(osd_op_t *cur_op)
|
|||||||
to_send_list.push_back((iovec){ .iov_base = cur_op->buf, .iov_len = (size_t)cur_op->reply.hdr.retval });
|
to_send_list.push_back((iovec){ .iov_base = cur_op->buf, .iov_len = (size_t)cur_op->reply.hdr.retval });
|
||||||
else if (cur_op->op_type == OSD_OP_OUT && cur_op->req.sec_read_bmp.len > 0)
|
else if (cur_op->op_type == OSD_OP_OUT && cur_op->req.sec_read_bmp.len > 0)
|
||||||
to_send_list.push_back((iovec){ .iov_base = cur_op->buf, .iov_len = (size_t)cur_op->req.sec_read_bmp.len });
|
to_send_list.push_back((iovec){ .iov_base = cur_op->buf, .iov_len = (size_t)cur_op->req.sec_read_bmp.len });
|
||||||
to_outbox.push_back(NULL);
|
to_outbox.push_back((msgr_sendp_t){ .op = cur_op, .flags = 0 });
|
||||||
}
|
}
|
||||||
if (cur_op->op_type == OSD_OP_IN)
|
if (cur_op->op_type == OSD_OP_IN)
|
||||||
{
|
{
|
||||||
// To free it later
|
to_outbox[to_outbox.size()-1].flags |= MSGR_SENDP_FREE;
|
||||||
to_outbox[to_outbox.size()-1] = cur_op;
|
|
||||||
}
|
}
|
||||||
|
#ifdef WITH_RDMA
|
||||||
|
if (cl->peer_state == PEER_RDMA)
|
||||||
|
{
|
||||||
|
try_send_rdma(cl);
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
#endif
|
||||||
if (!ringloop)
|
if (!ringloop)
|
||||||
{
|
{
|
||||||
// FIXME: It's worse because it doesn't allow batching
|
// FIXME: It's worse because it doesn't allow batching
|
||||||
@@ -220,7 +227,7 @@ void osd_messenger_t::handle_send(int result, osd_client_t *cl)
|
|||||||
if (result < 0 && result != -EAGAIN)
|
if (result < 0 && result != -EAGAIN)
|
||||||
{
|
{
|
||||||
// this is a client socket, so don't panic. just disconnect it
|
// this is a client socket, so don't panic. just disconnect it
|
||||||
printf("Client %d socket write error: %d (%s). Disconnecting client\n", cl->peer_fd, -result, strerror(-result));
|
fprintf(stderr, "Client %d socket write error: %d (%s). Disconnecting client\n", cl->peer_fd, -result, strerror(-result));
|
||||||
stop_client(cl->peer_fd);
|
stop_client(cl->peer_fd);
|
||||||
return;
|
return;
|
||||||
}
|
}
|
||||||
@@ -232,10 +239,10 @@ void osd_messenger_t::handle_send(int result, osd_client_t *cl)
|
|||||||
iovec & iov = cl->send_list[done];
|
iovec & iov = cl->send_list[done];
|
||||||
if (iov.iov_len <= result)
|
if (iov.iov_len <= result)
|
||||||
{
|
{
|
||||||
if (cl->outbox[done])
|
if (cl->outbox[done].flags & MSGR_SENDP_FREE)
|
||||||
{
|
{
|
||||||
// Reply fully sent
|
// Reply fully sent
|
||||||
delete cl->outbox[done];
|
delete cl->outbox[done].op;
|
||||||
}
|
}
|
||||||
result -= iov.iov_len;
|
result -= iov.iov_len;
|
||||||
done++;
|
done++;
|
||||||
@@ -260,6 +267,21 @@ void osd_messenger_t::handle_send(int result, osd_client_t *cl)
|
|||||||
cl->next_outbox.clear();
|
cl->next_outbox.clear();
|
||||||
}
|
}
|
||||||
cl->write_state = cl->outbox.size() > 0 ? CL_WRITE_READY : 0;
|
cl->write_state = cl->outbox.size() > 0 ? CL_WRITE_READY : 0;
|
||||||
|
#ifdef WITH_RDMA
|
||||||
|
if (cl->rdma_conn && !cl->outbox.size() && cl->peer_state == PEER_RDMA_CONNECTING)
|
||||||
|
{
|
||||||
|
// FIXME: Do something better than just forgetting the FD
|
||||||
|
// FIXME: Ignore pings during RDMA state transition
|
||||||
|
if (log_level > 0)
|
||||||
|
{
|
||||||
|
fprintf(stderr, "Successfully connected with client %d using RDMA\n", cl->peer_fd);
|
||||||
|
}
|
||||||
|
cl->peer_state = PEER_RDMA;
|
||||||
|
tfd->set_fd_handler(cl->peer_fd, false, NULL);
|
||||||
|
// Add the initial receive request
|
||||||
|
try_recv_rdma(cl);
|
||||||
|
}
|
||||||
|
#endif
|
||||||
}
|
}
|
||||||
if (cl->write_state != 0)
|
if (cl->write_state != 0)
|
||||||
{
|
{
|
||||||
|
@@ -58,11 +58,11 @@ void osd_messenger_t::stop_client(int peer_fd, bool force)
|
|||||||
{
|
{
|
||||||
if (cl->osd_num)
|
if (cl->osd_num)
|
||||||
{
|
{
|
||||||
printf("[OSD %lu] Stopping client %d (OSD peer %lu)\n", osd_num, peer_fd, cl->osd_num);
|
fprintf(stderr, "[OSD %lu] Stopping client %d (OSD peer %lu)\n", osd_num, peer_fd, cl->osd_num);
|
||||||
}
|
}
|
||||||
else
|
else
|
||||||
{
|
{
|
||||||
printf("[OSD %lu] Stopping client %d (regular client)\n", osd_num, peer_fd);
|
fprintf(stderr, "[OSD %lu] Stopping client %d (regular client)\n", osd_num, peer_fd);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
// First set state to STOPPED so another stop_client() call doesn't try to free it again
|
// First set state to STOPPED so another stop_client() call doesn't try to free it again
|
||||||
@@ -122,6 +122,12 @@ void osd_messenger_t::stop_client(int peer_fd, bool force)
|
|||||||
// And close the FD only when everything is done
|
// And close the FD only when everything is done
|
||||||
// ...because peer_fd number can get reused after close()
|
// ...because peer_fd number can get reused after close()
|
||||||
close(peer_fd);
|
close(peer_fd);
|
||||||
|
#ifdef WITH_RDMA
|
||||||
|
if (cl->rdma_conn)
|
||||||
|
{
|
||||||
|
delete cl->rdma_conn;
|
||||||
|
}
|
||||||
|
#endif
|
||||||
#endif
|
#endif
|
||||||
// Find the item again because it can be invalidated at this point
|
// Find the item again because it can be invalidated at this point
|
||||||
it = clients.find(peer_fd);
|
it = clients.find(peer_fd);
|
||||||
|
@@ -10,6 +10,7 @@
|
|||||||
#include <netinet/tcp.h>
|
#include <netinet/tcp.h>
|
||||||
#include <arpa/inet.h>
|
#include <arpa/inet.h>
|
||||||
#include <sys/un.h>
|
#include <sys/un.h>
|
||||||
|
#include <sys/epoll.h>
|
||||||
#include <unistd.h>
|
#include <unistd.h>
|
||||||
#include <fcntl.h>
|
#include <fcntl.h>
|
||||||
#include <signal.h>
|
#include <signal.h>
|
||||||
@@ -26,7 +27,10 @@ const char *exe_name = NULL;
|
|||||||
class nbd_proxy
|
class nbd_proxy
|
||||||
{
|
{
|
||||||
protected:
|
protected:
|
||||||
|
std::string image_name;
|
||||||
uint64_t inode = 0;
|
uint64_t inode = 0;
|
||||||
|
uint64_t device_size = 0;
|
||||||
|
inode_watch_t *watch = NULL;
|
||||||
|
|
||||||
ring_loop_t *ringloop = NULL;
|
ring_loop_t *ringloop = NULL;
|
||||||
epoll_manager_t *epmgr = NULL;
|
epoll_manager_t *epmgr = NULL;
|
||||||
@@ -111,9 +115,9 @@ public:
|
|||||||
{
|
{
|
||||||
printf(
|
printf(
|
||||||
"Vitastor NBD proxy\n"
|
"Vitastor NBD proxy\n"
|
||||||
"(c) Vitaliy Filippov, 2020 (VNPL-1.1)\n\n"
|
"(c) Vitaliy Filippov, 2020-2021 (VNPL-1.1)\n\n"
|
||||||
"USAGE:\n"
|
"USAGE:\n"
|
||||||
" %s map --etcd_address <etcd_address> --pool <pool> --inode <inode> --size <size in bytes>\n"
|
" %s map [--etcd_address <etcd_address>] (--image <image> | --pool <pool> --inode <inode> --size <size in bytes>)\n"
|
||||||
" %s unmap /dev/nbd0\n"
|
" %s unmap /dev/nbd0\n"
|
||||||
" %s list [--json]\n",
|
" %s list [--json]\n",
|
||||||
exe_name, exe_name, exe_name
|
exe_name, exe_name, exe_name
|
||||||
@@ -143,26 +147,49 @@ public:
|
|||||||
void start(json11::Json cfg)
|
void start(json11::Json cfg)
|
||||||
{
|
{
|
||||||
// Check options
|
// Check options
|
||||||
if (cfg["etcd_address"].string_value() == "")
|
if (cfg["image"].string_value() != "")
|
||||||
{
|
{
|
||||||
fprintf(stderr, "etcd_address is missing\n");
|
// Use image name
|
||||||
exit(1);
|
image_name = cfg["image"].string_value();
|
||||||
|
inode = 0;
|
||||||
}
|
}
|
||||||
if (!cfg["size"].uint64_value())
|
else
|
||||||
{
|
{
|
||||||
fprintf(stderr, "device size is missing\n");
|
// Use pool, inode number and size
|
||||||
exit(1);
|
if (!cfg["size"].uint64_value())
|
||||||
|
{
|
||||||
|
fprintf(stderr, "device size is missing\n");
|
||||||
|
exit(1);
|
||||||
|
}
|
||||||
|
device_size = cfg["size"].uint64_value();
|
||||||
|
inode = cfg["inode"].uint64_value();
|
||||||
|
uint64_t pool = cfg["pool"].uint64_value();
|
||||||
|
if (pool)
|
||||||
|
{
|
||||||
|
inode = (inode & ((1l << (64-POOL_ID_BITS)) - 1)) | (pool << (64-POOL_ID_BITS));
|
||||||
|
}
|
||||||
|
if (!(inode >> (64-POOL_ID_BITS)))
|
||||||
|
{
|
||||||
|
fprintf(stderr, "pool is missing\n");
|
||||||
|
exit(1);
|
||||||
|
}
|
||||||
}
|
}
|
||||||
inode = cfg["inode"].uint64_value();
|
// Create client
|
||||||
uint64_t pool = cfg["pool"].uint64_value();
|
ringloop = new ring_loop_t(512);
|
||||||
if (pool)
|
epmgr = new epoll_manager_t(ringloop);
|
||||||
|
cli = new cluster_client_t(ringloop, epmgr->tfd, cfg);
|
||||||
|
if (!inode)
|
||||||
{
|
{
|
||||||
inode = (inode & ((1l << (64-POOL_ID_BITS)) - 1)) | (pool << (64-POOL_ID_BITS));
|
// Load image metadata
|
||||||
}
|
while (!cli->is_ready())
|
||||||
if (!(inode >> (64-POOL_ID_BITS)))
|
{
|
||||||
{
|
ringloop->loop();
|
||||||
fprintf(stderr, "pool is missing\n");
|
if (cli->is_ready())
|
||||||
exit(1);
|
break;
|
||||||
|
ringloop->wait();
|
||||||
|
}
|
||||||
|
watch = cli->st_cli.watch_inode(image_name);
|
||||||
|
device_size = watch->cfg.size;
|
||||||
}
|
}
|
||||||
// Initialize NBD
|
// Initialize NBD
|
||||||
int sockfd[2];
|
int sockfd[2];
|
||||||
@@ -174,9 +201,10 @@ public:
|
|||||||
fcntl(sockfd[0], F_SETFL, fcntl(sockfd[0], F_GETFL, 0) | O_NONBLOCK);
|
fcntl(sockfd[0], F_SETFL, fcntl(sockfd[0], F_GETFL, 0) | O_NONBLOCK);
|
||||||
nbd_fd = sockfd[0];
|
nbd_fd = sockfd[0];
|
||||||
load_module();
|
load_module();
|
||||||
|
bool bg = cfg["foreground"].is_null();
|
||||||
if (!cfg["dev_num"].is_null())
|
if (!cfg["dev_num"].is_null())
|
||||||
{
|
{
|
||||||
if (run_nbd(sockfd, cfg["dev_num"].int64_value(), cfg["size"].uint64_value(), NBD_FLAG_SEND_FLUSH, 30) < 0)
|
if (run_nbd(sockfd, cfg["dev_num"].int64_value(), device_size, NBD_FLAG_SEND_FLUSH, 30, bg) < 0)
|
||||||
{
|
{
|
||||||
perror("run_nbd");
|
perror("run_nbd");
|
||||||
exit(1);
|
exit(1);
|
||||||
@@ -188,7 +216,7 @@ public:
|
|||||||
int i = 0;
|
int i = 0;
|
||||||
while (true)
|
while (true)
|
||||||
{
|
{
|
||||||
int r = run_nbd(sockfd, i, cfg["size"].uint64_value(), NBD_FLAG_SEND_FLUSH, 30);
|
int r = run_nbd(sockfd, i, device_size, NBD_FLAG_SEND_FLUSH, 30, bg);
|
||||||
if (r == 0)
|
if (r == 0)
|
||||||
{
|
{
|
||||||
printf("/dev/nbd%d\n", i);
|
printf("/dev/nbd%d\n", i);
|
||||||
@@ -211,14 +239,10 @@ public:
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
if (cfg["foreground"].is_null())
|
if (bg)
|
||||||
{
|
{
|
||||||
daemonize();
|
daemonize();
|
||||||
}
|
}
|
||||||
// Create client
|
|
||||||
ringloop = new ring_loop_t(512);
|
|
||||||
epmgr = new epoll_manager_t(ringloop);
|
|
||||||
cli = new cluster_client_t(ringloop, epmgr->tfd, cfg);
|
|
||||||
// Initialize read state
|
// Initialize read state
|
||||||
read_state = CL_READ_HDR;
|
read_state = CL_READ_HDR;
|
||||||
recv_buf = malloc_or_die(receive_buffer_size);
|
recv_buf = malloc_or_die(receive_buffer_size);
|
||||||
@@ -232,21 +256,47 @@ public:
|
|||||||
};
|
};
|
||||||
ringloop->register_consumer(&consumer);
|
ringloop->register_consumer(&consumer);
|
||||||
// Add FD to epoll
|
// Add FD to epoll
|
||||||
epmgr->tfd->set_fd_handler(sockfd[0], false, [this](int peer_fd, int epoll_events)
|
bool stop = false;
|
||||||
|
epmgr->tfd->set_fd_handler(sockfd[0], false, [this, &stop](int peer_fd, int epoll_events)
|
||||||
{
|
{
|
||||||
read_ready++;
|
if (epoll_events & EPOLLRDHUP)
|
||||||
submit_read();
|
{
|
||||||
|
close(peer_fd);
|
||||||
|
stop = true;
|
||||||
|
}
|
||||||
|
else
|
||||||
|
{
|
||||||
|
read_ready++;
|
||||||
|
submit_read();
|
||||||
|
}
|
||||||
});
|
});
|
||||||
while (1)
|
while (!stop)
|
||||||
{
|
{
|
||||||
ringloop->loop();
|
ringloop->loop();
|
||||||
ringloop->wait();
|
ringloop->wait();
|
||||||
}
|
}
|
||||||
|
stop = false;
|
||||||
|
cluster_op_t *close_sync = new cluster_op_t;
|
||||||
|
close_sync->opcode = OSD_OP_SYNC;
|
||||||
|
close_sync->callback = [this, &stop](cluster_op_t *op)
|
||||||
|
{
|
||||||
|
stop = true;
|
||||||
|
delete op;
|
||||||
|
};
|
||||||
|
cli->execute(close_sync);
|
||||||
|
while (!stop)
|
||||||
|
{
|
||||||
|
ringloop->loop();
|
||||||
|
ringloop->wait();
|
||||||
|
}
|
||||||
|
delete cli;
|
||||||
|
delete epmgr;
|
||||||
|
delete ringloop;
|
||||||
}
|
}
|
||||||
|
|
||||||
void load_module()
|
void load_module()
|
||||||
{
|
{
|
||||||
if (access("/sys/module/nbd", F_OK))
|
if (access("/sys/module/nbd", F_OK) == 0)
|
||||||
{
|
{
|
||||||
return;
|
return;
|
||||||
}
|
}
|
||||||
@@ -388,7 +438,7 @@ public:
|
|||||||
}
|
}
|
||||||
|
|
||||||
protected:
|
protected:
|
||||||
int run_nbd(int sockfd[2], int dev_num, uint64_t size, uint64_t flags, unsigned timeout)
|
int run_nbd(int sockfd[2], int dev_num, uint64_t size, uint64_t flags, unsigned timeout, bool bg)
|
||||||
{
|
{
|
||||||
// Check handle size
|
// Check handle size
|
||||||
assert(sizeof(cur_req.handle) == 8);
|
assert(sizeof(cur_req.handle) == 8);
|
||||||
@@ -436,11 +486,14 @@ protected:
|
|||||||
{
|
{
|
||||||
// Run in child
|
// Run in child
|
||||||
close(sockfd[0]);
|
close(sockfd[0]);
|
||||||
|
if (bg)
|
||||||
|
{
|
||||||
|
daemonize();
|
||||||
|
}
|
||||||
r = ioctl(nbd, NBD_DO_IT);
|
r = ioctl(nbd, NBD_DO_IT);
|
||||||
if (r < 0)
|
if (r < 0)
|
||||||
{
|
{
|
||||||
fprintf(stderr, "NBD device terminated with error: %s\n", strerror(errno));
|
fprintf(stderr, "NBD device terminated with error: %s\n", strerror(errno));
|
||||||
kill(getppid(), SIGTERM);
|
|
||||||
}
|
}
|
||||||
close(sockfd[1]);
|
close(sockfd[1]);
|
||||||
ioctl(nbd, NBD_CLEAR_QUE);
|
ioctl(nbd, NBD_CLEAR_QUE);
|
||||||
@@ -610,7 +663,7 @@ protected:
|
|||||||
if (req_type == NBD_CMD_READ || req_type == NBD_CMD_WRITE)
|
if (req_type == NBD_CMD_READ || req_type == NBD_CMD_WRITE)
|
||||||
{
|
{
|
||||||
op->opcode = req_type == NBD_CMD_READ ? OSD_OP_READ : OSD_OP_WRITE;
|
op->opcode = req_type == NBD_CMD_READ ? OSD_OP_READ : OSD_OP_WRITE;
|
||||||
op->inode = inode;
|
op->inode = inode ? inode : watch->cfg.num;
|
||||||
op->offset = be64toh(cur_req.from);
|
op->offset = be64toh(cur_req.from);
|
||||||
op->len = be32toh(cur_req.len);
|
op->len = be32toh(cur_req.len);
|
||||||
buf = malloc_or_die(sizeof(nbd_reply) + op->len);
|
buf = malloc_or_die(sizeof(nbd_reply) + op->len);
|
||||||
@@ -657,7 +710,15 @@ protected:
|
|||||||
}
|
}
|
||||||
else
|
else
|
||||||
{
|
{
|
||||||
cli->execute(cur_op);
|
if (cur_op->opcode == OSD_OP_WRITE && watch->cfg.readonly)
|
||||||
|
{
|
||||||
|
cur_op->retval = -EROFS;
|
||||||
|
std::function<void(cluster_op_t*)>(cur_op->callback)(cur_op);
|
||||||
|
}
|
||||||
|
else
|
||||||
|
{
|
||||||
|
cli->execute(cur_op);
|
||||||
|
}
|
||||||
cur_op = NULL;
|
cur_op = NULL;
|
||||||
cur_buf = &cur_req;
|
cur_buf = &cur_req;
|
||||||
cur_left = sizeof(nbd_request);
|
cur_left = sizeof(nbd_request);
|
||||||
|
143
src/osd.cpp
143
src/osd.cpp
@@ -10,31 +10,39 @@
|
|||||||
#include "osd.h"
|
#include "osd.h"
|
||||||
#include "http_client.h"
|
#include "http_client.h"
|
||||||
|
|
||||||
osd_t::osd_t(blockstore_config_t & config, ring_loop_t *ringloop)
|
static blockstore_config_t json_to_bs(const json11::Json::object & config)
|
||||||
{
|
{
|
||||||
bs_block_size = strtoull(config["block_size"].c_str(), NULL, 10);
|
blockstore_config_t bs;
|
||||||
bs_bitmap_granularity = strtoull(config["bitmap_granularity"].c_str(), NULL, 10);
|
for (auto kv: config)
|
||||||
if (!bs_block_size)
|
{
|
||||||
bs_block_size = DEFAULT_BLOCK_SIZE;
|
if (kv.second.is_string())
|
||||||
if (!bs_bitmap_granularity)
|
bs[kv.first] = kv.second.string_value();
|
||||||
bs_bitmap_granularity = DEFAULT_BITMAP_GRANULARITY;
|
else
|
||||||
clean_entry_bitmap_size = bs_block_size / bs_bitmap_granularity / 8;
|
bs[kv.first] = kv.second.dump();
|
||||||
|
}
|
||||||
|
return bs;
|
||||||
|
}
|
||||||
|
|
||||||
|
osd_t::osd_t(const json11::Json & config, ring_loop_t *ringloop)
|
||||||
|
{
|
||||||
zero_buffer_size = 1<<20;
|
zero_buffer_size = 1<<20;
|
||||||
zero_buffer = malloc_or_die(zero_buffer_size);
|
zero_buffer = malloc_or_die(zero_buffer_size);
|
||||||
memset(zero_buffer, 0, zero_buffer_size);
|
memset(zero_buffer, 0, zero_buffer_size);
|
||||||
|
|
||||||
this->config = config;
|
|
||||||
this->ringloop = ringloop;
|
this->ringloop = ringloop;
|
||||||
|
|
||||||
|
this->config = msgr.read_config(config).object_items();
|
||||||
|
if (this->config.find("log_level") == this->config.end())
|
||||||
|
this->config["log_level"] = 1;
|
||||||
|
parse_config(this->config);
|
||||||
|
|
||||||
epmgr = new epoll_manager_t(ringloop);
|
epmgr = new epoll_manager_t(ringloop);
|
||||||
// FIXME: Use timerfd_interval based directly on io_uring
|
// FIXME: Use timerfd_interval based directly on io_uring
|
||||||
this->tfd = epmgr->tfd;
|
this->tfd = epmgr->tfd;
|
||||||
|
|
||||||
// FIXME: Create Blockstore from on-disk superblock config and check it against the OSD cluster config
|
// FIXME: Create Blockstore from on-disk superblock config and check it against the OSD cluster config
|
||||||
this->bs = new blockstore_t(config, ringloop, tfd);
|
auto bs_cfg = json_to_bs(this->config);
|
||||||
|
this->bs = new blockstore_t(bs_cfg, ringloop, tfd);
|
||||||
parse_config(config);
|
|
||||||
|
|
||||||
this->tfd->set_timer(print_stats_interval*1000, true, [this](int timer_id)
|
this->tfd->set_timer(print_stats_interval*1000, true, [this](int timer_id)
|
||||||
{
|
{
|
||||||
@@ -45,11 +53,11 @@ osd_t::osd_t(blockstore_config_t & config, ring_loop_t *ringloop)
|
|||||||
print_slow();
|
print_slow();
|
||||||
});
|
});
|
||||||
|
|
||||||
c_cli.tfd = this->tfd;
|
msgr.tfd = this->tfd;
|
||||||
c_cli.ringloop = this->ringloop;
|
msgr.ringloop = this->ringloop;
|
||||||
c_cli.exec_op = [this](osd_op_t *op) { exec_op(op); };
|
msgr.exec_op = [this](osd_op_t *op) { exec_op(op); };
|
||||||
c_cli.repeer_pgs = [this](osd_num_t peer_osd) { repeer_pgs(peer_osd); };
|
msgr.repeer_pgs = [this](osd_num_t peer_osd) { repeer_pgs(peer_osd); };
|
||||||
c_cli.init();
|
msgr.init();
|
||||||
|
|
||||||
init_cluster();
|
init_cluster();
|
||||||
|
|
||||||
@@ -66,62 +74,71 @@ osd_t::~osd_t()
|
|||||||
free(zero_buffer);
|
free(zero_buffer);
|
||||||
}
|
}
|
||||||
|
|
||||||
void osd_t::parse_config(blockstore_config_t & config)
|
void osd_t::parse_config(const json11::Json & config)
|
||||||
{
|
{
|
||||||
if (config.find("log_level") == config.end())
|
st_cli.parse_config(config);
|
||||||
config["log_level"] = "1";
|
msgr.parse_config(config);
|
||||||
log_level = strtoull(config["log_level"].c_str(), NULL, 10);
|
// OSD number
|
||||||
// Initial startup configuration
|
osd_num = config["osd_num"].uint64_value();
|
||||||
json11::Json json_config = json11::Json(config);
|
|
||||||
st_cli.parse_config(json_config);
|
|
||||||
etcd_report_interval = strtoull(config["etcd_report_interval"].c_str(), NULL, 10);
|
|
||||||
if (etcd_report_interval <= 0)
|
|
||||||
etcd_report_interval = 30;
|
|
||||||
osd_num = strtoull(config["osd_num"].c_str(), NULL, 10);
|
|
||||||
if (!osd_num)
|
if (!osd_num)
|
||||||
throw std::runtime_error("osd_num is required in the configuration");
|
throw std::runtime_error("osd_num is required in the configuration");
|
||||||
c_cli.osd_num = osd_num;
|
msgr.osd_num = osd_num;
|
||||||
|
// Vital Blockstore parameters
|
||||||
|
bs_block_size = config["block_size"].uint64_value();
|
||||||
|
if (!bs_block_size)
|
||||||
|
bs_block_size = DEFAULT_BLOCK_SIZE;
|
||||||
|
bs_bitmap_granularity = config["bitmap_granularity"].uint64_value();
|
||||||
|
if (!bs_bitmap_granularity)
|
||||||
|
bs_bitmap_granularity = DEFAULT_BITMAP_GRANULARITY;
|
||||||
|
clean_entry_bitmap_size = bs_block_size / bs_bitmap_granularity / 8;
|
||||||
|
// Bind address
|
||||||
|
bind_address = config["bind_address"].string_value();
|
||||||
|
if (bind_address == "")
|
||||||
|
bind_address = "0.0.0.0";
|
||||||
|
bind_port = config["bind_port"].uint64_value();
|
||||||
|
if (bind_port <= 0 || bind_port > 65535)
|
||||||
|
bind_port = 0;
|
||||||
|
// OSD configuration
|
||||||
|
log_level = config["log_level"].uint64_value();
|
||||||
|
etcd_report_interval = config["etcd_report_interval"].uint64_value();
|
||||||
|
if (etcd_report_interval <= 0)
|
||||||
|
etcd_report_interval = 30;
|
||||||
|
readonly = config["readonly"] == "true" || config["readonly"] == "1" || config["readonly"] == "yes";
|
||||||
run_primary = config["run_primary"] != "false" && config["run_primary"] != "0" && config["run_primary"] != "no";
|
run_primary = config["run_primary"] != "false" && config["run_primary"] != "0" && config["run_primary"] != "no";
|
||||||
no_rebalance = config["no_rebalance"] == "true" || config["no_rebalance"] == "1" || config["no_rebalance"] == "yes";
|
no_rebalance = config["no_rebalance"] == "true" || config["no_rebalance"] == "1" || config["no_rebalance"] == "yes";
|
||||||
no_recovery = config["no_recovery"] == "true" || config["no_recovery"] == "1" || config["no_recovery"] == "yes";
|
no_recovery = config["no_recovery"] == "true" || config["no_recovery"] == "1" || config["no_recovery"] == "yes";
|
||||||
// Cluster configuration
|
allow_test_ops = config["allow_test_ops"] == "true" || config["allow_test_ops"] == "1" || config["allow_test_ops"] == "yes";
|
||||||
bind_address = config["bind_address"];
|
|
||||||
if (bind_address == "")
|
|
||||||
bind_address = "0.0.0.0";
|
|
||||||
bind_port = stoull_full(config["bind_port"]);
|
|
||||||
if (bind_port <= 0 || bind_port > 65535)
|
|
||||||
bind_port = 0;
|
|
||||||
if (config["immediate_commit"] == "all")
|
if (config["immediate_commit"] == "all")
|
||||||
immediate_commit = IMMEDIATE_ALL;
|
immediate_commit = IMMEDIATE_ALL;
|
||||||
else if (config["immediate_commit"] == "small")
|
else if (config["immediate_commit"] == "small")
|
||||||
immediate_commit = IMMEDIATE_SMALL;
|
immediate_commit = IMMEDIATE_SMALL;
|
||||||
if (config.find("autosync_interval") != config.end())
|
else
|
||||||
|
immediate_commit = IMMEDIATE_NONE;
|
||||||
|
if (!config["autosync_interval"].is_null())
|
||||||
{
|
{
|
||||||
autosync_interval = strtoull(config["autosync_interval"].c_str(), NULL, 10);
|
// Allow to set it to 0
|
||||||
|
autosync_interval = config["autosync_interval"].uint64_value();
|
||||||
if (autosync_interval > MAX_AUTOSYNC_INTERVAL)
|
if (autosync_interval > MAX_AUTOSYNC_INTERVAL)
|
||||||
autosync_interval = DEFAULT_AUTOSYNC_INTERVAL;
|
autosync_interval = DEFAULT_AUTOSYNC_INTERVAL;
|
||||||
}
|
}
|
||||||
if (config.find("client_queue_depth") != config.end())
|
if (!config["client_queue_depth"].is_null())
|
||||||
{
|
{
|
||||||
client_queue_depth = strtoull(config["client_queue_depth"].c_str(), NULL, 10);
|
client_queue_depth = config["client_queue_depth"].uint64_value();
|
||||||
if (client_queue_depth < 128)
|
if (client_queue_depth < 128)
|
||||||
client_queue_depth = 128;
|
client_queue_depth = 128;
|
||||||
}
|
}
|
||||||
recovery_queue_depth = strtoull(config["recovery_queue_depth"].c_str(), NULL, 10);
|
recovery_queue_depth = config["recovery_queue_depth"].uint64_value();
|
||||||
if (recovery_queue_depth < 1 || recovery_queue_depth > MAX_RECOVERY_QUEUE)
|
if (recovery_queue_depth < 1 || recovery_queue_depth > MAX_RECOVERY_QUEUE)
|
||||||
recovery_queue_depth = DEFAULT_RECOVERY_QUEUE;
|
recovery_queue_depth = DEFAULT_RECOVERY_QUEUE;
|
||||||
recovery_sync_batch = strtoull(config["recovery_sync_batch"].c_str(), NULL, 10);
|
recovery_sync_batch = config["recovery_sync_batch"].uint64_value();
|
||||||
if (recovery_sync_batch < 1 || recovery_sync_batch > MAX_RECOVERY_QUEUE)
|
if (recovery_sync_batch < 1 || recovery_sync_batch > MAX_RECOVERY_QUEUE)
|
||||||
recovery_sync_batch = DEFAULT_RECOVERY_BATCH;
|
recovery_sync_batch = DEFAULT_RECOVERY_BATCH;
|
||||||
if (config["readonly"] == "true" || config["readonly"] == "1" || config["readonly"] == "yes")
|
print_stats_interval = config["print_stats_interval"].uint64_value();
|
||||||
readonly = true;
|
|
||||||
print_stats_interval = strtoull(config["print_stats_interval"].c_str(), NULL, 10);
|
|
||||||
if (!print_stats_interval)
|
if (!print_stats_interval)
|
||||||
print_stats_interval = 3;
|
print_stats_interval = 3;
|
||||||
slow_log_interval = strtoull(config["slow_log_interval"].c_str(), NULL, 10);
|
slow_log_interval = config["slow_log_interval"].uint64_value();
|
||||||
if (!slow_log_interval)
|
if (!slow_log_interval)
|
||||||
slow_log_interval = 10;
|
slow_log_interval = 10;
|
||||||
c_cli.parse_config(json_config);
|
|
||||||
}
|
}
|
||||||
|
|
||||||
void osd_t::bind_socket()
|
void osd_t::bind_socket()
|
||||||
@@ -174,7 +191,7 @@ void osd_t::bind_socket()
|
|||||||
|
|
||||||
epmgr->set_fd_handler(listen_fd, false, [this](int fd, int events)
|
epmgr->set_fd_handler(listen_fd, false, [this](int fd, int events)
|
||||||
{
|
{
|
||||||
c_cli.accept_connections(listen_fd);
|
msgr.accept_connections(listen_fd);
|
||||||
});
|
});
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -191,8 +208,8 @@ bool osd_t::shutdown()
|
|||||||
void osd_t::loop()
|
void osd_t::loop()
|
||||||
{
|
{
|
||||||
handle_peers();
|
handle_peers();
|
||||||
c_cli.read_requests();
|
msgr.read_requests();
|
||||||
c_cli.send_replies();
|
msgr.send_replies();
|
||||||
ringloop->submit();
|
ringloop->submit();
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -276,7 +293,7 @@ void osd_t::exec_op(osd_op_t *cur_op)
|
|||||||
|
|
||||||
void osd_t::reset_stats()
|
void osd_t::reset_stats()
|
||||||
{
|
{
|
||||||
c_cli.stats = { 0 };
|
msgr.stats = { 0 };
|
||||||
prev_stats = { 0 };
|
prev_stats = { 0 };
|
||||||
memset(recovery_stat_count, 0, sizeof(recovery_stat_count));
|
memset(recovery_stat_count, 0, sizeof(recovery_stat_count));
|
||||||
memset(recovery_stat_bytes, 0, sizeof(recovery_stat_bytes));
|
memset(recovery_stat_bytes, 0, sizeof(recovery_stat_bytes));
|
||||||
@@ -286,11 +303,11 @@ void osd_t::print_stats()
|
|||||||
{
|
{
|
||||||
for (int i = OSD_OP_MIN; i <= OSD_OP_MAX; i++)
|
for (int i = OSD_OP_MIN; i <= OSD_OP_MAX; i++)
|
||||||
{
|
{
|
||||||
if (c_cli.stats.op_stat_count[i] != prev_stats.op_stat_count[i] && i != OSD_OP_PING)
|
if (msgr.stats.op_stat_count[i] != prev_stats.op_stat_count[i] && i != OSD_OP_PING)
|
||||||
{
|
{
|
||||||
uint64_t avg = (c_cli.stats.op_stat_sum[i] - prev_stats.op_stat_sum[i])/(c_cli.stats.op_stat_count[i] - prev_stats.op_stat_count[i]);
|
uint64_t avg = (msgr.stats.op_stat_sum[i] - prev_stats.op_stat_sum[i])/(msgr.stats.op_stat_count[i] - prev_stats.op_stat_count[i]);
|
||||||
uint64_t bw = (c_cli.stats.op_stat_bytes[i] - prev_stats.op_stat_bytes[i]) / print_stats_interval;
|
uint64_t bw = (msgr.stats.op_stat_bytes[i] - prev_stats.op_stat_bytes[i]) / print_stats_interval;
|
||||||
if (c_cli.stats.op_stat_bytes[i] != 0)
|
if (msgr.stats.op_stat_bytes[i] != 0)
|
||||||
{
|
{
|
||||||
printf(
|
printf(
|
||||||
"[OSD %lu] avg latency for op %d (%s): %lu us, B/W: %.2f %s\n", osd_num, i, osd_op_names[i], avg,
|
"[OSD %lu] avg latency for op %d (%s): %lu us, B/W: %.2f %s\n", osd_num, i, osd_op_names[i], avg,
|
||||||
@@ -302,19 +319,19 @@ void osd_t::print_stats()
|
|||||||
{
|
{
|
||||||
printf("[OSD %lu] avg latency for op %d (%s): %lu us\n", osd_num, i, osd_op_names[i], avg);
|
printf("[OSD %lu] avg latency for op %d (%s): %lu us\n", osd_num, i, osd_op_names[i], avg);
|
||||||
}
|
}
|
||||||
prev_stats.op_stat_count[i] = c_cli.stats.op_stat_count[i];
|
prev_stats.op_stat_count[i] = msgr.stats.op_stat_count[i];
|
||||||
prev_stats.op_stat_sum[i] = c_cli.stats.op_stat_sum[i];
|
prev_stats.op_stat_sum[i] = msgr.stats.op_stat_sum[i];
|
||||||
prev_stats.op_stat_bytes[i] = c_cli.stats.op_stat_bytes[i];
|
prev_stats.op_stat_bytes[i] = msgr.stats.op_stat_bytes[i];
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
for (int i = OSD_OP_MIN; i <= OSD_OP_MAX; i++)
|
for (int i = OSD_OP_MIN; i <= OSD_OP_MAX; i++)
|
||||||
{
|
{
|
||||||
if (c_cli.stats.subop_stat_count[i] != prev_stats.subop_stat_count[i])
|
if (msgr.stats.subop_stat_count[i] != prev_stats.subop_stat_count[i])
|
||||||
{
|
{
|
||||||
uint64_t avg = (c_cli.stats.subop_stat_sum[i] - prev_stats.subop_stat_sum[i])/(c_cli.stats.subop_stat_count[i] - prev_stats.subop_stat_count[i]);
|
uint64_t avg = (msgr.stats.subop_stat_sum[i] - prev_stats.subop_stat_sum[i])/(msgr.stats.subop_stat_count[i] - prev_stats.subop_stat_count[i]);
|
||||||
printf("[OSD %lu] avg latency for subop %d (%s): %ld us\n", osd_num, i, osd_op_names[i], avg);
|
printf("[OSD %lu] avg latency for subop %d (%s): %ld us\n", osd_num, i, osd_op_names[i], avg);
|
||||||
prev_stats.subop_stat_count[i] = c_cli.stats.subop_stat_count[i];
|
prev_stats.subop_stat_count[i] = msgr.stats.subop_stat_count[i];
|
||||||
prev_stats.subop_stat_sum[i] = c_cli.stats.subop_stat_sum[i];
|
prev_stats.subop_stat_sum[i] = msgr.stats.subop_stat_sum[i];
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
for (int i = 0; i < 2; i++)
|
for (int i = 0; i < 2; i++)
|
||||||
@@ -351,7 +368,7 @@ void osd_t::print_slow()
|
|||||||
char alloc[1024];
|
char alloc[1024];
|
||||||
timespec now;
|
timespec now;
|
||||||
clock_gettime(CLOCK_REALTIME, &now);
|
clock_gettime(CLOCK_REALTIME, &now);
|
||||||
for (auto & kv: c_cli.clients)
|
for (auto & kv: msgr.clients)
|
||||||
{
|
{
|
||||||
for (auto op: kv.second->received_ops)
|
for (auto op: kv.second->received_ops)
|
||||||
{
|
{
|
||||||
|
10
src/osd.h
10
src/osd.h
@@ -92,7 +92,7 @@ class osd_t
|
|||||||
{
|
{
|
||||||
// config
|
// config
|
||||||
|
|
||||||
blockstore_config_t config;
|
json11::Json::object config;
|
||||||
int etcd_report_interval = 30;
|
int etcd_report_interval = 30;
|
||||||
|
|
||||||
bool readonly = false;
|
bool readonly = false;
|
||||||
@@ -104,7 +104,7 @@ class osd_t
|
|||||||
int bind_port, listen_backlog;
|
int bind_port, listen_backlog;
|
||||||
// FIXME: Implement client queue depth limit
|
// FIXME: Implement client queue depth limit
|
||||||
int client_queue_depth = 128;
|
int client_queue_depth = 128;
|
||||||
bool allow_test_ops = true;
|
bool allow_test_ops = false;
|
||||||
int print_stats_interval = 3;
|
int print_stats_interval = 3;
|
||||||
int slow_log_interval = 10;
|
int slow_log_interval = 10;
|
||||||
int immediate_commit = IMMEDIATE_NONE;
|
int immediate_commit = IMMEDIATE_NONE;
|
||||||
@@ -116,7 +116,7 @@ class osd_t
|
|||||||
// cluster state
|
// cluster state
|
||||||
|
|
||||||
etcd_state_client_t st_cli;
|
etcd_state_client_t st_cli;
|
||||||
osd_messenger_t c_cli;
|
osd_messenger_t msgr;
|
||||||
int etcd_failed_attempts = 0;
|
int etcd_failed_attempts = 0;
|
||||||
std::string etcd_lease_id;
|
std::string etcd_lease_id;
|
||||||
json11::Json self_state;
|
json11::Json self_state;
|
||||||
@@ -167,7 +167,7 @@ class osd_t
|
|||||||
uint64_t recovery_stat_bytes[2][2] = { 0 };
|
uint64_t recovery_stat_bytes[2][2] = { 0 };
|
||||||
|
|
||||||
// cluster connection
|
// cluster connection
|
||||||
void parse_config(blockstore_config_t & config);
|
void parse_config(const json11::Json & config);
|
||||||
void init_cluster();
|
void init_cluster();
|
||||||
void on_change_osd_state_hook(osd_num_t peer_osd);
|
void on_change_osd_state_hook(osd_num_t peer_osd);
|
||||||
void on_change_pg_history_hook(pool_id_t pool_id, pg_num_t pg_num);
|
void on_change_pg_history_hook(pool_id_t pool_id, pg_num_t pg_num);
|
||||||
@@ -268,7 +268,7 @@ class osd_t
|
|||||||
}
|
}
|
||||||
|
|
||||||
public:
|
public:
|
||||||
osd_t(blockstore_config_t & config, ring_loop_t *ringloop);
|
osd_t(const json11::Json & config, ring_loop_t *ringloop);
|
||||||
~osd_t();
|
~osd_t();
|
||||||
void force_stop(int exitcode);
|
void force_stop(int exitcode);
|
||||||
bool shutdown();
|
bool shutdown();
|
||||||
|
@@ -21,7 +21,7 @@ void osd_t::init_cluster()
|
|||||||
{
|
{
|
||||||
// Test version of clustering code with 1 pool, 1 PG and 2 peers
|
// Test version of clustering code with 1 pool, 1 PG and 2 peers
|
||||||
// Example: peers = 2:127.0.0.1:11204,3:127.0.0.1:11205
|
// Example: peers = 2:127.0.0.1:11204,3:127.0.0.1:11205
|
||||||
std::string peerstr = config["peers"];
|
std::string peerstr = config["peers"].string_value();
|
||||||
while (peerstr.size())
|
while (peerstr.size())
|
||||||
{
|
{
|
||||||
int pos = peerstr.find(',');
|
int pos = peerstr.find(',');
|
||||||
@@ -104,7 +104,7 @@ void osd_t::parse_test_peer(std::string peer)
|
|||||||
{ "addresses", json11::Json::array { addr } },
|
{ "addresses", json11::Json::array { addr } },
|
||||||
{ "port", port },
|
{ "port", port },
|
||||||
};
|
};
|
||||||
c_cli.connect_peer(peer_osd, st_cli.peer_states[peer_osd]);
|
msgr.connect_peer(peer_osd, st_cli.peer_states[peer_osd]);
|
||||||
}
|
}
|
||||||
|
|
||||||
json11::Json osd_t::get_osd_state()
|
json11::Json osd_t::get_osd_state()
|
||||||
@@ -146,16 +146,16 @@ json11::Json osd_t::get_statistics()
|
|||||||
for (int i = OSD_OP_MIN; i <= OSD_OP_MAX; i++)
|
for (int i = OSD_OP_MIN; i <= OSD_OP_MAX; i++)
|
||||||
{
|
{
|
||||||
op_stats[osd_op_names[i]] = json11::Json::object {
|
op_stats[osd_op_names[i]] = json11::Json::object {
|
||||||
{ "count", c_cli.stats.op_stat_count[i] },
|
{ "count", msgr.stats.op_stat_count[i] },
|
||||||
{ "usec", c_cli.stats.op_stat_sum[i] },
|
{ "usec", msgr.stats.op_stat_sum[i] },
|
||||||
{ "bytes", c_cli.stats.op_stat_bytes[i] },
|
{ "bytes", msgr.stats.op_stat_bytes[i] },
|
||||||
};
|
};
|
||||||
}
|
}
|
||||||
for (int i = OSD_OP_MIN; i <= OSD_OP_MAX; i++)
|
for (int i = OSD_OP_MIN; i <= OSD_OP_MAX; i++)
|
||||||
{
|
{
|
||||||
subop_stats[osd_op_names[i]] = json11::Json::object {
|
subop_stats[osd_op_names[i]] = json11::Json::object {
|
||||||
{ "count", c_cli.stats.subop_stat_count[i] },
|
{ "count", msgr.stats.subop_stat_count[i] },
|
||||||
{ "usec", c_cli.stats.subop_stat_sum[i] },
|
{ "usec", msgr.stats.subop_stat_sum[i] },
|
||||||
};
|
};
|
||||||
}
|
}
|
||||||
st["op_stats"] = op_stats;
|
st["op_stats"] = op_stats;
|
||||||
@@ -298,9 +298,9 @@ void osd_t::report_statistics()
|
|||||||
|
|
||||||
void osd_t::on_change_osd_state_hook(osd_num_t peer_osd)
|
void osd_t::on_change_osd_state_hook(osd_num_t peer_osd)
|
||||||
{
|
{
|
||||||
if (c_cli.wanted_peers.find(peer_osd) != c_cli.wanted_peers.end())
|
if (msgr.wanted_peers.find(peer_osd) != msgr.wanted_peers.end())
|
||||||
{
|
{
|
||||||
c_cli.connect_peer(peer_osd, st_cli.peer_states[peer_osd]);
|
msgr.connect_peer(peer_osd, st_cli.peer_states[peer_osd]);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -340,21 +340,10 @@ void osd_t::on_change_pg_history_hook(pool_id_t pool_id, pg_num_t pg_num)
|
|||||||
|
|
||||||
void osd_t::on_load_config_hook(json11::Json::object & global_config)
|
void osd_t::on_load_config_hook(json11::Json::object & global_config)
|
||||||
{
|
{
|
||||||
blockstore_config_t osd_config = this->config;
|
json11::Json::object osd_config = this->config;
|
||||||
for (auto & cfg_var: global_config)
|
for (auto & kv: global_config)
|
||||||
{
|
if (osd_config.find(kv.first) == osd_config.end())
|
||||||
if (this->config.find(cfg_var.first) == this->config.end())
|
osd_config[kv.first] = kv.second;
|
||||||
{
|
|
||||||
if (cfg_var.second.is_string())
|
|
||||||
{
|
|
||||||
osd_config[cfg_var.first] = cfg_var.second.string_value();
|
|
||||||
}
|
|
||||||
else
|
|
||||||
{
|
|
||||||
osd_config[cfg_var.first] = cfg_var.second.dump();
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
parse_config(osd_config);
|
parse_config(osd_config);
|
||||||
bind_socket();
|
bind_socket();
|
||||||
acquire_lease();
|
acquire_lease();
|
||||||
@@ -380,7 +369,7 @@ void osd_t::acquire_lease()
|
|||||||
etcd_lease_id = data["ID"].string_value();
|
etcd_lease_id = data["ID"].string_value();
|
||||||
create_osd_state();
|
create_osd_state();
|
||||||
});
|
});
|
||||||
printf("[OSD %lu] reporting to etcd at %s every %d seconds\n", this->osd_num, config["etcd_address"].c_str(), etcd_report_interval);
|
printf("[OSD %lu] reporting to etcd at %s every %d seconds\n", this->osd_num, config["etcd_address"].string_value().c_str(), etcd_report_interval);
|
||||||
tfd->set_timer(etcd_report_interval*1000, true, [this](int timer_id)
|
tfd->set_timer(etcd_report_interval*1000, true, [this](int timer_id)
|
||||||
{
|
{
|
||||||
renew_lease();
|
renew_lease();
|
||||||
@@ -695,9 +684,9 @@ void osd_t::apply_pg_config()
|
|||||||
// Add peers
|
// Add peers
|
||||||
for (auto pg_osd: all_peers)
|
for (auto pg_osd: all_peers)
|
||||||
{
|
{
|
||||||
if (pg_osd != this->osd_num && c_cli.osd_peer_fds.find(pg_osd) == c_cli.osd_peer_fds.end())
|
if (pg_osd != this->osd_num && msgr.osd_peer_fds.find(pg_osd) == msgr.osd_peer_fds.end())
|
||||||
{
|
{
|
||||||
c_cli.connect_peer(pg_osd, st_cli.peer_states[pg_osd]);
|
msgr.connect_peer(pg_osd, st_cli.peer_states[pg_osd]);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
start_pg_peering(pg);
|
start_pg_peering(pg);
|
||||||
|
@@ -82,10 +82,10 @@ void osd_t::handle_flush_op(bool rollback, pool_id_t pool_id, pg_num_t pg_num, p
|
|||||||
else
|
else
|
||||||
{
|
{
|
||||||
printf("Error while doing flush on OSD %lu: %d (%s)\n", osd_num, retval, strerror(-retval));
|
printf("Error while doing flush on OSD %lu: %d (%s)\n", osd_num, retval, strerror(-retval));
|
||||||
auto fd_it = c_cli.osd_peer_fds.find(peer_osd);
|
auto fd_it = msgr.osd_peer_fds.find(peer_osd);
|
||||||
if (fd_it != c_cli.osd_peer_fds.end())
|
if (fd_it != msgr.osd_peer_fds.end())
|
||||||
{
|
{
|
||||||
c_cli.stop_client(fd_it->second);
|
msgr.stop_client(fd_it->second);
|
||||||
}
|
}
|
||||||
return;
|
return;
|
||||||
}
|
}
|
||||||
@@ -188,7 +188,7 @@ void osd_t::submit_flush_op(pool_id_t pool_id, pg_num_t pg_num, pg_flush_batch_t
|
|||||||
else
|
else
|
||||||
{
|
{
|
||||||
// Peer
|
// Peer
|
||||||
int peer_fd = c_cli.osd_peer_fds[peer_osd];
|
int peer_fd = msgr.osd_peer_fds[peer_osd];
|
||||||
op->op_type = OSD_OP_OUT;
|
op->op_type = OSD_OP_OUT;
|
||||||
op->iov.push_back(op->buf, count * sizeof(obj_ver_id));
|
op->iov.push_back(op->buf, count * sizeof(obj_ver_id));
|
||||||
op->peer_fd = peer_fd;
|
op->peer_fd = peer_fd;
|
||||||
@@ -196,7 +196,7 @@ void osd_t::submit_flush_op(pool_id_t pool_id, pg_num_t pg_num, pg_flush_batch_t
|
|||||||
.sec_stab = {
|
.sec_stab = {
|
||||||
.header = {
|
.header = {
|
||||||
.magic = SECONDARY_OSD_OP_MAGIC,
|
.magic = SECONDARY_OSD_OP_MAGIC,
|
||||||
.id = c_cli.next_subop_id++,
|
.id = msgr.next_subop_id++,
|
||||||
.opcode = (uint64_t)(rollback ? OSD_OP_SEC_ROLLBACK : OSD_OP_SEC_STABILIZE),
|
.opcode = (uint64_t)(rollback ? OSD_OP_SEC_ROLLBACK : OSD_OP_SEC_STABILIZE),
|
||||||
},
|
},
|
||||||
.len = count * sizeof(obj_ver_id),
|
.len = count * sizeof(obj_ver_id),
|
||||||
@@ -207,7 +207,7 @@ void osd_t::submit_flush_op(pool_id_t pool_id, pg_num_t pg_num, pg_flush_batch_t
|
|||||||
handle_flush_op(op->req.hdr.opcode == OSD_OP_SEC_ROLLBACK, pool_id, pg_num, fb, peer_osd, op->reply.hdr.retval);
|
handle_flush_op(op->req.hdr.opcode == OSD_OP_SEC_ROLLBACK, pool_id, pg_num, fb, peer_osd, op->reply.hdr.retval);
|
||||||
delete op;
|
delete op;
|
||||||
};
|
};
|
||||||
c_cli.outbox_push(op);
|
msgr.outbox_push(op);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@@ -29,13 +29,13 @@ int main(int narg, char *args[])
|
|||||||
perror("BUG: too small packet size");
|
perror("BUG: too small packet size");
|
||||||
return 1;
|
return 1;
|
||||||
}
|
}
|
||||||
blockstore_config_t config;
|
json11::Json::object config;
|
||||||
for (int i = 1; i < narg; i++)
|
for (int i = 1; i < narg; i++)
|
||||||
{
|
{
|
||||||
if (args[i][0] == '-' && args[i][1] == '-' && i < narg-1)
|
if (args[i][0] == '-' && args[i][1] == '-' && i < narg-1)
|
||||||
{
|
{
|
||||||
char *opt = args[i]+2;
|
char *opt = args[i]+2;
|
||||||
config[opt] = args[++i];
|
config[std::string(opt)] = std::string(args[++i]);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
signal(SIGINT, handle_sigint);
|
signal(SIGINT, handle_sigint);
|
||||||
|
@@ -148,6 +148,8 @@ struct __attribute__((__packed__)) osd_reply_sec_read_bmp_t
|
|||||||
struct __attribute__((__packed__)) osd_op_show_config_t
|
struct __attribute__((__packed__)) osd_op_show_config_t
|
||||||
{
|
{
|
||||||
osd_op_header_t header;
|
osd_op_header_t header;
|
||||||
|
// JSON request length
|
||||||
|
uint64_t json_len;
|
||||||
};
|
};
|
||||||
|
|
||||||
struct __attribute__((__packed__)) osd_reply_show_config_t
|
struct __attribute__((__packed__)) osd_reply_show_config_t
|
||||||
|
@@ -156,7 +156,7 @@ void osd_t::start_pg_peering(pg_t & pg)
|
|||||||
if (immediate_commit != IMMEDIATE_ALL)
|
if (immediate_commit != IMMEDIATE_ALL)
|
||||||
{
|
{
|
||||||
std::vector<int> to_stop;
|
std::vector<int> to_stop;
|
||||||
for (auto & cp: c_cli.clients)
|
for (auto & cp: msgr.clients)
|
||||||
{
|
{
|
||||||
if (cp.second->dirty_pgs.find({ .pool_id = pg.pool_id, .pg_num = pg.pg_num }) != cp.second->dirty_pgs.end())
|
if (cp.second->dirty_pgs.find({ .pool_id = pg.pool_id, .pg_num = pg.pg_num }) != cp.second->dirty_pgs.end())
|
||||||
{
|
{
|
||||||
@@ -165,7 +165,7 @@ void osd_t::start_pg_peering(pg_t & pg)
|
|||||||
}
|
}
|
||||||
for (auto peer_fd: to_stop)
|
for (auto peer_fd: to_stop)
|
||||||
{
|
{
|
||||||
c_cli.stop_client(peer_fd);
|
msgr.stop_client(peer_fd);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
// Calculate current write OSD set
|
// Calculate current write OSD set
|
||||||
@@ -175,7 +175,7 @@ void osd_t::start_pg_peering(pg_t & pg)
|
|||||||
for (int role = 0; role < pg.target_set.size(); role++)
|
for (int role = 0; role < pg.target_set.size(); role++)
|
||||||
{
|
{
|
||||||
pg.cur_set[role] = pg.target_set[role] == this->osd_num ||
|
pg.cur_set[role] = pg.target_set[role] == this->osd_num ||
|
||||||
c_cli.osd_peer_fds.find(pg.target_set[role]) != c_cli.osd_peer_fds.end() ? pg.target_set[role] : 0;
|
msgr.osd_peer_fds.find(pg.target_set[role]) != msgr.osd_peer_fds.end() ? pg.target_set[role] : 0;
|
||||||
if (pg.cur_set[role] != 0)
|
if (pg.cur_set[role] != 0)
|
||||||
{
|
{
|
||||||
pg.pg_cursize++;
|
pg.pg_cursize++;
|
||||||
@@ -199,7 +199,7 @@ void osd_t::start_pg_peering(pg_t & pg)
|
|||||||
{
|
{
|
||||||
found = false;
|
found = false;
|
||||||
if (history_osd == this->osd_num ||
|
if (history_osd == this->osd_num ||
|
||||||
c_cli.osd_peer_fds.find(history_osd) != c_cli.osd_peer_fds.end())
|
msgr.osd_peer_fds.find(history_osd) != msgr.osd_peer_fds.end())
|
||||||
{
|
{
|
||||||
found = true;
|
found = true;
|
||||||
break;
|
break;
|
||||||
@@ -223,13 +223,13 @@ void osd_t::start_pg_peering(pg_t & pg)
|
|||||||
std::set<osd_num_t> cur_peers;
|
std::set<osd_num_t> cur_peers;
|
||||||
for (auto pg_osd: pg.all_peers)
|
for (auto pg_osd: pg.all_peers)
|
||||||
{
|
{
|
||||||
if (pg_osd == this->osd_num || c_cli.osd_peer_fds.find(pg_osd) != c_cli.osd_peer_fds.end())
|
if (pg_osd == this->osd_num || msgr.osd_peer_fds.find(pg_osd) != msgr.osd_peer_fds.end())
|
||||||
{
|
{
|
||||||
cur_peers.insert(pg_osd);
|
cur_peers.insert(pg_osd);
|
||||||
}
|
}
|
||||||
else if (c_cli.wanted_peers.find(pg_osd) == c_cli.wanted_peers.end())
|
else if (msgr.wanted_peers.find(pg_osd) == msgr.wanted_peers.end())
|
||||||
{
|
{
|
||||||
c_cli.connect_peer(pg_osd, st_cli.peer_states[pg_osd]);
|
msgr.connect_peer(pg_osd, st_cli.peer_states[pg_osd]);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
pg.cur_peers.insert(pg.cur_peers.begin(), cur_peers.begin(), cur_peers.end());
|
pg.cur_peers.insert(pg.cur_peers.begin(), cur_peers.begin(), cur_peers.end());
|
||||||
@@ -325,7 +325,7 @@ void osd_t::submit_sync_and_list_subop(osd_num_t role_osd, pg_peering_state_t *p
|
|||||||
else
|
else
|
||||||
{
|
{
|
||||||
// Peer
|
// Peer
|
||||||
auto & cl = c_cli.clients.at(c_cli.osd_peer_fds[role_osd]);
|
auto & cl = msgr.clients.at(msgr.osd_peer_fds[role_osd]);
|
||||||
osd_op_t *op = new osd_op_t();
|
osd_op_t *op = new osd_op_t();
|
||||||
op->op_type = OSD_OP_OUT;
|
op->op_type = OSD_OP_OUT;
|
||||||
op->peer_fd = cl->peer_fd;
|
op->peer_fd = cl->peer_fd;
|
||||||
@@ -333,7 +333,7 @@ void osd_t::submit_sync_and_list_subop(osd_num_t role_osd, pg_peering_state_t *p
|
|||||||
.sec_sync = {
|
.sec_sync = {
|
||||||
.header = {
|
.header = {
|
||||||
.magic = SECONDARY_OSD_OP_MAGIC,
|
.magic = SECONDARY_OSD_OP_MAGIC,
|
||||||
.id = c_cli.next_subop_id++,
|
.id = msgr.next_subop_id++,
|
||||||
.opcode = OSD_OP_SEC_SYNC,
|
.opcode = OSD_OP_SEC_SYNC,
|
||||||
},
|
},
|
||||||
},
|
},
|
||||||
@@ -347,14 +347,14 @@ void osd_t::submit_sync_and_list_subop(osd_num_t role_osd, pg_peering_state_t *p
|
|||||||
int fail_fd = op->peer_fd;
|
int fail_fd = op->peer_fd;
|
||||||
ps->list_ops.erase(role_osd);
|
ps->list_ops.erase(role_osd);
|
||||||
delete op;
|
delete op;
|
||||||
c_cli.stop_client(fail_fd);
|
msgr.stop_client(fail_fd);
|
||||||
return;
|
return;
|
||||||
}
|
}
|
||||||
delete op;
|
delete op;
|
||||||
ps->list_ops.erase(role_osd);
|
ps->list_ops.erase(role_osd);
|
||||||
submit_list_subop(role_osd, ps);
|
submit_list_subop(role_osd, ps);
|
||||||
};
|
};
|
||||||
c_cli.outbox_push(op);
|
msgr.outbox_push(op);
|
||||||
ps->list_ops[role_osd] = op;
|
ps->list_ops[role_osd] = op;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
@@ -404,12 +404,12 @@ void osd_t::submit_list_subop(osd_num_t role_osd, pg_peering_state_t *ps)
|
|||||||
// Peer
|
// Peer
|
||||||
osd_op_t *op = new osd_op_t();
|
osd_op_t *op = new osd_op_t();
|
||||||
op->op_type = OSD_OP_OUT;
|
op->op_type = OSD_OP_OUT;
|
||||||
op->peer_fd = c_cli.osd_peer_fds[role_osd];
|
op->peer_fd = msgr.osd_peer_fds[role_osd];
|
||||||
op->req = (osd_any_op_t){
|
op->req = (osd_any_op_t){
|
||||||
.sec_list = {
|
.sec_list = {
|
||||||
.header = {
|
.header = {
|
||||||
.magic = SECONDARY_OSD_OP_MAGIC,
|
.magic = SECONDARY_OSD_OP_MAGIC,
|
||||||
.id = c_cli.next_subop_id++,
|
.id = msgr.next_subop_id++,
|
||||||
.opcode = OSD_OP_SEC_LIST,
|
.opcode = OSD_OP_SEC_LIST,
|
||||||
},
|
},
|
||||||
.list_pg = ps->pg_num,
|
.list_pg = ps->pg_num,
|
||||||
@@ -427,7 +427,7 @@ void osd_t::submit_list_subop(osd_num_t role_osd, pg_peering_state_t *ps)
|
|||||||
int fail_fd = op->peer_fd;
|
int fail_fd = op->peer_fd;
|
||||||
ps->list_ops.erase(role_osd);
|
ps->list_ops.erase(role_osd);
|
||||||
delete op;
|
delete op;
|
||||||
c_cli.stop_client(fail_fd);
|
msgr.stop_client(fail_fd);
|
||||||
return;
|
return;
|
||||||
}
|
}
|
||||||
printf(
|
printf(
|
||||||
@@ -444,7 +444,7 @@ void osd_t::submit_list_subop(osd_num_t role_osd, pg_peering_state_t *ps)
|
|||||||
ps->list_ops.erase(role_osd);
|
ps->list_ops.erase(role_osd);
|
||||||
delete op;
|
delete op;
|
||||||
};
|
};
|
||||||
c_cli.outbox_push(op);
|
msgr.outbox_push(op);
|
||||||
ps->list_ops[role_osd] = op;
|
ps->list_ops[role_osd] = op;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
@@ -236,14 +236,14 @@ int osd_t::submit_bitmap_subops(osd_op_t *cur_op, pg_t & pg)
|
|||||||
// Send to a remote OSD
|
// Send to a remote OSD
|
||||||
osd_op_t *subop = op_data->subops+subop_idx;
|
osd_op_t *subop = op_data->subops+subop_idx;
|
||||||
subop->op_type = OSD_OP_OUT;
|
subop->op_type = OSD_OP_OUT;
|
||||||
subop->peer_fd = c_cli.osd_peer_fds.at(subop_osd_num);
|
subop->peer_fd = msgr.osd_peer_fds.at(subop_osd_num);
|
||||||
// FIXME: Use the pre-allocated buffer
|
// FIXME: Use the pre-allocated buffer
|
||||||
subop->buf = malloc_or_die(sizeof(obj_ver_id)*(i+1-prev));
|
subop->buf = malloc_or_die(sizeof(obj_ver_id)*(i+1-prev));
|
||||||
subop->req = (osd_any_op_t){
|
subop->req = (osd_any_op_t){
|
||||||
.sec_read_bmp = {
|
.sec_read_bmp = {
|
||||||
.header = {
|
.header = {
|
||||||
.magic = SECONDARY_OSD_OP_MAGIC,
|
.magic = SECONDARY_OSD_OP_MAGIC,
|
||||||
.id = c_cli.next_subop_id++,
|
.id = msgr.next_subop_id++,
|
||||||
.opcode = OSD_OP_SEC_READ_BMP,
|
.opcode = OSD_OP_SEC_READ_BMP,
|
||||||
},
|
},
|
||||||
.len = sizeof(obj_ver_id)*(i+1-prev),
|
.len = sizeof(obj_ver_id)*(i+1-prev),
|
||||||
@@ -273,7 +273,7 @@ int osd_t::submit_bitmap_subops(osd_op_t *cur_op, pg_t & pg)
|
|||||||
}
|
}
|
||||||
handle_primary_subop(subop, cur_op);
|
handle_primary_subop(subop, cur_op);
|
||||||
};
|
};
|
||||||
c_cli.outbox_push(subop);
|
msgr.outbox_push(subop);
|
||||||
subop_idx++;
|
subop_idx++;
|
||||||
}
|
}
|
||||||
prev = i+1;
|
prev = i+1;
|
||||||
|
@@ -87,14 +87,14 @@ void osd_t::finish_op(osd_op_t *cur_op, int retval)
|
|||||||
else
|
else
|
||||||
{
|
{
|
||||||
// FIXME add separate magic number for primary ops
|
// FIXME add separate magic number for primary ops
|
||||||
auto cl_it = c_cli.clients.find(cur_op->peer_fd);
|
auto cl_it = msgr.clients.find(cur_op->peer_fd);
|
||||||
if (cl_it != c_cli.clients.end())
|
if (cl_it != msgr.clients.end())
|
||||||
{
|
{
|
||||||
cur_op->reply.hdr.magic = SECONDARY_OSD_REPLY_MAGIC;
|
cur_op->reply.hdr.magic = SECONDARY_OSD_REPLY_MAGIC;
|
||||||
cur_op->reply.hdr.id = cur_op->req.hdr.id;
|
cur_op->reply.hdr.id = cur_op->req.hdr.id;
|
||||||
cur_op->reply.hdr.opcode = cur_op->req.hdr.opcode;
|
cur_op->reply.hdr.opcode = cur_op->req.hdr.opcode;
|
||||||
cur_op->reply.hdr.retval = retval;
|
cur_op->reply.hdr.retval = retval;
|
||||||
c_cli.outbox_push(cur_op);
|
msgr.outbox_push(cur_op);
|
||||||
}
|
}
|
||||||
else
|
else
|
||||||
{
|
{
|
||||||
@@ -184,13 +184,13 @@ int osd_t::submit_primary_subop_batch(int submit_type, inode_t inode, uint64_t o
|
|||||||
else
|
else
|
||||||
{
|
{
|
||||||
subop->op_type = OSD_OP_OUT;
|
subop->op_type = OSD_OP_OUT;
|
||||||
subop->peer_fd = c_cli.osd_peer_fds.at(role_osd_num);
|
subop->peer_fd = msgr.osd_peer_fds.at(role_osd_num);
|
||||||
subop->bitmap = stripes[stripe_num].bmp_buf;
|
subop->bitmap = stripes[stripe_num].bmp_buf;
|
||||||
subop->bitmap_len = clean_entry_bitmap_size;
|
subop->bitmap_len = clean_entry_bitmap_size;
|
||||||
subop->req.sec_rw = {
|
subop->req.sec_rw = {
|
||||||
.header = {
|
.header = {
|
||||||
.magic = SECONDARY_OSD_OP_MAGIC,
|
.magic = SECONDARY_OSD_OP_MAGIC,
|
||||||
.id = c_cli.next_subop_id++,
|
.id = msgr.next_subop_id++,
|
||||||
.opcode = (uint64_t)(wr ? (rep ? OSD_OP_SEC_WRITE_STABLE : OSD_OP_SEC_WRITE) : OSD_OP_SEC_READ),
|
.opcode = (uint64_t)(wr ? (rep ? OSD_OP_SEC_WRITE_STABLE : OSD_OP_SEC_WRITE) : OSD_OP_SEC_READ),
|
||||||
},
|
},
|
||||||
.oid = {
|
.oid = {
|
||||||
@@ -227,7 +227,7 @@ int osd_t::submit_primary_subop_batch(int submit_type, inode_t inode, uint64_t o
|
|||||||
{
|
{
|
||||||
handle_primary_subop(subop, cur_op);
|
handle_primary_subop(subop, cur_op);
|
||||||
};
|
};
|
||||||
c_cli.outbox_push(subop);
|
msgr.outbox_push(subop);
|
||||||
}
|
}
|
||||||
i++;
|
i++;
|
||||||
}
|
}
|
||||||
@@ -282,20 +282,20 @@ void osd_t::add_bs_subop_stats(osd_op_t *subop)
|
|||||||
uint64_t opcode = bs_op_to_osd_op[subop->bs_op->opcode];
|
uint64_t opcode = bs_op_to_osd_op[subop->bs_op->opcode];
|
||||||
timespec tv_end;
|
timespec tv_end;
|
||||||
clock_gettime(CLOCK_REALTIME, &tv_end);
|
clock_gettime(CLOCK_REALTIME, &tv_end);
|
||||||
c_cli.stats.op_stat_count[opcode]++;
|
msgr.stats.op_stat_count[opcode]++;
|
||||||
if (!c_cli.stats.op_stat_count[opcode])
|
if (!msgr.stats.op_stat_count[opcode])
|
||||||
{
|
{
|
||||||
c_cli.stats.op_stat_count[opcode] = 1;
|
msgr.stats.op_stat_count[opcode] = 1;
|
||||||
c_cli.stats.op_stat_sum[opcode] = 0;
|
msgr.stats.op_stat_sum[opcode] = 0;
|
||||||
c_cli.stats.op_stat_bytes[opcode] = 0;
|
msgr.stats.op_stat_bytes[opcode] = 0;
|
||||||
}
|
}
|
||||||
c_cli.stats.op_stat_sum[opcode] += (
|
msgr.stats.op_stat_sum[opcode] += (
|
||||||
(tv_end.tv_sec - subop->tv_begin.tv_sec)*1000000 +
|
(tv_end.tv_sec - subop->tv_begin.tv_sec)*1000000 +
|
||||||
(tv_end.tv_nsec - subop->tv_begin.tv_nsec)/1000
|
(tv_end.tv_nsec - subop->tv_begin.tv_nsec)/1000
|
||||||
);
|
);
|
||||||
if (opcode == OSD_OP_SEC_READ || opcode == OSD_OP_SEC_WRITE)
|
if (opcode == OSD_OP_SEC_READ || opcode == OSD_OP_SEC_WRITE)
|
||||||
{
|
{
|
||||||
c_cli.stats.op_stat_bytes[opcode] += subop->bs_op->len;
|
msgr.stats.op_stat_bytes[opcode] += subop->bs_op->len;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -322,7 +322,7 @@ void osd_t::handle_primary_subop(osd_op_t *subop, osd_op_t *cur_op)
|
|||||||
if (subop->peer_fd >= 0)
|
if (subop->peer_fd >= 0)
|
||||||
{
|
{
|
||||||
// Drop connection on any error
|
// Drop connection on any error
|
||||||
c_cli.stop_client(subop->peer_fd);
|
msgr.stop_client(subop->peer_fd);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
else
|
else
|
||||||
@@ -332,8 +332,8 @@ void osd_t::handle_primary_subop(osd_op_t *subop, osd_op_t *cur_op)
|
|||||||
{
|
{
|
||||||
uint64_t version = subop->reply.sec_rw.version;
|
uint64_t version = subop->reply.sec_rw.version;
|
||||||
#ifdef OSD_DEBUG
|
#ifdef OSD_DEBUG
|
||||||
uint64_t peer_osd = c_cli.clients.find(subop->peer_fd) != c_cli.clients.end()
|
uint64_t peer_osd = msgr.clients.find(subop->peer_fd) != msgr.clients.end()
|
||||||
? c_cli.clients[subop->peer_fd]->osd_num : osd_num;
|
? msgr.clients[subop->peer_fd]->osd_num : osd_num;
|
||||||
printf("subop %lu from osd %lu: version = %lu\n", opcode, peer_osd, version);
|
printf("subop %lu from osd %lu: version = %lu\n", opcode, peer_osd, version);
|
||||||
#endif
|
#endif
|
||||||
if (op_data->fact_ver != UINT64_MAX)
|
if (op_data->fact_ver != UINT64_MAX)
|
||||||
@@ -465,11 +465,11 @@ void osd_t::submit_primary_del_batch(osd_op_t *cur_op, obj_ver_osd_t *chunks_to_
|
|||||||
else
|
else
|
||||||
{
|
{
|
||||||
subops[i].op_type = OSD_OP_OUT;
|
subops[i].op_type = OSD_OP_OUT;
|
||||||
subops[i].peer_fd = c_cli.osd_peer_fds.at(chunk.osd_num);
|
subops[i].peer_fd = msgr.osd_peer_fds.at(chunk.osd_num);
|
||||||
subops[i].req = (osd_any_op_t){ .sec_del = {
|
subops[i].req = (osd_any_op_t){ .sec_del = {
|
||||||
.header = {
|
.header = {
|
||||||
.magic = SECONDARY_OSD_OP_MAGIC,
|
.magic = SECONDARY_OSD_OP_MAGIC,
|
||||||
.id = c_cli.next_subop_id++,
|
.id = msgr.next_subop_id++,
|
||||||
.opcode = OSD_OP_SEC_DELETE,
|
.opcode = OSD_OP_SEC_DELETE,
|
||||||
},
|
},
|
||||||
.oid = chunk.oid,
|
.oid = chunk.oid,
|
||||||
@@ -479,7 +479,7 @@ void osd_t::submit_primary_del_batch(osd_op_t *cur_op, obj_ver_osd_t *chunks_to_
|
|||||||
{
|
{
|
||||||
handle_primary_subop(subop, cur_op);
|
handle_primary_subop(subop, cur_op);
|
||||||
};
|
};
|
||||||
c_cli.outbox_push(&subops[i]);
|
msgr.outbox_push(&subops[i]);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
@@ -509,14 +509,14 @@ int osd_t::submit_primary_sync_subops(osd_op_t *cur_op)
|
|||||||
});
|
});
|
||||||
bs->enqueue_op(subops[i].bs_op);
|
bs->enqueue_op(subops[i].bs_op);
|
||||||
}
|
}
|
||||||
else if ((peer_it = c_cli.osd_peer_fds.find(sync_osd)) != c_cli.osd_peer_fds.end())
|
else if ((peer_it = msgr.osd_peer_fds.find(sync_osd)) != msgr.osd_peer_fds.end())
|
||||||
{
|
{
|
||||||
subops[i].op_type = OSD_OP_OUT;
|
subops[i].op_type = OSD_OP_OUT;
|
||||||
subops[i].peer_fd = peer_it->second;
|
subops[i].peer_fd = peer_it->second;
|
||||||
subops[i].req = (osd_any_op_t){ .sec_sync = {
|
subops[i].req = (osd_any_op_t){ .sec_sync = {
|
||||||
.header = {
|
.header = {
|
||||||
.magic = SECONDARY_OSD_OP_MAGIC,
|
.magic = SECONDARY_OSD_OP_MAGIC,
|
||||||
.id = c_cli.next_subop_id++,
|
.id = msgr.next_subop_id++,
|
||||||
.opcode = OSD_OP_SEC_SYNC,
|
.opcode = OSD_OP_SEC_SYNC,
|
||||||
},
|
},
|
||||||
} };
|
} };
|
||||||
@@ -524,7 +524,7 @@ int osd_t::submit_primary_sync_subops(osd_op_t *cur_op)
|
|||||||
{
|
{
|
||||||
handle_primary_subop(subop, cur_op);
|
handle_primary_subop(subop, cur_op);
|
||||||
};
|
};
|
||||||
c_cli.outbox_push(&subops[i]);
|
msgr.outbox_push(&subops[i]);
|
||||||
}
|
}
|
||||||
else
|
else
|
||||||
{
|
{
|
||||||
@@ -569,11 +569,11 @@ void osd_t::submit_primary_stab_subops(osd_op_t *cur_op)
|
|||||||
else
|
else
|
||||||
{
|
{
|
||||||
subops[i].op_type = OSD_OP_OUT;
|
subops[i].op_type = OSD_OP_OUT;
|
||||||
subops[i].peer_fd = c_cli.osd_peer_fds.at(stab_osd.osd_num);
|
subops[i].peer_fd = msgr.osd_peer_fds.at(stab_osd.osd_num);
|
||||||
subops[i].req = (osd_any_op_t){ .sec_stab = {
|
subops[i].req = (osd_any_op_t){ .sec_stab = {
|
||||||
.header = {
|
.header = {
|
||||||
.magic = SECONDARY_OSD_OP_MAGIC,
|
.magic = SECONDARY_OSD_OP_MAGIC,
|
||||||
.id = c_cli.next_subop_id++,
|
.id = msgr.next_subop_id++,
|
||||||
.opcode = OSD_OP_SEC_STABILIZE,
|
.opcode = OSD_OP_SEC_STABILIZE,
|
||||||
},
|
},
|
||||||
.len = (uint64_t)(stab_osd.len * sizeof(obj_ver_id)),
|
.len = (uint64_t)(stab_osd.len * sizeof(obj_ver_id)),
|
||||||
@@ -583,7 +583,7 @@ void osd_t::submit_primary_stab_subops(osd_op_t *cur_op)
|
|||||||
{
|
{
|
||||||
handle_primary_subop(subop, cur_op);
|
handle_primary_subop(subop, cur_op);
|
||||||
};
|
};
|
||||||
c_cli.outbox_push(&subops[i]);
|
msgr.outbox_push(&subops[i]);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
@@ -247,8 +247,8 @@ resume_8:
|
|||||||
finish:
|
finish:
|
||||||
if (cur_op->peer_fd)
|
if (cur_op->peer_fd)
|
||||||
{
|
{
|
||||||
auto it = c_cli.clients.find(cur_op->peer_fd);
|
auto it = msgr.clients.find(cur_op->peer_fd);
|
||||||
if (it != c_cli.clients.end())
|
if (it != msgr.clients.end())
|
||||||
it->second->dirty_pgs.clear();
|
it->second->dirty_pgs.clear();
|
||||||
}
|
}
|
||||||
finish_op(cur_op, 0);
|
finish_op(cur_op, 0);
|
||||||
|
@@ -370,8 +370,8 @@ lazy:
|
|||||||
}
|
}
|
||||||
// Remember PG as dirty to drop the connection when PG goes offline
|
// Remember PG as dirty to drop the connection when PG goes offline
|
||||||
// (this is required because of the "lazy sync")
|
// (this is required because of the "lazy sync")
|
||||||
auto cl_it = c_cli.clients.find(cur_op->peer_fd);
|
auto cl_it = msgr.clients.find(cur_op->peer_fd);
|
||||||
if (cl_it != c_cli.clients.end())
|
if (cl_it != msgr.clients.end())
|
||||||
{
|
{
|
||||||
cl_it->second->dirty_pgs.insert({ .pool_id = pg.pool_id, .pg_num = pg.pg_num });
|
cl_it->second->dirty_pgs.insert({ .pool_id = pg.pool_id, .pg_num = pg.pg_num });
|
||||||
}
|
}
|
||||||
|
@@ -144,10 +144,44 @@ void osd_t::exec_secondary(osd_op_t *cur_op)
|
|||||||
|
|
||||||
void osd_t::exec_show_config(osd_op_t *cur_op)
|
void osd_t::exec_show_config(osd_op_t *cur_op)
|
||||||
{
|
{
|
||||||
// FIXME: Send the real config, not its source
|
std::string json_err;
|
||||||
auto cfg_copy = config;
|
json11::Json req_json = cur_op->req.show_conf.json_len > 0
|
||||||
cfg_copy["protocol_version"] = std::to_string(OSD_PROTOCOL_VERSION);
|
? json11::Json::parse(std::string((char *)cur_op->buf), json_err)
|
||||||
std::string cfg_str = json11::Json(cfg_copy).dump();
|
: json11::Json();
|
||||||
|
// Expose sensitive configuration values so peers can check them
|
||||||
|
json11::Json::object wire_config = json11::Json::object {
|
||||||
|
{ "osd_num", osd_num },
|
||||||
|
{ "protocol_version", OSD_PROTOCOL_VERSION },
|
||||||
|
{ "block_size", (uint64_t)bs_block_size },
|
||||||
|
{ "bitmap_granularity", (uint64_t)bs_bitmap_granularity },
|
||||||
|
{ "primary_enabled", run_primary },
|
||||||
|
{ "blockstore_enabled", bs ? true : false },
|
||||||
|
{ "readonly", readonly },
|
||||||
|
{ "immediate_commit", (immediate_commit == IMMEDIATE_ALL ? "all" :
|
||||||
|
(immediate_commit == IMMEDIATE_SMALL ? "small" : "none")) },
|
||||||
|
{ "lease_timeout", etcd_report_interval+(MAX_ETCD_ATTEMPTS*(2*ETCD_QUICK_TIMEOUT)+999)/1000 },
|
||||||
|
};
|
||||||
|
#ifdef WITH_RDMA
|
||||||
|
if (msgr.is_rdma_enabled())
|
||||||
|
{
|
||||||
|
// Indicate that RDMA is enabled
|
||||||
|
wire_config["rdma_enabled"] = true;
|
||||||
|
if (req_json["connect_rdma"].is_string())
|
||||||
|
{
|
||||||
|
// Peer is trying to connect using RDMA, try to satisfy him
|
||||||
|
bool ok = msgr.connect_rdma(cur_op->peer_fd, req_json["connect_rdma"].string_value(), req_json["rdma_max_msg"].uint64_value());
|
||||||
|
if (ok)
|
||||||
|
{
|
||||||
|
auto rc = msgr.clients.at(cur_op->peer_fd)->rdma_conn;
|
||||||
|
wire_config["rdma_address"] = rc->addr.to_string();
|
||||||
|
wire_config["rdma_max_msg"] = rc->max_msg;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
#endif
|
||||||
|
if (cur_op->buf)
|
||||||
|
free(cur_op->buf);
|
||||||
|
std::string cfg_str = json11::Json(wire_config).dump();
|
||||||
cur_op->buf = malloc_or_die(cfg_str.size()+1);
|
cur_op->buf = malloc_or_die(cfg_str.size()+1);
|
||||||
memcpy(cur_op->buf, cfg_str.c_str(), cfg_str.size()+1);
|
memcpy(cur_op->buf, cfg_str.c_str(), cfg_str.size()+1);
|
||||||
cur_op->iov.push_back(cur_op->buf, cfg_str.size()+1);
|
cur_op->iov.push_back(cur_op->buf, cfg_str.size()+1);
|
||||||
|
@@ -40,6 +40,7 @@ typedef struct VitastorClient
|
|||||||
{
|
{
|
||||||
void *proxy;
|
void *proxy;
|
||||||
void *watch;
|
void *watch;
|
||||||
|
char *config_path;
|
||||||
char *etcd_host;
|
char *etcd_host;
|
||||||
char *etcd_prefix;
|
char *etcd_prefix;
|
||||||
char *image;
|
char *image;
|
||||||
@@ -47,6 +48,10 @@ typedef struct VitastorClient
|
|||||||
uint64_t pool;
|
uint64_t pool;
|
||||||
uint64_t size;
|
uint64_t size;
|
||||||
long readonly;
|
long readonly;
|
||||||
|
char *rdma_device;
|
||||||
|
int rdma_port_num;
|
||||||
|
int rdma_gid_index;
|
||||||
|
int rdma_mtu;
|
||||||
QemuMutex mutex;
|
QemuMutex mutex;
|
||||||
} VitastorClient;
|
} VitastorClient;
|
||||||
|
|
||||||
@@ -95,7 +100,8 @@ static void qemu_rbd_unescape(char *src)
|
|||||||
}
|
}
|
||||||
|
|
||||||
// vitastor[:key=value]*
|
// vitastor[:key=value]*
|
||||||
// vitastor:etcd_host=127.0.0.1:inode=1:pool=1
|
// vitastor[:etcd_host=127.0.0.1]:inode=1:pool=1[:rdma_gid_index=3]
|
||||||
|
// vitastor:config_path=/etc/vitastor/vitastor.conf:image=testimg
|
||||||
static void vitastor_parse_filename(const char *filename, QDict *options, Error **errp)
|
static void vitastor_parse_filename(const char *filename, QDict *options, Error **errp)
|
||||||
{
|
{
|
||||||
const char *start;
|
const char *start;
|
||||||
@@ -123,7 +129,12 @@ static void vitastor_parse_filename(const char *filename, QDict *options, Error
|
|||||||
qemu_rbd_unescape(name);
|
qemu_rbd_unescape(name);
|
||||||
value = qemu_rbd_next_tok(p, ':', &p);
|
value = qemu_rbd_next_tok(p, ':', &p);
|
||||||
qemu_rbd_unescape(value);
|
qemu_rbd_unescape(value);
|
||||||
if (!strcmp(name, "inode") || !strcmp(name, "pool") || !strcmp(name, "size"))
|
if (!strcmp(name, "inode") ||
|
||||||
|
!strcmp(name, "pool") ||
|
||||||
|
!strcmp(name, "size") ||
|
||||||
|
!strcmp(name, "rdma_port_num") ||
|
||||||
|
!strcmp(name, "rdma_gid_index") ||
|
||||||
|
!strcmp(name, "rdma_mtu"))
|
||||||
{
|
{
|
||||||
unsigned long long num_val;
|
unsigned long long num_val;
|
||||||
if (parse_uint_full(value, &num_val, 0))
|
if (parse_uint_full(value, &num_val, 0))
|
||||||
@@ -157,11 +168,6 @@ static void vitastor_parse_filename(const char *filename, QDict *options, Error
|
|||||||
goto out;
|
goto out;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
if (!qdict_get_str(options, "etcd_host"))
|
|
||||||
{
|
|
||||||
error_setg(errp, "etcd_host is missing");
|
|
||||||
goto out;
|
|
||||||
}
|
|
||||||
|
|
||||||
out:
|
out:
|
||||||
g_free(buf);
|
g_free(buf);
|
||||||
@@ -189,9 +195,17 @@ static int vitastor_file_open(BlockDriverState *bs, QDict *options, int flags, E
|
|||||||
VitastorClient *client = bs->opaque;
|
VitastorClient *client = bs->opaque;
|
||||||
int64_t ret = 0;
|
int64_t ret = 0;
|
||||||
qemu_mutex_init(&client->mutex);
|
qemu_mutex_init(&client->mutex);
|
||||||
|
client->config_path = g_strdup(qdict_get_try_str(options, "config_path"));
|
||||||
client->etcd_host = g_strdup(qdict_get_try_str(options, "etcd_host"));
|
client->etcd_host = g_strdup(qdict_get_try_str(options, "etcd_host"));
|
||||||
client->etcd_prefix = g_strdup(qdict_get_try_str(options, "etcd_prefix"));
|
client->etcd_prefix = g_strdup(qdict_get_try_str(options, "etcd_prefix"));
|
||||||
client->proxy = vitastor_proxy_create(bdrv_get_aio_context(bs), client->etcd_host, client->etcd_prefix);
|
client->rdma_device = g_strdup(qdict_get_try_str(options, "rdma_device"));
|
||||||
|
client->rdma_port_num = qdict_get_try_int(options, "rdma_port_num", 0);
|
||||||
|
client->rdma_gid_index = qdict_get_try_int(options, "rdma_gid_index", 0);
|
||||||
|
client->rdma_mtu = qdict_get_try_int(options, "rdma_mtu", 0);
|
||||||
|
client->proxy = vitastor_proxy_create(
|
||||||
|
bdrv_get_aio_context(bs), client->config_path, client->etcd_host, client->etcd_prefix,
|
||||||
|
client->rdma_device, client->rdma_port_num, client->rdma_gid_index, client->rdma_mtu
|
||||||
|
);
|
||||||
client->image = g_strdup(qdict_get_try_str(options, "image"));
|
client->image = g_strdup(qdict_get_try_str(options, "image"));
|
||||||
client->readonly = (flags & BDRV_O_RDWR) ? 1 : 0;
|
client->readonly = (flags & BDRV_O_RDWR) ? 1 : 0;
|
||||||
if (client->image)
|
if (client->image)
|
||||||
@@ -241,6 +255,11 @@ static int vitastor_file_open(BlockDriverState *bs, QDict *options, int flags, E
|
|||||||
}
|
}
|
||||||
bs->total_sectors = client->size / BDRV_SECTOR_SIZE;
|
bs->total_sectors = client->size / BDRV_SECTOR_SIZE;
|
||||||
//client->aio_context = bdrv_get_aio_context(bs);
|
//client->aio_context = bdrv_get_aio_context(bs);
|
||||||
|
qdict_del(options, "rdma_mtu");
|
||||||
|
qdict_del(options, "rdma_gid_index");
|
||||||
|
qdict_del(options, "rdma_port_num");
|
||||||
|
qdict_del(options, "rdma_device");
|
||||||
|
qdict_del(options, "config_path");
|
||||||
qdict_del(options, "etcd_host");
|
qdict_del(options, "etcd_host");
|
||||||
qdict_del(options, "etcd_prefix");
|
qdict_del(options, "etcd_prefix");
|
||||||
qdict_del(options, "image");
|
qdict_del(options, "image");
|
||||||
@@ -255,7 +274,10 @@ static void vitastor_close(BlockDriverState *bs)
|
|||||||
VitastorClient *client = bs->opaque;
|
VitastorClient *client = bs->opaque;
|
||||||
vitastor_proxy_destroy(client->proxy);
|
vitastor_proxy_destroy(client->proxy);
|
||||||
qemu_mutex_destroy(&client->mutex);
|
qemu_mutex_destroy(&client->mutex);
|
||||||
g_free(client->etcd_host);
|
if (client->config_path)
|
||||||
|
g_free(client->config_path);
|
||||||
|
if (client->etcd_host)
|
||||||
|
g_free(client->etcd_host);
|
||||||
if (client->etcd_prefix)
|
if (client->etcd_prefix)
|
||||||
g_free(client->etcd_prefix);
|
g_free(client->etcd_prefix);
|
||||||
if (client->image)
|
if (client->image)
|
||||||
@@ -478,6 +500,7 @@ static QEMUOptionParameter vitastor_create_opts[] = {
|
|||||||
static const char *vitastor_strong_runtime_opts[] = {
|
static const char *vitastor_strong_runtime_opts[] = {
|
||||||
"inode",
|
"inode",
|
||||||
"pool",
|
"pool",
|
||||||
|
"config_path",
|
||||||
"etcd_host",
|
"etcd_host",
|
||||||
"etcd_prefix",
|
"etcd_prefix",
|
||||||
|
|
||||||
|
@@ -34,15 +34,28 @@ public:
|
|||||||
cluster_client_t *cli;
|
cluster_client_t *cli;
|
||||||
AioContext *ctx;
|
AioContext *ctx;
|
||||||
|
|
||||||
QemuProxy(AioContext *ctx, const char *etcd_host, const char *etcd_prefix)
|
QemuProxy(AioContext *ctx, const char *config_path, const char *etcd_host, const char *etcd_prefix,
|
||||||
|
const char *rdma_device, int rdma_port_num, int rdma_gid_index, int rdma_mtu)
|
||||||
{
|
{
|
||||||
this->ctx = ctx;
|
this->ctx = ctx;
|
||||||
json11::Json cfg = json11::Json::object {
|
json11::Json::object cfg;
|
||||||
{ "etcd_address", std::string(etcd_host) },
|
if (config_path)
|
||||||
{ "etcd_prefix", std::string(etcd_prefix ? etcd_prefix : "/vitastor") },
|
cfg["config_path"] = std::string(config_path);
|
||||||
};
|
if (etcd_host)
|
||||||
|
cfg["etcd_address"] = std::string(etcd_host);
|
||||||
|
if (etcd_prefix)
|
||||||
|
cfg["etcd_prefix"] = std::string(etcd_prefix);
|
||||||
|
if (rdma_device)
|
||||||
|
cfg["rdma_device"] = std::string(rdma_device);
|
||||||
|
if (rdma_port_num)
|
||||||
|
cfg["rdma_port_num"] = rdma_port_num;
|
||||||
|
if (rdma_gid_index)
|
||||||
|
cfg["rdma_gid_index"] = rdma_gid_index;
|
||||||
|
if (rdma_mtu)
|
||||||
|
cfg["rdma_mtu"] = rdma_mtu;
|
||||||
|
json11::Json cfg_json(cfg);
|
||||||
tfd = new timerfd_manager_t([this](int fd, bool wr, std::function<void(int, int)> callback) { set_fd_handler(fd, wr, callback); });
|
tfd = new timerfd_manager_t([this](int fd, bool wr, std::function<void(int, int)> callback) { set_fd_handler(fd, wr, callback); });
|
||||||
cli = new cluster_client_t(NULL, tfd, cfg);
|
cli = new cluster_client_t(NULL, tfd, cfg_json);
|
||||||
}
|
}
|
||||||
|
|
||||||
~QemuProxy()
|
~QemuProxy()
|
||||||
@@ -80,9 +93,10 @@ public:
|
|||||||
|
|
||||||
extern "C" {
|
extern "C" {
|
||||||
|
|
||||||
void* vitastor_proxy_create(AioContext *ctx, const char *etcd_host, const char *etcd_prefix)
|
void* vitastor_proxy_create(AioContext *ctx, const char *config_path, const char *etcd_host, const char *etcd_prefix,
|
||||||
|
const char *rdma_device, int rdma_port_num, int rdma_gid_index, int rdma_mtu)
|
||||||
{
|
{
|
||||||
QemuProxy *p = new QemuProxy(ctx, etcd_host, etcd_prefix);
|
QemuProxy *p = new QemuProxy(ctx, config_path, etcd_host, etcd_prefix, rdma_device, rdma_port_num, rdma_gid_index, rdma_mtu);
|
||||||
return p;
|
return p;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@@ -16,7 +16,8 @@ extern "C" {
|
|||||||
|
|
||||||
// Our exports
|
// Our exports
|
||||||
typedef void VitastorIOHandler(long retval, void *opaque);
|
typedef void VitastorIOHandler(long retval, void *opaque);
|
||||||
void* vitastor_proxy_create(AioContext *ctx, const char *etcd_host, const char *etcd_prefix);
|
void* vitastor_proxy_create(AioContext *ctx, const char *config_path, const char *etcd_host, const char *etcd_prefix,
|
||||||
|
const char *rdma_device, int rdma_port_num, int rdma_gid_index, int rdma_mtu);
|
||||||
void vitastor_proxy_destroy(void *client);
|
void vitastor_proxy_destroy(void *client);
|
||||||
void vitastor_proxy_rw(int write, void *client, uint64_t inode, uint64_t offset, uint64_t len,
|
void vitastor_proxy_rw(int write, void *client, uint64_t inode, uint64_t offset, uint64_t len,
|
||||||
struct iovec *iov, int iovcnt, VitastorIOHandler cb, void *opaque);
|
struct iovec *iov, int iovcnt, VitastorIOHandler cb, void *opaque);
|
||||||
|
@@ -87,7 +87,7 @@ public:
|
|||||||
"Vitastor inode removal tool\n"
|
"Vitastor inode removal tool\n"
|
||||||
"(c) Vitaliy Filippov, 2020 (VNPL-1.1)\n\n"
|
"(c) Vitaliy Filippov, 2020 (VNPL-1.1)\n\n"
|
||||||
"USAGE:\n"
|
"USAGE:\n"
|
||||||
" %s --etcd_address <etcd_address> --pool <pool> --inode <inode> [--wait-list]\n",
|
" %s [--etcd_address <etcd_address>] --pool <pool> --inode <inode> [--wait-list]\n",
|
||||||
exe_name
|
exe_name
|
||||||
);
|
);
|
||||||
exit(0);
|
exit(0);
|
||||||
@@ -95,11 +95,6 @@ public:
|
|||||||
|
|
||||||
void run(json11::Json cfg)
|
void run(json11::Json cfg)
|
||||||
{
|
{
|
||||||
if (cfg["etcd_address"].string_value() == "")
|
|
||||||
{
|
|
||||||
fprintf(stderr, "etcd_address is missing\n");
|
|
||||||
exit(1);
|
|
||||||
}
|
|
||||||
inode = cfg["inode"].uint64_value();
|
inode = cfg["inode"].uint64_value();
|
||||||
pool_id = cfg["pool"].uint64_value();
|
pool_id = cfg["pool"].uint64_value();
|
||||||
if (pool_id)
|
if (pool_id)
|
||||||
|
@@ -46,7 +46,7 @@ $ETCDCTL put /vitastor/config/inode/1/1 '{"name":"debian9@0","size":'$((2048*102
|
|||||||
$ETCDCTL put /vitastor/config/inode/1/2 '{"parent_id":1,"name":"debian9","size":'$((2048*1024*1024))'}'
|
$ETCDCTL put /vitastor/config/inode/1/2 '{"parent_id":1,"name":"debian9","size":'$((2048*1024*1024))'}'
|
||||||
|
|
||||||
qemu-system-x86_64 -enable-kvm -m 1024 \
|
qemu-system-x86_64 -enable-kvm -m 1024 \
|
||||||
-drive 'file=vitastor:etcd_host=127.0.0.1\:$ETCD_PORT/v3:image=debian9',format=raw,if=none,id=drive-virtio-disk0,cache=none \
|
-drive 'file=vitastor:etcd_host=127.0.0.1\:'$ETCD_PORT'/v3:image=debian9',format=raw,if=none,id=drive-virtio-disk0,cache=none \
|
||||||
-device virtio-blk-pci,scsi=off,bus=pci.0,addr=0x5,drive=drive-virtio-disk0,id=virtio-disk0,bootindex=1,write-cache=off,physical_block_size=4096,logical_block_size=512 \
|
-device virtio-blk-pci,scsi=off,bus=pci.0,addr=0x5,drive=drive-virtio-disk0,id=virtio-disk0,bootindex=1,write-cache=off,physical_block_size=4096,logical_block_size=512 \
|
||||||
-vnc 0.0.0.0:0
|
-vnc 0.0.0.0:0
|
||||||
|
|
||||||
|
Reference in New Issue
Block a user