Compare commits
17 Commits
v0.6.1
...
rdma-zeroc
Author | SHA1 | Date | |
---|---|---|---|
eb9fc274e8 | |||
9681b62204 | |||
8faf8f7b58 | |||
ce777319c3 | |||
f8ff39b0ab | |||
d749159585 | |||
9703773a63 | |||
5d8d486f7c | |||
2b546cdd55 | |||
bd7b177707 | |||
33f9d03d22 | |||
82e6aff17b | |||
57e2c503f7 | |||
715bc8d53d | |||
0af077701c | |||
cac976ce25 | |||
acf0646542 |
@@ -2,4 +2,6 @@ cmake_minimum_required(VERSION 2.8)
|
||||
|
||||
project(vitastor)
|
||||
|
||||
set(VERSION "0.6.2")
|
||||
|
||||
add_subdirectory(src)
|
||||
|
110
README-ru.md
110
README-ru.md
@@ -314,14 +314,15 @@ Ceph:
|
||||
|
||||
### NBD
|
||||
|
||||
NBD - на данный момент единственный способ монтировать Vitastor ядром Linux, но он
|
||||
приводит к дополнительным копированиям данных, поэтому немного ухудшает производительность,
|
||||
правда, в основном - линейную, а случайная затрагивается слабо.
|
||||
|
||||
NBD расшифровывается как "сетевое блочное устройство", но на самом деле оно также
|
||||
работает просто как аналог FUSE для блочных устройств, то есть, представляет собой
|
||||
"блочное устройство в пространстве пользователя".
|
||||
|
||||
NBD - на данный момент единственный способ монтировать Vitastor ядром Linux.
|
||||
NBD немного снижает производительность, так как приводит к дополнительным копированиям
|
||||
данных между ядром и пространством пользователя. Тем не менее, способ достаточно оптимален,
|
||||
а производительность случайного доступа вообще затрагивается слабо.
|
||||
|
||||
Vitastor с однопоточной NBD прокси на том же стенде:
|
||||
- T1Q1 запись: 6000 iops (задержка 0.166ms)
|
||||
- T1Q1 чтение: 5518 iops (задержка 0.18ms)
|
||||
@@ -424,23 +425,90 @@ Vitastor с однопоточной NBD прокси на том же стен
|
||||
- Запустите все OSD: `systemctl start vitastor.target`
|
||||
- Ваш кластер должен быть готов - один из мониторов должен уже сконфигурировать PG, а OSD должны запустить их.
|
||||
- Вы можете проверить состояние PG прямо в etcd: `etcdctl --endpoints=... get --prefix /vitastor/pg/state`. Все PG должны быть 'active'.
|
||||
- Пример команды для запуска тестов: `fio -thread -ioengine=libfio_vitastor.so -name=test -bs=4M -direct=1 -iodepth=16 -rw=write -etcd=10.115.0.10:2379/v3 -pool=1 -inode=1 -size=400G`.
|
||||
- Пример команды для заливки образа ВМ в vitastor через qemu-img:
|
||||
```
|
||||
qemu-img convert -f qcow2 debian10.qcow2 -p -O raw 'vitastor:etcd_host=10.115.0.10\:2379/v3:pool=1:inode=1:size=2147483648'
|
||||
```
|
||||
Если вы используете немодифицированный QEMU, данной команде потребуется переменная окружения `LD_PRELOAD=/usr/lib/x86_64-linux-gnu/qemu/block-vitastor.so`.
|
||||
- Пример команды запуска QEMU:
|
||||
```
|
||||
qemu-system-x86_64 -enable-kvm -m 1024
|
||||
-drive 'file=vitastor:etcd_host=10.115.0.10\:2379/v3:pool=1:inode=1:size=2147483648',format=raw,if=none,id=drive-virtio-disk0,cache=none
|
||||
-device virtio-blk-pci,scsi=off,bus=pci.0,addr=0x5,drive=drive-virtio-disk0,id=virtio-disk0,bootindex=1,write-cache=off,physical_block_size=4096,logical_block_size=512
|
||||
-vnc 0.0.0.0:0
|
||||
```
|
||||
- Пример команды удаления образа (инода) из Vitastor:
|
||||
```
|
||||
vitastor-rm --etcd_address 10.115.0.10:2379/v3 --pool 1 --inode 1 --parallel_osds 16 --iodepth 32
|
||||
```
|
||||
|
||||
### Задать имя образу
|
||||
|
||||
```
|
||||
etcdctl --endpoints=<etcd> put /vitastor/config/inode/<pool>/<inode> '{"name":"<name>","size":<size>[,"parent_id":<parent_inode_number>][,"readonly":true]}'
|
||||
```
|
||||
|
||||
Например:
|
||||
|
||||
```
|
||||
etcdctl --endpoints=http://10.115.0.10:2379/v3 put /vitastor/config/inode/1/1 '{"name":"testimg","size":2147483648}'
|
||||
```
|
||||
|
||||
Если вы зададите parent_id, то образ станет CoW-клоном, т.е. все новые запросы записи пойдут в новый инод, а запросы
|
||||
чтения будут проверять сначала его, а потом родительские слои по цепочке вверх. Чтобы случайно не перезаписать данные
|
||||
в родительском слое, вы можете переключить его в режим "только чтение", добавив флаг `"readonly":true` в его запись
|
||||
метаданных. В таком случае родительский образ становится просто снапшотом.
|
||||
|
||||
Таким образом, для создания снапшота вам нужно просто переименовать предыдущий inode (например, из testimg в testimg@0),
|
||||
сделать его readonly и создать новый слой с исходным именем образа (testimg), ссылающийся на только что переименованный
|
||||
в качестве родительского.
|
||||
|
||||
### Запуск тестов с fio
|
||||
|
||||
Пример команды для запуска тестов:
|
||||
|
||||
```
|
||||
fio -thread -ioengine=libfio_vitastor.so -name=test -bs=4M -direct=1 -iodepth=16 -rw=write -etcd=10.115.0.10:2379/v3 -image=testimg
|
||||
```
|
||||
|
||||
Если вы не хотите обращаться к образу по имени, вместо `-image=testimg` можно указать номер пула, номер инода и размер:
|
||||
`-pool=1 -inode=1 -size=400G`.
|
||||
|
||||
### Загрузить образ диска ВМ в/из Vitastor
|
||||
|
||||
Используйте qemu-img и строку `vitastor:etcd_host=<HOST>:image=<IMAGE>` в качестве имени файла диска. Например:
|
||||
|
||||
```
|
||||
qemu-img convert -f qcow2 debian10.qcow2 -p -O raw 'vitastor:etcd_host=10.115.0.10\:2379/v3:image=testimg'
|
||||
```
|
||||
|
||||
Обратите внимание, что если вы используете немодифицированный QEMU, потребуется установить переменную окружения
|
||||
`LD_PRELOAD=/usr/lib/x86_64-linux-gnu/qemu/block-vitastor.so`.
|
||||
|
||||
Если вы не хотите обращаться к образу по имени, вместо `:image=<IMAGE>` можно указать номер пула, номер инода и размер:
|
||||
`:pool=<POOL>:inode=<INODE>:size=<SIZE>`.
|
||||
|
||||
### Запустить ВМ
|
||||
|
||||
Для запуска QEMU используйте опцию `-drive file=vitastor:etcd_host=<HOST>:image=<IMAGE>` (аналогично qemu-img)
|
||||
и физический размер блока 4 KB.
|
||||
|
||||
Например:
|
||||
|
||||
```
|
||||
qemu-system-x86_64 -enable-kvm -m 1024
|
||||
-drive 'file=vitastor:etcd_host=10.115.0.10\:2379/v3:image=testimg',format=raw,if=none,id=drive-virtio-disk0,cache=none
|
||||
-device virtio-blk-pci,scsi=off,bus=pci.0,addr=0x5,drive=drive-virtio-disk0,id=virtio-disk0,bootindex=1,write-cache=off,physical_block_size=4096,logical_block_size=512
|
||||
-vnc 0.0.0.0:0
|
||||
```
|
||||
|
||||
Обращение по номерам (`:pool=<POOL>:inode=<INODE>:size=<SIZE>` вместо `:image=<IMAGE>`) работает аналогично qemu-img.
|
||||
|
||||
### Удалить образ
|
||||
|
||||
Используйте утилиту vitastor-rm. Например:
|
||||
|
||||
```
|
||||
vitastor-rm --etcd_address 10.115.0.10:2379/v3 --pool 1 --inode 1 --parallel_osds 16 --iodepth 32
|
||||
```
|
||||
|
||||
### NBD
|
||||
|
||||
Чтобы создать локальное блочное устройство, используйте NBD. Например:
|
||||
|
||||
```
|
||||
vitastor-nbd map --etcd_address 10.115.0.10:2379/v3 --image testimg
|
||||
```
|
||||
|
||||
Команда напечатает название устройства вида /dev/nbd0, которое потом можно будет форматировать
|
||||
и использовать как обычное блочное устройство.
|
||||
|
||||
Для обращения по номеру инода, аналогично другим командам, можно использовать опции
|
||||
`--pool <POOL> --inode <INODE> --size <SIZE>` вместо `--image testimg`.
|
||||
|
||||
## Известные проблемы
|
||||
|
||||
|
98
README.md
98
README.md
@@ -379,24 +379,86 @@ and calculate disk offsets almost by hand. This will be fixed in near future.
|
||||
For jerasure pools the configuration should look like the following: `2:{"name":"ecpool","scheme":"jerasure","pg_size":4,"parity_chunks":2,"pg_minsize":2,"pg_count":256,"failure_domain":"host"}`.
|
||||
- At this point, one of the monitors will configure PGs and OSDs will start them.
|
||||
- You can check PG states with `etcdctl --endpoints=... get --prefix /vitastor/pg/state`. All PGs should become 'active'.
|
||||
- Run tests with (for example): `fio -thread -ioengine=libfio_vitastor.so -name=test -bs=4M -direct=1 -iodepth=16 -rw=write -etcd=10.115.0.10:2379/v3 -pool=1 -inode=1 -size=400G`.
|
||||
- Upload VM disk image with qemu-img (for example):
|
||||
```
|
||||
qemu-img convert -f qcow2 debian10.qcow2 -p -O raw 'vitastor:etcd_host=10.115.0.10\:2379/v3:pool=1:inode=1:size=2147483648'
|
||||
```
|
||||
Note that the command requires to be run with `LD_PRELOAD=/usr/lib/x86_64-linux-gnu/qemu/block-vitastor.so qemu-img ...`
|
||||
if you use unmodified QEMU.
|
||||
- Run QEMU with (for example):
|
||||
```
|
||||
qemu-system-x86_64 -enable-kvm -m 1024
|
||||
-drive 'file=vitastor:etcd_host=10.115.0.10\:2379/v3:pool=1:inode=1:size=2147483648',format=raw,if=none,id=drive-virtio-disk0,cache=none
|
||||
-device virtio-blk-pci,scsi=off,bus=pci.0,addr=0x5,drive=drive-virtio-disk0,id=virtio-disk0,bootindex=1,write-cache=off,physical_block_size=4096,logical_block_size=512
|
||||
-vnc 0.0.0.0:0
|
||||
```
|
||||
- Remove inode with (for example):
|
||||
```
|
||||
vitastor-rm --etcd_address 10.115.0.10:2379/v3 --pool 1 --inode 1 --parallel_osds 16 --iodepth 32
|
||||
```
|
||||
|
||||
### Name an image
|
||||
|
||||
```
|
||||
etcdctl --endpoints=<etcd> put /vitastor/config/inode/<pool>/<inode> '{"name":"<name>","size":<size>[,"parent_id":<parent_inode_number>][,"readonly":true]}'
|
||||
```
|
||||
|
||||
For example:
|
||||
|
||||
```
|
||||
etcdctl --endpoints=http://10.115.0.10:2379/v3 put /vitastor/config/inode/1/1 '{"name":"testimg","size":2147483648}'
|
||||
```
|
||||
|
||||
If you specify parent_id the image becomes a CoW clone. I.e. all writes go to the new inode and reads first check it
|
||||
and then upper layers. You can then make parent readonly by updating its entry with `"readonly":true` for safety and
|
||||
basically treat it as a snapshot.
|
||||
|
||||
So to create a snapshot you basically rename the previous upper layer (for example from testimg to testimg@0), make it readonly
|
||||
and create a new top layer with the original name (testimg) and the previous one as a parent.
|
||||
|
||||
### Run fio benchmarks
|
||||
|
||||
fio command example:
|
||||
|
||||
```
|
||||
fio -thread -ioengine=libfio_vitastor.so -name=test -bs=4M -direct=1 -iodepth=16 -rw=write -etcd=10.115.0.10:2379/v3 -image=testimg
|
||||
```
|
||||
|
||||
If you don't want to access your image by name, you can specify pool number, inode number and size
|
||||
(`-pool=1 -inode=1 -size=400G`) instead of the image name (`-image=testimg`).
|
||||
|
||||
### Upload VM image
|
||||
|
||||
Use qemu-img and `vitastor:etcd_host=<HOST>:image=<IMAGE>` disk filename. For example:
|
||||
|
||||
```
|
||||
qemu-img convert -f qcow2 debian10.qcow2 -p -O raw 'vitastor:etcd_host=10.115.0.10\:2379/v3:image=testimg'
|
||||
```
|
||||
|
||||
Note that the command requires to be run with `LD_PRELOAD=/usr/lib/x86_64-linux-gnu/qemu/block-vitastor.so qemu-img ...`
|
||||
if you use unmodified QEMU.
|
||||
|
||||
You can also specify `:pool=<POOL>:inode=<INODE>:size=<SIZE>` instead of `:image=<IMAGE>`
|
||||
if you don't want to use inode metadata.
|
||||
|
||||
### Start a VM
|
||||
|
||||
Run QEMU with `-drive file=vitastor:etcd_host=<HOST>:image=<IMAGE>` and use 4 KB physical block size.
|
||||
|
||||
For example:
|
||||
|
||||
```
|
||||
qemu-system-x86_64 -enable-kvm -m 1024
|
||||
-drive 'file=vitastor:etcd_host=10.115.0.10\:2379/v3:image=testimg',format=raw,if=none,id=drive-virtio-disk0,cache=none
|
||||
-device virtio-blk-pci,scsi=off,bus=pci.0,addr=0x5,drive=drive-virtio-disk0,id=virtio-disk0,bootindex=1,write-cache=off,physical_block_size=4096,logical_block_size=512
|
||||
-vnc 0.0.0.0:0
|
||||
```
|
||||
|
||||
You can also specify `:pool=<POOL>:inode=<INODE>:size=<SIZE>` instead of `:image=<IMAGE>`,
|
||||
just like in qemu-img.
|
||||
|
||||
### Remove inode
|
||||
|
||||
Use vitastor-rm. For example:
|
||||
|
||||
```
|
||||
vitastor-rm --etcd_address 10.115.0.10:2379/v3 --pool 1 --inode 1 --parallel_osds 16 --iodepth 32
|
||||
```
|
||||
|
||||
### NBD
|
||||
|
||||
To create a local block device for a Vitastor image, use NBD. For example:
|
||||
|
||||
```
|
||||
vitastor-nbd map --etcd_address 10.115.0.10:2379/v3 --image testimg
|
||||
```
|
||||
|
||||
It will output the device name, like /dev/nbd0 which you can then format and mount as a normal block device.
|
||||
|
||||
Again, you can use `--pool <POOL> --inode <INODE> --size <SIZE>` insteaf of `--image <IMAGE>` if you want.
|
||||
|
||||
## Known Problems
|
||||
|
||||
|
2
debian/changelog
vendored
2
debian/changelog
vendored
@@ -1,4 +1,4 @@
|
||||
vitastor (0.6.1-1) unstable; urgency=medium
|
||||
vitastor (0.6.2-1) unstable; urgency=medium
|
||||
|
||||
* Bugfixes
|
||||
|
||||
|
12
debian/vitastor.Dockerfile
vendored
12
debian/vitastor.Dockerfile
vendored
@@ -40,10 +40,10 @@ RUN set -e -x; \
|
||||
mkdir -p /root/packages/vitastor-$REL; \
|
||||
rm -rf /root/packages/vitastor-$REL/*; \
|
||||
cd /root/packages/vitastor-$REL; \
|
||||
cp -r /root/vitastor vitastor-0.6.1; \
|
||||
ln -s /root/packages/qemu-$REL/qemu-*/ vitastor-0.6.1/qemu; \
|
||||
ln -s /root/fio-build/fio-*/ vitastor-0.6.1/fio; \
|
||||
cd vitastor-0.6.1; \
|
||||
cp -r /root/vitastor vitastor-0.6.2; \
|
||||
ln -s /root/packages/qemu-$REL/qemu-*/ vitastor-0.6.2/qemu; \
|
||||
ln -s /root/fio-build/fio-*/ vitastor-0.6.2/fio; \
|
||||
cd vitastor-0.6.2; \
|
||||
FIO=$(head -n1 fio/debian/changelog | perl -pe 's/^.*\((.*?)\).*$/$1/'); \
|
||||
QEMU=$(head -n1 qemu/debian/changelog | perl -pe 's/^.*\((.*?)\).*$/$1/'); \
|
||||
sh copy-qemu-includes.sh; \
|
||||
@@ -59,8 +59,8 @@ RUN set -e -x; \
|
||||
echo "dep:fio=$FIO" > debian/substvars; \
|
||||
echo "dep:qemu=$QEMU" >> debian/substvars; \
|
||||
cd /root/packages/vitastor-$REL; \
|
||||
tar --sort=name --mtime='2020-01-01' --owner=0 --group=0 --exclude=debian -cJf vitastor_0.6.1.orig.tar.xz vitastor-0.6.1; \
|
||||
cd vitastor-0.6.1; \
|
||||
tar --sort=name --mtime='2020-01-01' --owner=0 --group=0 --exclude=debian -cJf vitastor_0.6.2.orig.tar.xz vitastor-0.6.2; \
|
||||
cd vitastor-0.6.2; \
|
||||
V=$(head -n1 debian/changelog | perl -pe 's/^.*\((.*?)\).*$/$1/'); \
|
||||
DEBFULLNAME="Vitaliy Filippov <vitalif@yourcmc.ru>" dch -D $REL -v "$V""$REL" "Rebuild for $REL"; \
|
||||
DEB_BUILD_OPTIONS=nocheck dpkg-buildpackage --jobs=auto -sa; \
|
||||
|
@@ -48,4 +48,4 @@ FIO=`rpm -qi fio | perl -e 'while(<>) { /^Epoch[\s:]+(\S+)/ && print "$1:"; /^Ve
|
||||
QEMU=`rpm -qi qemu qemu-kvm | perl -e 'while(<>) { /^Epoch[\s:]+(\S+)/ && print "$1:"; /^Version[\s:]+(\S+)/ && print $1; /^Release[\s:]+(\S+)/ && print "-$1"; }'`
|
||||
perl -i -pe 's/(Requires:\s*fio)([^\n]+)?/$1 = '$FIO'/' $VITASTOR/rpm/vitastor-el$EL.spec
|
||||
perl -i -pe 's/(Requires:\s*qemu(?:-kvm)?)([^\n]+)?/$1 = '$QEMU'/' $VITASTOR/rpm/vitastor-el$EL.spec
|
||||
tar --transform 's#^#vitastor-0.6.1/#' --exclude 'rpm/*.rpm' -czf $VITASTOR/../vitastor-0.6.1$(rpm --eval '%dist').tar.gz *
|
||||
tar --transform 's#^#vitastor-0.6.2/#' --exclude 'rpm/*.rpm' -czf $VITASTOR/../vitastor-0.6.2$(rpm --eval '%dist').tar.gz *
|
||||
|
@@ -37,7 +37,7 @@ ADD . /root/vitastor
|
||||
RUN set -e; \
|
||||
cd /root/vitastor/rpm; \
|
||||
sh build-tarball.sh; \
|
||||
cp /root/vitastor-0.6.1.el7.tar.gz ~/rpmbuild/SOURCES; \
|
||||
cp /root/vitastor-0.6.2.el7.tar.gz ~/rpmbuild/SOURCES; \
|
||||
cp vitastor-el7.spec ~/rpmbuild/SPECS/vitastor.spec; \
|
||||
cd ~/rpmbuild/SPECS/; \
|
||||
rpmbuild -ba vitastor.spec; \
|
||||
|
@@ -1,11 +1,11 @@
|
||||
Name: vitastor
|
||||
Version: 0.6.1
|
||||
Version: 0.6.2
|
||||
Release: 1%{?dist}
|
||||
Summary: Vitastor, a fast software-defined clustered block storage
|
||||
|
||||
License: Vitastor Network Public License 1.1
|
||||
URL: https://vitastor.io/
|
||||
Source0: vitastor-0.6.1.el7.tar.gz
|
||||
Source0: vitastor-0.6.2.el7.tar.gz
|
||||
|
||||
BuildRequires: liburing-devel >= 0.6
|
||||
BuildRequires: gperftools-devel
|
||||
|
@@ -35,7 +35,7 @@ ADD . /root/vitastor
|
||||
RUN set -e; \
|
||||
cd /root/vitastor/rpm; \
|
||||
sh build-tarball.sh; \
|
||||
cp /root/vitastor-0.6.1.el8.tar.gz ~/rpmbuild/SOURCES; \
|
||||
cp /root/vitastor-0.6.2.el8.tar.gz ~/rpmbuild/SOURCES; \
|
||||
cp vitastor-el8.spec ~/rpmbuild/SPECS/vitastor.spec; \
|
||||
cd ~/rpmbuild/SPECS/; \
|
||||
rpmbuild -ba vitastor.spec; \
|
||||
|
@@ -1,11 +1,11 @@
|
||||
Name: vitastor
|
||||
Version: 0.6.1
|
||||
Version: 0.6.2
|
||||
Release: 1%{?dist}
|
||||
Summary: Vitastor, a fast software-defined clustered block storage
|
||||
|
||||
License: Vitastor Network Public License 1.1
|
||||
URL: https://vitastor.io/
|
||||
Source0: vitastor-0.6.1.el8.tar.gz
|
||||
Source0: vitastor-0.6.2.el8.tar.gz
|
||||
|
||||
BuildRequires: liburing-devel >= 0.6
|
||||
BuildRequires: gperftools-devel
|
||||
|
@@ -13,7 +13,7 @@ if("${CMAKE_INSTALL_PREFIX}" MATCHES "^/usr/local/?$")
|
||||
set(CMAKE_INSTALL_RPATH "${CMAKE_INSTALL_PREFIX}/${CMAKE_INSTALL_LIBDIR}")
|
||||
endif()
|
||||
|
||||
add_definitions(-DVERSION="0.6.1")
|
||||
add_definitions(-DVERSION="0.6.2")
|
||||
add_definitions(-Wall -Wno-sign-compare -Wno-comment -Wno-parentheses -Wno-pointer-arith -I ${CMAKE_SOURCE_DIR}/src)
|
||||
if (${WITH_ASAN})
|
||||
add_definitions(-fsanitize=address -fno-omit-frame-pointer)
|
||||
@@ -37,11 +37,16 @@ string(REGEX REPLACE "([\\/\\-]D) *NDEBUG" "" CMAKE_C_FLAGS_RELWITHDEBINFO "${CM
|
||||
find_package(PkgConfig)
|
||||
pkg_check_modules(LIBURING REQUIRED liburing)
|
||||
pkg_check_modules(GLIB REQUIRED glib-2.0)
|
||||
pkg_check_modules(IBVERBS libibverbs)
|
||||
if (IBVERBS_LIBRARIES)
|
||||
add_definitions(-DWITH_RDMA)
|
||||
endif (IBVERBS_LIBRARIES)
|
||||
|
||||
include_directories(
|
||||
../
|
||||
/usr/include/jerasure
|
||||
${LIBURING_INCLUDE_DIRS}
|
||||
${IBVERBS_INCLUDE_DIRS}
|
||||
)
|
||||
|
||||
# libvitastor_blk.so
|
||||
@@ -52,7 +57,10 @@ add_library(vitastor_blk SHARED
|
||||
target_link_libraries(vitastor_blk
|
||||
${LIBURING_LIBRARIES}
|
||||
tcmalloc_minimal
|
||||
# for timerfd_manager
|
||||
vitastor_common
|
||||
)
|
||||
set_target_properties(vitastor_blk PROPERTIES VERSION ${VERSION} SOVERSION 0)
|
||||
|
||||
# libfio_vitastor_blk.so
|
||||
add_library(fio_vitastor_blk SHARED
|
||||
@@ -63,16 +71,28 @@ target_link_libraries(fio_vitastor_blk
|
||||
vitastor_blk
|
||||
)
|
||||
|
||||
# libvitastor_common.a
|
||||
add_library(vitastor_common STATIC
|
||||
epoll_manager.cpp etcd_state_client.cpp
|
||||
messenger.cpp msgr_stop.cpp msgr_op.cpp msgr_send.cpp msgr_receive.cpp ringloop.cpp ../json11/json11.cpp
|
||||
http_client.cpp osd_ops.cpp pg_states.cpp timerfd_manager.cpp base64.cpp
|
||||
)
|
||||
if (IBVERBS_LIBRARIES)
|
||||
target_sources(vitastor_common PRIVATE msgr_rdma.cpp)
|
||||
endif (IBVERBS_LIBRARIES)
|
||||
target_compile_options(vitastor_common PUBLIC -fPIC)
|
||||
|
||||
# vitastor-osd
|
||||
add_executable(vitastor-osd
|
||||
osd_main.cpp osd.cpp osd_secondary.cpp msgr_receive.cpp msgr_send.cpp osd_peering.cpp osd_flush.cpp osd_peering_pg.cpp
|
||||
osd_main.cpp osd.cpp osd_secondary.cpp osd_peering.cpp osd_flush.cpp osd_peering_pg.cpp
|
||||
osd_primary.cpp osd_primary_chain.cpp osd_primary_sync.cpp osd_primary_write.cpp osd_primary_subops.cpp
|
||||
etcd_state_client.cpp messenger.cpp msgr_stop.cpp msgr_op.cpp osd_cluster.cpp http_client.cpp osd_ops.cpp pg_states.cpp
|
||||
osd_rmw.cpp base64.cpp timerfd_manager.cpp epoll_manager.cpp ../json11/json11.cpp
|
||||
osd_cluster.cpp osd_rmw.cpp
|
||||
)
|
||||
target_link_libraries(vitastor-osd
|
||||
vitastor_common
|
||||
vitastor_blk
|
||||
Jerasure
|
||||
${IBVERBS_LIBRARIES}
|
||||
)
|
||||
|
||||
# libfio_vitastor_sec.so
|
||||
@@ -86,14 +106,15 @@ target_link_libraries(fio_vitastor_sec
|
||||
|
||||
# libvitastor_client.so
|
||||
add_library(vitastor_client SHARED
|
||||
cluster_client.cpp epoll_manager.cpp etcd_state_client.cpp
|
||||
messenger.cpp msgr_stop.cpp msgr_op.cpp msgr_send.cpp msgr_receive.cpp ringloop.cpp ../json11/json11.cpp
|
||||
http_client.cpp osd_ops.cpp pg_states.cpp timerfd_manager.cpp base64.cpp
|
||||
cluster_client.cpp
|
||||
)
|
||||
target_link_libraries(vitastor_client
|
||||
vitastor_common
|
||||
tcmalloc_minimal
|
||||
${LIBURING_LIBRARIES}
|
||||
${IBVERBS_LIBRARIES}
|
||||
)
|
||||
set_target_properties(vitastor_client PROPERTIES VERSION ${VERSION} SOVERSION 0)
|
||||
|
||||
# libfio_vitastor.so
|
||||
add_library(fio_vitastor SHARED
|
||||
@@ -162,11 +183,12 @@ target_link_libraries(osd_rmw_test Jerasure tcmalloc_minimal)
|
||||
|
||||
# stub_uring_osd
|
||||
add_executable(stub_uring_osd
|
||||
stub_uring_osd.cpp epoll_manager.cpp messenger.cpp msgr_stop.cpp msgr_op.cpp
|
||||
msgr_send.cpp msgr_receive.cpp ringloop.cpp timerfd_manager.cpp ../json11/json11.cpp
|
||||
stub_uring_osd.cpp
|
||||
)
|
||||
target_link_libraries(stub_uring_osd
|
||||
vitastor_common
|
||||
${LIBURING_LIBRARIES}
|
||||
${IBVERBS_LIBRARIES}
|
||||
tcmalloc_minimal
|
||||
)
|
||||
|
||||
|
@@ -146,6 +146,7 @@ int blockstore_impl_t::continue_sync(blockstore_op_t *op, bool queue_has_in_prog
|
||||
my_uring_prep_fsync(sqe, journal.fd, IORING_FSYNC_DATASYNC);
|
||||
data->iov = { 0 };
|
||||
data->callback = [this, op](ring_data_t *data) { handle_sync_event(data, op); };
|
||||
PRIV(op)->min_flushed_journal_sector = PRIV(op)->max_flushed_journal_sector = 0;
|
||||
PRIV(op)->pending_ops = 1;
|
||||
PRIV(op)->op_state = SYNC_JOURNAL_SYNC_SENT;
|
||||
return 1;
|
||||
|
@@ -53,6 +53,7 @@ cluster_client_t::cluster_client_t(ring_loop_t *ringloop, timerfd_manager_t *tfd
|
||||
msgr.stop_client(op->peer_fd);
|
||||
delete op;
|
||||
};
|
||||
msgr.parse_config(this->config);
|
||||
msgr.init();
|
||||
|
||||
st_cli.tfd = tfd;
|
||||
@@ -108,6 +109,115 @@ cluster_op_t::~cluster_op_t()
|
||||
}
|
||||
}
|
||||
|
||||
void cluster_client_t::calc_wait(cluster_op_t *op)
|
||||
{
|
||||
op->prev_wait = 0;
|
||||
if (op->opcode == OSD_OP_WRITE)
|
||||
{
|
||||
for (auto prev = op->prev; prev; prev = prev->prev)
|
||||
{
|
||||
if (prev->opcode == OSD_OP_SYNC ||
|
||||
prev->opcode == OSD_OP_WRITE && !(op->flags & OP_FLUSH_BUFFER) && (prev->flags & OP_FLUSH_BUFFER))
|
||||
{
|
||||
op->prev_wait++;
|
||||
}
|
||||
}
|
||||
if (!op->prev_wait && pgs_loaded)
|
||||
continue_rw(op);
|
||||
}
|
||||
else if (op->opcode == OSD_OP_SYNC)
|
||||
{
|
||||
for (auto prev = op->prev; prev; prev = prev->prev)
|
||||
{
|
||||
if (prev->opcode == OSD_OP_SYNC || prev->opcode == OSD_OP_WRITE)
|
||||
{
|
||||
op->prev_wait++;
|
||||
}
|
||||
}
|
||||
if (!op->prev_wait && pgs_loaded)
|
||||
continue_sync(op);
|
||||
}
|
||||
else
|
||||
{
|
||||
for (auto prev = op->prev; prev; prev = prev->prev)
|
||||
{
|
||||
if (prev->opcode == OSD_OP_WRITE && prev->flags & OP_FLUSH_BUFFER)
|
||||
{
|
||||
op->prev_wait++;
|
||||
}
|
||||
else if (prev->opcode == OSD_OP_WRITE || prev->opcode == OSD_OP_READ)
|
||||
{
|
||||
// Flushes are always in the beginning
|
||||
break;
|
||||
}
|
||||
}
|
||||
if (!op->prev_wait && pgs_loaded)
|
||||
continue_rw(op);
|
||||
}
|
||||
}
|
||||
|
||||
void cluster_client_t::inc_wait(uint64_t opcode, uint64_t flags, cluster_op_t *next, int inc)
|
||||
{
|
||||
if (opcode == OSD_OP_WRITE)
|
||||
{
|
||||
while (next)
|
||||
{
|
||||
auto n2 = next->next;
|
||||
if (next->opcode == OSD_OP_SYNC ||
|
||||
next->opcode == OSD_OP_WRITE && (flags & OP_FLUSH_BUFFER) && !(next->flags & OP_FLUSH_BUFFER) ||
|
||||
next->opcode == OSD_OP_READ && (flags & OP_FLUSH_BUFFER))
|
||||
{
|
||||
next->prev_wait += inc;
|
||||
if (!next->prev_wait)
|
||||
{
|
||||
if (next->opcode == OSD_OP_SYNC)
|
||||
continue_sync(next);
|
||||
else
|
||||
continue_rw(next);
|
||||
}
|
||||
}
|
||||
next = n2;
|
||||
}
|
||||
}
|
||||
else if (opcode == OSD_OP_SYNC)
|
||||
{
|
||||
while (next)
|
||||
{
|
||||
auto n2 = next->next;
|
||||
if (next->opcode == OSD_OP_SYNC || next->opcode == OSD_OP_WRITE)
|
||||
{
|
||||
next->prev_wait += inc;
|
||||
if (!next->prev_wait)
|
||||
{
|
||||
if (next->opcode == OSD_OP_SYNC)
|
||||
continue_sync(next);
|
||||
else
|
||||
continue_rw(next);
|
||||
}
|
||||
}
|
||||
next = n2;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
void cluster_client_t::erase_op(cluster_op_t *op)
|
||||
{
|
||||
uint64_t opcode = op->opcode, flags = op->flags;
|
||||
cluster_op_t *next = op->next;
|
||||
if (op->prev)
|
||||
op->prev->next = op->next;
|
||||
if (op->next)
|
||||
op->next->prev = op->prev;
|
||||
if (op_queue_head == op)
|
||||
op_queue_head = op->next;
|
||||
if (op_queue_tail == op)
|
||||
op_queue_tail = op->prev;
|
||||
op->next = op->prev = NULL;
|
||||
std::function<void(cluster_op_t*)>(op->callback)(op);
|
||||
if (!immediate_commit)
|
||||
inc_wait(opcode, flags, next, -1);
|
||||
}
|
||||
|
||||
void cluster_client_t::continue_ops(bool up_retry)
|
||||
{
|
||||
if (!pgs_loaded)
|
||||
@@ -118,60 +228,25 @@ void cluster_client_t::continue_ops(bool up_retry)
|
||||
if (continuing_ops)
|
||||
{
|
||||
// Attempt to reenter the function
|
||||
continuing_ops = 2;
|
||||
return;
|
||||
}
|
||||
restart:
|
||||
continuing_ops = 1;
|
||||
op_queue_pos = 0;
|
||||
bool has_flushes = false, has_writes = false;
|
||||
while (op_queue_pos < op_queue.size())
|
||||
for (auto op = op_queue_head; op; )
|
||||
{
|
||||
auto op = op_queue[op_queue_pos];
|
||||
bool rm = false, is_flush = op->flags & OP_FLUSH_BUFFER;
|
||||
auto opcode = op->opcode;
|
||||
cluster_op_t *next_op = op->next;
|
||||
if (!op->up_wait || up_retry)
|
||||
{
|
||||
op->up_wait = false;
|
||||
if (opcode == OSD_OP_READ || opcode == OSD_OP_WRITE)
|
||||
if (!op->prev_wait)
|
||||
{
|
||||
if (is_flush || !has_flushes)
|
||||
{
|
||||
// Regular writes can't proceed before buffer flushes
|
||||
rm = continue_rw(op);
|
||||
}
|
||||
}
|
||||
else if (opcode == OSD_OP_SYNC)
|
||||
{
|
||||
if (!has_writes)
|
||||
{
|
||||
// SYNC can't proceed before previous writes
|
||||
rm = continue_sync(op);
|
||||
}
|
||||
if (op->opcode == OSD_OP_SYNC)
|
||||
continue_sync(op);
|
||||
else
|
||||
continue_rw(op);
|
||||
}
|
||||
}
|
||||
if (opcode == OSD_OP_WRITE)
|
||||
{
|
||||
has_writes = has_writes || !rm;
|
||||
if (is_flush)
|
||||
{
|
||||
has_flushes = has_writes || !rm;
|
||||
}
|
||||
}
|
||||
else if (opcode == OSD_OP_SYNC)
|
||||
{
|
||||
// Postpone writes until previous SYNC completes
|
||||
// ...so dirty_writes can't contain anything newer than SYNC
|
||||
has_flushes = has_writes || !rm;
|
||||
}
|
||||
if (rm)
|
||||
{
|
||||
op_queue.erase(op_queue.begin()+op_queue_pos, op_queue.begin()+op_queue_pos+1);
|
||||
}
|
||||
else
|
||||
{
|
||||
op_queue_pos++;
|
||||
}
|
||||
op = next_op;
|
||||
if (continuing_ops == 2)
|
||||
{
|
||||
goto restart;
|
||||
@@ -213,11 +288,8 @@ void cluster_client_t::on_load_config_hook(json11::Json::object & config)
|
||||
{
|
||||
throw std::runtime_error("Bad block size");
|
||||
}
|
||||
if (config["immediate_commit"] == "all")
|
||||
{
|
||||
// Cluster-wide immediate_commit mode
|
||||
immediate_commit = true;
|
||||
}
|
||||
// Cluster-wide immediate_commit mode
|
||||
immediate_commit = (config["immediate_commit"] == "all");
|
||||
if (config.find("client_max_dirty_bytes") != config.end())
|
||||
{
|
||||
client_max_dirty_bytes = config["client_max_dirty_bytes"].uint64_value();
|
||||
@@ -281,7 +353,7 @@ void cluster_client_t::on_change_hook(std::map<std::string, etcd_kv_t> & changes
|
||||
{
|
||||
// At this point, all pool operations should have been suspended
|
||||
// And now they have to be resliced!
|
||||
for (auto op: op_queue)
|
||||
for (auto op = op_queue_head; op; op = op->next)
|
||||
{
|
||||
if ((op->opcode == OSD_OP_WRITE || op->opcode == OSD_OP_READ) &&
|
||||
INODE_POOL(op->cur_inode) == pool_item.first)
|
||||
@@ -362,9 +434,17 @@ void cluster_client_t::execute(cluster_op_t *op)
|
||||
{
|
||||
delete sync_op;
|
||||
};
|
||||
op_queue.push_back(sync_op);
|
||||
sync_op->prev = op_queue_tail;
|
||||
if (op_queue_tail)
|
||||
{
|
||||
op_queue_tail->next = sync_op;
|
||||
op_queue_tail = sync_op;
|
||||
}
|
||||
else
|
||||
op_queue_tail = op_queue_head = sync_op;
|
||||
dirty_bytes = 0;
|
||||
dirty_ops = 0;
|
||||
calc_wait(sync_op);
|
||||
}
|
||||
dirty_bytes += op->len;
|
||||
dirty_ops++;
|
||||
@@ -374,8 +454,23 @@ void cluster_client_t::execute(cluster_op_t *op)
|
||||
dirty_bytes = 0;
|
||||
dirty_ops = 0;
|
||||
}
|
||||
op_queue.push_back(op);
|
||||
continue_ops();
|
||||
op->prev = op_queue_tail;
|
||||
if (op_queue_tail)
|
||||
{
|
||||
op_queue_tail->next = op;
|
||||
op_queue_tail = op;
|
||||
}
|
||||
else
|
||||
op_queue_tail = op_queue_head = op;
|
||||
if (!immediate_commit)
|
||||
calc_wait(op);
|
||||
else if (pgs_loaded)
|
||||
{
|
||||
if (op->opcode == OSD_OP_SYNC)
|
||||
continue_sync(op);
|
||||
else
|
||||
continue_rw(op);
|
||||
}
|
||||
}
|
||||
|
||||
void cluster_client_t::copy_write(cluster_op_t *op, std::map<object_id, cluster_buffer_t> & dirty_buffers)
|
||||
@@ -474,12 +569,16 @@ void cluster_client_t::flush_buffer(const object_id & oid, cluster_buffer_t *wr)
|
||||
}
|
||||
delete op;
|
||||
};
|
||||
op_queue.insert(op_queue.begin(), op);
|
||||
if (continuing_ops)
|
||||
op->next = op_queue_head;
|
||||
if (op_queue_head)
|
||||
{
|
||||
continuing_ops = 2;
|
||||
op_queue_pos++;
|
||||
op_queue_head->prev = op;
|
||||
op_queue_head = op;
|
||||
}
|
||||
else
|
||||
op_queue_tail = op_queue_head = op;
|
||||
inc_wait(op->opcode, op->flags, op->next, 1);
|
||||
continue_rw(op);
|
||||
}
|
||||
|
||||
int cluster_client_t::continue_rw(cluster_op_t *op)
|
||||
@@ -496,7 +595,7 @@ resume_0:
|
||||
if (!op->len || op->offset % bs_bitmap_granularity || op->len % bs_bitmap_granularity)
|
||||
{
|
||||
op->retval = -EINVAL;
|
||||
std::function<void(cluster_op_t*)>(op->callback)(op);
|
||||
erase_op(op);
|
||||
return 1;
|
||||
}
|
||||
{
|
||||
@@ -504,7 +603,7 @@ resume_0:
|
||||
if (!pool_id)
|
||||
{
|
||||
op->retval = -EINVAL;
|
||||
std::function<void(cluster_op_t*)>(op->callback)(op);
|
||||
erase_op(op);
|
||||
return 1;
|
||||
}
|
||||
if (st_cli.pool_config.find(pool_id) == st_cli.pool_config.end() ||
|
||||
@@ -520,7 +619,7 @@ resume_0:
|
||||
if (ino_it != st_cli.inode_config.end() && ino_it->second.readonly)
|
||||
{
|
||||
op->retval = -EINVAL;
|
||||
std::function<void(cluster_op_t*)>(op->callback)(op);
|
||||
erase_op(op);
|
||||
return 1;
|
||||
}
|
||||
if (!immediate_commit && !(op->flags & OP_FLUSH_BUFFER))
|
||||
@@ -603,13 +702,13 @@ resume_3:
|
||||
}
|
||||
}
|
||||
op->retval = op->len;
|
||||
std::function<void(cluster_op_t*)>(op->callback)(op);
|
||||
erase_op(op);
|
||||
return 1;
|
||||
}
|
||||
else if (op->retval != 0 && op->retval != -EPIPE)
|
||||
{
|
||||
// Fatal error (not -EPIPE)
|
||||
std::function<void(cluster_op_t*)>(op->callback)(op);
|
||||
erase_op(op);
|
||||
return 1;
|
||||
}
|
||||
else
|
||||
@@ -849,7 +948,7 @@ int cluster_client_t::continue_sync(cluster_op_t *op)
|
||||
{
|
||||
// Sync is not required in the immediate_commit mode or if there are no dirty_osds
|
||||
op->retval = 0;
|
||||
std::function<void(cluster_op_t*)>(op->callback)(op);
|
||||
erase_op(op);
|
||||
return 1;
|
||||
}
|
||||
// Check that all OSD connections are still alive
|
||||
@@ -924,7 +1023,7 @@ resume_1:
|
||||
uw_it++;
|
||||
}
|
||||
}
|
||||
std::function<void(cluster_op_t*)>(op->callback)(op);
|
||||
erase_op(op);
|
||||
return 1;
|
||||
}
|
||||
|
||||
@@ -1008,7 +1107,10 @@ void cluster_client_t::handle_op_part(cluster_op_part_t *part)
|
||||
}
|
||||
if (op->inflight_count == 0)
|
||||
{
|
||||
continue_ops();
|
||||
if (op->opcode == OSD_OP_SYNC)
|
||||
continue_sync(op);
|
||||
else
|
||||
continue_rw(op);
|
||||
}
|
||||
}
|
||||
|
||||
|
@@ -36,7 +36,7 @@ struct cluster_op_t
|
||||
std::function<void(cluster_op_t*)> callback;
|
||||
~cluster_op_t();
|
||||
protected:
|
||||
int flags = 0;
|
||||
uint64_t flags = 0;
|
||||
int state = 0;
|
||||
uint64_t cur_inode; // for snapshot reads
|
||||
void *buf = NULL;
|
||||
@@ -47,6 +47,8 @@ protected:
|
||||
std::vector<cluster_op_part_t> parts;
|
||||
void *bitmap_buf = NULL, *part_bitmaps = NULL;
|
||||
unsigned bitmap_buf_size = 0;
|
||||
cluster_op_t *prev = NULL, *next = NULL;
|
||||
int prev_wait = 0;
|
||||
friend class cluster_client_t;
|
||||
};
|
||||
|
||||
@@ -66,7 +68,8 @@ class cluster_client_t
|
||||
uint64_t bs_block_size = 0;
|
||||
uint32_t bs_bitmap_granularity = 0, bs_bitmap_size = 0;
|
||||
std::map<pool_id_t, uint64_t> pg_counts;
|
||||
bool immediate_commit = false;
|
||||
// WARNING: initially true so execute() doesn't create fake sync
|
||||
bool immediate_commit = true;
|
||||
// FIXME: Implement inmemory_commit mode. Note that it requires to return overlapping reads from memory.
|
||||
uint64_t client_max_dirty_bytes = 0;
|
||||
uint64_t client_max_dirty_ops = 0;
|
||||
@@ -76,7 +79,7 @@ class cluster_client_t
|
||||
int retry_timeout_id = 0;
|
||||
uint64_t op_id = 1;
|
||||
std::vector<cluster_op_t*> offline_ops;
|
||||
std::vector<cluster_op_t*> op_queue;
|
||||
cluster_op_t *op_queue_head = NULL, *op_queue_tail = NULL;
|
||||
std::map<object_id, cluster_buffer_t> dirty_buffers;
|
||||
std::set<osd_num_t> dirty_osds;
|
||||
uint64_t dirty_bytes = 0, dirty_ops = 0;
|
||||
@@ -88,7 +91,6 @@ class cluster_client_t
|
||||
ring_consumer_t consumer;
|
||||
std::vector<std::function<void(void)>> on_ready_hooks;
|
||||
int continuing_ops = 0;
|
||||
int op_queue_pos = 0;
|
||||
|
||||
public:
|
||||
etcd_state_client_t st_cli;
|
||||
@@ -117,4 +119,7 @@ protected:
|
||||
void send_sync(cluster_op_t *op, cluster_op_part_t *part);
|
||||
void handle_op_part(cluster_op_part_t *part);
|
||||
void copy_part_bitmap(cluster_op_t *op, cluster_op_part_t *part);
|
||||
void erase_op(cluster_op_t *op);
|
||||
void calc_wait(cluster_op_t *op);
|
||||
void inc_wait(uint64_t opcode, uint64_t flags, cluster_op_t *next, int inc);
|
||||
};
|
||||
|
@@ -53,6 +53,10 @@ struct sec_options
|
||||
uint64_t inode = 0;
|
||||
int cluster_log = 0;
|
||||
int trace = 0;
|
||||
int use_rdma = 0;
|
||||
int rdma_port_num = 0;
|
||||
int rdma_gid_index = 0;
|
||||
int rdma_mtu = 0;
|
||||
};
|
||||
|
||||
static struct fio_option options[] = {
|
||||
@@ -121,6 +125,26 @@ static struct fio_option options[] = {
|
||||
.category = FIO_OPT_C_ENGINE,
|
||||
.group = FIO_OPT_G_FILENAME,
|
||||
},
|
||||
{
|
||||
.name = "use_rdma",
|
||||
.lname = "Use RDMA",
|
||||
.type = FIO_OPT_BOOL,
|
||||
.off1 = offsetof(struct sec_options, use_rdma),
|
||||
.help = "Use RDMA",
|
||||
.def = "0",
|
||||
.category = FIO_OPT_C_ENGINE,
|
||||
.group = FIO_OPT_G_FILENAME,
|
||||
},
|
||||
{
|
||||
.name = "rdma_gid_index",
|
||||
.lname = "RDMA gid index",
|
||||
.type = FIO_OPT_INT,
|
||||
.off1 = offsetof(struct sec_options, rdma_gid_index),
|
||||
.help = "RDMA gid index",
|
||||
.def = "0",
|
||||
.category = FIO_OPT_C_ENGINE,
|
||||
.group = FIO_OPT_G_FILENAME,
|
||||
},
|
||||
{
|
||||
.name = NULL,
|
||||
},
|
||||
@@ -156,6 +180,8 @@ static int sec_setup(struct thread_data *td)
|
||||
{ "etcd_address", std::string(o->etcd_host) },
|
||||
{ "etcd_prefix", std::string(o->etcd_prefix ? o->etcd_prefix : "/vitastor") },
|
||||
{ "log_level", o->cluster_log },
|
||||
{ "use_rdma", o->use_rdma },
|
||||
{ "rdma_gid_index", o->rdma_gid_index },
|
||||
};
|
||||
|
||||
if (!o->image)
|
||||
|
@@ -12,6 +12,31 @@
|
||||
|
||||
void osd_messenger_t::init()
|
||||
{
|
||||
#ifdef WITH_RDMA
|
||||
if (use_rdma)
|
||||
{
|
||||
rdma_context = msgr_rdma_context_t::create(
|
||||
rdma_device != "" ? rdma_device.c_str() : NULL,
|
||||
rdma_port_num, rdma_gid_index, rdma_mtu
|
||||
);
|
||||
if (!rdma_context)
|
||||
{
|
||||
printf("[OSD %lu] Couldn't initialize RDMA, proceeding with TCP only\n", osd_num);
|
||||
}
|
||||
else
|
||||
{
|
||||
rdma_max_sge = rdma_max_sge < rdma_context->attrx.orig_attr.max_sge
|
||||
? rdma_max_sge : rdma_context->attrx.orig_attr.max_sge;
|
||||
printf("[OSD %lu] RDMA initialized successfully\n", osd_num);
|
||||
fcntl(rdma_context->channel->fd, F_SETFL, fcntl(rdma_context->channel->fd, F_GETFL, 0) | O_NONBLOCK);
|
||||
tfd->set_fd_handler(rdma_context->channel->fd, false, [this](int notify_fd, int epoll_events)
|
||||
{
|
||||
handle_rdma_events();
|
||||
});
|
||||
handle_rdma_events();
|
||||
}
|
||||
}
|
||||
#endif
|
||||
keepalive_timer_id = tfd->set_timer(1000, true, [this](int)
|
||||
{
|
||||
std::vector<int> to_stop;
|
||||
@@ -19,7 +44,7 @@ void osd_messenger_t::init()
|
||||
for (auto cl_it = clients.begin(); cl_it != clients.end(); cl_it++)
|
||||
{
|
||||
auto cl = cl_it->second;
|
||||
if (!cl->osd_num || cl->peer_state != PEER_CONNECTED)
|
||||
if (!cl->osd_num || cl->peer_state != PEER_CONNECTED && cl->peer_state != PEER_RDMA)
|
||||
{
|
||||
// Do not run keepalive on regular clients
|
||||
continue;
|
||||
@@ -94,10 +119,29 @@ osd_messenger_t::~osd_messenger_t()
|
||||
{
|
||||
stop_client(clients.begin()->first, true);
|
||||
}
|
||||
#ifdef WITH_RDMA
|
||||
if (rdma_context)
|
||||
{
|
||||
delete rdma_context;
|
||||
}
|
||||
#endif
|
||||
}
|
||||
|
||||
void osd_messenger_t::parse_config(const json11::Json & config)
|
||||
{
|
||||
#ifdef WITH_RDMA
|
||||
if (!config["use_rdma"].is_null())
|
||||
this->use_rdma = config["use_rdma"].bool_value() || config["use_rdma"].uint64_value() != 0;
|
||||
this->rdma_device = config["rdma_device"].string_value();
|
||||
this->rdma_port_num = (uint8_t)config["rdma_port_num"].uint64_value();
|
||||
if (!this->rdma_port_num)
|
||||
this->rdma_port_num = 1;
|
||||
this->rdma_gid_index = (uint8_t)config["rdma_gid_index"].uint64_value();
|
||||
this->rdma_mtu = (uint32_t)config["rdma_mtu"].uint64_value();
|
||||
#endif
|
||||
this->bs_bitmap_granularity = strtoull(config["bitmap_granularity"].string_value().c_str(), NULL, 10);
|
||||
if (!this->bs_bitmap_granularity)
|
||||
this->bs_bitmap_granularity = DEFAULT_BITMAP_GRANULARITY;
|
||||
this->use_sync_send_recv = config["use_sync_send_recv"].bool_value() ||
|
||||
config["use_sync_send_recv"].uint64_value();
|
||||
this->peer_connect_interval = config["peer_connect_interval"].uint64_value();
|
||||
@@ -326,6 +370,37 @@ void osd_messenger_t::check_peer_config(osd_client_t *cl)
|
||||
},
|
||||
},
|
||||
};
|
||||
#ifdef WITH_RDMA
|
||||
if (rdma_context)
|
||||
{
|
||||
for (int i = 0; i < rdma_queues_per_connection; i++)
|
||||
{
|
||||
auto rdma_conn = msgr_rdma_connection_t::create(rdma_context, rdma_max_send, rdma_max_recv, rdma_max_sge);
|
||||
if (!rdma_conn)
|
||||
{
|
||||
break;
|
||||
}
|
||||
cl->rdma_queues.push_back(rdma_conn);
|
||||
}
|
||||
if (cl->rdma_queues.size())
|
||||
{
|
||||
json11::Json::array addresses;
|
||||
for (auto rdma_conn: cl->rdma_queues)
|
||||
{
|
||||
addresses.push_back(rdma_conn->addr.to_string());
|
||||
}
|
||||
json11::Json payload = json11::Json::object {
|
||||
{ "rdma_queues", addresses },
|
||||
{ "rdma_max_sge", rdma_max_sge },
|
||||
};
|
||||
std::string payload_str = payload.dump();
|
||||
op->req.show_conf.json_len = payload_str.size();
|
||||
op->buf = malloc_or_die(payload_str.size());
|
||||
op->iov.push_back(op->buf, payload_str.size());
|
||||
memcpy(op->buf, payload_str.c_str(), payload_str.size());
|
||||
}
|
||||
}
|
||||
#endif
|
||||
op->callback = [this, cl](osd_op_t *op)
|
||||
{
|
||||
std::string json_err;
|
||||
@@ -361,12 +436,20 @@ void osd_messenger_t::check_peer_config(osd_client_t *cl)
|
||||
}
|
||||
if (err)
|
||||
{
|
||||
osd_num_t osd_num = cl->osd_num;
|
||||
osd_num_t peer_osd = cl->osd_num;
|
||||
stop_client(op->peer_fd);
|
||||
on_connect_peer(osd_num, -1);
|
||||
on_connect_peer(peer_osd, -1);
|
||||
delete op;
|
||||
return;
|
||||
}
|
||||
#ifdef WITH_RDMA
|
||||
if (!connect_rdma_server(cl, config["rdma_queues"], config["rdma_max_sge"].uint64_value()))
|
||||
{
|
||||
// FIXME: Keep TCP connection in this case
|
||||
delete op;
|
||||
return;
|
||||
}
|
||||
#endif
|
||||
osd_peer_fds[cl->osd_num] = cl->peer_fd;
|
||||
on_connect_peer(cl->osd_num, cl->peer_fd);
|
||||
delete op;
|
||||
@@ -408,3 +491,8 @@ void osd_messenger_t::accept_connections(int listen_fd)
|
||||
throw std::runtime_error(std::string("accept: ") + strerror(errno));
|
||||
}
|
||||
}
|
||||
|
||||
bool osd_messenger_t::is_rdma_enabled()
|
||||
{
|
||||
return rdma_context != NULL;
|
||||
}
|
||||
|
@@ -18,21 +18,36 @@
|
||||
#include "timerfd_manager.h"
|
||||
#include <ringloop.h>
|
||||
|
||||
#ifdef WITH_RDMA
|
||||
#include "msgr_rdma.h"
|
||||
#endif
|
||||
|
||||
#define CL_READ_HDR 1
|
||||
#define CL_READ_DATA 2
|
||||
#define CL_READ_REPLY_DATA 3
|
||||
#define CL_WRITE_READY 1
|
||||
#define CL_WRITE_REPLY 2
|
||||
|
||||
#define PEER_CONNECTING 1
|
||||
#define PEER_CONNECTED 2
|
||||
#define PEER_STOPPED 3
|
||||
#define PEER_RDMA_CONNECTING 3
|
||||
#define PEER_RDMA 4
|
||||
#define PEER_STOPPED 5
|
||||
|
||||
#define DEFAULT_PEER_CONNECT_INTERVAL 5
|
||||
#define DEFAULT_PEER_CONNECT_TIMEOUT 5
|
||||
#define DEFAULT_OSD_PING_TIMEOUT 5
|
||||
#define DEFAULT_BITMAP_GRANULARITY 4096
|
||||
|
||||
#define MSGR_SENDP_HDR 1
|
||||
#define MSGR_SENDP_FREE 2
|
||||
#define MSGR_SENDP_BMP 4
|
||||
|
||||
struct msgr_sendp_t
|
||||
{
|
||||
osd_op_t *op;
|
||||
int flags;
|
||||
};
|
||||
|
||||
struct osd_client_t
|
||||
{
|
||||
int refs = 0;
|
||||
@@ -48,6 +63,10 @@ struct osd_client_t
|
||||
|
||||
void *in_buf = NULL;
|
||||
|
||||
#ifdef WITH_RDMA
|
||||
std::vector<msgr_rdma_connection_t*> rdma_queues;
|
||||
#endif
|
||||
|
||||
// Read state
|
||||
int read_ready = 0;
|
||||
osd_op_t *read_op = NULL;
|
||||
@@ -70,7 +89,7 @@ struct osd_client_t
|
||||
msghdr write_msg = { 0 };
|
||||
int write_state = 0;
|
||||
std::vector<iovec> send_list, next_send_list;
|
||||
std::vector<osd_op_t*> outbox, next_outbox;
|
||||
std::vector<msgr_sendp_t> outbox, next_outbox;
|
||||
|
||||
~osd_client_t()
|
||||
{
|
||||
@@ -110,9 +129,19 @@ protected:
|
||||
int peer_connect_timeout = DEFAULT_PEER_CONNECT_TIMEOUT;
|
||||
int osd_idle_timeout = DEFAULT_OSD_PING_TIMEOUT;
|
||||
int osd_ping_timeout = DEFAULT_OSD_PING_TIMEOUT;
|
||||
uint32_t bs_bitmap_granularity = 0;
|
||||
int log_level = 0;
|
||||
bool use_sync_send_recv = false;
|
||||
|
||||
#ifdef WITH_RDMA
|
||||
bool use_rdma = true;
|
||||
std::string rdma_device;
|
||||
uint64_t rdma_port_num = 1, rdma_gid_index = 0, rdma_mtu = 0;
|
||||
msgr_rdma_context_t *rdma_context = NULL;
|
||||
int rdma_queues_per_connection = 128;
|
||||
int rdma_max_sge = 128, rdma_max_send = 32, rdma_max_recv = 32;
|
||||
#endif
|
||||
|
||||
std::vector<int> read_ready_clients;
|
||||
std::vector<int> write_ready_clients;
|
||||
std::vector<std::function<void()>> set_immediate;
|
||||
@@ -141,6 +170,12 @@ public:
|
||||
void accept_connections(int listen_fd);
|
||||
~osd_messenger_t();
|
||||
|
||||
#ifdef WITH_RDMA
|
||||
bool is_rdma_enabled();
|
||||
bool connect_rdma_client(osd_client_t *cl, json11::Json rdma_addresses, uint64_t client_max_sge);
|
||||
int get_rdma_max_sge();
|
||||
#endif
|
||||
|
||||
protected:
|
||||
void try_connect_peer(uint64_t osd_num);
|
||||
void try_connect_peer_addr(osd_num_t peer_osd, const char *peer_host, int peer_port);
|
||||
@@ -160,4 +195,11 @@ protected:
|
||||
void handle_op_hdr(osd_client_t *cl);
|
||||
bool handle_reply_hdr(osd_client_t *cl);
|
||||
void handle_reply_ready(osd_op_t *op);
|
||||
|
||||
#ifdef WITH_RDMA
|
||||
void try_send_rdma(osd_client_t *cl);
|
||||
void try_recv_rdma(osd_client_t *cl, msgr_rdma_connection_t *rc);
|
||||
void handle_rdma_events();
|
||||
bool connect_rdma_server(osd_client_t *cl, json11::Json rdma_addresses, uint64_t server_max_sge);
|
||||
#endif
|
||||
};
|
||||
|
962
src/msgr_rdma.cpp
Normal file
962
src/msgr_rdma.cpp
Normal file
@@ -0,0 +1,962 @@
|
||||
#include <stdio.h>
|
||||
#include <stdlib.h>
|
||||
#include "msgr_rdma.h"
|
||||
#include "messenger.h"
|
||||
|
||||
std::string msgr_rdma_address_t::to_string()
|
||||
{
|
||||
char msg[sizeof "0000:00000000:00000000:00000000000000000000000000000000"];
|
||||
sprintf(
|
||||
msg, "%04x:%06x:%06x:%016lx%016lx", lid, qpn, psn,
|
||||
htobe64(((uint64_t*)&gid)[0]), htobe64(((uint64_t*)&gid)[1])
|
||||
);
|
||||
return std::string(msg);
|
||||
}
|
||||
|
||||
bool msgr_rdma_address_t::from_string(const char *str, msgr_rdma_address_t *dest)
|
||||
{
|
||||
uint64_t* gid = (uint64_t*)&dest->gid;
|
||||
int n = sscanf(
|
||||
str, "%hx:%x:%x:%16lx%16lx", &dest->lid, &dest->qpn, &dest->psn, gid, gid+1
|
||||
);
|
||||
gid[0] = be64toh(gid[0]);
|
||||
gid[1] = be64toh(gid[1]);
|
||||
return n == 5;
|
||||
}
|
||||
|
||||
msgr_rdma_context_t::~msgr_rdma_context_t()
|
||||
{
|
||||
if (cq)
|
||||
ibv_destroy_cq(cq);
|
||||
if (channel)
|
||||
ibv_destroy_comp_channel(channel);
|
||||
if (mr)
|
||||
ibv_dereg_mr(mr);
|
||||
if (pd)
|
||||
ibv_dealloc_pd(pd);
|
||||
if (context)
|
||||
ibv_close_device(context);
|
||||
}
|
||||
|
||||
msgr_rdma_connection_t::~msgr_rdma_connection_t()
|
||||
{
|
||||
ctx->used_max_cqe -= max_send+max_recv;
|
||||
if (qp)
|
||||
ibv_destroy_qp(qp);
|
||||
}
|
||||
|
||||
msgr_rdma_context_t *msgr_rdma_context_t::create(const char *ib_devname, uint8_t ib_port, uint8_t gid_index, uint32_t mtu)
|
||||
{
|
||||
int res;
|
||||
ibv_device **dev_list = NULL;
|
||||
msgr_rdma_context_t *ctx = new msgr_rdma_context_t();
|
||||
ctx->mtu = mtu;
|
||||
|
||||
dev_list = ibv_get_device_list(NULL);
|
||||
if (!dev_list)
|
||||
{
|
||||
fprintf(stderr, "Failed to get RDMA device list: %s\n", strerror(errno));
|
||||
goto cleanup;
|
||||
}
|
||||
if (!ib_devname)
|
||||
{
|
||||
ctx->dev = *dev_list;
|
||||
if (!ctx->dev)
|
||||
{
|
||||
fprintf(stderr, "No RDMA devices found\n");
|
||||
goto cleanup;
|
||||
}
|
||||
}
|
||||
else
|
||||
{
|
||||
int i;
|
||||
for (i = 0; dev_list[i]; ++i)
|
||||
if (!strcmp(ibv_get_device_name(dev_list[i]), ib_devname))
|
||||
break;
|
||||
ctx->dev = dev_list[i];
|
||||
if (!ctx->dev)
|
||||
{
|
||||
fprintf(stderr, "RDMA device %s not found\n", ib_devname);
|
||||
goto cleanup;
|
||||
}
|
||||
}
|
||||
|
||||
ctx->context = ibv_open_device(ctx->dev);
|
||||
if (!ctx->context)
|
||||
{
|
||||
fprintf(stderr, "Couldn't get RDMA context for %s\n", ibv_get_device_name(ctx->dev));
|
||||
goto cleanup;
|
||||
}
|
||||
|
||||
ctx->ib_port = ib_port;
|
||||
ctx->gid_index = gid_index;
|
||||
if ((res = ibv_query_port(ctx->context, ib_port, &ctx->portinfo)) != 0)
|
||||
{
|
||||
fprintf(stderr, "Couldn't get RDMA device %s port %d info: %s\n", ibv_get_device_name(ctx->dev), ib_port, strerror(res));
|
||||
goto cleanup;
|
||||
}
|
||||
ctx->my_lid = ctx->portinfo.lid;
|
||||
if (ctx->portinfo.link_layer != IBV_LINK_LAYER_ETHERNET && !ctx->my_lid)
|
||||
{
|
||||
fprintf(stderr, "RDMA device %s must have local LID because it's not Ethernet, but LID is zero\n", ibv_get_device_name(ctx->dev));
|
||||
goto cleanup;
|
||||
}
|
||||
if (ibv_query_gid(ctx->context, ib_port, gid_index, &ctx->my_gid))
|
||||
{
|
||||
fprintf(stderr, "Couldn't read RDMA device %s GID index %d\n", ibv_get_device_name(ctx->dev), gid_index);
|
||||
goto cleanup;
|
||||
}
|
||||
|
||||
ctx->pd = ibv_alloc_pd(ctx->context);
|
||||
if (!ctx->pd)
|
||||
{
|
||||
fprintf(stderr, "Couldn't allocate RDMA protection domain\n");
|
||||
goto cleanup;
|
||||
}
|
||||
|
||||
{
|
||||
if (ibv_query_device_ex(ctx->context, NULL, &ctx->attrx))
|
||||
{
|
||||
fprintf(stderr, "Couldn't query RDMA device for its features\n");
|
||||
goto cleanup;
|
||||
}
|
||||
if (!(ctx->attrx.odp_caps.general_caps & IBV_ODP_SUPPORT) ||
|
||||
!(ctx->attrx.odp_caps.general_caps & IBV_ODP_SUPPORT_IMPLICIT) ||
|
||||
!(ctx->attrx.odp_caps.per_transport_caps.rc_odp_caps & IBV_ODP_SUPPORT_SEND) ||
|
||||
!(ctx->attrx.odp_caps.per_transport_caps.rc_odp_caps & IBV_ODP_SUPPORT_RECV))
|
||||
{
|
||||
fprintf(stderr, "The RDMA device isn't implicit ODP (On-Demand Paging) capable or does not support RC send and receive with ODP\n");
|
||||
goto cleanup;
|
||||
}
|
||||
}
|
||||
|
||||
ctx->mr = ibv_reg_mr(ctx->pd, NULL, SIZE_MAX, IBV_ACCESS_LOCAL_WRITE | IBV_ACCESS_ON_DEMAND);
|
||||
if (!ctx->mr)
|
||||
{
|
||||
fprintf(stderr, "Couldn't register RDMA memory region\n");
|
||||
goto cleanup;
|
||||
}
|
||||
|
||||
ctx->channel = ibv_create_comp_channel(ctx->context);
|
||||
if (!ctx->channel)
|
||||
{
|
||||
fprintf(stderr, "Couldn't create RDMA completion channel\n");
|
||||
goto cleanup;
|
||||
}
|
||||
|
||||
ctx->max_cqe = 4096;
|
||||
ctx->cq = ibv_create_cq(ctx->context, ctx->max_cqe, NULL, ctx->channel, 0);
|
||||
if (!ctx->cq)
|
||||
{
|
||||
fprintf(stderr, "Couldn't create RDMA completion queue\n");
|
||||
goto cleanup;
|
||||
}
|
||||
|
||||
if (dev_list)
|
||||
ibv_free_device_list(dev_list);
|
||||
return ctx;
|
||||
|
||||
cleanup:
|
||||
delete ctx;
|
||||
if (dev_list)
|
||||
ibv_free_device_list(dev_list);
|
||||
return NULL;
|
||||
}
|
||||
|
||||
msgr_rdma_connection_t *msgr_rdma_connection_t::create(msgr_rdma_context_t *ctx, uint32_t max_send, uint32_t max_recv, uint32_t max_sge)
|
||||
{
|
||||
msgr_rdma_connection_t *conn = new msgr_rdma_connection_t;
|
||||
|
||||
max_sge = max_sge > ctx->attrx.orig_attr.max_sge ? ctx->attrx.orig_attr.max_sge : max_sge;
|
||||
|
||||
conn->ctx = ctx;
|
||||
conn->max_send = max_send;
|
||||
conn->max_recv = max_recv;
|
||||
conn->max_sge = max_sge;
|
||||
|
||||
ctx->used_max_cqe += max_send+max_recv;
|
||||
if (ctx->used_max_cqe > ctx->max_cqe)
|
||||
{
|
||||
// Resize CQ
|
||||
// Mellanox ConnectX-4 supports up to 4194303 CQEs, so it's fine to put everything into a single CQ
|
||||
int new_max_cqe = ctx->max_cqe;
|
||||
while (ctx->used_max_cqe > new_max_cqe)
|
||||
{
|
||||
new_max_cqe *= 2;
|
||||
}
|
||||
if (ibv_resize_cq(ctx->cq, new_max_cqe) != 0)
|
||||
{
|
||||
fprintf(stderr, "Couldn't resize RDMA completion queue to %d entries\n", new_max_cqe);
|
||||
delete conn;
|
||||
return NULL;
|
||||
}
|
||||
ctx->max_cqe = new_max_cqe;
|
||||
}
|
||||
|
||||
ibv_qp_init_attr init_attr = {
|
||||
.send_cq = ctx->cq,
|
||||
.recv_cq = ctx->cq,
|
||||
.cap = {
|
||||
.max_send_wr = max_send,
|
||||
.max_recv_wr = max_recv,
|
||||
.max_send_sge = max_sge,
|
||||
.max_recv_sge = max_sge,
|
||||
},
|
||||
.qp_type = IBV_QPT_RC,
|
||||
};
|
||||
conn->qp = ibv_create_qp(ctx->pd, &init_attr);
|
||||
if (!conn->qp)
|
||||
{
|
||||
fprintf(stderr, "Couldn't create RDMA queue pair\n");
|
||||
delete conn;
|
||||
return NULL;
|
||||
}
|
||||
|
||||
conn->addr.lid = ctx->my_lid;
|
||||
conn->addr.gid = ctx->my_gid;
|
||||
conn->addr.qpn = conn->qp->qp_num;
|
||||
conn->addr.psn = lrand48() & 0xffffff;
|
||||
|
||||
ibv_qp_attr attr = {
|
||||
.qp_state = IBV_QPS_INIT,
|
||||
.qp_access_flags = 0,
|
||||
.pkey_index = 0,
|
||||
.port_num = ctx->ib_port,
|
||||
};
|
||||
|
||||
if (ibv_modify_qp(conn->qp, &attr, IBV_QP_STATE | IBV_QP_PKEY_INDEX | IBV_QP_PORT | IBV_QP_ACCESS_FLAGS))
|
||||
{
|
||||
fprintf(stderr, "Failed to switch RDMA queue pair to INIT state\n");
|
||||
delete conn;
|
||||
return NULL;
|
||||
}
|
||||
|
||||
return conn;
|
||||
}
|
||||
|
||||
static ibv_mtu mtu_to_ibv_mtu(uint32_t mtu)
|
||||
{
|
||||
switch (mtu)
|
||||
{
|
||||
case 256: return IBV_MTU_256;
|
||||
case 512: return IBV_MTU_512;
|
||||
case 1024: return IBV_MTU_1024;
|
||||
case 2048: return IBV_MTU_2048;
|
||||
case 4096: return IBV_MTU_4096;
|
||||
}
|
||||
return IBV_MTU_4096;
|
||||
}
|
||||
|
||||
int msgr_rdma_connection_t::connect(msgr_rdma_address_t *dest)
|
||||
{
|
||||
auto conn = this;
|
||||
ibv_qp_attr attr = {
|
||||
.qp_state = IBV_QPS_RTR,
|
||||
.path_mtu = mtu_to_ibv_mtu(conn->ctx->mtu),
|
||||
.rq_psn = dest->psn,
|
||||
.sq_psn = conn->addr.psn,
|
||||
.dest_qp_num = dest->qpn,
|
||||
.ah_attr = {
|
||||
.grh = {
|
||||
.dgid = dest->gid,
|
||||
.sgid_index = conn->ctx->gid_index,
|
||||
.hop_limit = 1, // FIXME can it vary?
|
||||
},
|
||||
.dlid = dest->lid,
|
||||
.sl = 0, // service level
|
||||
.src_path_bits = 0,
|
||||
.is_global = (uint8_t)(dest->gid.global.interface_id ? 1 : 0),
|
||||
.port_num = conn->ctx->ib_port,
|
||||
},
|
||||
.max_rd_atomic = 1,
|
||||
.max_dest_rd_atomic = 1,
|
||||
// Timeout and min_rnr_timer actual values seem to be 4.096us*2^(timeout+1)
|
||||
.min_rnr_timer = 1,
|
||||
.timeout = 14,
|
||||
.retry_cnt = 7,
|
||||
.rnr_retry = 7,
|
||||
};
|
||||
// FIXME No idea if ibv_modify_qp is a blocking operation or not. No idea if it has a timeout and what it is.
|
||||
if (ibv_modify_qp(conn->qp, &attr, IBV_QP_STATE | IBV_QP_AV | IBV_QP_PATH_MTU |
|
||||
IBV_QP_DEST_QPN | IBV_QP_RQ_PSN | IBV_QP_MAX_DEST_RD_ATOMIC | IBV_QP_MIN_RNR_TIMER))
|
||||
{
|
||||
fprintf(stderr, "Failed to switch RDMA queue pair to RTR (ready-to-receive) state\n");
|
||||
return 1;
|
||||
}
|
||||
attr.qp_state = IBV_QPS_RTS;
|
||||
if (ibv_modify_qp(conn->qp, &attr, IBV_QP_STATE | IBV_QP_TIMEOUT |
|
||||
IBV_QP_RETRY_CNT | IBV_QP_RNR_RETRY | IBV_QP_SQ_PSN | IBV_QP_MAX_QP_RD_ATOMIC))
|
||||
{
|
||||
fprintf(stderr, "Failed to switch RDMA queue pair to RTS (ready-to-send) state\n");
|
||||
return 1;
|
||||
}
|
||||
return 0;
|
||||
}
|
||||
|
||||
// Being the client, connect all server's RDMA queues to our local (client) queues
|
||||
bool osd_messenger_t::connect_rdma_server(osd_client_t *cl, json11::Json rdma_addresses, uint64_t server_max_sge)
|
||||
{
|
||||
if (rdma_addresses.array_items().size() > 0)
|
||||
{
|
||||
if (!server_max_sge || server_max_sge > rdma_max_sge)
|
||||
{
|
||||
server_max_sge = rdma_max_sge;
|
||||
}
|
||||
int n_conn = rdma_addresses.array_items().size();
|
||||
if (n_conn < cl->rdma_queues.size())
|
||||
{
|
||||
for (int i = n_conn; i < cl->rdma_queues.size(); i++)
|
||||
{
|
||||
delete cl->rdma_queues[i];
|
||||
}
|
||||
cl->rdma_queues.resize(n_conn);
|
||||
}
|
||||
else if (n_conn > cl->rdma_queues.size())
|
||||
{
|
||||
n_conn = cl->rdma_queues.size();
|
||||
}
|
||||
for (int i = 0; i < n_conn; i++)
|
||||
{
|
||||
msgr_rdma_address_t addr;
|
||||
if (!msgr_rdma_address_t::from_string(rdma_addresses[i].string_value().c_str(), &addr) ||
|
||||
cl->rdma_queues[i]->connect(&addr) != 0)
|
||||
{
|
||||
printf(
|
||||
"Failed to connect to OSD %lu (address %s) using RDMA\n",
|
||||
cl->osd_num, rdma_addresses[i].string_value().c_str()
|
||||
);
|
||||
// FIXME: Keep TCP connection in this case
|
||||
osd_num_t peer_osd = cl->osd_num;
|
||||
stop_client(cl->peer_fd);
|
||||
on_connect_peer(peer_osd, -1);
|
||||
return false;
|
||||
}
|
||||
else
|
||||
{
|
||||
printf("Connected local queue %d to OSD %lu queue %d using RDMA\n", cl->rdma_queues[i]->qp->qp_num, cl->osd_num, addr.qpn);
|
||||
if (cl->rdma_queues[i]->max_sge > server_max_sge)
|
||||
{
|
||||
cl->rdma_queues[i]->max_sge = server_max_sge;
|
||||
}
|
||||
}
|
||||
}
|
||||
cl->peer_state = PEER_RDMA;
|
||||
tfd->set_fd_handler(cl->peer_fd, false, NULL);
|
||||
}
|
||||
else
|
||||
{
|
||||
for (auto rdma_conn: cl->rdma_queues)
|
||||
{
|
||||
delete rdma_conn;
|
||||
}
|
||||
cl->rdma_queues.resize(0);
|
||||
}
|
||||
return true;
|
||||
}
|
||||
|
||||
// Being the server, try to connect all client's RDMA queues to our local (server) queues
|
||||
bool osd_messenger_t::connect_rdma_client(osd_client_t *cl, json11::Json rdma_addresses, uint64_t client_max_sge)
|
||||
{
|
||||
if (rdma_addresses.array_items().size() > 0)
|
||||
{
|
||||
if (!client_max_sge || client_max_sge > rdma_max_sge)
|
||||
{
|
||||
client_max_sge = rdma_max_sge;
|
||||
}
|
||||
int n_conn = rdma_addresses.array_items().size();
|
||||
if (n_conn > rdma_queues_per_connection)
|
||||
{
|
||||
n_conn = rdma_queues_per_connection;
|
||||
}
|
||||
for (int i = 0; i < n_conn; i++)
|
||||
{
|
||||
msgr_rdma_address_t addr;
|
||||
if (msgr_rdma_address_t::from_string(rdma_addresses[i].string_value().c_str(), &addr))
|
||||
{
|
||||
auto rdma_conn = msgr_rdma_connection_t::create(rdma_context, rdma_max_send, rdma_max_recv, client_max_sge);
|
||||
if (rdma_conn && rdma_conn->connect(&addr) == 0)
|
||||
{
|
||||
printf("Connected local queue %d to client %d queue %d using RDMA\n", rdma_conn->qp->qp_num, cl->peer_fd, addr.qpn);
|
||||
cl->rdma_queues.push_back(rdma_conn);
|
||||
}
|
||||
else
|
||||
{
|
||||
if (rdma_conn)
|
||||
{
|
||||
delete rdma_conn;
|
||||
}
|
||||
printf(
|
||||
"Failed to connect RDMA queue pair to %s (client %d queue %d)\n",
|
||||
addr.to_string().c_str(), cl->peer_fd, i+1
|
||||
);
|
||||
// Delete all RDMA queues to keep the TCP connection
|
||||
for (int j = 0; j < cl->rdma_queues.size(); j++)
|
||||
{
|
||||
delete cl->rdma_queues[j];
|
||||
}
|
||||
cl->rdma_queues.resize(0);
|
||||
return false;
|
||||
}
|
||||
}
|
||||
}
|
||||
// Switch to RDMA state only after sending the configuration response
|
||||
cl->peer_state = PEER_RDMA_CONNECTING;
|
||||
for (int i = 0; i < cl->rdma_queues.size(); i++)
|
||||
{
|
||||
try_recv_rdma(cl, cl->rdma_queues[i]);
|
||||
}
|
||||
}
|
||||
return true;
|
||||
}
|
||||
|
||||
static void try_send_rdma_wr(msgr_rdma_connection_t *rc, uint64_t wr_id, ibv_sge *sge, int op_sge)
|
||||
{
|
||||
timespec tv;
|
||||
clock_gettime(CLOCK_REALTIME, &tv);
|
||||
uint64_t total = 0;
|
||||
for (int i = 0; i < op_sge; i++)
|
||||
total += sge[i].length;
|
||||
printf("%lu.%09lu RDMA send to queue %d: %lu bytes\n", tv.tv_sec, tv.tv_nsec, rc->qp->qp_num, total);
|
||||
ibv_send_wr *bad_wr = NULL;
|
||||
ibv_send_wr wr = {
|
||||
.wr_id = wr_id,
|
||||
.sg_list = sge,
|
||||
.num_sge = op_sge,
|
||||
.opcode = IBV_WR_SEND,
|
||||
.send_flags = IBV_SEND_SIGNALED,
|
||||
};
|
||||
int err = ibv_post_send(rc->qp, &wr, &bad_wr);
|
||||
if (err || bad_wr)
|
||||
{
|
||||
printf("RDMA send failed: %s\n", strerror(err));
|
||||
exit(1);
|
||||
}
|
||||
rc->cur_send++;
|
||||
}
|
||||
|
||||
static void try_recv_rdma_wr(msgr_rdma_connection_t *rc, uint64_t wr_id, ibv_sge *sge, int op_sge)
|
||||
{
|
||||
timespec tv;
|
||||
clock_gettime(CLOCK_REALTIME, &tv);
|
||||
uint64_t total = 0;
|
||||
for (int i = 0; i < op_sge; i++)
|
||||
total += sge[i].length;
|
||||
printf("%lu.%09lu RDMA receive from queue %d: %lu bytes\n", tv.tv_sec, tv.tv_nsec, rc->qp->qp_num, total);
|
||||
ibv_recv_wr *bad_wr = NULL;
|
||||
ibv_recv_wr wr = {
|
||||
.wr_id = wr_id,
|
||||
.sg_list = sge,
|
||||
.num_sge = op_sge,
|
||||
};
|
||||
int err = ibv_post_recv(rc->qp, &wr, &bad_wr);
|
||||
if (err || bad_wr)
|
||||
{
|
||||
printf("RDMA receive failed: %s\n", strerror(err));
|
||||
exit(1);
|
||||
}
|
||||
rc->cur_recv++;
|
||||
}
|
||||
|
||||
static bool try_recv_rdma_read(osd_client_t *cl, msgr_rdma_connection_t *rc, osd_op_t *cur_op, uint32_t bs_bitmap_granularity)
|
||||
{
|
||||
int op_size = bs_bitmap_granularity, op_sge = 1, op_max = rc->max_sge*bs_bitmap_granularity;
|
||||
iovec *segments = cur_op->iov.get_iovec();
|
||||
ibv_sge sge[rc->max_sge];
|
||||
sge[0] = {
|
||||
.addr = (uintptr_t)cur_op->reply.buf,
|
||||
.length = (uint32_t)OSD_PACKET_SIZE,
|
||||
.lkey = rc->ctx->mr->lkey,
|
||||
};
|
||||
while (rc->recv_pos < cur_op->iov.get_size())
|
||||
{
|
||||
iovec & iov = segments[rc->recv_pos];
|
||||
if (op_size >= op_max || op_sge >= rc->max_sge)
|
||||
{
|
||||
try_recv_rdma_wr(rc, cl->peer_fd, sge, op_sge);
|
||||
op_sge = 0;
|
||||
op_size = 0;
|
||||
if (rc->cur_recv >= rc->max_recv)
|
||||
{
|
||||
// FIXME
|
||||
exit(1);
|
||||
}
|
||||
}
|
||||
// Receive in (max_sge*4k) fragments
|
||||
uint32_t len = (uint32_t)(op_size+iov.iov_len-rc->recv_buf_pos < op_max
|
||||
? iov.iov_len-rc->recv_buf_pos : op_max-op_size);
|
||||
sge[op_sge++] = {
|
||||
.addr = (uintptr_t)(iov.iov_base+rc->recv_buf_pos),
|
||||
.length = len,
|
||||
.lkey = rc->ctx->mr->lkey,
|
||||
};
|
||||
op_size += len;
|
||||
rc->recv_buf_pos += len;
|
||||
if (rc->recv_buf_pos >= iov.iov_len)
|
||||
{
|
||||
rc->recv_pos++;
|
||||
rc->recv_buf_pos = 0;
|
||||
}
|
||||
}
|
||||
if (op_sge > 0)
|
||||
{
|
||||
try_recv_rdma_wr(rc, cl->peer_fd, sge, op_sge);
|
||||
}
|
||||
rc->recv_pos = 0;
|
||||
rc->recv_buf_pos = 0;
|
||||
return true;
|
||||
}
|
||||
|
||||
static bool try_send_rdma_read(osd_client_t *cl, msgr_rdma_connection_t *rc, osd_op_t *cur_op, int op_list_size, uint32_t bs_bitmap_granularity)
|
||||
{
|
||||
ibv_sge sge[rc->max_sge];
|
||||
int op_size = bs_bitmap_granularity, op_sge = 1, op_max = rc->max_sge*bs_bitmap_granularity;
|
||||
sge[0] = {
|
||||
.addr = (uintptr_t)cl->send_list[0].iov_base,
|
||||
.length = (uint32_t)cl->send_list[0].iov_len,
|
||||
.lkey = rc->ctx->mr->lkey,
|
||||
};
|
||||
rc->send_pos = 1;
|
||||
while (rc->send_pos < op_list_size)
|
||||
{
|
||||
iovec & iov = cl->send_list[rc->send_pos];
|
||||
if (cl->outbox[rc->send_pos].flags & MSGR_SENDP_HDR)
|
||||
{
|
||||
if (op_sge > 0)
|
||||
{
|
||||
try_send_rdma_wr(rc, cl->peer_fd, sge, op_sge);
|
||||
op_sge = 0;
|
||||
op_size = 0;
|
||||
if (rc->cur_send >= rc->max_send)
|
||||
break;
|
||||
}
|
||||
assert(rc->send_buf_pos == 0);
|
||||
sge[0] = {
|
||||
.addr = (uintptr_t)iov.iov_base,
|
||||
.length = (uint32_t)iov.iov_len,
|
||||
.lkey = rc->ctx->mr->lkey,
|
||||
};
|
||||
try_send_rdma_wr(rc, cl->peer_fd, sge, 1);
|
||||
rc->send_pos++;
|
||||
if (rc->cur_send >= rc->max_send)
|
||||
break;
|
||||
}
|
||||
else
|
||||
{
|
||||
if (op_size >= op_max || op_sge >= rc->max_sge)
|
||||
{
|
||||
try_send_rdma_wr(rc, cl->peer_fd, sge, op_sge);
|
||||
op_sge = 0;
|
||||
op_size = 0;
|
||||
if (rc->cur_send >= rc->max_send)
|
||||
break;
|
||||
}
|
||||
// Fragment all messages into parts no longer than (max_sge*4k) = 120k on ConnectX-4
|
||||
// Otherwise the client may not be able to receive them in small parts
|
||||
uint32_t len = (uint32_t)(op_size+iov.iov_len-rc->send_buf_pos < op_max ? iov.iov_len-rc->send_buf_pos : op_max-op_size);
|
||||
sge[op_sge++] = {
|
||||
.addr = (uintptr_t)(iov.iov_base+rc->send_buf_pos),
|
||||
.length = len,
|
||||
.lkey = rc->ctx->mr->lkey,
|
||||
};
|
||||
op_size += len;
|
||||
rc->send_buf_pos += len;
|
||||
if (rc->send_buf_pos >= iov.iov_len)
|
||||
{
|
||||
rc->send_pos++;
|
||||
rc->send_buf_pos = 0;
|
||||
}
|
||||
}
|
||||
}
|
||||
if (op_sge > 0)
|
||||
{
|
||||
try_send_rdma_wr(rc, cl->peer_fd, sge, op_sge);
|
||||
}
|
||||
if (op_list_size == 1)
|
||||
{
|
||||
if (cur_op->req.hdr.opcode == OSD_OP_SEC_READ ||
|
||||
cur_op->req.hdr.opcode == OSD_OP_READ)
|
||||
{
|
||||
sge[0] = {
|
||||
.addr = 0,
|
||||
.length = 0,
|
||||
.lkey = rc->ctx->mr->lkey,
|
||||
};
|
||||
uint64_t data_size = cur_op->req.hdr.opcode == OSD_OP_SEC_READ
|
||||
? cur_op->req.sec_rw.len
|
||||
: cur_op->req.rw.len;
|
||||
while (data_size >= op_max)
|
||||
{
|
||||
try_send_rdma_wr(rc, cl->peer_fd, sge, 1);
|
||||
data_size -= op_max;
|
||||
}
|
||||
if (data_size > 0)
|
||||
try_send_rdma_wr(rc, cl->peer_fd, sge, 1);
|
||||
}
|
||||
else if (cur_op->req.hdr.opcode == OSD_OP_SEC_READ_BMP)
|
||||
{
|
||||
sge[0] = {
|
||||
.addr = 0,
|
||||
.length = 0,
|
||||
.lkey = rc->ctx->mr->lkey,
|
||||
};
|
||||
try_send_rdma_wr(rc, cl->peer_fd, sge, 1);
|
||||
}
|
||||
else
|
||||
return true;
|
||||
}
|
||||
return true;
|
||||
}
|
||||
|
||||
void osd_messenger_t::try_send_rdma(osd_client_t *cl)
|
||||
{
|
||||
// Two different algorithms for outgoing and incoming operations
|
||||
while (cl->outbox.size() > 0)
|
||||
{
|
||||
osd_op_t *cur_op = cl->outbox[0].op;
|
||||
if (cur_op->op_type == OSD_OP_OUT)
|
||||
{
|
||||
// Pick a queue. Send operation to it in one part.
|
||||
int qi;
|
||||
for (qi = 0; qi < cl->rdma_queues.size() && cl->rdma_queues[qi]->cur_op != NULL; qi++) {}
|
||||
if (qi >= cl->rdma_queues.size())
|
||||
{
|
||||
// No free queues, retry later.
|
||||
// We only post 1 operation per queue to use the queue pair number as a 'tag'.
|
||||
return;
|
||||
}
|
||||
// Pick all entries for the operation from the queue
|
||||
int op_list_size = 0;
|
||||
while (op_list_size < cl->outbox.size() && cl->outbox[op_list_size].op == cur_op)
|
||||
{
|
||||
op_list_size++;
|
||||
}
|
||||
auto rq = cl->rdma_queues[qi];
|
||||
rq->cur_op = cur_op;
|
||||
ibv_sge sge[rq->max_sge];
|
||||
// FIXME: This won't work with long bitmaps. But I don't care, I want to finally test fucking RDMA
|
||||
// header or header+data
|
||||
sge[0] = {
|
||||
.addr = (uintptr_t)cl->send_list[0].iov_base,
|
||||
.length = (uint32_t)cl->send_list[0].iov_len,
|
||||
.lkey = rq->ctx->mr->lkey,
|
||||
};
|
||||
if (op_list_size == 2)
|
||||
{
|
||||
auto & iov = cl->send_list[1];
|
||||
sge[1] = {
|
||||
.addr = (uintptr_t)iov.iov_base,
|
||||
.length = (uint32_t)iov.iov_len,
|
||||
.lkey = rq->ctx->mr->lkey,
|
||||
};
|
||||
try_send_rdma_wr(rq, cl->peer_fd, sge, 2);
|
||||
}
|
||||
else if (op_list_size == 1)
|
||||
{
|
||||
try_send_rdma_wr(rq, cl->peer_fd, sge, 1);
|
||||
}
|
||||
else
|
||||
{
|
||||
printf("unexpected long send_list for opcode %lu: %lu entries\n", cur_op->req.hdr.opcode, cl->send_list.size());
|
||||
exit(1);
|
||||
}
|
||||
cl->send_list.erase(cl->send_list.begin(), cl->send_list.begin()+op_list_size);
|
||||
cl->outbox.erase(cl->outbox.begin(), cl->outbox.begin()+op_list_size);
|
||||
// Post a receive request for the reply at the same time
|
||||
if (cur_op->req.hdr.opcode == OSD_OP_SEC_READ ||
|
||||
cur_op->req.hdr.opcode == OSD_OP_READ)
|
||||
{
|
||||
try_recv_rdma_read(cl, rq, cur_op, bs_bitmap_granularity);
|
||||
}
|
||||
else if (cur_op->req.hdr.opcode == OSD_OP_SEC_READ_BMP)
|
||||
{
|
||||
assert(!cur_op->iov.count);
|
||||
// FIXME: hardcode
|
||||
#define clean_entry_bitmap_size 4
|
||||
// Reply size is known
|
||||
uint64_t data_size = cur_op->req.sec_read_bmp.len / sizeof(obj_ver_id) * (8 + clean_entry_bitmap_size);
|
||||
cur_op->rmw_buf = malloc_or_die(data_size);
|
||||
sge[0] = {
|
||||
.addr = (uintptr_t)cur_op->reply.buf,
|
||||
.length = (uint32_t)OSD_PACKET_SIZE,
|
||||
.lkey = rq->ctx->mr->lkey,
|
||||
};
|
||||
sge[1] = {
|
||||
.addr = (uintptr_t)cur_op->rmw_buf,
|
||||
.length = (uint32_t)data_size,
|
||||
.lkey = rq->ctx->mr->lkey,
|
||||
};
|
||||
try_recv_rdma_wr(rq, cl->peer_fd, sge, 2);
|
||||
}
|
||||
else
|
||||
{
|
||||
// No reply or reply size is unknown
|
||||
sge[0] = {
|
||||
.addr = (uintptr_t)cur_op->reply.buf,
|
||||
.length = (uint32_t)OSD_PACKET_SIZE,
|
||||
.lkey = rq->ctx->mr->lkey,
|
||||
};
|
||||
try_recv_rdma_wr(rq, cl->peer_fd, sge, 1);
|
||||
}
|
||||
}
|
||||
else
|
||||
{
|
||||
// Send reply to the same queue the operation came from.
|
||||
// Fragment it into parts no longer than (max_sge*4k) to always
|
||||
// be able to send and receive them correctly.
|
||||
int qi;
|
||||
for (qi = 0; qi < cl->rdma_queues.size() && cl->rdma_queues[qi]->cur_op != cur_op; qi++) {}
|
||||
if (qi >= cl->rdma_queues.size())
|
||||
{
|
||||
printf("Unknown incoming operation for client %d\n", cl->peer_fd);
|
||||
exit(1);
|
||||
}
|
||||
// Pick all entries for the operation from the queue
|
||||
int op_list_size = 0;
|
||||
while (op_list_size < cl->outbox.size() && cl->outbox[op_list_size].op == cur_op)
|
||||
{
|
||||
op_list_size++;
|
||||
}
|
||||
auto rq = cl->rdma_queues[qi];
|
||||
if (cur_op->req.hdr.opcode == OSD_OP_SEC_READ ||
|
||||
cur_op->req.hdr.opcode == OSD_OP_READ)
|
||||
{
|
||||
try_send_rdma_read(cl, rq, cur_op, op_list_size, bs_bitmap_granularity);
|
||||
rq->send_pos = 0;
|
||||
rq->send_buf_pos = 0;
|
||||
}
|
||||
else if (op_list_size == 1)
|
||||
{
|
||||
ibv_sge sge[1];
|
||||
sge[0] = {
|
||||
.addr = (uintptr_t)cl->send_list[0].iov_base,
|
||||
.length = (uint32_t)cl->send_list[0].iov_len,
|
||||
.lkey = rq->ctx->mr->lkey,
|
||||
};
|
||||
try_send_rdma_wr(rq, cl->peer_fd, sge, 1);
|
||||
}
|
||||
else if (op_list_size == 2)
|
||||
{
|
||||
ibv_sge sge[2];
|
||||
sge[0] = {
|
||||
.addr = (uintptr_t)cl->send_list[0].iov_base,
|
||||
.length = (uint32_t)cl->send_list[0].iov_len,
|
||||
.lkey = rq->ctx->mr->lkey,
|
||||
};
|
||||
sge[1] = {
|
||||
.addr = (uintptr_t)cl->send_list[1].iov_base,
|
||||
.length = (uint32_t)cl->send_list[1].iov_len,
|
||||
.lkey = rq->ctx->mr->lkey,
|
||||
};
|
||||
if (cur_op->req.hdr.opcode == OSD_OP_SEC_READ_BMP)
|
||||
try_send_rdma_wr(rq, cl->peer_fd, sge, 2);
|
||||
else
|
||||
{
|
||||
try_send_rdma_wr(rq, cl->peer_fd, sge, 1);
|
||||
try_send_rdma_wr(rq, cl->peer_fd, sge+1, 1);
|
||||
}
|
||||
}
|
||||
else if (op_list_size > 2)
|
||||
{
|
||||
printf("Unexpected long send_list for opcode %lu: %lu entries\n", cur_op->req.hdr.opcode, cl->send_list.size());
|
||||
exit(1);
|
||||
}
|
||||
cl->send_list.erase(cl->send_list.begin(), cl->send_list.begin()+op_list_size);
|
||||
cl->outbox.erase(cl->outbox.begin(), cl->outbox.begin()+op_list_size);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// Try to receive an incoming operation via RDMA
|
||||
void osd_messenger_t::try_recv_rdma(osd_client_t *cl, msgr_rdma_connection_t *rc)
|
||||
{
|
||||
rc->cur_op = new osd_op_t;
|
||||
rc->cur_op->peer_fd = cl->peer_fd;
|
||||
rc->cur_op->op_type = OSD_OP_IN;
|
||||
rc->cur_op->buf = memalign_or_die(MEM_ALIGNMENT, 128*1024); // FIXME hardcode for tests
|
||||
ibv_sge sge[2];
|
||||
sge[0] = {
|
||||
.addr = (uintptr_t)rc->cur_op->req.buf,
|
||||
.length = (uint32_t)OSD_PACKET_SIZE,
|
||||
.lkey = rc->ctx->mr->lkey,
|
||||
};
|
||||
sge[1] = {
|
||||
.addr = (uintptr_t)rc->cur_op->buf,
|
||||
.length = (uint32_t)128*1024,
|
||||
.lkey = rc->ctx->mr->lkey,
|
||||
};
|
||||
try_recv_rdma_wr(rc, cl->peer_fd, sge, 2);
|
||||
}
|
||||
|
||||
#define RDMA_EVENTS_AT_ONCE 32
|
||||
|
||||
void osd_messenger_t::handle_rdma_events()
|
||||
{
|
||||
// Request next notification
|
||||
ibv_cq *ev_cq;
|
||||
void *ev_ctx;
|
||||
// FIXME: This is inefficient as it calls read()...
|
||||
timespec tv;
|
||||
if (ibv_get_cq_event(rdma_context->channel, &ev_cq, &ev_ctx) == 0)
|
||||
{
|
||||
ibv_ack_cq_events(rdma_context->cq, 1);
|
||||
}
|
||||
if (ibv_req_notify_cq(rdma_context->cq, 0) != 0)
|
||||
{
|
||||
printf("Failed to request RDMA completion notification, exiting\n");
|
||||
exit(1);
|
||||
}
|
||||
ibv_wc wc[RDMA_EVENTS_AT_ONCE];
|
||||
int event_count;
|
||||
do
|
||||
{
|
||||
event_count = ibv_poll_cq(rdma_context->cq, RDMA_EVENTS_AT_ONCE, wc);
|
||||
for (int i = 0; i < event_count; i++)
|
||||
{
|
||||
int client_id = wc[i].wr_id;
|
||||
bool is_send = wc[i].opcode == IBV_WC_SEND;
|
||||
auto cl_it = clients.find(client_id);
|
||||
if (cl_it == clients.end())
|
||||
{
|
||||
continue;
|
||||
}
|
||||
osd_client_t *cl = cl_it->second;
|
||||
if (wc[i].status != IBV_WC_SUCCESS)
|
||||
{
|
||||
printf("RDMA work request failed for client %d", client_id);
|
||||
if (cl->osd_num)
|
||||
{
|
||||
printf(" (OSD %lu)", cl->osd_num);
|
||||
}
|
||||
printf(" with status: %s, stopping client\n", ibv_wc_status_str(wc[i].status));
|
||||
stop_client(client_id);
|
||||
continue;
|
||||
}
|
||||
int q;
|
||||
for (q = 0; q < cl->rdma_queues.size() && cl->rdma_queues[q]->qp->qp_num != wc[i].qp_num; q++) {}
|
||||
if (q >= cl->rdma_queues.size())
|
||||
{
|
||||
printf("Unknown queue %d for client %d\n", wc[i].qp_num, cl->peer_fd);
|
||||
exit(1);
|
||||
}
|
||||
auto rc = cl->rdma_queues[q];
|
||||
if (is_send)
|
||||
{
|
||||
clock_gettime(CLOCK_REALTIME, &tv);
|
||||
printf("%lu.%09lu Done RDMA send on queue %d\n", tv.tv_sec, tv.tv_nsec, wc[i].qp_num);
|
||||
}
|
||||
else
|
||||
{
|
||||
clock_gettime(CLOCK_REALTIME, &tv);
|
||||
printf("%lu.%09lu Done RDMA recv on queue %d, %d bytes\n", tv.tv_sec, tv.tv_nsec, wc[i].qp_num, wc[i].byte_len);
|
||||
}
|
||||
if (!is_send)
|
||||
{
|
||||
rc->cur_recv--;
|
||||
if (!rc->cur_recv)
|
||||
{
|
||||
// Fucking shit...
|
||||
if (rc->cur_op->op_type == OSD_OP_IN)
|
||||
{
|
||||
if (wc[i].byte_len <= OSD_PACKET_SIZE)
|
||||
{
|
||||
free(rc->cur_op->buf);
|
||||
rc->cur_op->buf = NULL;
|
||||
}
|
||||
cl->received_ops.push_back(rc->cur_op);
|
||||
set_immediate.push_back([this, op = rc->cur_op]() { exec_op(op); });
|
||||
}
|
||||
else /* if (rc->cur_op->op_type == OSD_OP_OUT) */
|
||||
{
|
||||
if (rc->cur_op->reply.hdr.opcode == OSD_OP_SEC_READ ||
|
||||
rc->cur_op->reply.hdr.opcode == OSD_OP_READ)
|
||||
{
|
||||
// Data is already received
|
||||
cl->sent_ops.erase(rc->cur_op->req.hdr.id);
|
||||
handle_reply_ready(rc->cur_op);
|
||||
rc->cur_op = NULL;
|
||||
try_send_rdma(cl);
|
||||
}
|
||||
else if (rc->cur_op->reply.hdr.opcode == OSD_OP_SEC_READ_BMP)
|
||||
{
|
||||
// Data is already received, but we need to switch buffers
|
||||
cl->sent_ops.erase(rc->cur_op->req.hdr.id);
|
||||
free(rc->cur_op->buf);
|
||||
rc->cur_op->buf = rc->cur_op->rmw_buf;
|
||||
handle_reply_ready(rc->cur_op);
|
||||
rc->cur_op = NULL;
|
||||
try_send_rdma(cl);
|
||||
}
|
||||
else if (rc->cur_op->reply.hdr.opcode == OSD_OP_SEC_LIST && rc->cur_op->reply.hdr.retval > 0 ||
|
||||
rc->cur_op->reply.hdr.opcode == OSD_OP_SHOW_CONFIG && rc->cur_op->reply.hdr.retval > 0)
|
||||
{
|
||||
if (rc->recv_pos != 1)
|
||||
{
|
||||
// Data is not received yet (RNR)
|
||||
uint32_t len;
|
||||
if (rc->cur_op->reply.hdr.opcode == OSD_OP_SEC_LIST)
|
||||
len = sizeof(obj_ver_id) * rc->cur_op->reply.hdr.retval;
|
||||
else
|
||||
len = rc->cur_op->reply.hdr.retval;
|
||||
rc->cur_op->buf = malloc_or_die(len);
|
||||
ibv_sge sge[1];
|
||||
sge[0] = {
|
||||
.addr = (uintptr_t)rc->cur_op->buf,
|
||||
.length = len,
|
||||
.lkey = rc->ctx->mr->lkey,
|
||||
};
|
||||
try_recv_rdma_wr(rc, cl->peer_fd, sge, 1);
|
||||
rc->recv_pos = 1;
|
||||
}
|
||||
else
|
||||
{
|
||||
// Done
|
||||
cl->sent_ops.erase(rc->cur_op->req.hdr.id);
|
||||
handle_reply_ready(rc->cur_op);
|
||||
rc->cur_op = NULL;
|
||||
rc->recv_pos = 0;
|
||||
try_send_rdma(cl);
|
||||
}
|
||||
}
|
||||
else
|
||||
{
|
||||
// No data
|
||||
cl->sent_ops.erase(rc->cur_op->req.hdr.id);
|
||||
handle_reply_ready(rc->cur_op);
|
||||
rc->cur_op = NULL;
|
||||
try_send_rdma(cl);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
else
|
||||
{
|
||||
rc->cur_send--;
|
||||
if (!rc->cur_send)
|
||||
{
|
||||
if (rc->cur_op->op_type == OSD_OP_OUT)
|
||||
{
|
||||
// Nothing
|
||||
}
|
||||
else /* if (rc->cur_op->op_type == OSD_OP_IN) */
|
||||
{
|
||||
// Reply fully sent
|
||||
delete rc->cur_op;
|
||||
rc->cur_op = NULL;
|
||||
// Post receive for the next incoming op
|
||||
try_recv_rdma(cl, rc);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
} while (event_count > 0);
|
||||
for (auto cb: set_immediate)
|
||||
{
|
||||
cb();
|
||||
}
|
||||
set_immediate.clear();
|
||||
}
|
||||
|
||||
int osd_messenger_t::get_rdma_max_sge()
|
||||
{
|
||||
return rdma_max_sge;
|
||||
}
|
56
src/msgr_rdma.h
Normal file
56
src/msgr_rdma.h
Normal file
@@ -0,0 +1,56 @@
|
||||
#pragma once
|
||||
#include <infiniband/verbs.h>
|
||||
#include <string>
|
||||
#include <vector>
|
||||
|
||||
struct osd_op_t;
|
||||
|
||||
struct msgr_rdma_address_t
|
||||
{
|
||||
ibv_gid gid;
|
||||
uint16_t lid;
|
||||
uint32_t qpn;
|
||||
uint32_t psn;
|
||||
|
||||
std::string to_string();
|
||||
static bool from_string(const char *str, msgr_rdma_address_t *dest);
|
||||
};
|
||||
|
||||
struct msgr_rdma_context_t
|
||||
{
|
||||
ibv_context *context = NULL;
|
||||
ibv_device *dev = NULL;
|
||||
ibv_device_attr_ex attrx;
|
||||
ibv_pd *pd = NULL;
|
||||
ibv_mr *mr = NULL;
|
||||
ibv_comp_channel *channel = NULL;
|
||||
ibv_cq *cq = NULL;
|
||||
ibv_port_attr portinfo;
|
||||
uint8_t ib_port;
|
||||
uint8_t gid_index;
|
||||
uint16_t my_lid;
|
||||
ibv_gid my_gid;
|
||||
uint32_t mtu;
|
||||
int max_cqe = 0;
|
||||
int used_max_cqe = 0;
|
||||
|
||||
static msgr_rdma_context_t *create(const char *ib_devname, uint8_t ib_port, uint8_t gid_index, uint32_t mtu);
|
||||
~msgr_rdma_context_t();
|
||||
};
|
||||
|
||||
struct msgr_rdma_connection_t
|
||||
{
|
||||
msgr_rdma_context_t *ctx = NULL;
|
||||
ibv_qp *qp = NULL;
|
||||
msgr_rdma_address_t addr;
|
||||
int max_send = 0, max_recv = 0, max_sge = 0;
|
||||
int cur_send = 0, cur_recv = 0;
|
||||
|
||||
osd_op_t *cur_op = NULL;
|
||||
int send_pos = 0, send_buf_pos = 0;
|
||||
int recv_pos = 0, recv_buf_pos = 0;
|
||||
|
||||
~msgr_rdma_connection_t();
|
||||
static msgr_rdma_connection_t *create(msgr_rdma_context_t *ctx, uint32_t max_send, uint32_t max_recv, uint32_t max_sge);
|
||||
int connect(msgr_rdma_address_t *dest);
|
||||
};
|
@@ -207,20 +207,26 @@ void osd_messenger_t::handle_op_hdr(osd_client_t *cl)
|
||||
else if (cur_op->req.hdr.opcode == OSD_OP_SEC_WRITE ||
|
||||
cur_op->req.hdr.opcode == OSD_OP_SEC_WRITE_STABLE)
|
||||
{
|
||||
if (cur_op->req.sec_rw.attr_len > 0)
|
||||
if (cur_op->req.sec_rw.bitmap_len > 0)
|
||||
{
|
||||
if (cur_op->req.sec_rw.attr_len > sizeof(unsigned))
|
||||
cur_op->bitmap = cur_op->rmw_buf = malloc_or_die(cur_op->req.sec_rw.attr_len);
|
||||
if (cur_op->req.sec_rw.bitmap_len > sizeof(void*))
|
||||
cur_op->bitmap = cur_op->rmw_buf = malloc_or_die(cur_op->req.sec_rw.bitmap_len);
|
||||
else
|
||||
cur_op->bitmap = &cur_op->bmp_data;
|
||||
cl->recv_list.push_back(cur_op->bitmap, cur_op->req.sec_rw.attr_len);
|
||||
if (cur_op->req.sec_rw.bitmap_len <= 8)
|
||||
memcpy(cur_op->bitmap, &cur_op->req.sec_rw.bitmap, cur_op->req.sec_rw.bitmap_len);
|
||||
else
|
||||
{
|
||||
cl->recv_list.push_back(cur_op->bitmap, cur_op->req.sec_rw.bitmap_len);
|
||||
cl->read_remaining += cur_op->req.sec_rw.bitmap_len;
|
||||
}
|
||||
}
|
||||
if (cur_op->req.sec_rw.len > 0)
|
||||
{
|
||||
cur_op->buf = memalign_or_die(MEM_ALIGNMENT, cur_op->req.sec_rw.len);
|
||||
cl->recv_list.push_back(cur_op->buf, cur_op->req.sec_rw.len);
|
||||
cl->read_remaining += cur_op->req.sec_rw.len;
|
||||
}
|
||||
cl->read_remaining = cur_op->req.sec_rw.len + cur_op->req.sec_rw.attr_len;
|
||||
}
|
||||
else if (cur_op->req.hdr.opcode == OSD_OP_SEC_STABILIZE ||
|
||||
cur_op->req.hdr.opcode == OSD_OP_SEC_ROLLBACK)
|
||||
@@ -254,6 +260,16 @@ void osd_messenger_t::handle_op_hdr(osd_client_t *cl)
|
||||
}
|
||||
cl->read_remaining = cur_op->req.rw.len;
|
||||
}
|
||||
else if (cur_op->req.hdr.opcode == OSD_OP_SHOW_CONFIG)
|
||||
{
|
||||
if (cur_op->req.show_conf.json_len > 0)
|
||||
{
|
||||
cur_op->buf = malloc_or_die(cur_op->req.show_conf.json_len+1);
|
||||
((uint8_t*)cur_op->buf)[cur_op->req.show_conf.json_len] = 0;
|
||||
cl->recv_list.push_back(cur_op->buf, cur_op->req.show_conf.json_len);
|
||||
}
|
||||
cl->read_remaining = cur_op->req.show_conf.json_len;
|
||||
}
|
||||
if (cl->read_remaining > 0)
|
||||
{
|
||||
// Read data
|
||||
@@ -285,7 +301,7 @@ bool osd_messenger_t::handle_reply_hdr(osd_client_t *cl)
|
||||
if (op->reply.hdr.opcode == OSD_OP_SEC_READ || op->reply.hdr.opcode == OSD_OP_READ)
|
||||
{
|
||||
// Read data. In this case we assume that the buffer is preallocated by the caller (!)
|
||||
unsigned bmp_len = (op->reply.hdr.opcode == OSD_OP_SEC_READ ? op->reply.sec_rw.attr_len : op->reply.rw.bitmap_len);
|
||||
unsigned bmp_len = (op->reply.hdr.opcode == OSD_OP_SEC_READ ? op->reply.sec_rw.bitmap_len : op->reply.rw.bitmap_len);
|
||||
unsigned expected_size = (op->reply.hdr.opcode == OSD_OP_SEC_READ ? op->req.sec_rw.len : op->req.rw.len);
|
||||
if (op->reply.hdr.retval >= 0 && (op->reply.hdr.retval != expected_size || bmp_len > op->bitmap_len))
|
||||
{
|
||||
@@ -299,14 +315,24 @@ bool osd_messenger_t::handle_reply_hdr(osd_client_t *cl)
|
||||
if (op->reply.hdr.retval >= 0 && bmp_len > 0)
|
||||
{
|
||||
assert(op->bitmap);
|
||||
cl->recv_list.push_back(op->bitmap, bmp_len);
|
||||
if (bmp_len <= 8)
|
||||
{
|
||||
memcpy(op->bitmap, (op->reply.hdr.opcode == OSD_OP_SEC_READ
|
||||
? &op->reply.sec_rw.bitmap
|
||||
: &op->reply.rw.bitmap), bmp_len);
|
||||
}
|
||||
else
|
||||
{
|
||||
cl->recv_list.push_back(op->bitmap, bmp_len);
|
||||
cl->read_remaining += bmp_len;
|
||||
}
|
||||
}
|
||||
if (op->reply.hdr.retval > 0)
|
||||
{
|
||||
assert(op->iov.count > 0);
|
||||
cl->recv_list.append(op->iov);
|
||||
cl->read_remaining += op->reply.hdr.retval;
|
||||
}
|
||||
cl->read_remaining = op->reply.hdr.retval + bmp_len;
|
||||
if (cl->read_remaining == 0)
|
||||
{
|
||||
goto reuse;
|
||||
@@ -332,16 +358,17 @@ bool osd_messenger_t::handle_reply_hdr(osd_client_t *cl)
|
||||
cl->read_op = op;
|
||||
cl->read_state = CL_READ_REPLY_DATA;
|
||||
cl->read_remaining = op->reply.hdr.retval;
|
||||
free(op->buf);
|
||||
op->buf = memalign_or_die(MEM_ALIGNMENT, cl->read_remaining);
|
||||
cl->recv_list.push_back(op->buf, cl->read_remaining);
|
||||
}
|
||||
else if (op->reply.hdr.opcode == OSD_OP_SHOW_CONFIG && op->reply.hdr.retval > 0)
|
||||
{
|
||||
assert(!op->iov.count);
|
||||
delete cl->read_op;
|
||||
cl->read_op = op;
|
||||
cl->read_state = CL_READ_REPLY_DATA;
|
||||
cl->read_remaining = op->reply.hdr.retval;
|
||||
free(op->buf);
|
||||
op->buf = malloc_or_die(op->reply.hdr.retval);
|
||||
cl->recv_list.push_back(op->buf, op->reply.hdr.retval);
|
||||
}
|
||||
|
@@ -46,27 +46,41 @@ void osd_messenger_t::outbox_push(osd_op_t *cur_op)
|
||||
to_send_list.push_back((iovec){ .iov_base = cur_op->req.buf, .iov_len = OSD_PACKET_SIZE });
|
||||
cl->sent_ops[cur_op->req.hdr.id] = cur_op;
|
||||
}
|
||||
to_outbox.push_back(NULL);
|
||||
to_outbox.push_back((msgr_sendp_t){ .op = cur_op, .flags = MSGR_SENDP_HDR });
|
||||
// Bitmap
|
||||
if (cur_op->op_type == OSD_OP_IN &&
|
||||
cur_op->req.hdr.opcode == OSD_OP_SEC_READ &&
|
||||
cur_op->reply.sec_rw.attr_len > 0)
|
||||
cur_op->reply.sec_rw.bitmap_len > 0)
|
||||
{
|
||||
to_send_list.push_back((iovec){
|
||||
.iov_base = cur_op->bitmap,
|
||||
.iov_len = cur_op->reply.sec_rw.attr_len,
|
||||
});
|
||||
to_outbox.push_back(NULL);
|
||||
if (cur_op->reply.sec_rw.bitmap_len <= 8)
|
||||
{
|
||||
memcpy(&cur_op->reply.sec_rw.bitmap, cur_op->bitmap, cur_op->reply.sec_rw.bitmap_len);
|
||||
}
|
||||
else
|
||||
{
|
||||
to_send_list.push_back((iovec){
|
||||
.iov_base = cur_op->bitmap,
|
||||
.iov_len = cur_op->reply.sec_rw.bitmap_len,
|
||||
});
|
||||
to_outbox.push_back((msgr_sendp_t){ .op = cur_op, .flags = MSGR_SENDP_BMP });
|
||||
}
|
||||
}
|
||||
else if (cur_op->op_type == OSD_OP_OUT &&
|
||||
(cur_op->req.hdr.opcode == OSD_OP_SEC_WRITE || cur_op->req.hdr.opcode == OSD_OP_SEC_WRITE_STABLE) &&
|
||||
cur_op->req.sec_rw.attr_len > 0)
|
||||
cur_op->req.sec_rw.bitmap_len > 0)
|
||||
{
|
||||
to_send_list.push_back((iovec){
|
||||
.iov_base = cur_op->bitmap,
|
||||
.iov_len = cur_op->req.sec_rw.attr_len,
|
||||
});
|
||||
to_outbox.push_back(NULL);
|
||||
if (cur_op->req.sec_rw.bitmap_len <= 8)
|
||||
{
|
||||
memcpy(&cur_op->req.sec_rw.bitmap, cur_op->bitmap, cur_op->req.sec_rw.bitmap_len);
|
||||
}
|
||||
else
|
||||
{
|
||||
to_send_list.push_back((iovec){
|
||||
.iov_base = cur_op->bitmap,
|
||||
.iov_len = cur_op->req.sec_rw.attr_len,
|
||||
});
|
||||
to_outbox.push_back((msgr_sendp_t){ .op = cur_op, .flags = MSGR_SENDP_BMP });
|
||||
}
|
||||
}
|
||||
// Operation data
|
||||
if ((cur_op->op_type == OSD_OP_IN
|
||||
@@ -78,13 +92,14 @@ void osd_messenger_t::outbox_push(osd_op_t *cur_op)
|
||||
cur_op->req.hdr.opcode == OSD_OP_SEC_WRITE ||
|
||||
cur_op->req.hdr.opcode == OSD_OP_SEC_WRITE_STABLE ||
|
||||
cur_op->req.hdr.opcode == OSD_OP_SEC_STABILIZE ||
|
||||
cur_op->req.hdr.opcode == OSD_OP_SEC_ROLLBACK)) && cur_op->iov.count > 0)
|
||||
cur_op->req.hdr.opcode == OSD_OP_SEC_ROLLBACK ||
|
||||
cur_op->req.hdr.opcode == OSD_OP_SHOW_CONFIG)) && cur_op->iov.count > 0)
|
||||
{
|
||||
for (int i = 0; i < cur_op->iov.count; i++)
|
||||
{
|
||||
assert(cur_op->iov.buf[i].iov_base);
|
||||
to_send_list.push_back(cur_op->iov.buf[i]);
|
||||
to_outbox.push_back(NULL);
|
||||
to_outbox.push_back((msgr_sendp_t){ .op = cur_op, .flags = 0 });
|
||||
}
|
||||
}
|
||||
if (cur_op->req.hdr.opcode == OSD_OP_SEC_READ_BMP)
|
||||
@@ -93,13 +108,19 @@ void osd_messenger_t::outbox_push(osd_op_t *cur_op)
|
||||
to_send_list.push_back((iovec){ .iov_base = cur_op->buf, .iov_len = (size_t)cur_op->reply.hdr.retval });
|
||||
else if (cur_op->op_type == OSD_OP_OUT && cur_op->req.sec_read_bmp.len > 0)
|
||||
to_send_list.push_back((iovec){ .iov_base = cur_op->buf, .iov_len = (size_t)cur_op->req.sec_read_bmp.len });
|
||||
to_outbox.push_back(NULL);
|
||||
to_outbox.push_back((msgr_sendp_t){ .op = cur_op, .flags = 0 });
|
||||
}
|
||||
if (cur_op->op_type == OSD_OP_IN)
|
||||
{
|
||||
// To free it later
|
||||
to_outbox[to_outbox.size()-1] = cur_op;
|
||||
to_outbox[to_outbox.size()-1].flags |= MSGR_SENDP_FREE;
|
||||
}
|
||||
#ifdef WITH_RDMA
|
||||
if (cl->peer_state == PEER_RDMA)
|
||||
{
|
||||
try_send_rdma(cl);
|
||||
return;
|
||||
}
|
||||
#endif
|
||||
if (!ringloop)
|
||||
{
|
||||
// FIXME: It's worse because it doesn't allow batching
|
||||
@@ -232,10 +253,10 @@ void osd_messenger_t::handle_send(int result, osd_client_t *cl)
|
||||
iovec & iov = cl->send_list[done];
|
||||
if (iov.iov_len <= result)
|
||||
{
|
||||
if (cl->outbox[done])
|
||||
if (cl->outbox[done].flags & MSGR_SENDP_FREE)
|
||||
{
|
||||
// Reply fully sent
|
||||
delete cl->outbox[done];
|
||||
delete cl->outbox[done].op;
|
||||
}
|
||||
result -= iov.iov_len;
|
||||
done++;
|
||||
@@ -260,6 +281,16 @@ void osd_messenger_t::handle_send(int result, osd_client_t *cl)
|
||||
cl->next_outbox.clear();
|
||||
}
|
||||
cl->write_state = cl->outbox.size() > 0 ? CL_WRITE_READY : 0;
|
||||
#ifdef WITH_RDMA
|
||||
if (cl->peer_state == PEER_RDMA_CONNECTING && cl->rdma_queues.size() > 0 && !cl->outbox.size())
|
||||
{
|
||||
// FIXME: Do something better than just forgetting the FD
|
||||
// FIXME: Ignore pings during RDMA state transition
|
||||
printf("Successfully connected with client %d using RDMA\n", cl->peer_fd);
|
||||
cl->peer_state = PEER_RDMA;
|
||||
tfd->set_fd_handler(cl->peer_fd, false, NULL);
|
||||
}
|
||||
#endif
|
||||
}
|
||||
if (cl->write_state != 0)
|
||||
{
|
||||
|
@@ -122,6 +122,13 @@ void osd_messenger_t::stop_client(int peer_fd, bool force)
|
||||
// And close the FD only when everything is done
|
||||
// ...because peer_fd number can get reused after close()
|
||||
close(peer_fd);
|
||||
#ifdef WITH_RDMA
|
||||
for (auto rdma_conn: cl->rdma_queues)
|
||||
{
|
||||
delete rdma_conn;
|
||||
}
|
||||
cl->rdma_queues.resize(0);
|
||||
#endif
|
||||
#endif
|
||||
// Find the item again because it can be invalidated at this point
|
||||
it = clients.find(peer_fd);
|
||||
|
@@ -26,7 +26,10 @@ const char *exe_name = NULL;
|
||||
class nbd_proxy
|
||||
{
|
||||
protected:
|
||||
std::string image_name;
|
||||
uint64_t inode = 0;
|
||||
uint64_t device_size = 0;
|
||||
inode_watch_t *watch = NULL;
|
||||
|
||||
ring_loop_t *ringloop = NULL;
|
||||
epoll_manager_t *epmgr = NULL;
|
||||
@@ -111,9 +114,9 @@ public:
|
||||
{
|
||||
printf(
|
||||
"Vitastor NBD proxy\n"
|
||||
"(c) Vitaliy Filippov, 2020 (VNPL-1.1)\n\n"
|
||||
"(c) Vitaliy Filippov, 2020-2021 (VNPL-1.1)\n\n"
|
||||
"USAGE:\n"
|
||||
" %s map --etcd_address <etcd_address> --pool <pool> --inode <inode> --size <size in bytes>\n"
|
||||
" %s map --etcd_address <etcd_address> (--image <image> | --pool <pool> --inode <inode> --size <size in bytes>)\n"
|
||||
" %s unmap /dev/nbd0\n"
|
||||
" %s list [--json]\n",
|
||||
exe_name, exe_name, exe_name
|
||||
@@ -148,21 +151,49 @@ public:
|
||||
fprintf(stderr, "etcd_address is missing\n");
|
||||
exit(1);
|
||||
}
|
||||
if (!cfg["size"].uint64_value())
|
||||
if (cfg["image"].string_value() != "")
|
||||
{
|
||||
fprintf(stderr, "device size is missing\n");
|
||||
exit(1);
|
||||
// Use image name
|
||||
image_name = cfg["image"].string_value();
|
||||
inode = 0;
|
||||
}
|
||||
inode = cfg["inode"].uint64_value();
|
||||
uint64_t pool = cfg["pool"].uint64_value();
|
||||
if (pool)
|
||||
else
|
||||
{
|
||||
inode = (inode & ((1l << (64-POOL_ID_BITS)) - 1)) | (pool << (64-POOL_ID_BITS));
|
||||
// Use pool, inode number and size
|
||||
if (!cfg["size"].uint64_value())
|
||||
{
|
||||
fprintf(stderr, "device size is missing\n");
|
||||
exit(1);
|
||||
}
|
||||
device_size = cfg["size"].uint64_value();
|
||||
inode = cfg["inode"].uint64_value();
|
||||
uint64_t pool = cfg["pool"].uint64_value();
|
||||
if (pool)
|
||||
{
|
||||
inode = (inode & ((1l << (64-POOL_ID_BITS)) - 1)) | (pool << (64-POOL_ID_BITS));
|
||||
}
|
||||
if (!(inode >> (64-POOL_ID_BITS)))
|
||||
{
|
||||
fprintf(stderr, "pool is missing\n");
|
||||
exit(1);
|
||||
}
|
||||
}
|
||||
if (!(inode >> (64-POOL_ID_BITS)))
|
||||
// Create client
|
||||
ringloop = new ring_loop_t(512);
|
||||
epmgr = new epoll_manager_t(ringloop);
|
||||
cli = new cluster_client_t(ringloop, epmgr->tfd, cfg);
|
||||
if (!inode)
|
||||
{
|
||||
fprintf(stderr, "pool is missing\n");
|
||||
exit(1);
|
||||
// Load image metadata
|
||||
while (!cli->is_ready())
|
||||
{
|
||||
ringloop->loop();
|
||||
if (cli->is_ready())
|
||||
break;
|
||||
ringloop->wait();
|
||||
}
|
||||
watch = cli->st_cli.watch_inode(image_name);
|
||||
device_size = watch->cfg.size;
|
||||
}
|
||||
// Initialize NBD
|
||||
int sockfd[2];
|
||||
@@ -176,7 +207,7 @@ public:
|
||||
load_module();
|
||||
if (!cfg["dev_num"].is_null())
|
||||
{
|
||||
if (run_nbd(sockfd, cfg["dev_num"].int64_value(), cfg["size"].uint64_value(), NBD_FLAG_SEND_FLUSH, 30) < 0)
|
||||
if (run_nbd(sockfd, cfg["dev_num"].int64_value(), device_size, NBD_FLAG_SEND_FLUSH, 30) < 0)
|
||||
{
|
||||
perror("run_nbd");
|
||||
exit(1);
|
||||
@@ -188,7 +219,7 @@ public:
|
||||
int i = 0;
|
||||
while (true)
|
||||
{
|
||||
int r = run_nbd(sockfd, i, cfg["size"].uint64_value(), NBD_FLAG_SEND_FLUSH, 30);
|
||||
int r = run_nbd(sockfd, i, device_size, NBD_FLAG_SEND_FLUSH, 30);
|
||||
if (r == 0)
|
||||
{
|
||||
printf("/dev/nbd%d\n", i);
|
||||
@@ -215,10 +246,6 @@ public:
|
||||
{
|
||||
daemonize();
|
||||
}
|
||||
// Create client
|
||||
ringloop = new ring_loop_t(512);
|
||||
epmgr = new epoll_manager_t(ringloop);
|
||||
cli = new cluster_client_t(ringloop, epmgr->tfd, cfg);
|
||||
// Initialize read state
|
||||
read_state = CL_READ_HDR;
|
||||
recv_buf = malloc_or_die(receive_buffer_size);
|
||||
@@ -242,6 +269,7 @@ public:
|
||||
ringloop->loop();
|
||||
ringloop->wait();
|
||||
}
|
||||
// FIXME: Cleanup when exiting
|
||||
}
|
||||
|
||||
void load_module()
|
||||
@@ -610,7 +638,7 @@ protected:
|
||||
if (req_type == NBD_CMD_READ || req_type == NBD_CMD_WRITE)
|
||||
{
|
||||
op->opcode = req_type == NBD_CMD_READ ? OSD_OP_READ : OSD_OP_WRITE;
|
||||
op->inode = inode;
|
||||
op->inode = inode ? inode : watch->cfg.num;
|
||||
op->offset = be64toh(cur_req.from);
|
||||
op->len = be32toh(cur_req.len);
|
||||
buf = malloc_or_die(sizeof(nbd_reply) + op->len);
|
||||
@@ -657,7 +685,15 @@ protected:
|
||||
}
|
||||
else
|
||||
{
|
||||
cli->execute(cur_op);
|
||||
if (cur_op->opcode == OSD_OP_WRITE && watch->cfg.readonly)
|
||||
{
|
||||
cur_op->retval = -EROFS;
|
||||
std::function<void(cluster_op_t*)>(cur_op->callback)(cur_op);
|
||||
}
|
||||
else
|
||||
{
|
||||
cli->execute(cur_op);
|
||||
}
|
||||
cur_op = NULL;
|
||||
cur_buf = &cur_req;
|
||||
cur_left = sizeof(nbd_request);
|
||||
|
47
src/osd.cpp
47
src/osd.cpp
@@ -45,11 +45,11 @@ osd_t::osd_t(blockstore_config_t & config, ring_loop_t *ringloop)
|
||||
print_slow();
|
||||
});
|
||||
|
||||
c_cli.tfd = this->tfd;
|
||||
c_cli.ringloop = this->ringloop;
|
||||
c_cli.exec_op = [this](osd_op_t *op) { exec_op(op); };
|
||||
c_cli.repeer_pgs = [this](osd_num_t peer_osd) { repeer_pgs(peer_osd); };
|
||||
c_cli.init();
|
||||
msgr.tfd = this->tfd;
|
||||
msgr.ringloop = this->ringloop;
|
||||
msgr.exec_op = [this](osd_op_t *op) { exec_op(op); };
|
||||
msgr.repeer_pgs = [this](osd_num_t peer_osd) { repeer_pgs(peer_osd); };
|
||||
msgr.init();
|
||||
|
||||
init_cluster();
|
||||
|
||||
@@ -80,10 +80,11 @@ void osd_t::parse_config(blockstore_config_t & config)
|
||||
osd_num = strtoull(config["osd_num"].c_str(), NULL, 10);
|
||||
if (!osd_num)
|
||||
throw std::runtime_error("osd_num is required in the configuration");
|
||||
c_cli.osd_num = osd_num;
|
||||
msgr.osd_num = osd_num;
|
||||
run_primary = config["run_primary"] != "false" && config["run_primary"] != "0" && config["run_primary"] != "no";
|
||||
no_rebalance = config["no_rebalance"] == "true" || config["no_rebalance"] == "1" || config["no_rebalance"] == "yes";
|
||||
no_recovery = config["no_recovery"] == "true" || config["no_recovery"] == "1" || config["no_recovery"] == "yes";
|
||||
allow_test_ops = config["allow_test_ops"] == "true" || config["allow_test_ops"] == "1" || config["allow_test_ops"] == "yes";
|
||||
// Cluster configuration
|
||||
bind_address = config["bind_address"];
|
||||
if (bind_address == "")
|
||||
@@ -121,7 +122,7 @@ void osd_t::parse_config(blockstore_config_t & config)
|
||||
slow_log_interval = strtoull(config["slow_log_interval"].c_str(), NULL, 10);
|
||||
if (!slow_log_interval)
|
||||
slow_log_interval = 10;
|
||||
c_cli.parse_config(json_config);
|
||||
msgr.parse_config(json_config);
|
||||
}
|
||||
|
||||
void osd_t::bind_socket()
|
||||
@@ -174,7 +175,7 @@ void osd_t::bind_socket()
|
||||
|
||||
epmgr->set_fd_handler(listen_fd, false, [this](int fd, int events)
|
||||
{
|
||||
c_cli.accept_connections(listen_fd);
|
||||
msgr.accept_connections(listen_fd);
|
||||
});
|
||||
}
|
||||
|
||||
@@ -191,8 +192,8 @@ bool osd_t::shutdown()
|
||||
void osd_t::loop()
|
||||
{
|
||||
handle_peers();
|
||||
c_cli.read_requests();
|
||||
c_cli.send_replies();
|
||||
msgr.read_requests();
|
||||
msgr.send_replies();
|
||||
ringloop->submit();
|
||||
}
|
||||
|
||||
@@ -276,7 +277,7 @@ void osd_t::exec_op(osd_op_t *cur_op)
|
||||
|
||||
void osd_t::reset_stats()
|
||||
{
|
||||
c_cli.stats = { 0 };
|
||||
msgr.stats = { 0 };
|
||||
prev_stats = { 0 };
|
||||
memset(recovery_stat_count, 0, sizeof(recovery_stat_count));
|
||||
memset(recovery_stat_bytes, 0, sizeof(recovery_stat_bytes));
|
||||
@@ -286,11 +287,11 @@ void osd_t::print_stats()
|
||||
{
|
||||
for (int i = OSD_OP_MIN; i <= OSD_OP_MAX; i++)
|
||||
{
|
||||
if (c_cli.stats.op_stat_count[i] != prev_stats.op_stat_count[i] && i != OSD_OP_PING)
|
||||
if (msgr.stats.op_stat_count[i] != prev_stats.op_stat_count[i] && i != OSD_OP_PING)
|
||||
{
|
||||
uint64_t avg = (c_cli.stats.op_stat_sum[i] - prev_stats.op_stat_sum[i])/(c_cli.stats.op_stat_count[i] - prev_stats.op_stat_count[i]);
|
||||
uint64_t bw = (c_cli.stats.op_stat_bytes[i] - prev_stats.op_stat_bytes[i]) / print_stats_interval;
|
||||
if (c_cli.stats.op_stat_bytes[i] != 0)
|
||||
uint64_t avg = (msgr.stats.op_stat_sum[i] - prev_stats.op_stat_sum[i])/(msgr.stats.op_stat_count[i] - prev_stats.op_stat_count[i]);
|
||||
uint64_t bw = (msgr.stats.op_stat_bytes[i] - prev_stats.op_stat_bytes[i]) / print_stats_interval;
|
||||
if (msgr.stats.op_stat_bytes[i] != 0)
|
||||
{
|
||||
printf(
|
||||
"[OSD %lu] avg latency for op %d (%s): %lu us, B/W: %.2f %s\n", osd_num, i, osd_op_names[i], avg,
|
||||
@@ -302,19 +303,19 @@ void osd_t::print_stats()
|
||||
{
|
||||
printf("[OSD %lu] avg latency for op %d (%s): %lu us\n", osd_num, i, osd_op_names[i], avg);
|
||||
}
|
||||
prev_stats.op_stat_count[i] = c_cli.stats.op_stat_count[i];
|
||||
prev_stats.op_stat_sum[i] = c_cli.stats.op_stat_sum[i];
|
||||
prev_stats.op_stat_bytes[i] = c_cli.stats.op_stat_bytes[i];
|
||||
prev_stats.op_stat_count[i] = msgr.stats.op_stat_count[i];
|
||||
prev_stats.op_stat_sum[i] = msgr.stats.op_stat_sum[i];
|
||||
prev_stats.op_stat_bytes[i] = msgr.stats.op_stat_bytes[i];
|
||||
}
|
||||
}
|
||||
for (int i = OSD_OP_MIN; i <= OSD_OP_MAX; i++)
|
||||
{
|
||||
if (c_cli.stats.subop_stat_count[i] != prev_stats.subop_stat_count[i])
|
||||
if (msgr.stats.subop_stat_count[i] != prev_stats.subop_stat_count[i])
|
||||
{
|
||||
uint64_t avg = (c_cli.stats.subop_stat_sum[i] - prev_stats.subop_stat_sum[i])/(c_cli.stats.subop_stat_count[i] - prev_stats.subop_stat_count[i]);
|
||||
uint64_t avg = (msgr.stats.subop_stat_sum[i] - prev_stats.subop_stat_sum[i])/(msgr.stats.subop_stat_count[i] - prev_stats.subop_stat_count[i]);
|
||||
printf("[OSD %lu] avg latency for subop %d (%s): %ld us\n", osd_num, i, osd_op_names[i], avg);
|
||||
prev_stats.subop_stat_count[i] = c_cli.stats.subop_stat_count[i];
|
||||
prev_stats.subop_stat_sum[i] = c_cli.stats.subop_stat_sum[i];
|
||||
prev_stats.subop_stat_count[i] = msgr.stats.subop_stat_count[i];
|
||||
prev_stats.subop_stat_sum[i] = msgr.stats.subop_stat_sum[i];
|
||||
}
|
||||
}
|
||||
for (int i = 0; i < 2; i++)
|
||||
@@ -351,7 +352,7 @@ void osd_t::print_slow()
|
||||
char alloc[1024];
|
||||
timespec now;
|
||||
clock_gettime(CLOCK_REALTIME, &now);
|
||||
for (auto & kv: c_cli.clients)
|
||||
for (auto & kv: msgr.clients)
|
||||
{
|
||||
for (auto op: kv.second->received_ops)
|
||||
{
|
||||
|
@@ -104,7 +104,7 @@ class osd_t
|
||||
int bind_port, listen_backlog;
|
||||
// FIXME: Implement client queue depth limit
|
||||
int client_queue_depth = 128;
|
||||
bool allow_test_ops = true;
|
||||
bool allow_test_ops = false;
|
||||
int print_stats_interval = 3;
|
||||
int slow_log_interval = 10;
|
||||
int immediate_commit = IMMEDIATE_NONE;
|
||||
@@ -116,7 +116,7 @@ class osd_t
|
||||
// cluster state
|
||||
|
||||
etcd_state_client_t st_cli;
|
||||
osd_messenger_t c_cli;
|
||||
osd_messenger_t msgr;
|
||||
int etcd_failed_attempts = 0;
|
||||
std::string etcd_lease_id;
|
||||
json11::Json self_state;
|
||||
|
@@ -104,7 +104,7 @@ void osd_t::parse_test_peer(std::string peer)
|
||||
{ "addresses", json11::Json::array { addr } },
|
||||
{ "port", port },
|
||||
};
|
||||
c_cli.connect_peer(peer_osd, st_cli.peer_states[peer_osd]);
|
||||
msgr.connect_peer(peer_osd, st_cli.peer_states[peer_osd]);
|
||||
}
|
||||
|
||||
json11::Json osd_t::get_osd_state()
|
||||
@@ -146,16 +146,16 @@ json11::Json osd_t::get_statistics()
|
||||
for (int i = OSD_OP_MIN; i <= OSD_OP_MAX; i++)
|
||||
{
|
||||
op_stats[osd_op_names[i]] = json11::Json::object {
|
||||
{ "count", c_cli.stats.op_stat_count[i] },
|
||||
{ "usec", c_cli.stats.op_stat_sum[i] },
|
||||
{ "bytes", c_cli.stats.op_stat_bytes[i] },
|
||||
{ "count", msgr.stats.op_stat_count[i] },
|
||||
{ "usec", msgr.stats.op_stat_sum[i] },
|
||||
{ "bytes", msgr.stats.op_stat_bytes[i] },
|
||||
};
|
||||
}
|
||||
for (int i = OSD_OP_MIN; i <= OSD_OP_MAX; i++)
|
||||
{
|
||||
subop_stats[osd_op_names[i]] = json11::Json::object {
|
||||
{ "count", c_cli.stats.subop_stat_count[i] },
|
||||
{ "usec", c_cli.stats.subop_stat_sum[i] },
|
||||
{ "count", msgr.stats.subop_stat_count[i] },
|
||||
{ "usec", msgr.stats.subop_stat_sum[i] },
|
||||
};
|
||||
}
|
||||
st["op_stats"] = op_stats;
|
||||
@@ -298,9 +298,9 @@ void osd_t::report_statistics()
|
||||
|
||||
void osd_t::on_change_osd_state_hook(osd_num_t peer_osd)
|
||||
{
|
||||
if (c_cli.wanted_peers.find(peer_osd) != c_cli.wanted_peers.end())
|
||||
if (msgr.wanted_peers.find(peer_osd) != msgr.wanted_peers.end())
|
||||
{
|
||||
c_cli.connect_peer(peer_osd, st_cli.peer_states[peer_osd]);
|
||||
msgr.connect_peer(peer_osd, st_cli.peer_states[peer_osd]);
|
||||
}
|
||||
}
|
||||
|
||||
@@ -695,9 +695,9 @@ void osd_t::apply_pg_config()
|
||||
// Add peers
|
||||
for (auto pg_osd: all_peers)
|
||||
{
|
||||
if (pg_osd != this->osd_num && c_cli.osd_peer_fds.find(pg_osd) == c_cli.osd_peer_fds.end())
|
||||
if (pg_osd != this->osd_num && msgr.osd_peer_fds.find(pg_osd) == msgr.osd_peer_fds.end())
|
||||
{
|
||||
c_cli.connect_peer(pg_osd, st_cli.peer_states[pg_osd]);
|
||||
msgr.connect_peer(pg_osd, st_cli.peer_states[pg_osd]);
|
||||
}
|
||||
}
|
||||
start_pg_peering(pg);
|
||||
|
@@ -82,10 +82,10 @@ void osd_t::handle_flush_op(bool rollback, pool_id_t pool_id, pg_num_t pg_num, p
|
||||
else
|
||||
{
|
||||
printf("Error while doing flush on OSD %lu: %d (%s)\n", osd_num, retval, strerror(-retval));
|
||||
auto fd_it = c_cli.osd_peer_fds.find(peer_osd);
|
||||
if (fd_it != c_cli.osd_peer_fds.end())
|
||||
auto fd_it = msgr.osd_peer_fds.find(peer_osd);
|
||||
if (fd_it != msgr.osd_peer_fds.end())
|
||||
{
|
||||
c_cli.stop_client(fd_it->second);
|
||||
msgr.stop_client(fd_it->second);
|
||||
}
|
||||
return;
|
||||
}
|
||||
@@ -188,7 +188,7 @@ void osd_t::submit_flush_op(pool_id_t pool_id, pg_num_t pg_num, pg_flush_batch_t
|
||||
else
|
||||
{
|
||||
// Peer
|
||||
int peer_fd = c_cli.osd_peer_fds[peer_osd];
|
||||
int peer_fd = msgr.osd_peer_fds[peer_osd];
|
||||
op->op_type = OSD_OP_OUT;
|
||||
op->iov.push_back(op->buf, count * sizeof(obj_ver_id));
|
||||
op->peer_fd = peer_fd;
|
||||
@@ -196,7 +196,7 @@ void osd_t::submit_flush_op(pool_id_t pool_id, pg_num_t pg_num, pg_flush_batch_t
|
||||
.sec_stab = {
|
||||
.header = {
|
||||
.magic = SECONDARY_OSD_OP_MAGIC,
|
||||
.id = c_cli.next_subop_id++,
|
||||
.id = msgr.next_subop_id++,
|
||||
.opcode = (uint64_t)(rollback ? OSD_OP_SEC_ROLLBACK : OSD_OP_SEC_STABILIZE),
|
||||
},
|
||||
.len = count * sizeof(obj_ver_id),
|
||||
@@ -207,7 +207,7 @@ void osd_t::submit_flush_op(pool_id_t pool_id, pg_num_t pg_num, pg_flush_batch_t
|
||||
handle_flush_op(op->req.hdr.opcode == OSD_OP_SEC_ROLLBACK, pool_id, pg_num, fb, peer_osd, op->reply.hdr.retval);
|
||||
delete op;
|
||||
};
|
||||
c_cli.outbox_push(op);
|
||||
msgr.outbox_push(op);
|
||||
}
|
||||
}
|
||||
|
||||
|
@@ -35,7 +35,7 @@
|
||||
#define MEM_ALIGNMENT 512
|
||||
#endif
|
||||
#define OSD_RW_MAX 64*1024*1024
|
||||
#define OSD_PROTOCOL_VERSION 1
|
||||
#define OSD_PROTOCOL_VERSION 2
|
||||
|
||||
// common request and reply headers
|
||||
struct __attribute__((__packed__)) osd_op_header_t
|
||||
@@ -74,8 +74,10 @@ struct __attribute__((__packed__)) osd_op_sec_rw_t
|
||||
// length
|
||||
uint32_t len;
|
||||
// bitmap/attribute length - bitmap comes after header, but before data
|
||||
uint32_t attr_len;
|
||||
uint32_t bitmap_len;
|
||||
uint32_t pad0;
|
||||
// inline bitmap (when it's no longer than 8 bytes)
|
||||
uint64_t bitmap;
|
||||
};
|
||||
|
||||
struct __attribute__((__packed__)) osd_reply_sec_rw_t
|
||||
@@ -84,8 +86,10 @@ struct __attribute__((__packed__)) osd_reply_sec_rw_t
|
||||
// for reads and writes: assigned or read version number
|
||||
uint64_t version;
|
||||
// for reads: bitmap/attribute length (just to double-check)
|
||||
uint32_t attr_len;
|
||||
uint32_t bitmap_len;
|
||||
uint32_t pad0;
|
||||
// inline bitmap (when it's no longer than 8 bytes)
|
||||
uint64_t bitmap;
|
||||
};
|
||||
|
||||
// delete object on the secondary OSD
|
||||
@@ -148,6 +152,8 @@ struct __attribute__((__packed__)) osd_reply_sec_read_bmp_t
|
||||
struct __attribute__((__packed__)) osd_op_show_config_t
|
||||
{
|
||||
osd_op_header_t header;
|
||||
// JSON request length
|
||||
uint64_t json_len;
|
||||
};
|
||||
|
||||
struct __attribute__((__packed__)) osd_reply_show_config_t
|
||||
@@ -197,6 +203,8 @@ struct __attribute__((__packed__)) osd_reply_rw_t
|
||||
// for reads: bitmap length
|
||||
uint32_t bitmap_len;
|
||||
uint32_t pad0;
|
||||
// inline bitmap (when it's no longer than 8 bytes)
|
||||
uint64_t bitmap;
|
||||
};
|
||||
|
||||
// sync to the primary OSD
|
||||
|
@@ -156,7 +156,7 @@ void osd_t::start_pg_peering(pg_t & pg)
|
||||
if (immediate_commit != IMMEDIATE_ALL)
|
||||
{
|
||||
std::vector<int> to_stop;
|
||||
for (auto & cp: c_cli.clients)
|
||||
for (auto & cp: msgr.clients)
|
||||
{
|
||||
if (cp.second->dirty_pgs.find({ .pool_id = pg.pool_id, .pg_num = pg.pg_num }) != cp.second->dirty_pgs.end())
|
||||
{
|
||||
@@ -165,7 +165,7 @@ void osd_t::start_pg_peering(pg_t & pg)
|
||||
}
|
||||
for (auto peer_fd: to_stop)
|
||||
{
|
||||
c_cli.stop_client(peer_fd);
|
||||
msgr.stop_client(peer_fd);
|
||||
}
|
||||
}
|
||||
// Calculate current write OSD set
|
||||
@@ -175,7 +175,7 @@ void osd_t::start_pg_peering(pg_t & pg)
|
||||
for (int role = 0; role < pg.target_set.size(); role++)
|
||||
{
|
||||
pg.cur_set[role] = pg.target_set[role] == this->osd_num ||
|
||||
c_cli.osd_peer_fds.find(pg.target_set[role]) != c_cli.osd_peer_fds.end() ? pg.target_set[role] : 0;
|
||||
msgr.osd_peer_fds.find(pg.target_set[role]) != msgr.osd_peer_fds.end() ? pg.target_set[role] : 0;
|
||||
if (pg.cur_set[role] != 0)
|
||||
{
|
||||
pg.pg_cursize++;
|
||||
@@ -199,7 +199,7 @@ void osd_t::start_pg_peering(pg_t & pg)
|
||||
{
|
||||
found = false;
|
||||
if (history_osd == this->osd_num ||
|
||||
c_cli.osd_peer_fds.find(history_osd) != c_cli.osd_peer_fds.end())
|
||||
msgr.osd_peer_fds.find(history_osd) != msgr.osd_peer_fds.end())
|
||||
{
|
||||
found = true;
|
||||
break;
|
||||
@@ -223,13 +223,13 @@ void osd_t::start_pg_peering(pg_t & pg)
|
||||
std::set<osd_num_t> cur_peers;
|
||||
for (auto pg_osd: pg.all_peers)
|
||||
{
|
||||
if (pg_osd == this->osd_num || c_cli.osd_peer_fds.find(pg_osd) != c_cli.osd_peer_fds.end())
|
||||
if (pg_osd == this->osd_num || msgr.osd_peer_fds.find(pg_osd) != msgr.osd_peer_fds.end())
|
||||
{
|
||||
cur_peers.insert(pg_osd);
|
||||
}
|
||||
else if (c_cli.wanted_peers.find(pg_osd) == c_cli.wanted_peers.end())
|
||||
else if (msgr.wanted_peers.find(pg_osd) == msgr.wanted_peers.end())
|
||||
{
|
||||
c_cli.connect_peer(pg_osd, st_cli.peer_states[pg_osd]);
|
||||
msgr.connect_peer(pg_osd, st_cli.peer_states[pg_osd]);
|
||||
}
|
||||
}
|
||||
pg.cur_peers.insert(pg.cur_peers.begin(), cur_peers.begin(), cur_peers.end());
|
||||
@@ -325,7 +325,7 @@ void osd_t::submit_sync_and_list_subop(osd_num_t role_osd, pg_peering_state_t *p
|
||||
else
|
||||
{
|
||||
// Peer
|
||||
auto & cl = c_cli.clients.at(c_cli.osd_peer_fds[role_osd]);
|
||||
auto & cl = msgr.clients.at(msgr.osd_peer_fds[role_osd]);
|
||||
osd_op_t *op = new osd_op_t();
|
||||
op->op_type = OSD_OP_OUT;
|
||||
op->peer_fd = cl->peer_fd;
|
||||
@@ -333,7 +333,7 @@ void osd_t::submit_sync_and_list_subop(osd_num_t role_osd, pg_peering_state_t *p
|
||||
.sec_sync = {
|
||||
.header = {
|
||||
.magic = SECONDARY_OSD_OP_MAGIC,
|
||||
.id = c_cli.next_subop_id++,
|
||||
.id = msgr.next_subop_id++,
|
||||
.opcode = OSD_OP_SEC_SYNC,
|
||||
},
|
||||
},
|
||||
@@ -347,14 +347,14 @@ void osd_t::submit_sync_and_list_subop(osd_num_t role_osd, pg_peering_state_t *p
|
||||
int fail_fd = op->peer_fd;
|
||||
ps->list_ops.erase(role_osd);
|
||||
delete op;
|
||||
c_cli.stop_client(fail_fd);
|
||||
msgr.stop_client(fail_fd);
|
||||
return;
|
||||
}
|
||||
delete op;
|
||||
ps->list_ops.erase(role_osd);
|
||||
submit_list_subop(role_osd, ps);
|
||||
};
|
||||
c_cli.outbox_push(op);
|
||||
msgr.outbox_push(op);
|
||||
ps->list_ops[role_osd] = op;
|
||||
}
|
||||
}
|
||||
@@ -404,12 +404,12 @@ void osd_t::submit_list_subop(osd_num_t role_osd, pg_peering_state_t *ps)
|
||||
// Peer
|
||||
osd_op_t *op = new osd_op_t();
|
||||
op->op_type = OSD_OP_OUT;
|
||||
op->peer_fd = c_cli.osd_peer_fds[role_osd];
|
||||
op->peer_fd = msgr.osd_peer_fds[role_osd];
|
||||
op->req = (osd_any_op_t){
|
||||
.sec_list = {
|
||||
.header = {
|
||||
.magic = SECONDARY_OSD_OP_MAGIC,
|
||||
.id = c_cli.next_subop_id++,
|
||||
.id = msgr.next_subop_id++,
|
||||
.opcode = OSD_OP_SEC_LIST,
|
||||
},
|
||||
.list_pg = ps->pg_num,
|
||||
@@ -427,7 +427,7 @@ void osd_t::submit_list_subop(osd_num_t role_osd, pg_peering_state_t *ps)
|
||||
int fail_fd = op->peer_fd;
|
||||
ps->list_ops.erase(role_osd);
|
||||
delete op;
|
||||
c_cli.stop_client(fail_fd);
|
||||
msgr.stop_client(fail_fd);
|
||||
return;
|
||||
}
|
||||
printf(
|
||||
@@ -444,7 +444,7 @@ void osd_t::submit_list_subop(osd_num_t role_osd, pg_peering_state_t *ps)
|
||||
ps->list_ops.erase(role_osd);
|
||||
delete op;
|
||||
};
|
||||
c_cli.outbox_push(op);
|
||||
msgr.outbox_push(op);
|
||||
ps->list_ops[role_osd] = op;
|
||||
}
|
||||
}
|
||||
|
@@ -235,7 +235,10 @@ resume_2:
|
||||
{
|
||||
reconstruct_stripes_jerasure(stripes, op_data->pg_size, op_data->pg_data_size, clean_entry_bitmap_size);
|
||||
}
|
||||
cur_op->iov.push_back(op_data->stripes[0].bmp_buf, cur_op->reply.rw.bitmap_len);
|
||||
if (cur_op->reply.rw.bitmap_len <= 8)
|
||||
memcpy(&cur_op->reply.rw.bitmap, op_data->stripes[0].bmp_buf, cur_op->reply.rw.bitmap_len);
|
||||
else
|
||||
cur_op->iov.push_back(op_data->stripes[0].bmp_buf, cur_op->reply.rw.bitmap_len);
|
||||
for (int role = 0; role < op_data->pg_size; role++)
|
||||
{
|
||||
if (stripes[role].req_end != 0)
|
||||
@@ -250,7 +253,10 @@ resume_2:
|
||||
}
|
||||
else
|
||||
{
|
||||
cur_op->iov.push_back(op_data->stripes[0].bmp_buf, cur_op->reply.rw.bitmap_len);
|
||||
if (cur_op->reply.rw.bitmap_len <= 8)
|
||||
memcpy(&cur_op->reply.rw.bitmap, op_data->stripes[0].bmp_buf, cur_op->reply.rw.bitmap_len);
|
||||
else
|
||||
cur_op->iov.push_back(op_data->stripes[0].bmp_buf, cur_op->reply.rw.bitmap_len);
|
||||
cur_op->iov.push_back(cur_op->buf, cur_op->req.rw.len);
|
||||
}
|
||||
finish_op(cur_op, cur_op->req.rw.len);
|
||||
|
@@ -236,14 +236,14 @@ int osd_t::submit_bitmap_subops(osd_op_t *cur_op, pg_t & pg)
|
||||
// Send to a remote OSD
|
||||
osd_op_t *subop = op_data->subops+subop_idx;
|
||||
subop->op_type = OSD_OP_OUT;
|
||||
subop->peer_fd = c_cli.osd_peer_fds.at(subop_osd_num);
|
||||
subop->peer_fd = msgr.osd_peer_fds.at(subop_osd_num);
|
||||
// FIXME: Use the pre-allocated buffer
|
||||
subop->buf = malloc_or_die(sizeof(obj_ver_id)*(i+1-prev));
|
||||
subop->req = (osd_any_op_t){
|
||||
.sec_read_bmp = {
|
||||
.header = {
|
||||
.magic = SECONDARY_OSD_OP_MAGIC,
|
||||
.id = c_cli.next_subop_id++,
|
||||
.id = msgr.next_subop_id++,
|
||||
.opcode = OSD_OP_SEC_READ_BMP,
|
||||
},
|
||||
.len = sizeof(obj_ver_id)*(i+1-prev),
|
||||
@@ -273,7 +273,7 @@ int osd_t::submit_bitmap_subops(osd_op_t *cur_op, pg_t & pg)
|
||||
}
|
||||
handle_primary_subop(subop, cur_op);
|
||||
};
|
||||
c_cli.outbox_push(subop);
|
||||
msgr.outbox_push(subop);
|
||||
subop_idx++;
|
||||
}
|
||||
prev = i+1;
|
||||
|
@@ -87,14 +87,14 @@ void osd_t::finish_op(osd_op_t *cur_op, int retval)
|
||||
else
|
||||
{
|
||||
// FIXME add separate magic number for primary ops
|
||||
auto cl_it = c_cli.clients.find(cur_op->peer_fd);
|
||||
if (cl_it != c_cli.clients.end())
|
||||
auto cl_it = msgr.clients.find(cur_op->peer_fd);
|
||||
if (cl_it != msgr.clients.end())
|
||||
{
|
||||
cur_op->reply.hdr.magic = SECONDARY_OSD_REPLY_MAGIC;
|
||||
cur_op->reply.hdr.id = cur_op->req.hdr.id;
|
||||
cur_op->reply.hdr.opcode = cur_op->req.hdr.opcode;
|
||||
cur_op->reply.hdr.retval = retval;
|
||||
c_cli.outbox_push(cur_op);
|
||||
msgr.outbox_push(cur_op);
|
||||
}
|
||||
else
|
||||
{
|
||||
@@ -184,13 +184,13 @@ int osd_t::submit_primary_subop_batch(int submit_type, inode_t inode, uint64_t o
|
||||
else
|
||||
{
|
||||
subop->op_type = OSD_OP_OUT;
|
||||
subop->peer_fd = c_cli.osd_peer_fds.at(role_osd_num);
|
||||
subop->peer_fd = msgr.osd_peer_fds.at(role_osd_num);
|
||||
subop->bitmap = stripes[stripe_num].bmp_buf;
|
||||
subop->bitmap_len = clean_entry_bitmap_size;
|
||||
subop->req.sec_rw = {
|
||||
.header = {
|
||||
.magic = SECONDARY_OSD_OP_MAGIC,
|
||||
.id = c_cli.next_subop_id++,
|
||||
.id = msgr.next_subop_id++,
|
||||
.opcode = (uint64_t)(wr ? (rep ? OSD_OP_SEC_WRITE_STABLE : OSD_OP_SEC_WRITE) : OSD_OP_SEC_READ),
|
||||
},
|
||||
.oid = {
|
||||
@@ -200,7 +200,7 @@ int osd_t::submit_primary_subop_batch(int submit_type, inode_t inode, uint64_t o
|
||||
.version = op_version,
|
||||
.offset = wr ? stripes[stripe_num].write_start : stripes[stripe_num].read_start,
|
||||
.len = wr ? stripes[stripe_num].write_end - stripes[stripe_num].write_start : stripes[stripe_num].read_end - stripes[stripe_num].read_start,
|
||||
.attr_len = wr ? clean_entry_bitmap_size : 0,
|
||||
.bitmap_len = wr ? clean_entry_bitmap_size : 0,
|
||||
};
|
||||
#ifdef OSD_DEBUG
|
||||
printf(
|
||||
@@ -227,7 +227,7 @@ int osd_t::submit_primary_subop_batch(int submit_type, inode_t inode, uint64_t o
|
||||
{
|
||||
handle_primary_subop(subop, cur_op);
|
||||
};
|
||||
c_cli.outbox_push(subop);
|
||||
msgr.outbox_push(subop);
|
||||
}
|
||||
i++;
|
||||
}
|
||||
@@ -282,20 +282,20 @@ void osd_t::add_bs_subop_stats(osd_op_t *subop)
|
||||
uint64_t opcode = bs_op_to_osd_op[subop->bs_op->opcode];
|
||||
timespec tv_end;
|
||||
clock_gettime(CLOCK_REALTIME, &tv_end);
|
||||
c_cli.stats.op_stat_count[opcode]++;
|
||||
if (!c_cli.stats.op_stat_count[opcode])
|
||||
msgr.stats.op_stat_count[opcode]++;
|
||||
if (!msgr.stats.op_stat_count[opcode])
|
||||
{
|
||||
c_cli.stats.op_stat_count[opcode] = 1;
|
||||
c_cli.stats.op_stat_sum[opcode] = 0;
|
||||
c_cli.stats.op_stat_bytes[opcode] = 0;
|
||||
msgr.stats.op_stat_count[opcode] = 1;
|
||||
msgr.stats.op_stat_sum[opcode] = 0;
|
||||
msgr.stats.op_stat_bytes[opcode] = 0;
|
||||
}
|
||||
c_cli.stats.op_stat_sum[opcode] += (
|
||||
msgr.stats.op_stat_sum[opcode] += (
|
||||
(tv_end.tv_sec - subop->tv_begin.tv_sec)*1000000 +
|
||||
(tv_end.tv_nsec - subop->tv_begin.tv_nsec)/1000
|
||||
);
|
||||
if (opcode == OSD_OP_SEC_READ || opcode == OSD_OP_SEC_WRITE)
|
||||
{
|
||||
c_cli.stats.op_stat_bytes[opcode] += subop->bs_op->len;
|
||||
msgr.stats.op_stat_bytes[opcode] += subop->bs_op->len;
|
||||
}
|
||||
}
|
||||
|
||||
@@ -322,7 +322,7 @@ void osd_t::handle_primary_subop(osd_op_t *subop, osd_op_t *cur_op)
|
||||
if (subop->peer_fd >= 0)
|
||||
{
|
||||
// Drop connection on any error
|
||||
c_cli.stop_client(subop->peer_fd);
|
||||
msgr.stop_client(subop->peer_fd);
|
||||
}
|
||||
}
|
||||
else
|
||||
@@ -332,8 +332,8 @@ void osd_t::handle_primary_subop(osd_op_t *subop, osd_op_t *cur_op)
|
||||
{
|
||||
uint64_t version = subop->reply.sec_rw.version;
|
||||
#ifdef OSD_DEBUG
|
||||
uint64_t peer_osd = c_cli.clients.find(subop->peer_fd) != c_cli.clients.end()
|
||||
? c_cli.clients[subop->peer_fd]->osd_num : osd_num;
|
||||
uint64_t peer_osd = msgr.clients.find(subop->peer_fd) != msgr.clients.end()
|
||||
? msgr.clients[subop->peer_fd]->osd_num : osd_num;
|
||||
printf("subop %lu from osd %lu: version = %lu\n", opcode, peer_osd, version);
|
||||
#endif
|
||||
if (op_data->fact_ver != UINT64_MAX)
|
||||
@@ -465,11 +465,11 @@ void osd_t::submit_primary_del_batch(osd_op_t *cur_op, obj_ver_osd_t *chunks_to_
|
||||
else
|
||||
{
|
||||
subops[i].op_type = OSD_OP_OUT;
|
||||
subops[i].peer_fd = c_cli.osd_peer_fds.at(chunk.osd_num);
|
||||
subops[i].peer_fd = msgr.osd_peer_fds.at(chunk.osd_num);
|
||||
subops[i].req = (osd_any_op_t){ .sec_del = {
|
||||
.header = {
|
||||
.magic = SECONDARY_OSD_OP_MAGIC,
|
||||
.id = c_cli.next_subop_id++,
|
||||
.id = msgr.next_subop_id++,
|
||||
.opcode = OSD_OP_SEC_DELETE,
|
||||
},
|
||||
.oid = chunk.oid,
|
||||
@@ -479,7 +479,7 @@ void osd_t::submit_primary_del_batch(osd_op_t *cur_op, obj_ver_osd_t *chunks_to_
|
||||
{
|
||||
handle_primary_subop(subop, cur_op);
|
||||
};
|
||||
c_cli.outbox_push(&subops[i]);
|
||||
msgr.outbox_push(&subops[i]);
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -509,14 +509,14 @@ int osd_t::submit_primary_sync_subops(osd_op_t *cur_op)
|
||||
});
|
||||
bs->enqueue_op(subops[i].bs_op);
|
||||
}
|
||||
else if ((peer_it = c_cli.osd_peer_fds.find(sync_osd)) != c_cli.osd_peer_fds.end())
|
||||
else if ((peer_it = msgr.osd_peer_fds.find(sync_osd)) != msgr.osd_peer_fds.end())
|
||||
{
|
||||
subops[i].op_type = OSD_OP_OUT;
|
||||
subops[i].peer_fd = peer_it->second;
|
||||
subops[i].req = (osd_any_op_t){ .sec_sync = {
|
||||
.header = {
|
||||
.magic = SECONDARY_OSD_OP_MAGIC,
|
||||
.id = c_cli.next_subop_id++,
|
||||
.id = msgr.next_subop_id++,
|
||||
.opcode = OSD_OP_SEC_SYNC,
|
||||
},
|
||||
} };
|
||||
@@ -524,7 +524,7 @@ int osd_t::submit_primary_sync_subops(osd_op_t *cur_op)
|
||||
{
|
||||
handle_primary_subop(subop, cur_op);
|
||||
};
|
||||
c_cli.outbox_push(&subops[i]);
|
||||
msgr.outbox_push(&subops[i]);
|
||||
}
|
||||
else
|
||||
{
|
||||
@@ -569,11 +569,11 @@ void osd_t::submit_primary_stab_subops(osd_op_t *cur_op)
|
||||
else
|
||||
{
|
||||
subops[i].op_type = OSD_OP_OUT;
|
||||
subops[i].peer_fd = c_cli.osd_peer_fds.at(stab_osd.osd_num);
|
||||
subops[i].peer_fd = msgr.osd_peer_fds.at(stab_osd.osd_num);
|
||||
subops[i].req = (osd_any_op_t){ .sec_stab = {
|
||||
.header = {
|
||||
.magic = SECONDARY_OSD_OP_MAGIC,
|
||||
.id = c_cli.next_subop_id++,
|
||||
.id = msgr.next_subop_id++,
|
||||
.opcode = OSD_OP_SEC_STABILIZE,
|
||||
},
|
||||
.len = (uint64_t)(stab_osd.len * sizeof(obj_ver_id)),
|
||||
@@ -583,7 +583,7 @@ void osd_t::submit_primary_stab_subops(osd_op_t *cur_op)
|
||||
{
|
||||
handle_primary_subop(subop, cur_op);
|
||||
};
|
||||
c_cli.outbox_push(&subops[i]);
|
||||
msgr.outbox_push(&subops[i]);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
@@ -247,8 +247,8 @@ resume_8:
|
||||
finish:
|
||||
if (cur_op->peer_fd)
|
||||
{
|
||||
auto it = c_cli.clients.find(cur_op->peer_fd);
|
||||
if (it != c_cli.clients.end())
|
||||
auto it = msgr.clients.find(cur_op->peer_fd);
|
||||
if (it != msgr.clients.end())
|
||||
it->second->dirty_pgs.clear();
|
||||
}
|
||||
finish_op(cur_op, 0);
|
||||
|
@@ -370,8 +370,8 @@ lazy:
|
||||
}
|
||||
// Remember PG as dirty to drop the connection when PG goes offline
|
||||
// (this is required because of the "lazy sync")
|
||||
auto cl_it = c_cli.clients.find(cur_op->peer_fd);
|
||||
if (cl_it != c_cli.clients.end())
|
||||
auto cl_it = msgr.clients.find(cur_op->peer_fd);
|
||||
if (cl_it != msgr.clients.end())
|
||||
{
|
||||
cl_it->second->dirty_pgs.insert({ .pool_id = pg.pool_id, .pg_num = pg.pg_num });
|
||||
}
|
||||
|
@@ -20,9 +20,9 @@ void osd_t::secondary_op_callback(osd_op_t *op)
|
||||
if (op->req.hdr.opcode == OSD_OP_SEC_READ)
|
||||
{
|
||||
if (op->bs_op->retval >= 0)
|
||||
op->reply.sec_rw.attr_len = clean_entry_bitmap_size;
|
||||
op->reply.sec_rw.bitmap_len = clean_entry_bitmap_size;
|
||||
else
|
||||
op->reply.sec_rw.attr_len = 0;
|
||||
op->reply.sec_rw.bitmap_len = 0;
|
||||
if (op->bs_op->retval > 0)
|
||||
op->iov.push_back(op->buf, op->bs_op->retval);
|
||||
}
|
||||
@@ -81,7 +81,7 @@ void osd_t::exec_secondary(osd_op_t *cur_op)
|
||||
if (cur_op->req.hdr.opcode == OSD_OP_SEC_READ)
|
||||
{
|
||||
// Allocate memory for the read operation
|
||||
if (clean_entry_bitmap_size > sizeof(unsigned))
|
||||
if (clean_entry_bitmap_size > sizeof(void*))
|
||||
cur_op->bitmap = cur_op->rmw_buf = malloc_or_die(clean_entry_bitmap_size);
|
||||
else
|
||||
cur_op->bitmap = &cur_op->bmp_data;
|
||||
@@ -144,10 +144,49 @@ void osd_t::exec_secondary(osd_op_t *cur_op)
|
||||
|
||||
void osd_t::exec_show_config(osd_op_t *cur_op)
|
||||
{
|
||||
// FIXME: Send the real config, not its source
|
||||
auto cfg_copy = config;
|
||||
cfg_copy["protocol_version"] = std::to_string(OSD_PROTOCOL_VERSION);
|
||||
std::string cfg_str = json11::Json(cfg_copy).dump();
|
||||
std::string json_err;
|
||||
json11::Json req_json = cur_op->req.show_conf.json_len > 0
|
||||
? json11::Json::parse(std::string((char *)cur_op->buf), json_err)
|
||||
: json11::Json();
|
||||
// Expose sensitive configuration values so peers can check them
|
||||
json11::Json::object wire_config = json11::Json::object {
|
||||
{ "osd_num", osd_num },
|
||||
{ "protocol_version", OSD_PROTOCOL_VERSION },
|
||||
{ "block_size", (uint64_t)bs_block_size },
|
||||
{ "bitmap_granularity", (uint64_t)bs_bitmap_granularity },
|
||||
{ "primary_enabled", run_primary },
|
||||
{ "blockstore_enabled", bs ? true : false },
|
||||
{ "readonly", readonly },
|
||||
{ "immediate_commit", (immediate_commit == IMMEDIATE_ALL ? "all" :
|
||||
(immediate_commit == IMMEDIATE_SMALL ? "small" : "none")) },
|
||||
{ "lease_timeout", etcd_report_interval+(MAX_ETCD_ATTEMPTS*(2*ETCD_QUICK_TIMEOUT)+999)/1000 },
|
||||
};
|
||||
#ifdef WITH_RDMA
|
||||
if (msgr.is_rdma_enabled())
|
||||
{
|
||||
// Indicate that RDMA is enabled
|
||||
wire_config["rdma_enabled"] = true;
|
||||
if (req_json["rdma_queues"].array_items().size())
|
||||
{
|
||||
// Peer is trying to connect using RDMA, try to satisfy him
|
||||
auto cl = msgr.clients.at(cur_op->peer_fd);
|
||||
bool ok = msgr.connect_rdma_client(cl, req_json["rdma_queues"], req_json["rdma_max_sge"].uint64_value());
|
||||
if (ok)
|
||||
{
|
||||
json11::Json::array rdma_queues;
|
||||
for (auto rdma_conn: cl->rdma_queues)
|
||||
{
|
||||
rdma_queues.push_back(rdma_conn->addr.to_string());
|
||||
}
|
||||
wire_config["rdma_queues"] = rdma_queues;
|
||||
wire_config["rdma_max_sge"] = msgr.get_rdma_max_sge();
|
||||
}
|
||||
}
|
||||
}
|
||||
#endif
|
||||
if (cur_op->buf)
|
||||
free(cur_op->buf);
|
||||
std::string cfg_str = json11::Json(wire_config).dump();
|
||||
cur_op->buf = malloc_or_die(cfg_str.size()+1);
|
||||
memcpy(cur_op->buf, cfg_str.c_str(), cfg_str.size()+1);
|
||||
cur_op->iov.push_back(cur_op->buf, cfg_str.size()+1);
|
||||
|
@@ -3,6 +3,7 @@
|
||||
export KEEP_DATA=1
|
||||
. `dirname $0`/common.sh
|
||||
|
||||
etcdctl --endpoints=http://127.0.0.1:12379/v3 del --prefix /vitastor/mon/master
|
||||
etcdctl --endpoints=http://127.0.0.1:12379/v3 del --prefix /vitastor/pg/state
|
||||
etcdctl --endpoints=http://127.0.0.1:12379/v3 del --prefix /vitastor/osd/state
|
||||
|
||||
|
@@ -39,6 +39,12 @@ if ! cmp build/src/block-vitastor.so /usr/lib/x86_64-linux-gnu/qemu/block-vitast
|
||||
sudo ln -s "$(realpath .)/build/src/block-vitastor.so" /usr/lib/x86_64-linux-gnu/qemu/block-vitastor.so
|
||||
fi
|
||||
|
||||
# A lot of parallel syncs was crashing the primary OSD at some point
|
||||
|
||||
LD_PRELOAD=libasan.so.5 \
|
||||
fio -thread -name=test -ioengine=build/src/libfio_vitastor.so -bs=4k -direct=1 -numjobs=64 -iodepth=1 -fsync=1 \
|
||||
-rw=randwrite -etcd=$ETCD_URL -pool=1 -inode=1 -size=128M -number_ios=100
|
||||
|
||||
LD_PRELOAD=libasan.so.5 \
|
||||
fio -thread -name=test -ioengine=build/src/libfio_vitastor.so -bs=4M -direct=1 -iodepth=1 -fsync=1 -rw=write -etcd=$ETCD_URL -pool=1 -inode=1 -size=128M -cluster_log_level=10
|
||||
|
||||
|
Reference in New Issue
Block a user