Compare commits

..

17 Commits

Author SHA1 Message Date
eb9fc274e8 Debug prints 2021-04-29 02:08:28 +03:00
9681b62204 WIP multi-queue RDMA 2021-04-29 02:08:28 +03:00
8faf8f7b58 Inline bitmaps
Handy for zero-copy RDMA tests (removes 4-byte s/g entries)
2021-04-29 02:07:46 +03:00
ce777319c3 WIP RDMA support
Basic naive implementation works, but it's highly non-optimal as
RNR retransmissions occur all the time. RDMA expects the receiver
to always have place for incoming WRs...
2021-04-29 02:03:54 +03:00
f8ff39b0ab Rework continue_ops() to remove a CPU hot spot
This rework increases fio -rw=randread -iodepth=128 result from ~120k to ~180k iops :)
2021-04-29 01:50:13 +03:00
d749159585 Linked list experiment
Rework client operation queue from a vector to a linked list.
This is required to rework continue_ops() as its current implementation
consumes ~25% of client process CPU.
2021-04-29 01:47:33 +03:00
9703773a63 Fix has_flushes setting 2021-04-28 23:40:44 +03:00
5d8d486f7c Add SOVERSION 2021-04-20 01:01:32 +03:00
2b546cdd55 Link vitastor_blk with vitastor_common for timerfd_manager_t
Not really required to operate, but fixes a verify-elf error
2021-04-20 00:51:53 +03:00
bd7b177707 Report sensitive configuration values instead of the configuration source 2021-04-17 23:11:16 +03:00
33f9d03d22 Update documentation regarding image names and vitastor-nbd 2021-04-17 17:40:12 +03:00
82e6aff17b Support mapping NBD by the image name 2021-04-17 17:39:55 +03:00
57e2c503f7 Rename osd_t::c_cli to msgr 2021-04-17 16:32:09 +03:00
715bc8d53d Release 0.6.2
- Fix a possible crash during SYNC when journal fsyncs are enabled
- Fix a memory leak in the chained read implementation
2021-04-15 23:40:06 +03:00
0af077701c Fix a possible crash during SYNC when journal fsyncs are enabled 2021-04-15 02:01:50 +03:00
cac976ce25 Fix a memory leak in the chained read implementation 2021-04-15 01:42:18 +03:00
acf0646542 Build common sources once 2021-04-15 01:13:34 +03:00
37 changed files with 1884 additions and 286 deletions

View File

@@ -2,4 +2,6 @@ cmake_minimum_required(VERSION 2.8)
project(vitastor) project(vitastor)
set(VERSION "0.6.2")
add_subdirectory(src) add_subdirectory(src)

View File

@@ -314,14 +314,15 @@ Ceph:
### NBD ### NBD
NBD - на данный момент единственный способ монтировать Vitastor ядром Linux, но он
приводит к дополнительным копированиям данных, поэтому немного ухудшает производительность,
правда, в основном - линейную, а случайная затрагивается слабо.
NBD расшифровывается как "сетевое блочное устройство", но на самом деле оно также NBD расшифровывается как "сетевое блочное устройство", но на самом деле оно также
работает просто как аналог FUSE для блочных устройств, то есть, представляет собой работает просто как аналог FUSE для блочных устройств, то есть, представляет собой
"блочное устройство в пространстве пользователя". "блочное устройство в пространстве пользователя".
NBD - на данный момент единственный способ монтировать Vitastor ядром Linux.
NBD немного снижает производительность, так как приводит к дополнительным копированиям
данных между ядром и пространством пользователя. Тем не менее, способ достаточно оптимален,
а производительность случайного доступа вообще затрагивается слабо.
Vitastor с однопоточной NBD прокси на том же стенде: Vitastor с однопоточной NBD прокси на том же стенде:
- T1Q1 запись: 6000 iops (задержка 0.166ms) - T1Q1 запись: 6000 iops (задержка 0.166ms)
- T1Q1 чтение: 5518 iops (задержка 0.18ms) - T1Q1 чтение: 5518 iops (задержка 0.18ms)
@@ -424,23 +425,90 @@ Vitastor с однопоточной NBD прокси на том же стен
- Запустите все OSD: `systemctl start vitastor.target` - Запустите все OSD: `systemctl start vitastor.target`
- Ваш кластер должен быть готов - один из мониторов должен уже сконфигурировать PG, а OSD должны запустить их. - Ваш кластер должен быть готов - один из мониторов должен уже сконфигурировать PG, а OSD должны запустить их.
- Вы можете проверить состояние PG прямо в etcd: `etcdctl --endpoints=... get --prefix /vitastor/pg/state`. Все PG должны быть 'active'. - Вы можете проверить состояние PG прямо в etcd: `etcdctl --endpoints=... get --prefix /vitastor/pg/state`. Все PG должны быть 'active'.
- Пример команды для запуска тестов: `fio -thread -ioengine=libfio_vitastor.so -name=test -bs=4M -direct=1 -iodepth=16 -rw=write -etcd=10.115.0.10:2379/v3 -pool=1 -inode=1 -size=400G`.
- Пример команды для заливки образа ВМ в vitastor через qemu-img: ### Задать имя образу
```
qemu-img convert -f qcow2 debian10.qcow2 -p -O raw 'vitastor:etcd_host=10.115.0.10\:2379/v3:pool=1:inode=1:size=2147483648' ```
``` etcdctl --endpoints=<etcd> put /vitastor/config/inode/<pool>/<inode> '{"name":"<name>","size":<size>[,"parent_id":<parent_inode_number>][,"readonly":true]}'
Если вы используете немодифицированный QEMU, данной команде потребуется переменная окружения `LD_PRELOAD=/usr/lib/x86_64-linux-gnu/qemu/block-vitastor.so`. ```
- Пример команды запуска QEMU:
``` Например:
qemu-system-x86_64 -enable-kvm -m 1024
-drive 'file=vitastor:etcd_host=10.115.0.10\:2379/v3:pool=1:inode=1:size=2147483648',format=raw,if=none,id=drive-virtio-disk0,cache=none ```
-device virtio-blk-pci,scsi=off,bus=pci.0,addr=0x5,drive=drive-virtio-disk0,id=virtio-disk0,bootindex=1,write-cache=off,physical_block_size=4096,logical_block_size=512 etcdctl --endpoints=http://10.115.0.10:2379/v3 put /vitastor/config/inode/1/1 '{"name":"testimg","size":2147483648}'
-vnc 0.0.0.0:0 ```
```
- Пример команды удаления образа (инода) из Vitastor: Если вы зададите parent_id, то образ станет CoW-клоном, т.е. все новые запросы записи пойдут в новый инод, а запросы
``` чтения будут проверять сначала его, а потом родительские слои по цепочке вверх. Чтобы случайно не перезаписать данные
vitastor-rm --etcd_address 10.115.0.10:2379/v3 --pool 1 --inode 1 --parallel_osds 16 --iodepth 32 в родительском слое, вы можете переключить его в режим "только чтение", добавив флаг `"readonly":true` в его запись
``` метаданных. В таком случае родительский образ становится просто снапшотом.
Таким образом, для создания снапшота вам нужно просто переименовать предыдущий inode (например, из testimg в testimg@0),
сделать его readonly и создать новый слой с исходным именем образа (testimg), ссылающийся на только что переименованный
в качестве родительского.
### Запуск тестов с fio
Пример команды для запуска тестов:
```
fio -thread -ioengine=libfio_vitastor.so -name=test -bs=4M -direct=1 -iodepth=16 -rw=write -etcd=10.115.0.10:2379/v3 -image=testimg
```
Если вы не хотите обращаться к образу по имени, вместо `-image=testimg` можно указать номер пула, номер инода и размер:
`-pool=1 -inode=1 -size=400G`.
### Загрузить образ диска ВМ в/из Vitastor
Используйте qemu-img и строку `vitastor:etcd_host=<HOST>:image=<IMAGE>` в качестве имени файла диска. Например:
```
qemu-img convert -f qcow2 debian10.qcow2 -p -O raw 'vitastor:etcd_host=10.115.0.10\:2379/v3:image=testimg'
```
Обратите внимание, что если вы используете немодифицированный QEMU, потребуется установить переменную окружения
`LD_PRELOAD=/usr/lib/x86_64-linux-gnu/qemu/block-vitastor.so`.
Если вы не хотите обращаться к образу по имени, вместо `:image=<IMAGE>` можно указать номер пула, номер инода и размер:
`:pool=<POOL>:inode=<INODE>:size=<SIZE>`.
### Запустить ВМ
Для запуска QEMU используйте опцию `-drive file=vitastor:etcd_host=<HOST>:image=<IMAGE>` (аналогично qemu-img)
и физический размер блока 4 KB.
Например:
```
qemu-system-x86_64 -enable-kvm -m 1024
-drive 'file=vitastor:etcd_host=10.115.0.10\:2379/v3:image=testimg',format=raw,if=none,id=drive-virtio-disk0,cache=none
-device virtio-blk-pci,scsi=off,bus=pci.0,addr=0x5,drive=drive-virtio-disk0,id=virtio-disk0,bootindex=1,write-cache=off,physical_block_size=4096,logical_block_size=512
-vnc 0.0.0.0:0
```
Обращение по номерам (`:pool=<POOL>:inode=<INODE>:size=<SIZE>` вместо `:image=<IMAGE>`) работает аналогично qemu-img.
### Удалить образ
Используйте утилиту vitastor-rm. Например:
```
vitastor-rm --etcd_address 10.115.0.10:2379/v3 --pool 1 --inode 1 --parallel_osds 16 --iodepth 32
```
### NBD
Чтобы создать локальное блочное устройство, используйте NBD. Например:
```
vitastor-nbd map --etcd_address 10.115.0.10:2379/v3 --image testimg
```
Команда напечатает название устройства вида /dev/nbd0, которое потом можно будет форматировать
и использовать как обычное блочное устройство.
Для обращения по номеру инода, аналогично другим командам, можно использовать опции
`--pool <POOL> --inode <INODE> --size <SIZE>` вместо `--image testimg`.
## Известные проблемы ## Известные проблемы

View File

@@ -379,24 +379,86 @@ and calculate disk offsets almost by hand. This will be fixed in near future.
For jerasure pools the configuration should look like the following: `2:{"name":"ecpool","scheme":"jerasure","pg_size":4,"parity_chunks":2,"pg_minsize":2,"pg_count":256,"failure_domain":"host"}`. For jerasure pools the configuration should look like the following: `2:{"name":"ecpool","scheme":"jerasure","pg_size":4,"parity_chunks":2,"pg_minsize":2,"pg_count":256,"failure_domain":"host"}`.
- At this point, one of the monitors will configure PGs and OSDs will start them. - At this point, one of the monitors will configure PGs and OSDs will start them.
- You can check PG states with `etcdctl --endpoints=... get --prefix /vitastor/pg/state`. All PGs should become 'active'. - You can check PG states with `etcdctl --endpoints=... get --prefix /vitastor/pg/state`. All PGs should become 'active'.
- Run tests with (for example): `fio -thread -ioengine=libfio_vitastor.so -name=test -bs=4M -direct=1 -iodepth=16 -rw=write -etcd=10.115.0.10:2379/v3 -pool=1 -inode=1 -size=400G`.
- Upload VM disk image with qemu-img (for example): ### Name an image
```
qemu-img convert -f qcow2 debian10.qcow2 -p -O raw 'vitastor:etcd_host=10.115.0.10\:2379/v3:pool=1:inode=1:size=2147483648' ```
``` etcdctl --endpoints=<etcd> put /vitastor/config/inode/<pool>/<inode> '{"name":"<name>","size":<size>[,"parent_id":<parent_inode_number>][,"readonly":true]}'
Note that the command requires to be run with `LD_PRELOAD=/usr/lib/x86_64-linux-gnu/qemu/block-vitastor.so qemu-img ...` ```
if you use unmodified QEMU.
- Run QEMU with (for example): For example:
```
qemu-system-x86_64 -enable-kvm -m 1024 ```
-drive 'file=vitastor:etcd_host=10.115.0.10\:2379/v3:pool=1:inode=1:size=2147483648',format=raw,if=none,id=drive-virtio-disk0,cache=none etcdctl --endpoints=http://10.115.0.10:2379/v3 put /vitastor/config/inode/1/1 '{"name":"testimg","size":2147483648}'
-device virtio-blk-pci,scsi=off,bus=pci.0,addr=0x5,drive=drive-virtio-disk0,id=virtio-disk0,bootindex=1,write-cache=off,physical_block_size=4096,logical_block_size=512 ```
-vnc 0.0.0.0:0
``` If you specify parent_id the image becomes a CoW clone. I.e. all writes go to the new inode and reads first check it
- Remove inode with (for example): and then upper layers. You can then make parent readonly by updating its entry with `"readonly":true` for safety and
``` basically treat it as a snapshot.
vitastor-rm --etcd_address 10.115.0.10:2379/v3 --pool 1 --inode 1 --parallel_osds 16 --iodepth 32
``` So to create a snapshot you basically rename the previous upper layer (for example from testimg to testimg@0), make it readonly
and create a new top layer with the original name (testimg) and the previous one as a parent.
### Run fio benchmarks
fio command example:
```
fio -thread -ioengine=libfio_vitastor.so -name=test -bs=4M -direct=1 -iodepth=16 -rw=write -etcd=10.115.0.10:2379/v3 -image=testimg
```
If you don't want to access your image by name, you can specify pool number, inode number and size
(`-pool=1 -inode=1 -size=400G`) instead of the image name (`-image=testimg`).
### Upload VM image
Use qemu-img and `vitastor:etcd_host=<HOST>:image=<IMAGE>` disk filename. For example:
```
qemu-img convert -f qcow2 debian10.qcow2 -p -O raw 'vitastor:etcd_host=10.115.0.10\:2379/v3:image=testimg'
```
Note that the command requires to be run with `LD_PRELOAD=/usr/lib/x86_64-linux-gnu/qemu/block-vitastor.so qemu-img ...`
if you use unmodified QEMU.
You can also specify `:pool=<POOL>:inode=<INODE>:size=<SIZE>` instead of `:image=<IMAGE>`
if you don't want to use inode metadata.
### Start a VM
Run QEMU with `-drive file=vitastor:etcd_host=<HOST>:image=<IMAGE>` and use 4 KB physical block size.
For example:
```
qemu-system-x86_64 -enable-kvm -m 1024
-drive 'file=vitastor:etcd_host=10.115.0.10\:2379/v3:image=testimg',format=raw,if=none,id=drive-virtio-disk0,cache=none
-device virtio-blk-pci,scsi=off,bus=pci.0,addr=0x5,drive=drive-virtio-disk0,id=virtio-disk0,bootindex=1,write-cache=off,physical_block_size=4096,logical_block_size=512
-vnc 0.0.0.0:0
```
You can also specify `:pool=<POOL>:inode=<INODE>:size=<SIZE>` instead of `:image=<IMAGE>`,
just like in qemu-img.
### Remove inode
Use vitastor-rm. For example:
```
vitastor-rm --etcd_address 10.115.0.10:2379/v3 --pool 1 --inode 1 --parallel_osds 16 --iodepth 32
```
### NBD
To create a local block device for a Vitastor image, use NBD. For example:
```
vitastor-nbd map --etcd_address 10.115.0.10:2379/v3 --image testimg
```
It will output the device name, like /dev/nbd0 which you can then format and mount as a normal block device.
Again, you can use `--pool <POOL> --inode <INODE> --size <SIZE>` insteaf of `--image <IMAGE>` if you want.
## Known Problems ## Known Problems

2
debian/changelog vendored
View File

@@ -1,4 +1,4 @@
vitastor (0.6.1-1) unstable; urgency=medium vitastor (0.6.2-1) unstable; urgency=medium
* Bugfixes * Bugfixes

View File

@@ -40,10 +40,10 @@ RUN set -e -x; \
mkdir -p /root/packages/vitastor-$REL; \ mkdir -p /root/packages/vitastor-$REL; \
rm -rf /root/packages/vitastor-$REL/*; \ rm -rf /root/packages/vitastor-$REL/*; \
cd /root/packages/vitastor-$REL; \ cd /root/packages/vitastor-$REL; \
cp -r /root/vitastor vitastor-0.6.1; \ cp -r /root/vitastor vitastor-0.6.2; \
ln -s /root/packages/qemu-$REL/qemu-*/ vitastor-0.6.1/qemu; \ ln -s /root/packages/qemu-$REL/qemu-*/ vitastor-0.6.2/qemu; \
ln -s /root/fio-build/fio-*/ vitastor-0.6.1/fio; \ ln -s /root/fio-build/fio-*/ vitastor-0.6.2/fio; \
cd vitastor-0.6.1; \ cd vitastor-0.6.2; \
FIO=$(head -n1 fio/debian/changelog | perl -pe 's/^.*\((.*?)\).*$/$1/'); \ FIO=$(head -n1 fio/debian/changelog | perl -pe 's/^.*\((.*?)\).*$/$1/'); \
QEMU=$(head -n1 qemu/debian/changelog | perl -pe 's/^.*\((.*?)\).*$/$1/'); \ QEMU=$(head -n1 qemu/debian/changelog | perl -pe 's/^.*\((.*?)\).*$/$1/'); \
sh copy-qemu-includes.sh; \ sh copy-qemu-includes.sh; \
@@ -59,8 +59,8 @@ RUN set -e -x; \
echo "dep:fio=$FIO" > debian/substvars; \ echo "dep:fio=$FIO" > debian/substvars; \
echo "dep:qemu=$QEMU" >> debian/substvars; \ echo "dep:qemu=$QEMU" >> debian/substvars; \
cd /root/packages/vitastor-$REL; \ cd /root/packages/vitastor-$REL; \
tar --sort=name --mtime='2020-01-01' --owner=0 --group=0 --exclude=debian -cJf vitastor_0.6.1.orig.tar.xz vitastor-0.6.1; \ tar --sort=name --mtime='2020-01-01' --owner=0 --group=0 --exclude=debian -cJf vitastor_0.6.2.orig.tar.xz vitastor-0.6.2; \
cd vitastor-0.6.1; \ cd vitastor-0.6.2; \
V=$(head -n1 debian/changelog | perl -pe 's/^.*\((.*?)\).*$/$1/'); \ V=$(head -n1 debian/changelog | perl -pe 's/^.*\((.*?)\).*$/$1/'); \
DEBFULLNAME="Vitaliy Filippov <vitalif@yourcmc.ru>" dch -D $REL -v "$V""$REL" "Rebuild for $REL"; \ DEBFULLNAME="Vitaliy Filippov <vitalif@yourcmc.ru>" dch -D $REL -v "$V""$REL" "Rebuild for $REL"; \
DEB_BUILD_OPTIONS=nocheck dpkg-buildpackage --jobs=auto -sa; \ DEB_BUILD_OPTIONS=nocheck dpkg-buildpackage --jobs=auto -sa; \

View File

@@ -48,4 +48,4 @@ FIO=`rpm -qi fio | perl -e 'while(<>) { /^Epoch[\s:]+(\S+)/ && print "$1:"; /^Ve
QEMU=`rpm -qi qemu qemu-kvm | perl -e 'while(<>) { /^Epoch[\s:]+(\S+)/ && print "$1:"; /^Version[\s:]+(\S+)/ && print $1; /^Release[\s:]+(\S+)/ && print "-$1"; }'` QEMU=`rpm -qi qemu qemu-kvm | perl -e 'while(<>) { /^Epoch[\s:]+(\S+)/ && print "$1:"; /^Version[\s:]+(\S+)/ && print $1; /^Release[\s:]+(\S+)/ && print "-$1"; }'`
perl -i -pe 's/(Requires:\s*fio)([^\n]+)?/$1 = '$FIO'/' $VITASTOR/rpm/vitastor-el$EL.spec perl -i -pe 's/(Requires:\s*fio)([^\n]+)?/$1 = '$FIO'/' $VITASTOR/rpm/vitastor-el$EL.spec
perl -i -pe 's/(Requires:\s*qemu(?:-kvm)?)([^\n]+)?/$1 = '$QEMU'/' $VITASTOR/rpm/vitastor-el$EL.spec perl -i -pe 's/(Requires:\s*qemu(?:-kvm)?)([^\n]+)?/$1 = '$QEMU'/' $VITASTOR/rpm/vitastor-el$EL.spec
tar --transform 's#^#vitastor-0.6.1/#' --exclude 'rpm/*.rpm' -czf $VITASTOR/../vitastor-0.6.1$(rpm --eval '%dist').tar.gz * tar --transform 's#^#vitastor-0.6.2/#' --exclude 'rpm/*.rpm' -czf $VITASTOR/../vitastor-0.6.2$(rpm --eval '%dist').tar.gz *

View File

@@ -37,7 +37,7 @@ ADD . /root/vitastor
RUN set -e; \ RUN set -e; \
cd /root/vitastor/rpm; \ cd /root/vitastor/rpm; \
sh build-tarball.sh; \ sh build-tarball.sh; \
cp /root/vitastor-0.6.1.el7.tar.gz ~/rpmbuild/SOURCES; \ cp /root/vitastor-0.6.2.el7.tar.gz ~/rpmbuild/SOURCES; \
cp vitastor-el7.spec ~/rpmbuild/SPECS/vitastor.spec; \ cp vitastor-el7.spec ~/rpmbuild/SPECS/vitastor.spec; \
cd ~/rpmbuild/SPECS/; \ cd ~/rpmbuild/SPECS/; \
rpmbuild -ba vitastor.spec; \ rpmbuild -ba vitastor.spec; \

View File

@@ -1,11 +1,11 @@
Name: vitastor Name: vitastor
Version: 0.6.1 Version: 0.6.2
Release: 1%{?dist} Release: 1%{?dist}
Summary: Vitastor, a fast software-defined clustered block storage Summary: Vitastor, a fast software-defined clustered block storage
License: Vitastor Network Public License 1.1 License: Vitastor Network Public License 1.1
URL: https://vitastor.io/ URL: https://vitastor.io/
Source0: vitastor-0.6.1.el7.tar.gz Source0: vitastor-0.6.2.el7.tar.gz
BuildRequires: liburing-devel >= 0.6 BuildRequires: liburing-devel >= 0.6
BuildRequires: gperftools-devel BuildRequires: gperftools-devel

View File

@@ -35,7 +35,7 @@ ADD . /root/vitastor
RUN set -e; \ RUN set -e; \
cd /root/vitastor/rpm; \ cd /root/vitastor/rpm; \
sh build-tarball.sh; \ sh build-tarball.sh; \
cp /root/vitastor-0.6.1.el8.tar.gz ~/rpmbuild/SOURCES; \ cp /root/vitastor-0.6.2.el8.tar.gz ~/rpmbuild/SOURCES; \
cp vitastor-el8.spec ~/rpmbuild/SPECS/vitastor.spec; \ cp vitastor-el8.spec ~/rpmbuild/SPECS/vitastor.spec; \
cd ~/rpmbuild/SPECS/; \ cd ~/rpmbuild/SPECS/; \
rpmbuild -ba vitastor.spec; \ rpmbuild -ba vitastor.spec; \

View File

@@ -1,11 +1,11 @@
Name: vitastor Name: vitastor
Version: 0.6.1 Version: 0.6.2
Release: 1%{?dist} Release: 1%{?dist}
Summary: Vitastor, a fast software-defined clustered block storage Summary: Vitastor, a fast software-defined clustered block storage
License: Vitastor Network Public License 1.1 License: Vitastor Network Public License 1.1
URL: https://vitastor.io/ URL: https://vitastor.io/
Source0: vitastor-0.6.1.el8.tar.gz Source0: vitastor-0.6.2.el8.tar.gz
BuildRequires: liburing-devel >= 0.6 BuildRequires: liburing-devel >= 0.6
BuildRequires: gperftools-devel BuildRequires: gperftools-devel

View File

@@ -13,7 +13,7 @@ if("${CMAKE_INSTALL_PREFIX}" MATCHES "^/usr/local/?$")
set(CMAKE_INSTALL_RPATH "${CMAKE_INSTALL_PREFIX}/${CMAKE_INSTALL_LIBDIR}") set(CMAKE_INSTALL_RPATH "${CMAKE_INSTALL_PREFIX}/${CMAKE_INSTALL_LIBDIR}")
endif() endif()
add_definitions(-DVERSION="0.6.1") add_definitions(-DVERSION="0.6.2")
add_definitions(-Wall -Wno-sign-compare -Wno-comment -Wno-parentheses -Wno-pointer-arith -I ${CMAKE_SOURCE_DIR}/src) add_definitions(-Wall -Wno-sign-compare -Wno-comment -Wno-parentheses -Wno-pointer-arith -I ${CMAKE_SOURCE_DIR}/src)
if (${WITH_ASAN}) if (${WITH_ASAN})
add_definitions(-fsanitize=address -fno-omit-frame-pointer) add_definitions(-fsanitize=address -fno-omit-frame-pointer)
@@ -37,11 +37,16 @@ string(REGEX REPLACE "([\\/\\-]D) *NDEBUG" "" CMAKE_C_FLAGS_RELWITHDEBINFO "${CM
find_package(PkgConfig) find_package(PkgConfig)
pkg_check_modules(LIBURING REQUIRED liburing) pkg_check_modules(LIBURING REQUIRED liburing)
pkg_check_modules(GLIB REQUIRED glib-2.0) pkg_check_modules(GLIB REQUIRED glib-2.0)
pkg_check_modules(IBVERBS libibverbs)
if (IBVERBS_LIBRARIES)
add_definitions(-DWITH_RDMA)
endif (IBVERBS_LIBRARIES)
include_directories( include_directories(
../ ../
/usr/include/jerasure /usr/include/jerasure
${LIBURING_INCLUDE_DIRS} ${LIBURING_INCLUDE_DIRS}
${IBVERBS_INCLUDE_DIRS}
) )
# libvitastor_blk.so # libvitastor_blk.so
@@ -52,7 +57,10 @@ add_library(vitastor_blk SHARED
target_link_libraries(vitastor_blk target_link_libraries(vitastor_blk
${LIBURING_LIBRARIES} ${LIBURING_LIBRARIES}
tcmalloc_minimal tcmalloc_minimal
# for timerfd_manager
vitastor_common
) )
set_target_properties(vitastor_blk PROPERTIES VERSION ${VERSION} SOVERSION 0)
# libfio_vitastor_blk.so # libfio_vitastor_blk.so
add_library(fio_vitastor_blk SHARED add_library(fio_vitastor_blk SHARED
@@ -63,16 +71,28 @@ target_link_libraries(fio_vitastor_blk
vitastor_blk vitastor_blk
) )
# libvitastor_common.a
add_library(vitastor_common STATIC
epoll_manager.cpp etcd_state_client.cpp
messenger.cpp msgr_stop.cpp msgr_op.cpp msgr_send.cpp msgr_receive.cpp ringloop.cpp ../json11/json11.cpp
http_client.cpp osd_ops.cpp pg_states.cpp timerfd_manager.cpp base64.cpp
)
if (IBVERBS_LIBRARIES)
target_sources(vitastor_common PRIVATE msgr_rdma.cpp)
endif (IBVERBS_LIBRARIES)
target_compile_options(vitastor_common PUBLIC -fPIC)
# vitastor-osd # vitastor-osd
add_executable(vitastor-osd add_executable(vitastor-osd
osd_main.cpp osd.cpp osd_secondary.cpp msgr_receive.cpp msgr_send.cpp osd_peering.cpp osd_flush.cpp osd_peering_pg.cpp osd_main.cpp osd.cpp osd_secondary.cpp osd_peering.cpp osd_flush.cpp osd_peering_pg.cpp
osd_primary.cpp osd_primary_chain.cpp osd_primary_sync.cpp osd_primary_write.cpp osd_primary_subops.cpp osd_primary.cpp osd_primary_chain.cpp osd_primary_sync.cpp osd_primary_write.cpp osd_primary_subops.cpp
etcd_state_client.cpp messenger.cpp msgr_stop.cpp msgr_op.cpp osd_cluster.cpp http_client.cpp osd_ops.cpp pg_states.cpp osd_cluster.cpp osd_rmw.cpp
osd_rmw.cpp base64.cpp timerfd_manager.cpp epoll_manager.cpp ../json11/json11.cpp
) )
target_link_libraries(vitastor-osd target_link_libraries(vitastor-osd
vitastor_common
vitastor_blk vitastor_blk
Jerasure Jerasure
${IBVERBS_LIBRARIES}
) )
# libfio_vitastor_sec.so # libfio_vitastor_sec.so
@@ -86,14 +106,15 @@ target_link_libraries(fio_vitastor_sec
# libvitastor_client.so # libvitastor_client.so
add_library(vitastor_client SHARED add_library(vitastor_client SHARED
cluster_client.cpp epoll_manager.cpp etcd_state_client.cpp cluster_client.cpp
messenger.cpp msgr_stop.cpp msgr_op.cpp msgr_send.cpp msgr_receive.cpp ringloop.cpp ../json11/json11.cpp
http_client.cpp osd_ops.cpp pg_states.cpp timerfd_manager.cpp base64.cpp
) )
target_link_libraries(vitastor_client target_link_libraries(vitastor_client
vitastor_common
tcmalloc_minimal tcmalloc_minimal
${LIBURING_LIBRARIES} ${LIBURING_LIBRARIES}
${IBVERBS_LIBRARIES}
) )
set_target_properties(vitastor_client PROPERTIES VERSION ${VERSION} SOVERSION 0)
# libfio_vitastor.so # libfio_vitastor.so
add_library(fio_vitastor SHARED add_library(fio_vitastor SHARED
@@ -162,11 +183,12 @@ target_link_libraries(osd_rmw_test Jerasure tcmalloc_minimal)
# stub_uring_osd # stub_uring_osd
add_executable(stub_uring_osd add_executable(stub_uring_osd
stub_uring_osd.cpp epoll_manager.cpp messenger.cpp msgr_stop.cpp msgr_op.cpp stub_uring_osd.cpp
msgr_send.cpp msgr_receive.cpp ringloop.cpp timerfd_manager.cpp ../json11/json11.cpp
) )
target_link_libraries(stub_uring_osd target_link_libraries(stub_uring_osd
vitastor_common
${LIBURING_LIBRARIES} ${LIBURING_LIBRARIES}
${IBVERBS_LIBRARIES}
tcmalloc_minimal tcmalloc_minimal
) )

View File

@@ -146,6 +146,7 @@ int blockstore_impl_t::continue_sync(blockstore_op_t *op, bool queue_has_in_prog
my_uring_prep_fsync(sqe, journal.fd, IORING_FSYNC_DATASYNC); my_uring_prep_fsync(sqe, journal.fd, IORING_FSYNC_DATASYNC);
data->iov = { 0 }; data->iov = { 0 };
data->callback = [this, op](ring_data_t *data) { handle_sync_event(data, op); }; data->callback = [this, op](ring_data_t *data) { handle_sync_event(data, op); };
PRIV(op)->min_flushed_journal_sector = PRIV(op)->max_flushed_journal_sector = 0;
PRIV(op)->pending_ops = 1; PRIV(op)->pending_ops = 1;
PRIV(op)->op_state = SYNC_JOURNAL_SYNC_SENT; PRIV(op)->op_state = SYNC_JOURNAL_SYNC_SENT;
return 1; return 1;

View File

@@ -53,6 +53,7 @@ cluster_client_t::cluster_client_t(ring_loop_t *ringloop, timerfd_manager_t *tfd
msgr.stop_client(op->peer_fd); msgr.stop_client(op->peer_fd);
delete op; delete op;
}; };
msgr.parse_config(this->config);
msgr.init(); msgr.init();
st_cli.tfd = tfd; st_cli.tfd = tfd;
@@ -108,6 +109,115 @@ cluster_op_t::~cluster_op_t()
} }
} }
void cluster_client_t::calc_wait(cluster_op_t *op)
{
op->prev_wait = 0;
if (op->opcode == OSD_OP_WRITE)
{
for (auto prev = op->prev; prev; prev = prev->prev)
{
if (prev->opcode == OSD_OP_SYNC ||
prev->opcode == OSD_OP_WRITE && !(op->flags & OP_FLUSH_BUFFER) && (prev->flags & OP_FLUSH_BUFFER))
{
op->prev_wait++;
}
}
if (!op->prev_wait && pgs_loaded)
continue_rw(op);
}
else if (op->opcode == OSD_OP_SYNC)
{
for (auto prev = op->prev; prev; prev = prev->prev)
{
if (prev->opcode == OSD_OP_SYNC || prev->opcode == OSD_OP_WRITE)
{
op->prev_wait++;
}
}
if (!op->prev_wait && pgs_loaded)
continue_sync(op);
}
else
{
for (auto prev = op->prev; prev; prev = prev->prev)
{
if (prev->opcode == OSD_OP_WRITE && prev->flags & OP_FLUSH_BUFFER)
{
op->prev_wait++;
}
else if (prev->opcode == OSD_OP_WRITE || prev->opcode == OSD_OP_READ)
{
// Flushes are always in the beginning
break;
}
}
if (!op->prev_wait && pgs_loaded)
continue_rw(op);
}
}
void cluster_client_t::inc_wait(uint64_t opcode, uint64_t flags, cluster_op_t *next, int inc)
{
if (opcode == OSD_OP_WRITE)
{
while (next)
{
auto n2 = next->next;
if (next->opcode == OSD_OP_SYNC ||
next->opcode == OSD_OP_WRITE && (flags & OP_FLUSH_BUFFER) && !(next->flags & OP_FLUSH_BUFFER) ||
next->opcode == OSD_OP_READ && (flags & OP_FLUSH_BUFFER))
{
next->prev_wait += inc;
if (!next->prev_wait)
{
if (next->opcode == OSD_OP_SYNC)
continue_sync(next);
else
continue_rw(next);
}
}
next = n2;
}
}
else if (opcode == OSD_OP_SYNC)
{
while (next)
{
auto n2 = next->next;
if (next->opcode == OSD_OP_SYNC || next->opcode == OSD_OP_WRITE)
{
next->prev_wait += inc;
if (!next->prev_wait)
{
if (next->opcode == OSD_OP_SYNC)
continue_sync(next);
else
continue_rw(next);
}
}
next = n2;
}
}
}
void cluster_client_t::erase_op(cluster_op_t *op)
{
uint64_t opcode = op->opcode, flags = op->flags;
cluster_op_t *next = op->next;
if (op->prev)
op->prev->next = op->next;
if (op->next)
op->next->prev = op->prev;
if (op_queue_head == op)
op_queue_head = op->next;
if (op_queue_tail == op)
op_queue_tail = op->prev;
op->next = op->prev = NULL;
std::function<void(cluster_op_t*)>(op->callback)(op);
if (!immediate_commit)
inc_wait(opcode, flags, next, -1);
}
void cluster_client_t::continue_ops(bool up_retry) void cluster_client_t::continue_ops(bool up_retry)
{ {
if (!pgs_loaded) if (!pgs_loaded)
@@ -118,60 +228,25 @@ void cluster_client_t::continue_ops(bool up_retry)
if (continuing_ops) if (continuing_ops)
{ {
// Attempt to reenter the function // Attempt to reenter the function
continuing_ops = 2;
return; return;
} }
restart: restart:
continuing_ops = 1; continuing_ops = 1;
op_queue_pos = 0; for (auto op = op_queue_head; op; )
bool has_flushes = false, has_writes = false;
while (op_queue_pos < op_queue.size())
{ {
auto op = op_queue[op_queue_pos]; cluster_op_t *next_op = op->next;
bool rm = false, is_flush = op->flags & OP_FLUSH_BUFFER;
auto opcode = op->opcode;
if (!op->up_wait || up_retry) if (!op->up_wait || up_retry)
{ {
op->up_wait = false; op->up_wait = false;
if (opcode == OSD_OP_READ || opcode == OSD_OP_WRITE) if (!op->prev_wait)
{ {
if (is_flush || !has_flushes) if (op->opcode == OSD_OP_SYNC)
{ continue_sync(op);
// Regular writes can't proceed before buffer flushes else
rm = continue_rw(op); continue_rw(op);
}
}
else if (opcode == OSD_OP_SYNC)
{
if (!has_writes)
{
// SYNC can't proceed before previous writes
rm = continue_sync(op);
}
} }
} }
if (opcode == OSD_OP_WRITE) op = next_op;
{
has_writes = has_writes || !rm;
if (is_flush)
{
has_flushes = has_writes || !rm;
}
}
else if (opcode == OSD_OP_SYNC)
{
// Postpone writes until previous SYNC completes
// ...so dirty_writes can't contain anything newer than SYNC
has_flushes = has_writes || !rm;
}
if (rm)
{
op_queue.erase(op_queue.begin()+op_queue_pos, op_queue.begin()+op_queue_pos+1);
}
else
{
op_queue_pos++;
}
if (continuing_ops == 2) if (continuing_ops == 2)
{ {
goto restart; goto restart;
@@ -213,11 +288,8 @@ void cluster_client_t::on_load_config_hook(json11::Json::object & config)
{ {
throw std::runtime_error("Bad block size"); throw std::runtime_error("Bad block size");
} }
if (config["immediate_commit"] == "all") // Cluster-wide immediate_commit mode
{ immediate_commit = (config["immediate_commit"] == "all");
// Cluster-wide immediate_commit mode
immediate_commit = true;
}
if (config.find("client_max_dirty_bytes") != config.end()) if (config.find("client_max_dirty_bytes") != config.end())
{ {
client_max_dirty_bytes = config["client_max_dirty_bytes"].uint64_value(); client_max_dirty_bytes = config["client_max_dirty_bytes"].uint64_value();
@@ -281,7 +353,7 @@ void cluster_client_t::on_change_hook(std::map<std::string, etcd_kv_t> & changes
{ {
// At this point, all pool operations should have been suspended // At this point, all pool operations should have been suspended
// And now they have to be resliced! // And now they have to be resliced!
for (auto op: op_queue) for (auto op = op_queue_head; op; op = op->next)
{ {
if ((op->opcode == OSD_OP_WRITE || op->opcode == OSD_OP_READ) && if ((op->opcode == OSD_OP_WRITE || op->opcode == OSD_OP_READ) &&
INODE_POOL(op->cur_inode) == pool_item.first) INODE_POOL(op->cur_inode) == pool_item.first)
@@ -362,9 +434,17 @@ void cluster_client_t::execute(cluster_op_t *op)
{ {
delete sync_op; delete sync_op;
}; };
op_queue.push_back(sync_op); sync_op->prev = op_queue_tail;
if (op_queue_tail)
{
op_queue_tail->next = sync_op;
op_queue_tail = sync_op;
}
else
op_queue_tail = op_queue_head = sync_op;
dirty_bytes = 0; dirty_bytes = 0;
dirty_ops = 0; dirty_ops = 0;
calc_wait(sync_op);
} }
dirty_bytes += op->len; dirty_bytes += op->len;
dirty_ops++; dirty_ops++;
@@ -374,8 +454,23 @@ void cluster_client_t::execute(cluster_op_t *op)
dirty_bytes = 0; dirty_bytes = 0;
dirty_ops = 0; dirty_ops = 0;
} }
op_queue.push_back(op); op->prev = op_queue_tail;
continue_ops(); if (op_queue_tail)
{
op_queue_tail->next = op;
op_queue_tail = op;
}
else
op_queue_tail = op_queue_head = op;
if (!immediate_commit)
calc_wait(op);
else if (pgs_loaded)
{
if (op->opcode == OSD_OP_SYNC)
continue_sync(op);
else
continue_rw(op);
}
} }
void cluster_client_t::copy_write(cluster_op_t *op, std::map<object_id, cluster_buffer_t> & dirty_buffers) void cluster_client_t::copy_write(cluster_op_t *op, std::map<object_id, cluster_buffer_t> & dirty_buffers)
@@ -474,12 +569,16 @@ void cluster_client_t::flush_buffer(const object_id & oid, cluster_buffer_t *wr)
} }
delete op; delete op;
}; };
op_queue.insert(op_queue.begin(), op); op->next = op_queue_head;
if (continuing_ops) if (op_queue_head)
{ {
continuing_ops = 2; op_queue_head->prev = op;
op_queue_pos++; op_queue_head = op;
} }
else
op_queue_tail = op_queue_head = op;
inc_wait(op->opcode, op->flags, op->next, 1);
continue_rw(op);
} }
int cluster_client_t::continue_rw(cluster_op_t *op) int cluster_client_t::continue_rw(cluster_op_t *op)
@@ -496,7 +595,7 @@ resume_0:
if (!op->len || op->offset % bs_bitmap_granularity || op->len % bs_bitmap_granularity) if (!op->len || op->offset % bs_bitmap_granularity || op->len % bs_bitmap_granularity)
{ {
op->retval = -EINVAL; op->retval = -EINVAL;
std::function<void(cluster_op_t*)>(op->callback)(op); erase_op(op);
return 1; return 1;
} }
{ {
@@ -504,7 +603,7 @@ resume_0:
if (!pool_id) if (!pool_id)
{ {
op->retval = -EINVAL; op->retval = -EINVAL;
std::function<void(cluster_op_t*)>(op->callback)(op); erase_op(op);
return 1; return 1;
} }
if (st_cli.pool_config.find(pool_id) == st_cli.pool_config.end() || if (st_cli.pool_config.find(pool_id) == st_cli.pool_config.end() ||
@@ -520,7 +619,7 @@ resume_0:
if (ino_it != st_cli.inode_config.end() && ino_it->second.readonly) if (ino_it != st_cli.inode_config.end() && ino_it->second.readonly)
{ {
op->retval = -EINVAL; op->retval = -EINVAL;
std::function<void(cluster_op_t*)>(op->callback)(op); erase_op(op);
return 1; return 1;
} }
if (!immediate_commit && !(op->flags & OP_FLUSH_BUFFER)) if (!immediate_commit && !(op->flags & OP_FLUSH_BUFFER))
@@ -603,13 +702,13 @@ resume_3:
} }
} }
op->retval = op->len; op->retval = op->len;
std::function<void(cluster_op_t*)>(op->callback)(op); erase_op(op);
return 1; return 1;
} }
else if (op->retval != 0 && op->retval != -EPIPE) else if (op->retval != 0 && op->retval != -EPIPE)
{ {
// Fatal error (not -EPIPE) // Fatal error (not -EPIPE)
std::function<void(cluster_op_t*)>(op->callback)(op); erase_op(op);
return 1; return 1;
} }
else else
@@ -849,7 +948,7 @@ int cluster_client_t::continue_sync(cluster_op_t *op)
{ {
// Sync is not required in the immediate_commit mode or if there are no dirty_osds // Sync is not required in the immediate_commit mode or if there are no dirty_osds
op->retval = 0; op->retval = 0;
std::function<void(cluster_op_t*)>(op->callback)(op); erase_op(op);
return 1; return 1;
} }
// Check that all OSD connections are still alive // Check that all OSD connections are still alive
@@ -924,7 +1023,7 @@ resume_1:
uw_it++; uw_it++;
} }
} }
std::function<void(cluster_op_t*)>(op->callback)(op); erase_op(op);
return 1; return 1;
} }
@@ -1008,7 +1107,10 @@ void cluster_client_t::handle_op_part(cluster_op_part_t *part)
} }
if (op->inflight_count == 0) if (op->inflight_count == 0)
{ {
continue_ops(); if (op->opcode == OSD_OP_SYNC)
continue_sync(op);
else
continue_rw(op);
} }
} }

View File

@@ -36,7 +36,7 @@ struct cluster_op_t
std::function<void(cluster_op_t*)> callback; std::function<void(cluster_op_t*)> callback;
~cluster_op_t(); ~cluster_op_t();
protected: protected:
int flags = 0; uint64_t flags = 0;
int state = 0; int state = 0;
uint64_t cur_inode; // for snapshot reads uint64_t cur_inode; // for snapshot reads
void *buf = NULL; void *buf = NULL;
@@ -47,6 +47,8 @@ protected:
std::vector<cluster_op_part_t> parts; std::vector<cluster_op_part_t> parts;
void *bitmap_buf = NULL, *part_bitmaps = NULL; void *bitmap_buf = NULL, *part_bitmaps = NULL;
unsigned bitmap_buf_size = 0; unsigned bitmap_buf_size = 0;
cluster_op_t *prev = NULL, *next = NULL;
int prev_wait = 0;
friend class cluster_client_t; friend class cluster_client_t;
}; };
@@ -66,7 +68,8 @@ class cluster_client_t
uint64_t bs_block_size = 0; uint64_t bs_block_size = 0;
uint32_t bs_bitmap_granularity = 0, bs_bitmap_size = 0; uint32_t bs_bitmap_granularity = 0, bs_bitmap_size = 0;
std::map<pool_id_t, uint64_t> pg_counts; std::map<pool_id_t, uint64_t> pg_counts;
bool immediate_commit = false; // WARNING: initially true so execute() doesn't create fake sync
bool immediate_commit = true;
// FIXME: Implement inmemory_commit mode. Note that it requires to return overlapping reads from memory. // FIXME: Implement inmemory_commit mode. Note that it requires to return overlapping reads from memory.
uint64_t client_max_dirty_bytes = 0; uint64_t client_max_dirty_bytes = 0;
uint64_t client_max_dirty_ops = 0; uint64_t client_max_dirty_ops = 0;
@@ -76,7 +79,7 @@ class cluster_client_t
int retry_timeout_id = 0; int retry_timeout_id = 0;
uint64_t op_id = 1; uint64_t op_id = 1;
std::vector<cluster_op_t*> offline_ops; std::vector<cluster_op_t*> offline_ops;
std::vector<cluster_op_t*> op_queue; cluster_op_t *op_queue_head = NULL, *op_queue_tail = NULL;
std::map<object_id, cluster_buffer_t> dirty_buffers; std::map<object_id, cluster_buffer_t> dirty_buffers;
std::set<osd_num_t> dirty_osds; std::set<osd_num_t> dirty_osds;
uint64_t dirty_bytes = 0, dirty_ops = 0; uint64_t dirty_bytes = 0, dirty_ops = 0;
@@ -88,7 +91,6 @@ class cluster_client_t
ring_consumer_t consumer; ring_consumer_t consumer;
std::vector<std::function<void(void)>> on_ready_hooks; std::vector<std::function<void(void)>> on_ready_hooks;
int continuing_ops = 0; int continuing_ops = 0;
int op_queue_pos = 0;
public: public:
etcd_state_client_t st_cli; etcd_state_client_t st_cli;
@@ -117,4 +119,7 @@ protected:
void send_sync(cluster_op_t *op, cluster_op_part_t *part); void send_sync(cluster_op_t *op, cluster_op_part_t *part);
void handle_op_part(cluster_op_part_t *part); void handle_op_part(cluster_op_part_t *part);
void copy_part_bitmap(cluster_op_t *op, cluster_op_part_t *part); void copy_part_bitmap(cluster_op_t *op, cluster_op_part_t *part);
void erase_op(cluster_op_t *op);
void calc_wait(cluster_op_t *op);
void inc_wait(uint64_t opcode, uint64_t flags, cluster_op_t *next, int inc);
}; };

View File

@@ -53,6 +53,10 @@ struct sec_options
uint64_t inode = 0; uint64_t inode = 0;
int cluster_log = 0; int cluster_log = 0;
int trace = 0; int trace = 0;
int use_rdma = 0;
int rdma_port_num = 0;
int rdma_gid_index = 0;
int rdma_mtu = 0;
}; };
static struct fio_option options[] = { static struct fio_option options[] = {
@@ -121,6 +125,26 @@ static struct fio_option options[] = {
.category = FIO_OPT_C_ENGINE, .category = FIO_OPT_C_ENGINE,
.group = FIO_OPT_G_FILENAME, .group = FIO_OPT_G_FILENAME,
}, },
{
.name = "use_rdma",
.lname = "Use RDMA",
.type = FIO_OPT_BOOL,
.off1 = offsetof(struct sec_options, use_rdma),
.help = "Use RDMA",
.def = "0",
.category = FIO_OPT_C_ENGINE,
.group = FIO_OPT_G_FILENAME,
},
{
.name = "rdma_gid_index",
.lname = "RDMA gid index",
.type = FIO_OPT_INT,
.off1 = offsetof(struct sec_options, rdma_gid_index),
.help = "RDMA gid index",
.def = "0",
.category = FIO_OPT_C_ENGINE,
.group = FIO_OPT_G_FILENAME,
},
{ {
.name = NULL, .name = NULL,
}, },
@@ -156,6 +180,8 @@ static int sec_setup(struct thread_data *td)
{ "etcd_address", std::string(o->etcd_host) }, { "etcd_address", std::string(o->etcd_host) },
{ "etcd_prefix", std::string(o->etcd_prefix ? o->etcd_prefix : "/vitastor") }, { "etcd_prefix", std::string(o->etcd_prefix ? o->etcd_prefix : "/vitastor") },
{ "log_level", o->cluster_log }, { "log_level", o->cluster_log },
{ "use_rdma", o->use_rdma },
{ "rdma_gid_index", o->rdma_gid_index },
}; };
if (!o->image) if (!o->image)

View File

@@ -12,6 +12,31 @@
void osd_messenger_t::init() void osd_messenger_t::init()
{ {
#ifdef WITH_RDMA
if (use_rdma)
{
rdma_context = msgr_rdma_context_t::create(
rdma_device != "" ? rdma_device.c_str() : NULL,
rdma_port_num, rdma_gid_index, rdma_mtu
);
if (!rdma_context)
{
printf("[OSD %lu] Couldn't initialize RDMA, proceeding with TCP only\n", osd_num);
}
else
{
rdma_max_sge = rdma_max_sge < rdma_context->attrx.orig_attr.max_sge
? rdma_max_sge : rdma_context->attrx.orig_attr.max_sge;
printf("[OSD %lu] RDMA initialized successfully\n", osd_num);
fcntl(rdma_context->channel->fd, F_SETFL, fcntl(rdma_context->channel->fd, F_GETFL, 0) | O_NONBLOCK);
tfd->set_fd_handler(rdma_context->channel->fd, false, [this](int notify_fd, int epoll_events)
{
handle_rdma_events();
});
handle_rdma_events();
}
}
#endif
keepalive_timer_id = tfd->set_timer(1000, true, [this](int) keepalive_timer_id = tfd->set_timer(1000, true, [this](int)
{ {
std::vector<int> to_stop; std::vector<int> to_stop;
@@ -19,7 +44,7 @@ void osd_messenger_t::init()
for (auto cl_it = clients.begin(); cl_it != clients.end(); cl_it++) for (auto cl_it = clients.begin(); cl_it != clients.end(); cl_it++)
{ {
auto cl = cl_it->second; auto cl = cl_it->second;
if (!cl->osd_num || cl->peer_state != PEER_CONNECTED) if (!cl->osd_num || cl->peer_state != PEER_CONNECTED && cl->peer_state != PEER_RDMA)
{ {
// Do not run keepalive on regular clients // Do not run keepalive on regular clients
continue; continue;
@@ -94,10 +119,29 @@ osd_messenger_t::~osd_messenger_t()
{ {
stop_client(clients.begin()->first, true); stop_client(clients.begin()->first, true);
} }
#ifdef WITH_RDMA
if (rdma_context)
{
delete rdma_context;
}
#endif
} }
void osd_messenger_t::parse_config(const json11::Json & config) void osd_messenger_t::parse_config(const json11::Json & config)
{ {
#ifdef WITH_RDMA
if (!config["use_rdma"].is_null())
this->use_rdma = config["use_rdma"].bool_value() || config["use_rdma"].uint64_value() != 0;
this->rdma_device = config["rdma_device"].string_value();
this->rdma_port_num = (uint8_t)config["rdma_port_num"].uint64_value();
if (!this->rdma_port_num)
this->rdma_port_num = 1;
this->rdma_gid_index = (uint8_t)config["rdma_gid_index"].uint64_value();
this->rdma_mtu = (uint32_t)config["rdma_mtu"].uint64_value();
#endif
this->bs_bitmap_granularity = strtoull(config["bitmap_granularity"].string_value().c_str(), NULL, 10);
if (!this->bs_bitmap_granularity)
this->bs_bitmap_granularity = DEFAULT_BITMAP_GRANULARITY;
this->use_sync_send_recv = config["use_sync_send_recv"].bool_value() || this->use_sync_send_recv = config["use_sync_send_recv"].bool_value() ||
config["use_sync_send_recv"].uint64_value(); config["use_sync_send_recv"].uint64_value();
this->peer_connect_interval = config["peer_connect_interval"].uint64_value(); this->peer_connect_interval = config["peer_connect_interval"].uint64_value();
@@ -326,6 +370,37 @@ void osd_messenger_t::check_peer_config(osd_client_t *cl)
}, },
}, },
}; };
#ifdef WITH_RDMA
if (rdma_context)
{
for (int i = 0; i < rdma_queues_per_connection; i++)
{
auto rdma_conn = msgr_rdma_connection_t::create(rdma_context, rdma_max_send, rdma_max_recv, rdma_max_sge);
if (!rdma_conn)
{
break;
}
cl->rdma_queues.push_back(rdma_conn);
}
if (cl->rdma_queues.size())
{
json11::Json::array addresses;
for (auto rdma_conn: cl->rdma_queues)
{
addresses.push_back(rdma_conn->addr.to_string());
}
json11::Json payload = json11::Json::object {
{ "rdma_queues", addresses },
{ "rdma_max_sge", rdma_max_sge },
};
std::string payload_str = payload.dump();
op->req.show_conf.json_len = payload_str.size();
op->buf = malloc_or_die(payload_str.size());
op->iov.push_back(op->buf, payload_str.size());
memcpy(op->buf, payload_str.c_str(), payload_str.size());
}
}
#endif
op->callback = [this, cl](osd_op_t *op) op->callback = [this, cl](osd_op_t *op)
{ {
std::string json_err; std::string json_err;
@@ -361,12 +436,20 @@ void osd_messenger_t::check_peer_config(osd_client_t *cl)
} }
if (err) if (err)
{ {
osd_num_t osd_num = cl->osd_num; osd_num_t peer_osd = cl->osd_num;
stop_client(op->peer_fd); stop_client(op->peer_fd);
on_connect_peer(osd_num, -1); on_connect_peer(peer_osd, -1);
delete op; delete op;
return; return;
} }
#ifdef WITH_RDMA
if (!connect_rdma_server(cl, config["rdma_queues"], config["rdma_max_sge"].uint64_value()))
{
// FIXME: Keep TCP connection in this case
delete op;
return;
}
#endif
osd_peer_fds[cl->osd_num] = cl->peer_fd; osd_peer_fds[cl->osd_num] = cl->peer_fd;
on_connect_peer(cl->osd_num, cl->peer_fd); on_connect_peer(cl->osd_num, cl->peer_fd);
delete op; delete op;
@@ -408,3 +491,8 @@ void osd_messenger_t::accept_connections(int listen_fd)
throw std::runtime_error(std::string("accept: ") + strerror(errno)); throw std::runtime_error(std::string("accept: ") + strerror(errno));
} }
} }
bool osd_messenger_t::is_rdma_enabled()
{
return rdma_context != NULL;
}

View File

@@ -18,21 +18,36 @@
#include "timerfd_manager.h" #include "timerfd_manager.h"
#include <ringloop.h> #include <ringloop.h>
#ifdef WITH_RDMA
#include "msgr_rdma.h"
#endif
#define CL_READ_HDR 1 #define CL_READ_HDR 1
#define CL_READ_DATA 2 #define CL_READ_DATA 2
#define CL_READ_REPLY_DATA 3 #define CL_READ_REPLY_DATA 3
#define CL_WRITE_READY 1 #define CL_WRITE_READY 1
#define CL_WRITE_REPLY 2
#define PEER_CONNECTING 1 #define PEER_CONNECTING 1
#define PEER_CONNECTED 2 #define PEER_CONNECTED 2
#define PEER_STOPPED 3 #define PEER_RDMA_CONNECTING 3
#define PEER_RDMA 4
#define PEER_STOPPED 5
#define DEFAULT_PEER_CONNECT_INTERVAL 5 #define DEFAULT_PEER_CONNECT_INTERVAL 5
#define DEFAULT_PEER_CONNECT_TIMEOUT 5 #define DEFAULT_PEER_CONNECT_TIMEOUT 5
#define DEFAULT_OSD_PING_TIMEOUT 5 #define DEFAULT_OSD_PING_TIMEOUT 5
#define DEFAULT_BITMAP_GRANULARITY 4096 #define DEFAULT_BITMAP_GRANULARITY 4096
#define MSGR_SENDP_HDR 1
#define MSGR_SENDP_FREE 2
#define MSGR_SENDP_BMP 4
struct msgr_sendp_t
{
osd_op_t *op;
int flags;
};
struct osd_client_t struct osd_client_t
{ {
int refs = 0; int refs = 0;
@@ -48,6 +63,10 @@ struct osd_client_t
void *in_buf = NULL; void *in_buf = NULL;
#ifdef WITH_RDMA
std::vector<msgr_rdma_connection_t*> rdma_queues;
#endif
// Read state // Read state
int read_ready = 0; int read_ready = 0;
osd_op_t *read_op = NULL; osd_op_t *read_op = NULL;
@@ -70,7 +89,7 @@ struct osd_client_t
msghdr write_msg = { 0 }; msghdr write_msg = { 0 };
int write_state = 0; int write_state = 0;
std::vector<iovec> send_list, next_send_list; std::vector<iovec> send_list, next_send_list;
std::vector<osd_op_t*> outbox, next_outbox; std::vector<msgr_sendp_t> outbox, next_outbox;
~osd_client_t() ~osd_client_t()
{ {
@@ -110,9 +129,19 @@ protected:
int peer_connect_timeout = DEFAULT_PEER_CONNECT_TIMEOUT; int peer_connect_timeout = DEFAULT_PEER_CONNECT_TIMEOUT;
int osd_idle_timeout = DEFAULT_OSD_PING_TIMEOUT; int osd_idle_timeout = DEFAULT_OSD_PING_TIMEOUT;
int osd_ping_timeout = DEFAULT_OSD_PING_TIMEOUT; int osd_ping_timeout = DEFAULT_OSD_PING_TIMEOUT;
uint32_t bs_bitmap_granularity = 0;
int log_level = 0; int log_level = 0;
bool use_sync_send_recv = false; bool use_sync_send_recv = false;
#ifdef WITH_RDMA
bool use_rdma = true;
std::string rdma_device;
uint64_t rdma_port_num = 1, rdma_gid_index = 0, rdma_mtu = 0;
msgr_rdma_context_t *rdma_context = NULL;
int rdma_queues_per_connection = 128;
int rdma_max_sge = 128, rdma_max_send = 32, rdma_max_recv = 32;
#endif
std::vector<int> read_ready_clients; std::vector<int> read_ready_clients;
std::vector<int> write_ready_clients; std::vector<int> write_ready_clients;
std::vector<std::function<void()>> set_immediate; std::vector<std::function<void()>> set_immediate;
@@ -141,6 +170,12 @@ public:
void accept_connections(int listen_fd); void accept_connections(int listen_fd);
~osd_messenger_t(); ~osd_messenger_t();
#ifdef WITH_RDMA
bool is_rdma_enabled();
bool connect_rdma_client(osd_client_t *cl, json11::Json rdma_addresses, uint64_t client_max_sge);
int get_rdma_max_sge();
#endif
protected: protected:
void try_connect_peer(uint64_t osd_num); void try_connect_peer(uint64_t osd_num);
void try_connect_peer_addr(osd_num_t peer_osd, const char *peer_host, int peer_port); void try_connect_peer_addr(osd_num_t peer_osd, const char *peer_host, int peer_port);
@@ -160,4 +195,11 @@ protected:
void handle_op_hdr(osd_client_t *cl); void handle_op_hdr(osd_client_t *cl);
bool handle_reply_hdr(osd_client_t *cl); bool handle_reply_hdr(osd_client_t *cl);
void handle_reply_ready(osd_op_t *op); void handle_reply_ready(osd_op_t *op);
#ifdef WITH_RDMA
void try_send_rdma(osd_client_t *cl);
void try_recv_rdma(osd_client_t *cl, msgr_rdma_connection_t *rc);
void handle_rdma_events();
bool connect_rdma_server(osd_client_t *cl, json11::Json rdma_addresses, uint64_t server_max_sge);
#endif
}; };

962
src/msgr_rdma.cpp Normal file
View File

@@ -0,0 +1,962 @@
#include <stdio.h>
#include <stdlib.h>
#include "msgr_rdma.h"
#include "messenger.h"
std::string msgr_rdma_address_t::to_string()
{
char msg[sizeof "0000:00000000:00000000:00000000000000000000000000000000"];
sprintf(
msg, "%04x:%06x:%06x:%016lx%016lx", lid, qpn, psn,
htobe64(((uint64_t*)&gid)[0]), htobe64(((uint64_t*)&gid)[1])
);
return std::string(msg);
}
bool msgr_rdma_address_t::from_string(const char *str, msgr_rdma_address_t *dest)
{
uint64_t* gid = (uint64_t*)&dest->gid;
int n = sscanf(
str, "%hx:%x:%x:%16lx%16lx", &dest->lid, &dest->qpn, &dest->psn, gid, gid+1
);
gid[0] = be64toh(gid[0]);
gid[1] = be64toh(gid[1]);
return n == 5;
}
msgr_rdma_context_t::~msgr_rdma_context_t()
{
if (cq)
ibv_destroy_cq(cq);
if (channel)
ibv_destroy_comp_channel(channel);
if (mr)
ibv_dereg_mr(mr);
if (pd)
ibv_dealloc_pd(pd);
if (context)
ibv_close_device(context);
}
msgr_rdma_connection_t::~msgr_rdma_connection_t()
{
ctx->used_max_cqe -= max_send+max_recv;
if (qp)
ibv_destroy_qp(qp);
}
msgr_rdma_context_t *msgr_rdma_context_t::create(const char *ib_devname, uint8_t ib_port, uint8_t gid_index, uint32_t mtu)
{
int res;
ibv_device **dev_list = NULL;
msgr_rdma_context_t *ctx = new msgr_rdma_context_t();
ctx->mtu = mtu;
dev_list = ibv_get_device_list(NULL);
if (!dev_list)
{
fprintf(stderr, "Failed to get RDMA device list: %s\n", strerror(errno));
goto cleanup;
}
if (!ib_devname)
{
ctx->dev = *dev_list;
if (!ctx->dev)
{
fprintf(stderr, "No RDMA devices found\n");
goto cleanup;
}
}
else
{
int i;
for (i = 0; dev_list[i]; ++i)
if (!strcmp(ibv_get_device_name(dev_list[i]), ib_devname))
break;
ctx->dev = dev_list[i];
if (!ctx->dev)
{
fprintf(stderr, "RDMA device %s not found\n", ib_devname);
goto cleanup;
}
}
ctx->context = ibv_open_device(ctx->dev);
if (!ctx->context)
{
fprintf(stderr, "Couldn't get RDMA context for %s\n", ibv_get_device_name(ctx->dev));
goto cleanup;
}
ctx->ib_port = ib_port;
ctx->gid_index = gid_index;
if ((res = ibv_query_port(ctx->context, ib_port, &ctx->portinfo)) != 0)
{
fprintf(stderr, "Couldn't get RDMA device %s port %d info: %s\n", ibv_get_device_name(ctx->dev), ib_port, strerror(res));
goto cleanup;
}
ctx->my_lid = ctx->portinfo.lid;
if (ctx->portinfo.link_layer != IBV_LINK_LAYER_ETHERNET && !ctx->my_lid)
{
fprintf(stderr, "RDMA device %s must have local LID because it's not Ethernet, but LID is zero\n", ibv_get_device_name(ctx->dev));
goto cleanup;
}
if (ibv_query_gid(ctx->context, ib_port, gid_index, &ctx->my_gid))
{
fprintf(stderr, "Couldn't read RDMA device %s GID index %d\n", ibv_get_device_name(ctx->dev), gid_index);
goto cleanup;
}
ctx->pd = ibv_alloc_pd(ctx->context);
if (!ctx->pd)
{
fprintf(stderr, "Couldn't allocate RDMA protection domain\n");
goto cleanup;
}
{
if (ibv_query_device_ex(ctx->context, NULL, &ctx->attrx))
{
fprintf(stderr, "Couldn't query RDMA device for its features\n");
goto cleanup;
}
if (!(ctx->attrx.odp_caps.general_caps & IBV_ODP_SUPPORT) ||
!(ctx->attrx.odp_caps.general_caps & IBV_ODP_SUPPORT_IMPLICIT) ||
!(ctx->attrx.odp_caps.per_transport_caps.rc_odp_caps & IBV_ODP_SUPPORT_SEND) ||
!(ctx->attrx.odp_caps.per_transport_caps.rc_odp_caps & IBV_ODP_SUPPORT_RECV))
{
fprintf(stderr, "The RDMA device isn't implicit ODP (On-Demand Paging) capable or does not support RC send and receive with ODP\n");
goto cleanup;
}
}
ctx->mr = ibv_reg_mr(ctx->pd, NULL, SIZE_MAX, IBV_ACCESS_LOCAL_WRITE | IBV_ACCESS_ON_DEMAND);
if (!ctx->mr)
{
fprintf(stderr, "Couldn't register RDMA memory region\n");
goto cleanup;
}
ctx->channel = ibv_create_comp_channel(ctx->context);
if (!ctx->channel)
{
fprintf(stderr, "Couldn't create RDMA completion channel\n");
goto cleanup;
}
ctx->max_cqe = 4096;
ctx->cq = ibv_create_cq(ctx->context, ctx->max_cqe, NULL, ctx->channel, 0);
if (!ctx->cq)
{
fprintf(stderr, "Couldn't create RDMA completion queue\n");
goto cleanup;
}
if (dev_list)
ibv_free_device_list(dev_list);
return ctx;
cleanup:
delete ctx;
if (dev_list)
ibv_free_device_list(dev_list);
return NULL;
}
msgr_rdma_connection_t *msgr_rdma_connection_t::create(msgr_rdma_context_t *ctx, uint32_t max_send, uint32_t max_recv, uint32_t max_sge)
{
msgr_rdma_connection_t *conn = new msgr_rdma_connection_t;
max_sge = max_sge > ctx->attrx.orig_attr.max_sge ? ctx->attrx.orig_attr.max_sge : max_sge;
conn->ctx = ctx;
conn->max_send = max_send;
conn->max_recv = max_recv;
conn->max_sge = max_sge;
ctx->used_max_cqe += max_send+max_recv;
if (ctx->used_max_cqe > ctx->max_cqe)
{
// Resize CQ
// Mellanox ConnectX-4 supports up to 4194303 CQEs, so it's fine to put everything into a single CQ
int new_max_cqe = ctx->max_cqe;
while (ctx->used_max_cqe > new_max_cqe)
{
new_max_cqe *= 2;
}
if (ibv_resize_cq(ctx->cq, new_max_cqe) != 0)
{
fprintf(stderr, "Couldn't resize RDMA completion queue to %d entries\n", new_max_cqe);
delete conn;
return NULL;
}
ctx->max_cqe = new_max_cqe;
}
ibv_qp_init_attr init_attr = {
.send_cq = ctx->cq,
.recv_cq = ctx->cq,
.cap = {
.max_send_wr = max_send,
.max_recv_wr = max_recv,
.max_send_sge = max_sge,
.max_recv_sge = max_sge,
},
.qp_type = IBV_QPT_RC,
};
conn->qp = ibv_create_qp(ctx->pd, &init_attr);
if (!conn->qp)
{
fprintf(stderr, "Couldn't create RDMA queue pair\n");
delete conn;
return NULL;
}
conn->addr.lid = ctx->my_lid;
conn->addr.gid = ctx->my_gid;
conn->addr.qpn = conn->qp->qp_num;
conn->addr.psn = lrand48() & 0xffffff;
ibv_qp_attr attr = {
.qp_state = IBV_QPS_INIT,
.qp_access_flags = 0,
.pkey_index = 0,
.port_num = ctx->ib_port,
};
if (ibv_modify_qp(conn->qp, &attr, IBV_QP_STATE | IBV_QP_PKEY_INDEX | IBV_QP_PORT | IBV_QP_ACCESS_FLAGS))
{
fprintf(stderr, "Failed to switch RDMA queue pair to INIT state\n");
delete conn;
return NULL;
}
return conn;
}
static ibv_mtu mtu_to_ibv_mtu(uint32_t mtu)
{
switch (mtu)
{
case 256: return IBV_MTU_256;
case 512: return IBV_MTU_512;
case 1024: return IBV_MTU_1024;
case 2048: return IBV_MTU_2048;
case 4096: return IBV_MTU_4096;
}
return IBV_MTU_4096;
}
int msgr_rdma_connection_t::connect(msgr_rdma_address_t *dest)
{
auto conn = this;
ibv_qp_attr attr = {
.qp_state = IBV_QPS_RTR,
.path_mtu = mtu_to_ibv_mtu(conn->ctx->mtu),
.rq_psn = dest->psn,
.sq_psn = conn->addr.psn,
.dest_qp_num = dest->qpn,
.ah_attr = {
.grh = {
.dgid = dest->gid,
.sgid_index = conn->ctx->gid_index,
.hop_limit = 1, // FIXME can it vary?
},
.dlid = dest->lid,
.sl = 0, // service level
.src_path_bits = 0,
.is_global = (uint8_t)(dest->gid.global.interface_id ? 1 : 0),
.port_num = conn->ctx->ib_port,
},
.max_rd_atomic = 1,
.max_dest_rd_atomic = 1,
// Timeout and min_rnr_timer actual values seem to be 4.096us*2^(timeout+1)
.min_rnr_timer = 1,
.timeout = 14,
.retry_cnt = 7,
.rnr_retry = 7,
};
// FIXME No idea if ibv_modify_qp is a blocking operation or not. No idea if it has a timeout and what it is.
if (ibv_modify_qp(conn->qp, &attr, IBV_QP_STATE | IBV_QP_AV | IBV_QP_PATH_MTU |
IBV_QP_DEST_QPN | IBV_QP_RQ_PSN | IBV_QP_MAX_DEST_RD_ATOMIC | IBV_QP_MIN_RNR_TIMER))
{
fprintf(stderr, "Failed to switch RDMA queue pair to RTR (ready-to-receive) state\n");
return 1;
}
attr.qp_state = IBV_QPS_RTS;
if (ibv_modify_qp(conn->qp, &attr, IBV_QP_STATE | IBV_QP_TIMEOUT |
IBV_QP_RETRY_CNT | IBV_QP_RNR_RETRY | IBV_QP_SQ_PSN | IBV_QP_MAX_QP_RD_ATOMIC))
{
fprintf(stderr, "Failed to switch RDMA queue pair to RTS (ready-to-send) state\n");
return 1;
}
return 0;
}
// Being the client, connect all server's RDMA queues to our local (client) queues
bool osd_messenger_t::connect_rdma_server(osd_client_t *cl, json11::Json rdma_addresses, uint64_t server_max_sge)
{
if (rdma_addresses.array_items().size() > 0)
{
if (!server_max_sge || server_max_sge > rdma_max_sge)
{
server_max_sge = rdma_max_sge;
}
int n_conn = rdma_addresses.array_items().size();
if (n_conn < cl->rdma_queues.size())
{
for (int i = n_conn; i < cl->rdma_queues.size(); i++)
{
delete cl->rdma_queues[i];
}
cl->rdma_queues.resize(n_conn);
}
else if (n_conn > cl->rdma_queues.size())
{
n_conn = cl->rdma_queues.size();
}
for (int i = 0; i < n_conn; i++)
{
msgr_rdma_address_t addr;
if (!msgr_rdma_address_t::from_string(rdma_addresses[i].string_value().c_str(), &addr) ||
cl->rdma_queues[i]->connect(&addr) != 0)
{
printf(
"Failed to connect to OSD %lu (address %s) using RDMA\n",
cl->osd_num, rdma_addresses[i].string_value().c_str()
);
// FIXME: Keep TCP connection in this case
osd_num_t peer_osd = cl->osd_num;
stop_client(cl->peer_fd);
on_connect_peer(peer_osd, -1);
return false;
}
else
{
printf("Connected local queue %d to OSD %lu queue %d using RDMA\n", cl->rdma_queues[i]->qp->qp_num, cl->osd_num, addr.qpn);
if (cl->rdma_queues[i]->max_sge > server_max_sge)
{
cl->rdma_queues[i]->max_sge = server_max_sge;
}
}
}
cl->peer_state = PEER_RDMA;
tfd->set_fd_handler(cl->peer_fd, false, NULL);
}
else
{
for (auto rdma_conn: cl->rdma_queues)
{
delete rdma_conn;
}
cl->rdma_queues.resize(0);
}
return true;
}
// Being the server, try to connect all client's RDMA queues to our local (server) queues
bool osd_messenger_t::connect_rdma_client(osd_client_t *cl, json11::Json rdma_addresses, uint64_t client_max_sge)
{
if (rdma_addresses.array_items().size() > 0)
{
if (!client_max_sge || client_max_sge > rdma_max_sge)
{
client_max_sge = rdma_max_sge;
}
int n_conn = rdma_addresses.array_items().size();
if (n_conn > rdma_queues_per_connection)
{
n_conn = rdma_queues_per_connection;
}
for (int i = 0; i < n_conn; i++)
{
msgr_rdma_address_t addr;
if (msgr_rdma_address_t::from_string(rdma_addresses[i].string_value().c_str(), &addr))
{
auto rdma_conn = msgr_rdma_connection_t::create(rdma_context, rdma_max_send, rdma_max_recv, client_max_sge);
if (rdma_conn && rdma_conn->connect(&addr) == 0)
{
printf("Connected local queue %d to client %d queue %d using RDMA\n", rdma_conn->qp->qp_num, cl->peer_fd, addr.qpn);
cl->rdma_queues.push_back(rdma_conn);
}
else
{
if (rdma_conn)
{
delete rdma_conn;
}
printf(
"Failed to connect RDMA queue pair to %s (client %d queue %d)\n",
addr.to_string().c_str(), cl->peer_fd, i+1
);
// Delete all RDMA queues to keep the TCP connection
for (int j = 0; j < cl->rdma_queues.size(); j++)
{
delete cl->rdma_queues[j];
}
cl->rdma_queues.resize(0);
return false;
}
}
}
// Switch to RDMA state only after sending the configuration response
cl->peer_state = PEER_RDMA_CONNECTING;
for (int i = 0; i < cl->rdma_queues.size(); i++)
{
try_recv_rdma(cl, cl->rdma_queues[i]);
}
}
return true;
}
static void try_send_rdma_wr(msgr_rdma_connection_t *rc, uint64_t wr_id, ibv_sge *sge, int op_sge)
{
timespec tv;
clock_gettime(CLOCK_REALTIME, &tv);
uint64_t total = 0;
for (int i = 0; i < op_sge; i++)
total += sge[i].length;
printf("%lu.%09lu RDMA send to queue %d: %lu bytes\n", tv.tv_sec, tv.tv_nsec, rc->qp->qp_num, total);
ibv_send_wr *bad_wr = NULL;
ibv_send_wr wr = {
.wr_id = wr_id,
.sg_list = sge,
.num_sge = op_sge,
.opcode = IBV_WR_SEND,
.send_flags = IBV_SEND_SIGNALED,
};
int err = ibv_post_send(rc->qp, &wr, &bad_wr);
if (err || bad_wr)
{
printf("RDMA send failed: %s\n", strerror(err));
exit(1);
}
rc->cur_send++;
}
static void try_recv_rdma_wr(msgr_rdma_connection_t *rc, uint64_t wr_id, ibv_sge *sge, int op_sge)
{
timespec tv;
clock_gettime(CLOCK_REALTIME, &tv);
uint64_t total = 0;
for (int i = 0; i < op_sge; i++)
total += sge[i].length;
printf("%lu.%09lu RDMA receive from queue %d: %lu bytes\n", tv.tv_sec, tv.tv_nsec, rc->qp->qp_num, total);
ibv_recv_wr *bad_wr = NULL;
ibv_recv_wr wr = {
.wr_id = wr_id,
.sg_list = sge,
.num_sge = op_sge,
};
int err = ibv_post_recv(rc->qp, &wr, &bad_wr);
if (err || bad_wr)
{
printf("RDMA receive failed: %s\n", strerror(err));
exit(1);
}
rc->cur_recv++;
}
static bool try_recv_rdma_read(osd_client_t *cl, msgr_rdma_connection_t *rc, osd_op_t *cur_op, uint32_t bs_bitmap_granularity)
{
int op_size = bs_bitmap_granularity, op_sge = 1, op_max = rc->max_sge*bs_bitmap_granularity;
iovec *segments = cur_op->iov.get_iovec();
ibv_sge sge[rc->max_sge];
sge[0] = {
.addr = (uintptr_t)cur_op->reply.buf,
.length = (uint32_t)OSD_PACKET_SIZE,
.lkey = rc->ctx->mr->lkey,
};
while (rc->recv_pos < cur_op->iov.get_size())
{
iovec & iov = segments[rc->recv_pos];
if (op_size >= op_max || op_sge >= rc->max_sge)
{
try_recv_rdma_wr(rc, cl->peer_fd, sge, op_sge);
op_sge = 0;
op_size = 0;
if (rc->cur_recv >= rc->max_recv)
{
// FIXME
exit(1);
}
}
// Receive in (max_sge*4k) fragments
uint32_t len = (uint32_t)(op_size+iov.iov_len-rc->recv_buf_pos < op_max
? iov.iov_len-rc->recv_buf_pos : op_max-op_size);
sge[op_sge++] = {
.addr = (uintptr_t)(iov.iov_base+rc->recv_buf_pos),
.length = len,
.lkey = rc->ctx->mr->lkey,
};
op_size += len;
rc->recv_buf_pos += len;
if (rc->recv_buf_pos >= iov.iov_len)
{
rc->recv_pos++;
rc->recv_buf_pos = 0;
}
}
if (op_sge > 0)
{
try_recv_rdma_wr(rc, cl->peer_fd, sge, op_sge);
}
rc->recv_pos = 0;
rc->recv_buf_pos = 0;
return true;
}
static bool try_send_rdma_read(osd_client_t *cl, msgr_rdma_connection_t *rc, osd_op_t *cur_op, int op_list_size, uint32_t bs_bitmap_granularity)
{
ibv_sge sge[rc->max_sge];
int op_size = bs_bitmap_granularity, op_sge = 1, op_max = rc->max_sge*bs_bitmap_granularity;
sge[0] = {
.addr = (uintptr_t)cl->send_list[0].iov_base,
.length = (uint32_t)cl->send_list[0].iov_len,
.lkey = rc->ctx->mr->lkey,
};
rc->send_pos = 1;
while (rc->send_pos < op_list_size)
{
iovec & iov = cl->send_list[rc->send_pos];
if (cl->outbox[rc->send_pos].flags & MSGR_SENDP_HDR)
{
if (op_sge > 0)
{
try_send_rdma_wr(rc, cl->peer_fd, sge, op_sge);
op_sge = 0;
op_size = 0;
if (rc->cur_send >= rc->max_send)
break;
}
assert(rc->send_buf_pos == 0);
sge[0] = {
.addr = (uintptr_t)iov.iov_base,
.length = (uint32_t)iov.iov_len,
.lkey = rc->ctx->mr->lkey,
};
try_send_rdma_wr(rc, cl->peer_fd, sge, 1);
rc->send_pos++;
if (rc->cur_send >= rc->max_send)
break;
}
else
{
if (op_size >= op_max || op_sge >= rc->max_sge)
{
try_send_rdma_wr(rc, cl->peer_fd, sge, op_sge);
op_sge = 0;
op_size = 0;
if (rc->cur_send >= rc->max_send)
break;
}
// Fragment all messages into parts no longer than (max_sge*4k) = 120k on ConnectX-4
// Otherwise the client may not be able to receive them in small parts
uint32_t len = (uint32_t)(op_size+iov.iov_len-rc->send_buf_pos < op_max ? iov.iov_len-rc->send_buf_pos : op_max-op_size);
sge[op_sge++] = {
.addr = (uintptr_t)(iov.iov_base+rc->send_buf_pos),
.length = len,
.lkey = rc->ctx->mr->lkey,
};
op_size += len;
rc->send_buf_pos += len;
if (rc->send_buf_pos >= iov.iov_len)
{
rc->send_pos++;
rc->send_buf_pos = 0;
}
}
}
if (op_sge > 0)
{
try_send_rdma_wr(rc, cl->peer_fd, sge, op_sge);
}
if (op_list_size == 1)
{
if (cur_op->req.hdr.opcode == OSD_OP_SEC_READ ||
cur_op->req.hdr.opcode == OSD_OP_READ)
{
sge[0] = {
.addr = 0,
.length = 0,
.lkey = rc->ctx->mr->lkey,
};
uint64_t data_size = cur_op->req.hdr.opcode == OSD_OP_SEC_READ
? cur_op->req.sec_rw.len
: cur_op->req.rw.len;
while (data_size >= op_max)
{
try_send_rdma_wr(rc, cl->peer_fd, sge, 1);
data_size -= op_max;
}
if (data_size > 0)
try_send_rdma_wr(rc, cl->peer_fd, sge, 1);
}
else if (cur_op->req.hdr.opcode == OSD_OP_SEC_READ_BMP)
{
sge[0] = {
.addr = 0,
.length = 0,
.lkey = rc->ctx->mr->lkey,
};
try_send_rdma_wr(rc, cl->peer_fd, sge, 1);
}
else
return true;
}
return true;
}
void osd_messenger_t::try_send_rdma(osd_client_t *cl)
{
// Two different algorithms for outgoing and incoming operations
while (cl->outbox.size() > 0)
{
osd_op_t *cur_op = cl->outbox[0].op;
if (cur_op->op_type == OSD_OP_OUT)
{
// Pick a queue. Send operation to it in one part.
int qi;
for (qi = 0; qi < cl->rdma_queues.size() && cl->rdma_queues[qi]->cur_op != NULL; qi++) {}
if (qi >= cl->rdma_queues.size())
{
// No free queues, retry later.
// We only post 1 operation per queue to use the queue pair number as a 'tag'.
return;
}
// Pick all entries for the operation from the queue
int op_list_size = 0;
while (op_list_size < cl->outbox.size() && cl->outbox[op_list_size].op == cur_op)
{
op_list_size++;
}
auto rq = cl->rdma_queues[qi];
rq->cur_op = cur_op;
ibv_sge sge[rq->max_sge];
// FIXME: This won't work with long bitmaps. But I don't care, I want to finally test fucking RDMA
// header or header+data
sge[0] = {
.addr = (uintptr_t)cl->send_list[0].iov_base,
.length = (uint32_t)cl->send_list[0].iov_len,
.lkey = rq->ctx->mr->lkey,
};
if (op_list_size == 2)
{
auto & iov = cl->send_list[1];
sge[1] = {
.addr = (uintptr_t)iov.iov_base,
.length = (uint32_t)iov.iov_len,
.lkey = rq->ctx->mr->lkey,
};
try_send_rdma_wr(rq, cl->peer_fd, sge, 2);
}
else if (op_list_size == 1)
{
try_send_rdma_wr(rq, cl->peer_fd, sge, 1);
}
else
{
printf("unexpected long send_list for opcode %lu: %lu entries\n", cur_op->req.hdr.opcode, cl->send_list.size());
exit(1);
}
cl->send_list.erase(cl->send_list.begin(), cl->send_list.begin()+op_list_size);
cl->outbox.erase(cl->outbox.begin(), cl->outbox.begin()+op_list_size);
// Post a receive request for the reply at the same time
if (cur_op->req.hdr.opcode == OSD_OP_SEC_READ ||
cur_op->req.hdr.opcode == OSD_OP_READ)
{
try_recv_rdma_read(cl, rq, cur_op, bs_bitmap_granularity);
}
else if (cur_op->req.hdr.opcode == OSD_OP_SEC_READ_BMP)
{
assert(!cur_op->iov.count);
// FIXME: hardcode
#define clean_entry_bitmap_size 4
// Reply size is known
uint64_t data_size = cur_op->req.sec_read_bmp.len / sizeof(obj_ver_id) * (8 + clean_entry_bitmap_size);
cur_op->rmw_buf = malloc_or_die(data_size);
sge[0] = {
.addr = (uintptr_t)cur_op->reply.buf,
.length = (uint32_t)OSD_PACKET_SIZE,
.lkey = rq->ctx->mr->lkey,
};
sge[1] = {
.addr = (uintptr_t)cur_op->rmw_buf,
.length = (uint32_t)data_size,
.lkey = rq->ctx->mr->lkey,
};
try_recv_rdma_wr(rq, cl->peer_fd, sge, 2);
}
else
{
// No reply or reply size is unknown
sge[0] = {
.addr = (uintptr_t)cur_op->reply.buf,
.length = (uint32_t)OSD_PACKET_SIZE,
.lkey = rq->ctx->mr->lkey,
};
try_recv_rdma_wr(rq, cl->peer_fd, sge, 1);
}
}
else
{
// Send reply to the same queue the operation came from.
// Fragment it into parts no longer than (max_sge*4k) to always
// be able to send and receive them correctly.
int qi;
for (qi = 0; qi < cl->rdma_queues.size() && cl->rdma_queues[qi]->cur_op != cur_op; qi++) {}
if (qi >= cl->rdma_queues.size())
{
printf("Unknown incoming operation for client %d\n", cl->peer_fd);
exit(1);
}
// Pick all entries for the operation from the queue
int op_list_size = 0;
while (op_list_size < cl->outbox.size() && cl->outbox[op_list_size].op == cur_op)
{
op_list_size++;
}
auto rq = cl->rdma_queues[qi];
if (cur_op->req.hdr.opcode == OSD_OP_SEC_READ ||
cur_op->req.hdr.opcode == OSD_OP_READ)
{
try_send_rdma_read(cl, rq, cur_op, op_list_size, bs_bitmap_granularity);
rq->send_pos = 0;
rq->send_buf_pos = 0;
}
else if (op_list_size == 1)
{
ibv_sge sge[1];
sge[0] = {
.addr = (uintptr_t)cl->send_list[0].iov_base,
.length = (uint32_t)cl->send_list[0].iov_len,
.lkey = rq->ctx->mr->lkey,
};
try_send_rdma_wr(rq, cl->peer_fd, sge, 1);
}
else if (op_list_size == 2)
{
ibv_sge sge[2];
sge[0] = {
.addr = (uintptr_t)cl->send_list[0].iov_base,
.length = (uint32_t)cl->send_list[0].iov_len,
.lkey = rq->ctx->mr->lkey,
};
sge[1] = {
.addr = (uintptr_t)cl->send_list[1].iov_base,
.length = (uint32_t)cl->send_list[1].iov_len,
.lkey = rq->ctx->mr->lkey,
};
if (cur_op->req.hdr.opcode == OSD_OP_SEC_READ_BMP)
try_send_rdma_wr(rq, cl->peer_fd, sge, 2);
else
{
try_send_rdma_wr(rq, cl->peer_fd, sge, 1);
try_send_rdma_wr(rq, cl->peer_fd, sge+1, 1);
}
}
else if (op_list_size > 2)
{
printf("Unexpected long send_list for opcode %lu: %lu entries\n", cur_op->req.hdr.opcode, cl->send_list.size());
exit(1);
}
cl->send_list.erase(cl->send_list.begin(), cl->send_list.begin()+op_list_size);
cl->outbox.erase(cl->outbox.begin(), cl->outbox.begin()+op_list_size);
}
}
}
// Try to receive an incoming operation via RDMA
void osd_messenger_t::try_recv_rdma(osd_client_t *cl, msgr_rdma_connection_t *rc)
{
rc->cur_op = new osd_op_t;
rc->cur_op->peer_fd = cl->peer_fd;
rc->cur_op->op_type = OSD_OP_IN;
rc->cur_op->buf = memalign_or_die(MEM_ALIGNMENT, 128*1024); // FIXME hardcode for tests
ibv_sge sge[2];
sge[0] = {
.addr = (uintptr_t)rc->cur_op->req.buf,
.length = (uint32_t)OSD_PACKET_SIZE,
.lkey = rc->ctx->mr->lkey,
};
sge[1] = {
.addr = (uintptr_t)rc->cur_op->buf,
.length = (uint32_t)128*1024,
.lkey = rc->ctx->mr->lkey,
};
try_recv_rdma_wr(rc, cl->peer_fd, sge, 2);
}
#define RDMA_EVENTS_AT_ONCE 32
void osd_messenger_t::handle_rdma_events()
{
// Request next notification
ibv_cq *ev_cq;
void *ev_ctx;
// FIXME: This is inefficient as it calls read()...
timespec tv;
if (ibv_get_cq_event(rdma_context->channel, &ev_cq, &ev_ctx) == 0)
{
ibv_ack_cq_events(rdma_context->cq, 1);
}
if (ibv_req_notify_cq(rdma_context->cq, 0) != 0)
{
printf("Failed to request RDMA completion notification, exiting\n");
exit(1);
}
ibv_wc wc[RDMA_EVENTS_AT_ONCE];
int event_count;
do
{
event_count = ibv_poll_cq(rdma_context->cq, RDMA_EVENTS_AT_ONCE, wc);
for (int i = 0; i < event_count; i++)
{
int client_id = wc[i].wr_id;
bool is_send = wc[i].opcode == IBV_WC_SEND;
auto cl_it = clients.find(client_id);
if (cl_it == clients.end())
{
continue;
}
osd_client_t *cl = cl_it->second;
if (wc[i].status != IBV_WC_SUCCESS)
{
printf("RDMA work request failed for client %d", client_id);
if (cl->osd_num)
{
printf(" (OSD %lu)", cl->osd_num);
}
printf(" with status: %s, stopping client\n", ibv_wc_status_str(wc[i].status));
stop_client(client_id);
continue;
}
int q;
for (q = 0; q < cl->rdma_queues.size() && cl->rdma_queues[q]->qp->qp_num != wc[i].qp_num; q++) {}
if (q >= cl->rdma_queues.size())
{
printf("Unknown queue %d for client %d\n", wc[i].qp_num, cl->peer_fd);
exit(1);
}
auto rc = cl->rdma_queues[q];
if (is_send)
{
clock_gettime(CLOCK_REALTIME, &tv);
printf("%lu.%09lu Done RDMA send on queue %d\n", tv.tv_sec, tv.tv_nsec, wc[i].qp_num);
}
else
{
clock_gettime(CLOCK_REALTIME, &tv);
printf("%lu.%09lu Done RDMA recv on queue %d, %d bytes\n", tv.tv_sec, tv.tv_nsec, wc[i].qp_num, wc[i].byte_len);
}
if (!is_send)
{
rc->cur_recv--;
if (!rc->cur_recv)
{
// Fucking shit...
if (rc->cur_op->op_type == OSD_OP_IN)
{
if (wc[i].byte_len <= OSD_PACKET_SIZE)
{
free(rc->cur_op->buf);
rc->cur_op->buf = NULL;
}
cl->received_ops.push_back(rc->cur_op);
set_immediate.push_back([this, op = rc->cur_op]() { exec_op(op); });
}
else /* if (rc->cur_op->op_type == OSD_OP_OUT) */
{
if (rc->cur_op->reply.hdr.opcode == OSD_OP_SEC_READ ||
rc->cur_op->reply.hdr.opcode == OSD_OP_READ)
{
// Data is already received
cl->sent_ops.erase(rc->cur_op->req.hdr.id);
handle_reply_ready(rc->cur_op);
rc->cur_op = NULL;
try_send_rdma(cl);
}
else if (rc->cur_op->reply.hdr.opcode == OSD_OP_SEC_READ_BMP)
{
// Data is already received, but we need to switch buffers
cl->sent_ops.erase(rc->cur_op->req.hdr.id);
free(rc->cur_op->buf);
rc->cur_op->buf = rc->cur_op->rmw_buf;
handle_reply_ready(rc->cur_op);
rc->cur_op = NULL;
try_send_rdma(cl);
}
else if (rc->cur_op->reply.hdr.opcode == OSD_OP_SEC_LIST && rc->cur_op->reply.hdr.retval > 0 ||
rc->cur_op->reply.hdr.opcode == OSD_OP_SHOW_CONFIG && rc->cur_op->reply.hdr.retval > 0)
{
if (rc->recv_pos != 1)
{
// Data is not received yet (RNR)
uint32_t len;
if (rc->cur_op->reply.hdr.opcode == OSD_OP_SEC_LIST)
len = sizeof(obj_ver_id) * rc->cur_op->reply.hdr.retval;
else
len = rc->cur_op->reply.hdr.retval;
rc->cur_op->buf = malloc_or_die(len);
ibv_sge sge[1];
sge[0] = {
.addr = (uintptr_t)rc->cur_op->buf,
.length = len,
.lkey = rc->ctx->mr->lkey,
};
try_recv_rdma_wr(rc, cl->peer_fd, sge, 1);
rc->recv_pos = 1;
}
else
{
// Done
cl->sent_ops.erase(rc->cur_op->req.hdr.id);
handle_reply_ready(rc->cur_op);
rc->cur_op = NULL;
rc->recv_pos = 0;
try_send_rdma(cl);
}
}
else
{
// No data
cl->sent_ops.erase(rc->cur_op->req.hdr.id);
handle_reply_ready(rc->cur_op);
rc->cur_op = NULL;
try_send_rdma(cl);
}
}
}
}
else
{
rc->cur_send--;
if (!rc->cur_send)
{
if (rc->cur_op->op_type == OSD_OP_OUT)
{
// Nothing
}
else /* if (rc->cur_op->op_type == OSD_OP_IN) */
{
// Reply fully sent
delete rc->cur_op;
rc->cur_op = NULL;
// Post receive for the next incoming op
try_recv_rdma(cl, rc);
}
}
}
}
} while (event_count > 0);
for (auto cb: set_immediate)
{
cb();
}
set_immediate.clear();
}
int osd_messenger_t::get_rdma_max_sge()
{
return rdma_max_sge;
}

56
src/msgr_rdma.h Normal file
View File

@@ -0,0 +1,56 @@
#pragma once
#include <infiniband/verbs.h>
#include <string>
#include <vector>
struct osd_op_t;
struct msgr_rdma_address_t
{
ibv_gid gid;
uint16_t lid;
uint32_t qpn;
uint32_t psn;
std::string to_string();
static bool from_string(const char *str, msgr_rdma_address_t *dest);
};
struct msgr_rdma_context_t
{
ibv_context *context = NULL;
ibv_device *dev = NULL;
ibv_device_attr_ex attrx;
ibv_pd *pd = NULL;
ibv_mr *mr = NULL;
ibv_comp_channel *channel = NULL;
ibv_cq *cq = NULL;
ibv_port_attr portinfo;
uint8_t ib_port;
uint8_t gid_index;
uint16_t my_lid;
ibv_gid my_gid;
uint32_t mtu;
int max_cqe = 0;
int used_max_cqe = 0;
static msgr_rdma_context_t *create(const char *ib_devname, uint8_t ib_port, uint8_t gid_index, uint32_t mtu);
~msgr_rdma_context_t();
};
struct msgr_rdma_connection_t
{
msgr_rdma_context_t *ctx = NULL;
ibv_qp *qp = NULL;
msgr_rdma_address_t addr;
int max_send = 0, max_recv = 0, max_sge = 0;
int cur_send = 0, cur_recv = 0;
osd_op_t *cur_op = NULL;
int send_pos = 0, send_buf_pos = 0;
int recv_pos = 0, recv_buf_pos = 0;
~msgr_rdma_connection_t();
static msgr_rdma_connection_t *create(msgr_rdma_context_t *ctx, uint32_t max_send, uint32_t max_recv, uint32_t max_sge);
int connect(msgr_rdma_address_t *dest);
};

View File

@@ -207,20 +207,26 @@ void osd_messenger_t::handle_op_hdr(osd_client_t *cl)
else if (cur_op->req.hdr.opcode == OSD_OP_SEC_WRITE || else if (cur_op->req.hdr.opcode == OSD_OP_SEC_WRITE ||
cur_op->req.hdr.opcode == OSD_OP_SEC_WRITE_STABLE) cur_op->req.hdr.opcode == OSD_OP_SEC_WRITE_STABLE)
{ {
if (cur_op->req.sec_rw.attr_len > 0) if (cur_op->req.sec_rw.bitmap_len > 0)
{ {
if (cur_op->req.sec_rw.attr_len > sizeof(unsigned)) if (cur_op->req.sec_rw.bitmap_len > sizeof(void*))
cur_op->bitmap = cur_op->rmw_buf = malloc_or_die(cur_op->req.sec_rw.attr_len); cur_op->bitmap = cur_op->rmw_buf = malloc_or_die(cur_op->req.sec_rw.bitmap_len);
else else
cur_op->bitmap = &cur_op->bmp_data; cur_op->bitmap = &cur_op->bmp_data;
cl->recv_list.push_back(cur_op->bitmap, cur_op->req.sec_rw.attr_len); if (cur_op->req.sec_rw.bitmap_len <= 8)
memcpy(cur_op->bitmap, &cur_op->req.sec_rw.bitmap, cur_op->req.sec_rw.bitmap_len);
else
{
cl->recv_list.push_back(cur_op->bitmap, cur_op->req.sec_rw.bitmap_len);
cl->read_remaining += cur_op->req.sec_rw.bitmap_len;
}
} }
if (cur_op->req.sec_rw.len > 0) if (cur_op->req.sec_rw.len > 0)
{ {
cur_op->buf = memalign_or_die(MEM_ALIGNMENT, cur_op->req.sec_rw.len); cur_op->buf = memalign_or_die(MEM_ALIGNMENT, cur_op->req.sec_rw.len);
cl->recv_list.push_back(cur_op->buf, cur_op->req.sec_rw.len); cl->recv_list.push_back(cur_op->buf, cur_op->req.sec_rw.len);
cl->read_remaining += cur_op->req.sec_rw.len;
} }
cl->read_remaining = cur_op->req.sec_rw.len + cur_op->req.sec_rw.attr_len;
} }
else if (cur_op->req.hdr.opcode == OSD_OP_SEC_STABILIZE || else if (cur_op->req.hdr.opcode == OSD_OP_SEC_STABILIZE ||
cur_op->req.hdr.opcode == OSD_OP_SEC_ROLLBACK) cur_op->req.hdr.opcode == OSD_OP_SEC_ROLLBACK)
@@ -254,6 +260,16 @@ void osd_messenger_t::handle_op_hdr(osd_client_t *cl)
} }
cl->read_remaining = cur_op->req.rw.len; cl->read_remaining = cur_op->req.rw.len;
} }
else if (cur_op->req.hdr.opcode == OSD_OP_SHOW_CONFIG)
{
if (cur_op->req.show_conf.json_len > 0)
{
cur_op->buf = malloc_or_die(cur_op->req.show_conf.json_len+1);
((uint8_t*)cur_op->buf)[cur_op->req.show_conf.json_len] = 0;
cl->recv_list.push_back(cur_op->buf, cur_op->req.show_conf.json_len);
}
cl->read_remaining = cur_op->req.show_conf.json_len;
}
if (cl->read_remaining > 0) if (cl->read_remaining > 0)
{ {
// Read data // Read data
@@ -285,7 +301,7 @@ bool osd_messenger_t::handle_reply_hdr(osd_client_t *cl)
if (op->reply.hdr.opcode == OSD_OP_SEC_READ || op->reply.hdr.opcode == OSD_OP_READ) if (op->reply.hdr.opcode == OSD_OP_SEC_READ || op->reply.hdr.opcode == OSD_OP_READ)
{ {
// Read data. In this case we assume that the buffer is preallocated by the caller (!) // Read data. In this case we assume that the buffer is preallocated by the caller (!)
unsigned bmp_len = (op->reply.hdr.opcode == OSD_OP_SEC_READ ? op->reply.sec_rw.attr_len : op->reply.rw.bitmap_len); unsigned bmp_len = (op->reply.hdr.opcode == OSD_OP_SEC_READ ? op->reply.sec_rw.bitmap_len : op->reply.rw.bitmap_len);
unsigned expected_size = (op->reply.hdr.opcode == OSD_OP_SEC_READ ? op->req.sec_rw.len : op->req.rw.len); unsigned expected_size = (op->reply.hdr.opcode == OSD_OP_SEC_READ ? op->req.sec_rw.len : op->req.rw.len);
if (op->reply.hdr.retval >= 0 && (op->reply.hdr.retval != expected_size || bmp_len > op->bitmap_len)) if (op->reply.hdr.retval >= 0 && (op->reply.hdr.retval != expected_size || bmp_len > op->bitmap_len))
{ {
@@ -299,14 +315,24 @@ bool osd_messenger_t::handle_reply_hdr(osd_client_t *cl)
if (op->reply.hdr.retval >= 0 && bmp_len > 0) if (op->reply.hdr.retval >= 0 && bmp_len > 0)
{ {
assert(op->bitmap); assert(op->bitmap);
cl->recv_list.push_back(op->bitmap, bmp_len); if (bmp_len <= 8)
{
memcpy(op->bitmap, (op->reply.hdr.opcode == OSD_OP_SEC_READ
? &op->reply.sec_rw.bitmap
: &op->reply.rw.bitmap), bmp_len);
}
else
{
cl->recv_list.push_back(op->bitmap, bmp_len);
cl->read_remaining += bmp_len;
}
} }
if (op->reply.hdr.retval > 0) if (op->reply.hdr.retval > 0)
{ {
assert(op->iov.count > 0); assert(op->iov.count > 0);
cl->recv_list.append(op->iov); cl->recv_list.append(op->iov);
cl->read_remaining += op->reply.hdr.retval;
} }
cl->read_remaining = op->reply.hdr.retval + bmp_len;
if (cl->read_remaining == 0) if (cl->read_remaining == 0)
{ {
goto reuse; goto reuse;
@@ -332,16 +358,17 @@ bool osd_messenger_t::handle_reply_hdr(osd_client_t *cl)
cl->read_op = op; cl->read_op = op;
cl->read_state = CL_READ_REPLY_DATA; cl->read_state = CL_READ_REPLY_DATA;
cl->read_remaining = op->reply.hdr.retval; cl->read_remaining = op->reply.hdr.retval;
free(op->buf);
op->buf = memalign_or_die(MEM_ALIGNMENT, cl->read_remaining); op->buf = memalign_or_die(MEM_ALIGNMENT, cl->read_remaining);
cl->recv_list.push_back(op->buf, cl->read_remaining); cl->recv_list.push_back(op->buf, cl->read_remaining);
} }
else if (op->reply.hdr.opcode == OSD_OP_SHOW_CONFIG && op->reply.hdr.retval > 0) else if (op->reply.hdr.opcode == OSD_OP_SHOW_CONFIG && op->reply.hdr.retval > 0)
{ {
assert(!op->iov.count);
delete cl->read_op; delete cl->read_op;
cl->read_op = op; cl->read_op = op;
cl->read_state = CL_READ_REPLY_DATA; cl->read_state = CL_READ_REPLY_DATA;
cl->read_remaining = op->reply.hdr.retval; cl->read_remaining = op->reply.hdr.retval;
free(op->buf);
op->buf = malloc_or_die(op->reply.hdr.retval); op->buf = malloc_or_die(op->reply.hdr.retval);
cl->recv_list.push_back(op->buf, op->reply.hdr.retval); cl->recv_list.push_back(op->buf, op->reply.hdr.retval);
} }

View File

@@ -46,27 +46,41 @@ void osd_messenger_t::outbox_push(osd_op_t *cur_op)
to_send_list.push_back((iovec){ .iov_base = cur_op->req.buf, .iov_len = OSD_PACKET_SIZE }); to_send_list.push_back((iovec){ .iov_base = cur_op->req.buf, .iov_len = OSD_PACKET_SIZE });
cl->sent_ops[cur_op->req.hdr.id] = cur_op; cl->sent_ops[cur_op->req.hdr.id] = cur_op;
} }
to_outbox.push_back(NULL); to_outbox.push_back((msgr_sendp_t){ .op = cur_op, .flags = MSGR_SENDP_HDR });
// Bitmap // Bitmap
if (cur_op->op_type == OSD_OP_IN && if (cur_op->op_type == OSD_OP_IN &&
cur_op->req.hdr.opcode == OSD_OP_SEC_READ && cur_op->req.hdr.opcode == OSD_OP_SEC_READ &&
cur_op->reply.sec_rw.attr_len > 0) cur_op->reply.sec_rw.bitmap_len > 0)
{ {
to_send_list.push_back((iovec){ if (cur_op->reply.sec_rw.bitmap_len <= 8)
.iov_base = cur_op->bitmap, {
.iov_len = cur_op->reply.sec_rw.attr_len, memcpy(&cur_op->reply.sec_rw.bitmap, cur_op->bitmap, cur_op->reply.sec_rw.bitmap_len);
}); }
to_outbox.push_back(NULL); else
{
to_send_list.push_back((iovec){
.iov_base = cur_op->bitmap,
.iov_len = cur_op->reply.sec_rw.bitmap_len,
});
to_outbox.push_back((msgr_sendp_t){ .op = cur_op, .flags = MSGR_SENDP_BMP });
}
} }
else if (cur_op->op_type == OSD_OP_OUT && else if (cur_op->op_type == OSD_OP_OUT &&
(cur_op->req.hdr.opcode == OSD_OP_SEC_WRITE || cur_op->req.hdr.opcode == OSD_OP_SEC_WRITE_STABLE) && (cur_op->req.hdr.opcode == OSD_OP_SEC_WRITE || cur_op->req.hdr.opcode == OSD_OP_SEC_WRITE_STABLE) &&
cur_op->req.sec_rw.attr_len > 0) cur_op->req.sec_rw.bitmap_len > 0)
{ {
to_send_list.push_back((iovec){ if (cur_op->req.sec_rw.bitmap_len <= 8)
.iov_base = cur_op->bitmap, {
.iov_len = cur_op->req.sec_rw.attr_len, memcpy(&cur_op->req.sec_rw.bitmap, cur_op->bitmap, cur_op->req.sec_rw.bitmap_len);
}); }
to_outbox.push_back(NULL); else
{
to_send_list.push_back((iovec){
.iov_base = cur_op->bitmap,
.iov_len = cur_op->req.sec_rw.attr_len,
});
to_outbox.push_back((msgr_sendp_t){ .op = cur_op, .flags = MSGR_SENDP_BMP });
}
} }
// Operation data // Operation data
if ((cur_op->op_type == OSD_OP_IN if ((cur_op->op_type == OSD_OP_IN
@@ -78,13 +92,14 @@ void osd_messenger_t::outbox_push(osd_op_t *cur_op)
cur_op->req.hdr.opcode == OSD_OP_SEC_WRITE || cur_op->req.hdr.opcode == OSD_OP_SEC_WRITE ||
cur_op->req.hdr.opcode == OSD_OP_SEC_WRITE_STABLE || cur_op->req.hdr.opcode == OSD_OP_SEC_WRITE_STABLE ||
cur_op->req.hdr.opcode == OSD_OP_SEC_STABILIZE || cur_op->req.hdr.opcode == OSD_OP_SEC_STABILIZE ||
cur_op->req.hdr.opcode == OSD_OP_SEC_ROLLBACK)) && cur_op->iov.count > 0) cur_op->req.hdr.opcode == OSD_OP_SEC_ROLLBACK ||
cur_op->req.hdr.opcode == OSD_OP_SHOW_CONFIG)) && cur_op->iov.count > 0)
{ {
for (int i = 0; i < cur_op->iov.count; i++) for (int i = 0; i < cur_op->iov.count; i++)
{ {
assert(cur_op->iov.buf[i].iov_base); assert(cur_op->iov.buf[i].iov_base);
to_send_list.push_back(cur_op->iov.buf[i]); to_send_list.push_back(cur_op->iov.buf[i]);
to_outbox.push_back(NULL); to_outbox.push_back((msgr_sendp_t){ .op = cur_op, .flags = 0 });
} }
} }
if (cur_op->req.hdr.opcode == OSD_OP_SEC_READ_BMP) if (cur_op->req.hdr.opcode == OSD_OP_SEC_READ_BMP)
@@ -93,13 +108,19 @@ void osd_messenger_t::outbox_push(osd_op_t *cur_op)
to_send_list.push_back((iovec){ .iov_base = cur_op->buf, .iov_len = (size_t)cur_op->reply.hdr.retval }); to_send_list.push_back((iovec){ .iov_base = cur_op->buf, .iov_len = (size_t)cur_op->reply.hdr.retval });
else if (cur_op->op_type == OSD_OP_OUT && cur_op->req.sec_read_bmp.len > 0) else if (cur_op->op_type == OSD_OP_OUT && cur_op->req.sec_read_bmp.len > 0)
to_send_list.push_back((iovec){ .iov_base = cur_op->buf, .iov_len = (size_t)cur_op->req.sec_read_bmp.len }); to_send_list.push_back((iovec){ .iov_base = cur_op->buf, .iov_len = (size_t)cur_op->req.sec_read_bmp.len });
to_outbox.push_back(NULL); to_outbox.push_back((msgr_sendp_t){ .op = cur_op, .flags = 0 });
} }
if (cur_op->op_type == OSD_OP_IN) if (cur_op->op_type == OSD_OP_IN)
{ {
// To free it later to_outbox[to_outbox.size()-1].flags |= MSGR_SENDP_FREE;
to_outbox[to_outbox.size()-1] = cur_op;
} }
#ifdef WITH_RDMA
if (cl->peer_state == PEER_RDMA)
{
try_send_rdma(cl);
return;
}
#endif
if (!ringloop) if (!ringloop)
{ {
// FIXME: It's worse because it doesn't allow batching // FIXME: It's worse because it doesn't allow batching
@@ -232,10 +253,10 @@ void osd_messenger_t::handle_send(int result, osd_client_t *cl)
iovec & iov = cl->send_list[done]; iovec & iov = cl->send_list[done];
if (iov.iov_len <= result) if (iov.iov_len <= result)
{ {
if (cl->outbox[done]) if (cl->outbox[done].flags & MSGR_SENDP_FREE)
{ {
// Reply fully sent // Reply fully sent
delete cl->outbox[done]; delete cl->outbox[done].op;
} }
result -= iov.iov_len; result -= iov.iov_len;
done++; done++;
@@ -260,6 +281,16 @@ void osd_messenger_t::handle_send(int result, osd_client_t *cl)
cl->next_outbox.clear(); cl->next_outbox.clear();
} }
cl->write_state = cl->outbox.size() > 0 ? CL_WRITE_READY : 0; cl->write_state = cl->outbox.size() > 0 ? CL_WRITE_READY : 0;
#ifdef WITH_RDMA
if (cl->peer_state == PEER_RDMA_CONNECTING && cl->rdma_queues.size() > 0 && !cl->outbox.size())
{
// FIXME: Do something better than just forgetting the FD
// FIXME: Ignore pings during RDMA state transition
printf("Successfully connected with client %d using RDMA\n", cl->peer_fd);
cl->peer_state = PEER_RDMA;
tfd->set_fd_handler(cl->peer_fd, false, NULL);
}
#endif
} }
if (cl->write_state != 0) if (cl->write_state != 0)
{ {

View File

@@ -122,6 +122,13 @@ void osd_messenger_t::stop_client(int peer_fd, bool force)
// And close the FD only when everything is done // And close the FD only when everything is done
// ...because peer_fd number can get reused after close() // ...because peer_fd number can get reused after close()
close(peer_fd); close(peer_fd);
#ifdef WITH_RDMA
for (auto rdma_conn: cl->rdma_queues)
{
delete rdma_conn;
}
cl->rdma_queues.resize(0);
#endif
#endif #endif
// Find the item again because it can be invalidated at this point // Find the item again because it can be invalidated at this point
it = clients.find(peer_fd); it = clients.find(peer_fd);

View File

@@ -26,7 +26,10 @@ const char *exe_name = NULL;
class nbd_proxy class nbd_proxy
{ {
protected: protected:
std::string image_name;
uint64_t inode = 0; uint64_t inode = 0;
uint64_t device_size = 0;
inode_watch_t *watch = NULL;
ring_loop_t *ringloop = NULL; ring_loop_t *ringloop = NULL;
epoll_manager_t *epmgr = NULL; epoll_manager_t *epmgr = NULL;
@@ -111,9 +114,9 @@ public:
{ {
printf( printf(
"Vitastor NBD proxy\n" "Vitastor NBD proxy\n"
"(c) Vitaliy Filippov, 2020 (VNPL-1.1)\n\n" "(c) Vitaliy Filippov, 2020-2021 (VNPL-1.1)\n\n"
"USAGE:\n" "USAGE:\n"
" %s map --etcd_address <etcd_address> --pool <pool> --inode <inode> --size <size in bytes>\n" " %s map --etcd_address <etcd_address> (--image <image> | --pool <pool> --inode <inode> --size <size in bytes>)\n"
" %s unmap /dev/nbd0\n" " %s unmap /dev/nbd0\n"
" %s list [--json]\n", " %s list [--json]\n",
exe_name, exe_name, exe_name exe_name, exe_name, exe_name
@@ -148,21 +151,49 @@ public:
fprintf(stderr, "etcd_address is missing\n"); fprintf(stderr, "etcd_address is missing\n");
exit(1); exit(1);
} }
if (!cfg["size"].uint64_value()) if (cfg["image"].string_value() != "")
{ {
fprintf(stderr, "device size is missing\n"); // Use image name
exit(1); image_name = cfg["image"].string_value();
inode = 0;
} }
inode = cfg["inode"].uint64_value(); else
uint64_t pool = cfg["pool"].uint64_value();
if (pool)
{ {
inode = (inode & ((1l << (64-POOL_ID_BITS)) - 1)) | (pool << (64-POOL_ID_BITS)); // Use pool, inode number and size
if (!cfg["size"].uint64_value())
{
fprintf(stderr, "device size is missing\n");
exit(1);
}
device_size = cfg["size"].uint64_value();
inode = cfg["inode"].uint64_value();
uint64_t pool = cfg["pool"].uint64_value();
if (pool)
{
inode = (inode & ((1l << (64-POOL_ID_BITS)) - 1)) | (pool << (64-POOL_ID_BITS));
}
if (!(inode >> (64-POOL_ID_BITS)))
{
fprintf(stderr, "pool is missing\n");
exit(1);
}
} }
if (!(inode >> (64-POOL_ID_BITS))) // Create client
ringloop = new ring_loop_t(512);
epmgr = new epoll_manager_t(ringloop);
cli = new cluster_client_t(ringloop, epmgr->tfd, cfg);
if (!inode)
{ {
fprintf(stderr, "pool is missing\n"); // Load image metadata
exit(1); while (!cli->is_ready())
{
ringloop->loop();
if (cli->is_ready())
break;
ringloop->wait();
}
watch = cli->st_cli.watch_inode(image_name);
device_size = watch->cfg.size;
} }
// Initialize NBD // Initialize NBD
int sockfd[2]; int sockfd[2];
@@ -176,7 +207,7 @@ public:
load_module(); load_module();
if (!cfg["dev_num"].is_null()) if (!cfg["dev_num"].is_null())
{ {
if (run_nbd(sockfd, cfg["dev_num"].int64_value(), cfg["size"].uint64_value(), NBD_FLAG_SEND_FLUSH, 30) < 0) if (run_nbd(sockfd, cfg["dev_num"].int64_value(), device_size, NBD_FLAG_SEND_FLUSH, 30) < 0)
{ {
perror("run_nbd"); perror("run_nbd");
exit(1); exit(1);
@@ -188,7 +219,7 @@ public:
int i = 0; int i = 0;
while (true) while (true)
{ {
int r = run_nbd(sockfd, i, cfg["size"].uint64_value(), NBD_FLAG_SEND_FLUSH, 30); int r = run_nbd(sockfd, i, device_size, NBD_FLAG_SEND_FLUSH, 30);
if (r == 0) if (r == 0)
{ {
printf("/dev/nbd%d\n", i); printf("/dev/nbd%d\n", i);
@@ -215,10 +246,6 @@ public:
{ {
daemonize(); daemonize();
} }
// Create client
ringloop = new ring_loop_t(512);
epmgr = new epoll_manager_t(ringloop);
cli = new cluster_client_t(ringloop, epmgr->tfd, cfg);
// Initialize read state // Initialize read state
read_state = CL_READ_HDR; read_state = CL_READ_HDR;
recv_buf = malloc_or_die(receive_buffer_size); recv_buf = malloc_or_die(receive_buffer_size);
@@ -242,6 +269,7 @@ public:
ringloop->loop(); ringloop->loop();
ringloop->wait(); ringloop->wait();
} }
// FIXME: Cleanup when exiting
} }
void load_module() void load_module()
@@ -610,7 +638,7 @@ protected:
if (req_type == NBD_CMD_READ || req_type == NBD_CMD_WRITE) if (req_type == NBD_CMD_READ || req_type == NBD_CMD_WRITE)
{ {
op->opcode = req_type == NBD_CMD_READ ? OSD_OP_READ : OSD_OP_WRITE; op->opcode = req_type == NBD_CMD_READ ? OSD_OP_READ : OSD_OP_WRITE;
op->inode = inode; op->inode = inode ? inode : watch->cfg.num;
op->offset = be64toh(cur_req.from); op->offset = be64toh(cur_req.from);
op->len = be32toh(cur_req.len); op->len = be32toh(cur_req.len);
buf = malloc_or_die(sizeof(nbd_reply) + op->len); buf = malloc_or_die(sizeof(nbd_reply) + op->len);
@@ -657,7 +685,15 @@ protected:
} }
else else
{ {
cli->execute(cur_op); if (cur_op->opcode == OSD_OP_WRITE && watch->cfg.readonly)
{
cur_op->retval = -EROFS;
std::function<void(cluster_op_t*)>(cur_op->callback)(cur_op);
}
else
{
cli->execute(cur_op);
}
cur_op = NULL; cur_op = NULL;
cur_buf = &cur_req; cur_buf = &cur_req;
cur_left = sizeof(nbd_request); cur_left = sizeof(nbd_request);

View File

@@ -45,11 +45,11 @@ osd_t::osd_t(blockstore_config_t & config, ring_loop_t *ringloop)
print_slow(); print_slow();
}); });
c_cli.tfd = this->tfd; msgr.tfd = this->tfd;
c_cli.ringloop = this->ringloop; msgr.ringloop = this->ringloop;
c_cli.exec_op = [this](osd_op_t *op) { exec_op(op); }; msgr.exec_op = [this](osd_op_t *op) { exec_op(op); };
c_cli.repeer_pgs = [this](osd_num_t peer_osd) { repeer_pgs(peer_osd); }; msgr.repeer_pgs = [this](osd_num_t peer_osd) { repeer_pgs(peer_osd); };
c_cli.init(); msgr.init();
init_cluster(); init_cluster();
@@ -80,10 +80,11 @@ void osd_t::parse_config(blockstore_config_t & config)
osd_num = strtoull(config["osd_num"].c_str(), NULL, 10); osd_num = strtoull(config["osd_num"].c_str(), NULL, 10);
if (!osd_num) if (!osd_num)
throw std::runtime_error("osd_num is required in the configuration"); throw std::runtime_error("osd_num is required in the configuration");
c_cli.osd_num = osd_num; msgr.osd_num = osd_num;
run_primary = config["run_primary"] != "false" && config["run_primary"] != "0" && config["run_primary"] != "no"; run_primary = config["run_primary"] != "false" && config["run_primary"] != "0" && config["run_primary"] != "no";
no_rebalance = config["no_rebalance"] == "true" || config["no_rebalance"] == "1" || config["no_rebalance"] == "yes"; no_rebalance = config["no_rebalance"] == "true" || config["no_rebalance"] == "1" || config["no_rebalance"] == "yes";
no_recovery = config["no_recovery"] == "true" || config["no_recovery"] == "1" || config["no_recovery"] == "yes"; no_recovery = config["no_recovery"] == "true" || config["no_recovery"] == "1" || config["no_recovery"] == "yes";
allow_test_ops = config["allow_test_ops"] == "true" || config["allow_test_ops"] == "1" || config["allow_test_ops"] == "yes";
// Cluster configuration // Cluster configuration
bind_address = config["bind_address"]; bind_address = config["bind_address"];
if (bind_address == "") if (bind_address == "")
@@ -121,7 +122,7 @@ void osd_t::parse_config(blockstore_config_t & config)
slow_log_interval = strtoull(config["slow_log_interval"].c_str(), NULL, 10); slow_log_interval = strtoull(config["slow_log_interval"].c_str(), NULL, 10);
if (!slow_log_interval) if (!slow_log_interval)
slow_log_interval = 10; slow_log_interval = 10;
c_cli.parse_config(json_config); msgr.parse_config(json_config);
} }
void osd_t::bind_socket() void osd_t::bind_socket()
@@ -174,7 +175,7 @@ void osd_t::bind_socket()
epmgr->set_fd_handler(listen_fd, false, [this](int fd, int events) epmgr->set_fd_handler(listen_fd, false, [this](int fd, int events)
{ {
c_cli.accept_connections(listen_fd); msgr.accept_connections(listen_fd);
}); });
} }
@@ -191,8 +192,8 @@ bool osd_t::shutdown()
void osd_t::loop() void osd_t::loop()
{ {
handle_peers(); handle_peers();
c_cli.read_requests(); msgr.read_requests();
c_cli.send_replies(); msgr.send_replies();
ringloop->submit(); ringloop->submit();
} }
@@ -276,7 +277,7 @@ void osd_t::exec_op(osd_op_t *cur_op)
void osd_t::reset_stats() void osd_t::reset_stats()
{ {
c_cli.stats = { 0 }; msgr.stats = { 0 };
prev_stats = { 0 }; prev_stats = { 0 };
memset(recovery_stat_count, 0, sizeof(recovery_stat_count)); memset(recovery_stat_count, 0, sizeof(recovery_stat_count));
memset(recovery_stat_bytes, 0, sizeof(recovery_stat_bytes)); memset(recovery_stat_bytes, 0, sizeof(recovery_stat_bytes));
@@ -286,11 +287,11 @@ void osd_t::print_stats()
{ {
for (int i = OSD_OP_MIN; i <= OSD_OP_MAX; i++) for (int i = OSD_OP_MIN; i <= OSD_OP_MAX; i++)
{ {
if (c_cli.stats.op_stat_count[i] != prev_stats.op_stat_count[i] && i != OSD_OP_PING) if (msgr.stats.op_stat_count[i] != prev_stats.op_stat_count[i] && i != OSD_OP_PING)
{ {
uint64_t avg = (c_cli.stats.op_stat_sum[i] - prev_stats.op_stat_sum[i])/(c_cli.stats.op_stat_count[i] - prev_stats.op_stat_count[i]); uint64_t avg = (msgr.stats.op_stat_sum[i] - prev_stats.op_stat_sum[i])/(msgr.stats.op_stat_count[i] - prev_stats.op_stat_count[i]);
uint64_t bw = (c_cli.stats.op_stat_bytes[i] - prev_stats.op_stat_bytes[i]) / print_stats_interval; uint64_t bw = (msgr.stats.op_stat_bytes[i] - prev_stats.op_stat_bytes[i]) / print_stats_interval;
if (c_cli.stats.op_stat_bytes[i] != 0) if (msgr.stats.op_stat_bytes[i] != 0)
{ {
printf( printf(
"[OSD %lu] avg latency for op %d (%s): %lu us, B/W: %.2f %s\n", osd_num, i, osd_op_names[i], avg, "[OSD %lu] avg latency for op %d (%s): %lu us, B/W: %.2f %s\n", osd_num, i, osd_op_names[i], avg,
@@ -302,19 +303,19 @@ void osd_t::print_stats()
{ {
printf("[OSD %lu] avg latency for op %d (%s): %lu us\n", osd_num, i, osd_op_names[i], avg); printf("[OSD %lu] avg latency for op %d (%s): %lu us\n", osd_num, i, osd_op_names[i], avg);
} }
prev_stats.op_stat_count[i] = c_cli.stats.op_stat_count[i]; prev_stats.op_stat_count[i] = msgr.stats.op_stat_count[i];
prev_stats.op_stat_sum[i] = c_cli.stats.op_stat_sum[i]; prev_stats.op_stat_sum[i] = msgr.stats.op_stat_sum[i];
prev_stats.op_stat_bytes[i] = c_cli.stats.op_stat_bytes[i]; prev_stats.op_stat_bytes[i] = msgr.stats.op_stat_bytes[i];
} }
} }
for (int i = OSD_OP_MIN; i <= OSD_OP_MAX; i++) for (int i = OSD_OP_MIN; i <= OSD_OP_MAX; i++)
{ {
if (c_cli.stats.subop_stat_count[i] != prev_stats.subop_stat_count[i]) if (msgr.stats.subop_stat_count[i] != prev_stats.subop_stat_count[i])
{ {
uint64_t avg = (c_cli.stats.subop_stat_sum[i] - prev_stats.subop_stat_sum[i])/(c_cli.stats.subop_stat_count[i] - prev_stats.subop_stat_count[i]); uint64_t avg = (msgr.stats.subop_stat_sum[i] - prev_stats.subop_stat_sum[i])/(msgr.stats.subop_stat_count[i] - prev_stats.subop_stat_count[i]);
printf("[OSD %lu] avg latency for subop %d (%s): %ld us\n", osd_num, i, osd_op_names[i], avg); printf("[OSD %lu] avg latency for subop %d (%s): %ld us\n", osd_num, i, osd_op_names[i], avg);
prev_stats.subop_stat_count[i] = c_cli.stats.subop_stat_count[i]; prev_stats.subop_stat_count[i] = msgr.stats.subop_stat_count[i];
prev_stats.subop_stat_sum[i] = c_cli.stats.subop_stat_sum[i]; prev_stats.subop_stat_sum[i] = msgr.stats.subop_stat_sum[i];
} }
} }
for (int i = 0; i < 2; i++) for (int i = 0; i < 2; i++)
@@ -351,7 +352,7 @@ void osd_t::print_slow()
char alloc[1024]; char alloc[1024];
timespec now; timespec now;
clock_gettime(CLOCK_REALTIME, &now); clock_gettime(CLOCK_REALTIME, &now);
for (auto & kv: c_cli.clients) for (auto & kv: msgr.clients)
{ {
for (auto op: kv.second->received_ops) for (auto op: kv.second->received_ops)
{ {

View File

@@ -104,7 +104,7 @@ class osd_t
int bind_port, listen_backlog; int bind_port, listen_backlog;
// FIXME: Implement client queue depth limit // FIXME: Implement client queue depth limit
int client_queue_depth = 128; int client_queue_depth = 128;
bool allow_test_ops = true; bool allow_test_ops = false;
int print_stats_interval = 3; int print_stats_interval = 3;
int slow_log_interval = 10; int slow_log_interval = 10;
int immediate_commit = IMMEDIATE_NONE; int immediate_commit = IMMEDIATE_NONE;
@@ -116,7 +116,7 @@ class osd_t
// cluster state // cluster state
etcd_state_client_t st_cli; etcd_state_client_t st_cli;
osd_messenger_t c_cli; osd_messenger_t msgr;
int etcd_failed_attempts = 0; int etcd_failed_attempts = 0;
std::string etcd_lease_id; std::string etcd_lease_id;
json11::Json self_state; json11::Json self_state;

View File

@@ -104,7 +104,7 @@ void osd_t::parse_test_peer(std::string peer)
{ "addresses", json11::Json::array { addr } }, { "addresses", json11::Json::array { addr } },
{ "port", port }, { "port", port },
}; };
c_cli.connect_peer(peer_osd, st_cli.peer_states[peer_osd]); msgr.connect_peer(peer_osd, st_cli.peer_states[peer_osd]);
} }
json11::Json osd_t::get_osd_state() json11::Json osd_t::get_osd_state()
@@ -146,16 +146,16 @@ json11::Json osd_t::get_statistics()
for (int i = OSD_OP_MIN; i <= OSD_OP_MAX; i++) for (int i = OSD_OP_MIN; i <= OSD_OP_MAX; i++)
{ {
op_stats[osd_op_names[i]] = json11::Json::object { op_stats[osd_op_names[i]] = json11::Json::object {
{ "count", c_cli.stats.op_stat_count[i] }, { "count", msgr.stats.op_stat_count[i] },
{ "usec", c_cli.stats.op_stat_sum[i] }, { "usec", msgr.stats.op_stat_sum[i] },
{ "bytes", c_cli.stats.op_stat_bytes[i] }, { "bytes", msgr.stats.op_stat_bytes[i] },
}; };
} }
for (int i = OSD_OP_MIN; i <= OSD_OP_MAX; i++) for (int i = OSD_OP_MIN; i <= OSD_OP_MAX; i++)
{ {
subop_stats[osd_op_names[i]] = json11::Json::object { subop_stats[osd_op_names[i]] = json11::Json::object {
{ "count", c_cli.stats.subop_stat_count[i] }, { "count", msgr.stats.subop_stat_count[i] },
{ "usec", c_cli.stats.subop_stat_sum[i] }, { "usec", msgr.stats.subop_stat_sum[i] },
}; };
} }
st["op_stats"] = op_stats; st["op_stats"] = op_stats;
@@ -298,9 +298,9 @@ void osd_t::report_statistics()
void osd_t::on_change_osd_state_hook(osd_num_t peer_osd) void osd_t::on_change_osd_state_hook(osd_num_t peer_osd)
{ {
if (c_cli.wanted_peers.find(peer_osd) != c_cli.wanted_peers.end()) if (msgr.wanted_peers.find(peer_osd) != msgr.wanted_peers.end())
{ {
c_cli.connect_peer(peer_osd, st_cli.peer_states[peer_osd]); msgr.connect_peer(peer_osd, st_cli.peer_states[peer_osd]);
} }
} }
@@ -695,9 +695,9 @@ void osd_t::apply_pg_config()
// Add peers // Add peers
for (auto pg_osd: all_peers) for (auto pg_osd: all_peers)
{ {
if (pg_osd != this->osd_num && c_cli.osd_peer_fds.find(pg_osd) == c_cli.osd_peer_fds.end()) if (pg_osd != this->osd_num && msgr.osd_peer_fds.find(pg_osd) == msgr.osd_peer_fds.end())
{ {
c_cli.connect_peer(pg_osd, st_cli.peer_states[pg_osd]); msgr.connect_peer(pg_osd, st_cli.peer_states[pg_osd]);
} }
} }
start_pg_peering(pg); start_pg_peering(pg);

View File

@@ -82,10 +82,10 @@ void osd_t::handle_flush_op(bool rollback, pool_id_t pool_id, pg_num_t pg_num, p
else else
{ {
printf("Error while doing flush on OSD %lu: %d (%s)\n", osd_num, retval, strerror(-retval)); printf("Error while doing flush on OSD %lu: %d (%s)\n", osd_num, retval, strerror(-retval));
auto fd_it = c_cli.osd_peer_fds.find(peer_osd); auto fd_it = msgr.osd_peer_fds.find(peer_osd);
if (fd_it != c_cli.osd_peer_fds.end()) if (fd_it != msgr.osd_peer_fds.end())
{ {
c_cli.stop_client(fd_it->second); msgr.stop_client(fd_it->second);
} }
return; return;
} }
@@ -188,7 +188,7 @@ void osd_t::submit_flush_op(pool_id_t pool_id, pg_num_t pg_num, pg_flush_batch_t
else else
{ {
// Peer // Peer
int peer_fd = c_cli.osd_peer_fds[peer_osd]; int peer_fd = msgr.osd_peer_fds[peer_osd];
op->op_type = OSD_OP_OUT; op->op_type = OSD_OP_OUT;
op->iov.push_back(op->buf, count * sizeof(obj_ver_id)); op->iov.push_back(op->buf, count * sizeof(obj_ver_id));
op->peer_fd = peer_fd; op->peer_fd = peer_fd;
@@ -196,7 +196,7 @@ void osd_t::submit_flush_op(pool_id_t pool_id, pg_num_t pg_num, pg_flush_batch_t
.sec_stab = { .sec_stab = {
.header = { .header = {
.magic = SECONDARY_OSD_OP_MAGIC, .magic = SECONDARY_OSD_OP_MAGIC,
.id = c_cli.next_subop_id++, .id = msgr.next_subop_id++,
.opcode = (uint64_t)(rollback ? OSD_OP_SEC_ROLLBACK : OSD_OP_SEC_STABILIZE), .opcode = (uint64_t)(rollback ? OSD_OP_SEC_ROLLBACK : OSD_OP_SEC_STABILIZE),
}, },
.len = count * sizeof(obj_ver_id), .len = count * sizeof(obj_ver_id),
@@ -207,7 +207,7 @@ void osd_t::submit_flush_op(pool_id_t pool_id, pg_num_t pg_num, pg_flush_batch_t
handle_flush_op(op->req.hdr.opcode == OSD_OP_SEC_ROLLBACK, pool_id, pg_num, fb, peer_osd, op->reply.hdr.retval); handle_flush_op(op->req.hdr.opcode == OSD_OP_SEC_ROLLBACK, pool_id, pg_num, fb, peer_osd, op->reply.hdr.retval);
delete op; delete op;
}; };
c_cli.outbox_push(op); msgr.outbox_push(op);
} }
} }

View File

@@ -35,7 +35,7 @@
#define MEM_ALIGNMENT 512 #define MEM_ALIGNMENT 512
#endif #endif
#define OSD_RW_MAX 64*1024*1024 #define OSD_RW_MAX 64*1024*1024
#define OSD_PROTOCOL_VERSION 1 #define OSD_PROTOCOL_VERSION 2
// common request and reply headers // common request and reply headers
struct __attribute__((__packed__)) osd_op_header_t struct __attribute__((__packed__)) osd_op_header_t
@@ -74,8 +74,10 @@ struct __attribute__((__packed__)) osd_op_sec_rw_t
// length // length
uint32_t len; uint32_t len;
// bitmap/attribute length - bitmap comes after header, but before data // bitmap/attribute length - bitmap comes after header, but before data
uint32_t attr_len; uint32_t bitmap_len;
uint32_t pad0; uint32_t pad0;
// inline bitmap (when it's no longer than 8 bytes)
uint64_t bitmap;
}; };
struct __attribute__((__packed__)) osd_reply_sec_rw_t struct __attribute__((__packed__)) osd_reply_sec_rw_t
@@ -84,8 +86,10 @@ struct __attribute__((__packed__)) osd_reply_sec_rw_t
// for reads and writes: assigned or read version number // for reads and writes: assigned or read version number
uint64_t version; uint64_t version;
// for reads: bitmap/attribute length (just to double-check) // for reads: bitmap/attribute length (just to double-check)
uint32_t attr_len; uint32_t bitmap_len;
uint32_t pad0; uint32_t pad0;
// inline bitmap (when it's no longer than 8 bytes)
uint64_t bitmap;
}; };
// delete object on the secondary OSD // delete object on the secondary OSD
@@ -148,6 +152,8 @@ struct __attribute__((__packed__)) osd_reply_sec_read_bmp_t
struct __attribute__((__packed__)) osd_op_show_config_t struct __attribute__((__packed__)) osd_op_show_config_t
{ {
osd_op_header_t header; osd_op_header_t header;
// JSON request length
uint64_t json_len;
}; };
struct __attribute__((__packed__)) osd_reply_show_config_t struct __attribute__((__packed__)) osd_reply_show_config_t
@@ -197,6 +203,8 @@ struct __attribute__((__packed__)) osd_reply_rw_t
// for reads: bitmap length // for reads: bitmap length
uint32_t bitmap_len; uint32_t bitmap_len;
uint32_t pad0; uint32_t pad0;
// inline bitmap (when it's no longer than 8 bytes)
uint64_t bitmap;
}; };
// sync to the primary OSD // sync to the primary OSD

View File

@@ -156,7 +156,7 @@ void osd_t::start_pg_peering(pg_t & pg)
if (immediate_commit != IMMEDIATE_ALL) if (immediate_commit != IMMEDIATE_ALL)
{ {
std::vector<int> to_stop; std::vector<int> to_stop;
for (auto & cp: c_cli.clients) for (auto & cp: msgr.clients)
{ {
if (cp.second->dirty_pgs.find({ .pool_id = pg.pool_id, .pg_num = pg.pg_num }) != cp.second->dirty_pgs.end()) if (cp.second->dirty_pgs.find({ .pool_id = pg.pool_id, .pg_num = pg.pg_num }) != cp.second->dirty_pgs.end())
{ {
@@ -165,7 +165,7 @@ void osd_t::start_pg_peering(pg_t & pg)
} }
for (auto peer_fd: to_stop) for (auto peer_fd: to_stop)
{ {
c_cli.stop_client(peer_fd); msgr.stop_client(peer_fd);
} }
} }
// Calculate current write OSD set // Calculate current write OSD set
@@ -175,7 +175,7 @@ void osd_t::start_pg_peering(pg_t & pg)
for (int role = 0; role < pg.target_set.size(); role++) for (int role = 0; role < pg.target_set.size(); role++)
{ {
pg.cur_set[role] = pg.target_set[role] == this->osd_num || pg.cur_set[role] = pg.target_set[role] == this->osd_num ||
c_cli.osd_peer_fds.find(pg.target_set[role]) != c_cli.osd_peer_fds.end() ? pg.target_set[role] : 0; msgr.osd_peer_fds.find(pg.target_set[role]) != msgr.osd_peer_fds.end() ? pg.target_set[role] : 0;
if (pg.cur_set[role] != 0) if (pg.cur_set[role] != 0)
{ {
pg.pg_cursize++; pg.pg_cursize++;
@@ -199,7 +199,7 @@ void osd_t::start_pg_peering(pg_t & pg)
{ {
found = false; found = false;
if (history_osd == this->osd_num || if (history_osd == this->osd_num ||
c_cli.osd_peer_fds.find(history_osd) != c_cli.osd_peer_fds.end()) msgr.osd_peer_fds.find(history_osd) != msgr.osd_peer_fds.end())
{ {
found = true; found = true;
break; break;
@@ -223,13 +223,13 @@ void osd_t::start_pg_peering(pg_t & pg)
std::set<osd_num_t> cur_peers; std::set<osd_num_t> cur_peers;
for (auto pg_osd: pg.all_peers) for (auto pg_osd: pg.all_peers)
{ {
if (pg_osd == this->osd_num || c_cli.osd_peer_fds.find(pg_osd) != c_cli.osd_peer_fds.end()) if (pg_osd == this->osd_num || msgr.osd_peer_fds.find(pg_osd) != msgr.osd_peer_fds.end())
{ {
cur_peers.insert(pg_osd); cur_peers.insert(pg_osd);
} }
else if (c_cli.wanted_peers.find(pg_osd) == c_cli.wanted_peers.end()) else if (msgr.wanted_peers.find(pg_osd) == msgr.wanted_peers.end())
{ {
c_cli.connect_peer(pg_osd, st_cli.peer_states[pg_osd]); msgr.connect_peer(pg_osd, st_cli.peer_states[pg_osd]);
} }
} }
pg.cur_peers.insert(pg.cur_peers.begin(), cur_peers.begin(), cur_peers.end()); pg.cur_peers.insert(pg.cur_peers.begin(), cur_peers.begin(), cur_peers.end());
@@ -325,7 +325,7 @@ void osd_t::submit_sync_and_list_subop(osd_num_t role_osd, pg_peering_state_t *p
else else
{ {
// Peer // Peer
auto & cl = c_cli.clients.at(c_cli.osd_peer_fds[role_osd]); auto & cl = msgr.clients.at(msgr.osd_peer_fds[role_osd]);
osd_op_t *op = new osd_op_t(); osd_op_t *op = new osd_op_t();
op->op_type = OSD_OP_OUT; op->op_type = OSD_OP_OUT;
op->peer_fd = cl->peer_fd; op->peer_fd = cl->peer_fd;
@@ -333,7 +333,7 @@ void osd_t::submit_sync_and_list_subop(osd_num_t role_osd, pg_peering_state_t *p
.sec_sync = { .sec_sync = {
.header = { .header = {
.magic = SECONDARY_OSD_OP_MAGIC, .magic = SECONDARY_OSD_OP_MAGIC,
.id = c_cli.next_subop_id++, .id = msgr.next_subop_id++,
.opcode = OSD_OP_SEC_SYNC, .opcode = OSD_OP_SEC_SYNC,
}, },
}, },
@@ -347,14 +347,14 @@ void osd_t::submit_sync_and_list_subop(osd_num_t role_osd, pg_peering_state_t *p
int fail_fd = op->peer_fd; int fail_fd = op->peer_fd;
ps->list_ops.erase(role_osd); ps->list_ops.erase(role_osd);
delete op; delete op;
c_cli.stop_client(fail_fd); msgr.stop_client(fail_fd);
return; return;
} }
delete op; delete op;
ps->list_ops.erase(role_osd); ps->list_ops.erase(role_osd);
submit_list_subop(role_osd, ps); submit_list_subop(role_osd, ps);
}; };
c_cli.outbox_push(op); msgr.outbox_push(op);
ps->list_ops[role_osd] = op; ps->list_ops[role_osd] = op;
} }
} }
@@ -404,12 +404,12 @@ void osd_t::submit_list_subop(osd_num_t role_osd, pg_peering_state_t *ps)
// Peer // Peer
osd_op_t *op = new osd_op_t(); osd_op_t *op = new osd_op_t();
op->op_type = OSD_OP_OUT; op->op_type = OSD_OP_OUT;
op->peer_fd = c_cli.osd_peer_fds[role_osd]; op->peer_fd = msgr.osd_peer_fds[role_osd];
op->req = (osd_any_op_t){ op->req = (osd_any_op_t){
.sec_list = { .sec_list = {
.header = { .header = {
.magic = SECONDARY_OSD_OP_MAGIC, .magic = SECONDARY_OSD_OP_MAGIC,
.id = c_cli.next_subop_id++, .id = msgr.next_subop_id++,
.opcode = OSD_OP_SEC_LIST, .opcode = OSD_OP_SEC_LIST,
}, },
.list_pg = ps->pg_num, .list_pg = ps->pg_num,
@@ -427,7 +427,7 @@ void osd_t::submit_list_subop(osd_num_t role_osd, pg_peering_state_t *ps)
int fail_fd = op->peer_fd; int fail_fd = op->peer_fd;
ps->list_ops.erase(role_osd); ps->list_ops.erase(role_osd);
delete op; delete op;
c_cli.stop_client(fail_fd); msgr.stop_client(fail_fd);
return; return;
} }
printf( printf(
@@ -444,7 +444,7 @@ void osd_t::submit_list_subop(osd_num_t role_osd, pg_peering_state_t *ps)
ps->list_ops.erase(role_osd); ps->list_ops.erase(role_osd);
delete op; delete op;
}; };
c_cli.outbox_push(op); msgr.outbox_push(op);
ps->list_ops[role_osd] = op; ps->list_ops[role_osd] = op;
} }
} }

View File

@@ -235,7 +235,10 @@ resume_2:
{ {
reconstruct_stripes_jerasure(stripes, op_data->pg_size, op_data->pg_data_size, clean_entry_bitmap_size); reconstruct_stripes_jerasure(stripes, op_data->pg_size, op_data->pg_data_size, clean_entry_bitmap_size);
} }
cur_op->iov.push_back(op_data->stripes[0].bmp_buf, cur_op->reply.rw.bitmap_len); if (cur_op->reply.rw.bitmap_len <= 8)
memcpy(&cur_op->reply.rw.bitmap, op_data->stripes[0].bmp_buf, cur_op->reply.rw.bitmap_len);
else
cur_op->iov.push_back(op_data->stripes[0].bmp_buf, cur_op->reply.rw.bitmap_len);
for (int role = 0; role < op_data->pg_size; role++) for (int role = 0; role < op_data->pg_size; role++)
{ {
if (stripes[role].req_end != 0) if (stripes[role].req_end != 0)
@@ -250,7 +253,10 @@ resume_2:
} }
else else
{ {
cur_op->iov.push_back(op_data->stripes[0].bmp_buf, cur_op->reply.rw.bitmap_len); if (cur_op->reply.rw.bitmap_len <= 8)
memcpy(&cur_op->reply.rw.bitmap, op_data->stripes[0].bmp_buf, cur_op->reply.rw.bitmap_len);
else
cur_op->iov.push_back(op_data->stripes[0].bmp_buf, cur_op->reply.rw.bitmap_len);
cur_op->iov.push_back(cur_op->buf, cur_op->req.rw.len); cur_op->iov.push_back(cur_op->buf, cur_op->req.rw.len);
} }
finish_op(cur_op, cur_op->req.rw.len); finish_op(cur_op, cur_op->req.rw.len);

View File

@@ -236,14 +236,14 @@ int osd_t::submit_bitmap_subops(osd_op_t *cur_op, pg_t & pg)
// Send to a remote OSD // Send to a remote OSD
osd_op_t *subop = op_data->subops+subop_idx; osd_op_t *subop = op_data->subops+subop_idx;
subop->op_type = OSD_OP_OUT; subop->op_type = OSD_OP_OUT;
subop->peer_fd = c_cli.osd_peer_fds.at(subop_osd_num); subop->peer_fd = msgr.osd_peer_fds.at(subop_osd_num);
// FIXME: Use the pre-allocated buffer // FIXME: Use the pre-allocated buffer
subop->buf = malloc_or_die(sizeof(obj_ver_id)*(i+1-prev)); subop->buf = malloc_or_die(sizeof(obj_ver_id)*(i+1-prev));
subop->req = (osd_any_op_t){ subop->req = (osd_any_op_t){
.sec_read_bmp = { .sec_read_bmp = {
.header = { .header = {
.magic = SECONDARY_OSD_OP_MAGIC, .magic = SECONDARY_OSD_OP_MAGIC,
.id = c_cli.next_subop_id++, .id = msgr.next_subop_id++,
.opcode = OSD_OP_SEC_READ_BMP, .opcode = OSD_OP_SEC_READ_BMP,
}, },
.len = sizeof(obj_ver_id)*(i+1-prev), .len = sizeof(obj_ver_id)*(i+1-prev),
@@ -273,7 +273,7 @@ int osd_t::submit_bitmap_subops(osd_op_t *cur_op, pg_t & pg)
} }
handle_primary_subop(subop, cur_op); handle_primary_subop(subop, cur_op);
}; };
c_cli.outbox_push(subop); msgr.outbox_push(subop);
subop_idx++; subop_idx++;
} }
prev = i+1; prev = i+1;

View File

@@ -87,14 +87,14 @@ void osd_t::finish_op(osd_op_t *cur_op, int retval)
else else
{ {
// FIXME add separate magic number for primary ops // FIXME add separate magic number for primary ops
auto cl_it = c_cli.clients.find(cur_op->peer_fd); auto cl_it = msgr.clients.find(cur_op->peer_fd);
if (cl_it != c_cli.clients.end()) if (cl_it != msgr.clients.end())
{ {
cur_op->reply.hdr.magic = SECONDARY_OSD_REPLY_MAGIC; cur_op->reply.hdr.magic = SECONDARY_OSD_REPLY_MAGIC;
cur_op->reply.hdr.id = cur_op->req.hdr.id; cur_op->reply.hdr.id = cur_op->req.hdr.id;
cur_op->reply.hdr.opcode = cur_op->req.hdr.opcode; cur_op->reply.hdr.opcode = cur_op->req.hdr.opcode;
cur_op->reply.hdr.retval = retval; cur_op->reply.hdr.retval = retval;
c_cli.outbox_push(cur_op); msgr.outbox_push(cur_op);
} }
else else
{ {
@@ -184,13 +184,13 @@ int osd_t::submit_primary_subop_batch(int submit_type, inode_t inode, uint64_t o
else else
{ {
subop->op_type = OSD_OP_OUT; subop->op_type = OSD_OP_OUT;
subop->peer_fd = c_cli.osd_peer_fds.at(role_osd_num); subop->peer_fd = msgr.osd_peer_fds.at(role_osd_num);
subop->bitmap = stripes[stripe_num].bmp_buf; subop->bitmap = stripes[stripe_num].bmp_buf;
subop->bitmap_len = clean_entry_bitmap_size; subop->bitmap_len = clean_entry_bitmap_size;
subop->req.sec_rw = { subop->req.sec_rw = {
.header = { .header = {
.magic = SECONDARY_OSD_OP_MAGIC, .magic = SECONDARY_OSD_OP_MAGIC,
.id = c_cli.next_subop_id++, .id = msgr.next_subop_id++,
.opcode = (uint64_t)(wr ? (rep ? OSD_OP_SEC_WRITE_STABLE : OSD_OP_SEC_WRITE) : OSD_OP_SEC_READ), .opcode = (uint64_t)(wr ? (rep ? OSD_OP_SEC_WRITE_STABLE : OSD_OP_SEC_WRITE) : OSD_OP_SEC_READ),
}, },
.oid = { .oid = {
@@ -200,7 +200,7 @@ int osd_t::submit_primary_subop_batch(int submit_type, inode_t inode, uint64_t o
.version = op_version, .version = op_version,
.offset = wr ? stripes[stripe_num].write_start : stripes[stripe_num].read_start, .offset = wr ? stripes[stripe_num].write_start : stripes[stripe_num].read_start,
.len = wr ? stripes[stripe_num].write_end - stripes[stripe_num].write_start : stripes[stripe_num].read_end - stripes[stripe_num].read_start, .len = wr ? stripes[stripe_num].write_end - stripes[stripe_num].write_start : stripes[stripe_num].read_end - stripes[stripe_num].read_start,
.attr_len = wr ? clean_entry_bitmap_size : 0, .bitmap_len = wr ? clean_entry_bitmap_size : 0,
}; };
#ifdef OSD_DEBUG #ifdef OSD_DEBUG
printf( printf(
@@ -227,7 +227,7 @@ int osd_t::submit_primary_subop_batch(int submit_type, inode_t inode, uint64_t o
{ {
handle_primary_subop(subop, cur_op); handle_primary_subop(subop, cur_op);
}; };
c_cli.outbox_push(subop); msgr.outbox_push(subop);
} }
i++; i++;
} }
@@ -282,20 +282,20 @@ void osd_t::add_bs_subop_stats(osd_op_t *subop)
uint64_t opcode = bs_op_to_osd_op[subop->bs_op->opcode]; uint64_t opcode = bs_op_to_osd_op[subop->bs_op->opcode];
timespec tv_end; timespec tv_end;
clock_gettime(CLOCK_REALTIME, &tv_end); clock_gettime(CLOCK_REALTIME, &tv_end);
c_cli.stats.op_stat_count[opcode]++; msgr.stats.op_stat_count[opcode]++;
if (!c_cli.stats.op_stat_count[opcode]) if (!msgr.stats.op_stat_count[opcode])
{ {
c_cli.stats.op_stat_count[opcode] = 1; msgr.stats.op_stat_count[opcode] = 1;
c_cli.stats.op_stat_sum[opcode] = 0; msgr.stats.op_stat_sum[opcode] = 0;
c_cli.stats.op_stat_bytes[opcode] = 0; msgr.stats.op_stat_bytes[opcode] = 0;
} }
c_cli.stats.op_stat_sum[opcode] += ( msgr.stats.op_stat_sum[opcode] += (
(tv_end.tv_sec - subop->tv_begin.tv_sec)*1000000 + (tv_end.tv_sec - subop->tv_begin.tv_sec)*1000000 +
(tv_end.tv_nsec - subop->tv_begin.tv_nsec)/1000 (tv_end.tv_nsec - subop->tv_begin.tv_nsec)/1000
); );
if (opcode == OSD_OP_SEC_READ || opcode == OSD_OP_SEC_WRITE) if (opcode == OSD_OP_SEC_READ || opcode == OSD_OP_SEC_WRITE)
{ {
c_cli.stats.op_stat_bytes[opcode] += subop->bs_op->len; msgr.stats.op_stat_bytes[opcode] += subop->bs_op->len;
} }
} }
@@ -322,7 +322,7 @@ void osd_t::handle_primary_subop(osd_op_t *subop, osd_op_t *cur_op)
if (subop->peer_fd >= 0) if (subop->peer_fd >= 0)
{ {
// Drop connection on any error // Drop connection on any error
c_cli.stop_client(subop->peer_fd); msgr.stop_client(subop->peer_fd);
} }
} }
else else
@@ -332,8 +332,8 @@ void osd_t::handle_primary_subop(osd_op_t *subop, osd_op_t *cur_op)
{ {
uint64_t version = subop->reply.sec_rw.version; uint64_t version = subop->reply.sec_rw.version;
#ifdef OSD_DEBUG #ifdef OSD_DEBUG
uint64_t peer_osd = c_cli.clients.find(subop->peer_fd) != c_cli.clients.end() uint64_t peer_osd = msgr.clients.find(subop->peer_fd) != msgr.clients.end()
? c_cli.clients[subop->peer_fd]->osd_num : osd_num; ? msgr.clients[subop->peer_fd]->osd_num : osd_num;
printf("subop %lu from osd %lu: version = %lu\n", opcode, peer_osd, version); printf("subop %lu from osd %lu: version = %lu\n", opcode, peer_osd, version);
#endif #endif
if (op_data->fact_ver != UINT64_MAX) if (op_data->fact_ver != UINT64_MAX)
@@ -465,11 +465,11 @@ void osd_t::submit_primary_del_batch(osd_op_t *cur_op, obj_ver_osd_t *chunks_to_
else else
{ {
subops[i].op_type = OSD_OP_OUT; subops[i].op_type = OSD_OP_OUT;
subops[i].peer_fd = c_cli.osd_peer_fds.at(chunk.osd_num); subops[i].peer_fd = msgr.osd_peer_fds.at(chunk.osd_num);
subops[i].req = (osd_any_op_t){ .sec_del = { subops[i].req = (osd_any_op_t){ .sec_del = {
.header = { .header = {
.magic = SECONDARY_OSD_OP_MAGIC, .magic = SECONDARY_OSD_OP_MAGIC,
.id = c_cli.next_subop_id++, .id = msgr.next_subop_id++,
.opcode = OSD_OP_SEC_DELETE, .opcode = OSD_OP_SEC_DELETE,
}, },
.oid = chunk.oid, .oid = chunk.oid,
@@ -479,7 +479,7 @@ void osd_t::submit_primary_del_batch(osd_op_t *cur_op, obj_ver_osd_t *chunks_to_
{ {
handle_primary_subop(subop, cur_op); handle_primary_subop(subop, cur_op);
}; };
c_cli.outbox_push(&subops[i]); msgr.outbox_push(&subops[i]);
} }
} }
} }
@@ -509,14 +509,14 @@ int osd_t::submit_primary_sync_subops(osd_op_t *cur_op)
}); });
bs->enqueue_op(subops[i].bs_op); bs->enqueue_op(subops[i].bs_op);
} }
else if ((peer_it = c_cli.osd_peer_fds.find(sync_osd)) != c_cli.osd_peer_fds.end()) else if ((peer_it = msgr.osd_peer_fds.find(sync_osd)) != msgr.osd_peer_fds.end())
{ {
subops[i].op_type = OSD_OP_OUT; subops[i].op_type = OSD_OP_OUT;
subops[i].peer_fd = peer_it->second; subops[i].peer_fd = peer_it->second;
subops[i].req = (osd_any_op_t){ .sec_sync = { subops[i].req = (osd_any_op_t){ .sec_sync = {
.header = { .header = {
.magic = SECONDARY_OSD_OP_MAGIC, .magic = SECONDARY_OSD_OP_MAGIC,
.id = c_cli.next_subop_id++, .id = msgr.next_subop_id++,
.opcode = OSD_OP_SEC_SYNC, .opcode = OSD_OP_SEC_SYNC,
}, },
} }; } };
@@ -524,7 +524,7 @@ int osd_t::submit_primary_sync_subops(osd_op_t *cur_op)
{ {
handle_primary_subop(subop, cur_op); handle_primary_subop(subop, cur_op);
}; };
c_cli.outbox_push(&subops[i]); msgr.outbox_push(&subops[i]);
} }
else else
{ {
@@ -569,11 +569,11 @@ void osd_t::submit_primary_stab_subops(osd_op_t *cur_op)
else else
{ {
subops[i].op_type = OSD_OP_OUT; subops[i].op_type = OSD_OP_OUT;
subops[i].peer_fd = c_cli.osd_peer_fds.at(stab_osd.osd_num); subops[i].peer_fd = msgr.osd_peer_fds.at(stab_osd.osd_num);
subops[i].req = (osd_any_op_t){ .sec_stab = { subops[i].req = (osd_any_op_t){ .sec_stab = {
.header = { .header = {
.magic = SECONDARY_OSD_OP_MAGIC, .magic = SECONDARY_OSD_OP_MAGIC,
.id = c_cli.next_subop_id++, .id = msgr.next_subop_id++,
.opcode = OSD_OP_SEC_STABILIZE, .opcode = OSD_OP_SEC_STABILIZE,
}, },
.len = (uint64_t)(stab_osd.len * sizeof(obj_ver_id)), .len = (uint64_t)(stab_osd.len * sizeof(obj_ver_id)),
@@ -583,7 +583,7 @@ void osd_t::submit_primary_stab_subops(osd_op_t *cur_op)
{ {
handle_primary_subop(subop, cur_op); handle_primary_subop(subop, cur_op);
}; };
c_cli.outbox_push(&subops[i]); msgr.outbox_push(&subops[i]);
} }
} }
} }

View File

@@ -247,8 +247,8 @@ resume_8:
finish: finish:
if (cur_op->peer_fd) if (cur_op->peer_fd)
{ {
auto it = c_cli.clients.find(cur_op->peer_fd); auto it = msgr.clients.find(cur_op->peer_fd);
if (it != c_cli.clients.end()) if (it != msgr.clients.end())
it->second->dirty_pgs.clear(); it->second->dirty_pgs.clear();
} }
finish_op(cur_op, 0); finish_op(cur_op, 0);

View File

@@ -370,8 +370,8 @@ lazy:
} }
// Remember PG as dirty to drop the connection when PG goes offline // Remember PG as dirty to drop the connection when PG goes offline
// (this is required because of the "lazy sync") // (this is required because of the "lazy sync")
auto cl_it = c_cli.clients.find(cur_op->peer_fd); auto cl_it = msgr.clients.find(cur_op->peer_fd);
if (cl_it != c_cli.clients.end()) if (cl_it != msgr.clients.end())
{ {
cl_it->second->dirty_pgs.insert({ .pool_id = pg.pool_id, .pg_num = pg.pg_num }); cl_it->second->dirty_pgs.insert({ .pool_id = pg.pool_id, .pg_num = pg.pg_num });
} }

View File

@@ -20,9 +20,9 @@ void osd_t::secondary_op_callback(osd_op_t *op)
if (op->req.hdr.opcode == OSD_OP_SEC_READ) if (op->req.hdr.opcode == OSD_OP_SEC_READ)
{ {
if (op->bs_op->retval >= 0) if (op->bs_op->retval >= 0)
op->reply.sec_rw.attr_len = clean_entry_bitmap_size; op->reply.sec_rw.bitmap_len = clean_entry_bitmap_size;
else else
op->reply.sec_rw.attr_len = 0; op->reply.sec_rw.bitmap_len = 0;
if (op->bs_op->retval > 0) if (op->bs_op->retval > 0)
op->iov.push_back(op->buf, op->bs_op->retval); op->iov.push_back(op->buf, op->bs_op->retval);
} }
@@ -81,7 +81,7 @@ void osd_t::exec_secondary(osd_op_t *cur_op)
if (cur_op->req.hdr.opcode == OSD_OP_SEC_READ) if (cur_op->req.hdr.opcode == OSD_OP_SEC_READ)
{ {
// Allocate memory for the read operation // Allocate memory for the read operation
if (clean_entry_bitmap_size > sizeof(unsigned)) if (clean_entry_bitmap_size > sizeof(void*))
cur_op->bitmap = cur_op->rmw_buf = malloc_or_die(clean_entry_bitmap_size); cur_op->bitmap = cur_op->rmw_buf = malloc_or_die(clean_entry_bitmap_size);
else else
cur_op->bitmap = &cur_op->bmp_data; cur_op->bitmap = &cur_op->bmp_data;
@@ -144,10 +144,49 @@ void osd_t::exec_secondary(osd_op_t *cur_op)
void osd_t::exec_show_config(osd_op_t *cur_op) void osd_t::exec_show_config(osd_op_t *cur_op)
{ {
// FIXME: Send the real config, not its source std::string json_err;
auto cfg_copy = config; json11::Json req_json = cur_op->req.show_conf.json_len > 0
cfg_copy["protocol_version"] = std::to_string(OSD_PROTOCOL_VERSION); ? json11::Json::parse(std::string((char *)cur_op->buf), json_err)
std::string cfg_str = json11::Json(cfg_copy).dump(); : json11::Json();
// Expose sensitive configuration values so peers can check them
json11::Json::object wire_config = json11::Json::object {
{ "osd_num", osd_num },
{ "protocol_version", OSD_PROTOCOL_VERSION },
{ "block_size", (uint64_t)bs_block_size },
{ "bitmap_granularity", (uint64_t)bs_bitmap_granularity },
{ "primary_enabled", run_primary },
{ "blockstore_enabled", bs ? true : false },
{ "readonly", readonly },
{ "immediate_commit", (immediate_commit == IMMEDIATE_ALL ? "all" :
(immediate_commit == IMMEDIATE_SMALL ? "small" : "none")) },
{ "lease_timeout", etcd_report_interval+(MAX_ETCD_ATTEMPTS*(2*ETCD_QUICK_TIMEOUT)+999)/1000 },
};
#ifdef WITH_RDMA
if (msgr.is_rdma_enabled())
{
// Indicate that RDMA is enabled
wire_config["rdma_enabled"] = true;
if (req_json["rdma_queues"].array_items().size())
{
// Peer is trying to connect using RDMA, try to satisfy him
auto cl = msgr.clients.at(cur_op->peer_fd);
bool ok = msgr.connect_rdma_client(cl, req_json["rdma_queues"], req_json["rdma_max_sge"].uint64_value());
if (ok)
{
json11::Json::array rdma_queues;
for (auto rdma_conn: cl->rdma_queues)
{
rdma_queues.push_back(rdma_conn->addr.to_string());
}
wire_config["rdma_queues"] = rdma_queues;
wire_config["rdma_max_sge"] = msgr.get_rdma_max_sge();
}
}
}
#endif
if (cur_op->buf)
free(cur_op->buf);
std::string cfg_str = json11::Json(wire_config).dump();
cur_op->buf = malloc_or_die(cfg_str.size()+1); cur_op->buf = malloc_or_die(cfg_str.size()+1);
memcpy(cur_op->buf, cfg_str.c_str(), cfg_str.size()+1); memcpy(cur_op->buf, cfg_str.c_str(), cfg_str.size()+1);
cur_op->iov.push_back(cur_op->buf, cfg_str.size()+1); cur_op->iov.push_back(cur_op->buf, cfg_str.size()+1);

View File

@@ -3,6 +3,7 @@
export KEEP_DATA=1 export KEEP_DATA=1
. `dirname $0`/common.sh . `dirname $0`/common.sh
etcdctl --endpoints=http://127.0.0.1:12379/v3 del --prefix /vitastor/mon/master
etcdctl --endpoints=http://127.0.0.1:12379/v3 del --prefix /vitastor/pg/state etcdctl --endpoints=http://127.0.0.1:12379/v3 del --prefix /vitastor/pg/state
etcdctl --endpoints=http://127.0.0.1:12379/v3 del --prefix /vitastor/osd/state etcdctl --endpoints=http://127.0.0.1:12379/v3 del --prefix /vitastor/osd/state

View File

@@ -39,6 +39,12 @@ if ! cmp build/src/block-vitastor.so /usr/lib/x86_64-linux-gnu/qemu/block-vitast
sudo ln -s "$(realpath .)/build/src/block-vitastor.so" /usr/lib/x86_64-linux-gnu/qemu/block-vitastor.so sudo ln -s "$(realpath .)/build/src/block-vitastor.so" /usr/lib/x86_64-linux-gnu/qemu/block-vitastor.so
fi fi
# A lot of parallel syncs was crashing the primary OSD at some point
LD_PRELOAD=libasan.so.5 \
fio -thread -name=test -ioengine=build/src/libfio_vitastor.so -bs=4k -direct=1 -numjobs=64 -iodepth=1 -fsync=1 \
-rw=randwrite -etcd=$ETCD_URL -pool=1 -inode=1 -size=128M -number_ios=100
LD_PRELOAD=libasan.so.5 \ LD_PRELOAD=libasan.so.5 \
fio -thread -name=test -ioengine=build/src/libfio_vitastor.so -bs=4M -direct=1 -iodepth=1 -fsync=1 -rw=write -etcd=$ETCD_URL -pool=1 -inode=1 -size=128M -cluster_log_level=10 fio -thread -name=test -ioengine=build/src/libfio_vitastor.so -bs=4M -direct=1 -iodepth=1 -fsync=1 -rw=write -etcd=$ETCD_URL -pool=1 -inode=1 -size=128M -cluster_log_level=10