Compare commits
1 Commits
kv-debug
...
zerocopy-t
Author | SHA1 | Date | |
---|---|---|---|
64bbf121b6 |
@@ -2,6 +2,6 @@ cmake_minimum_required(VERSION 2.8.12)
|
|||||||
|
|
||||||
project(vitastor)
|
project(vitastor)
|
||||||
|
|
||||||
set(VERSION "1.2.0")
|
set(VERSION "1.1.0")
|
||||||
|
|
||||||
add_subdirectory(src)
|
add_subdirectory(src)
|
||||||
|
@@ -1,4 +1,4 @@
|
|||||||
VERSION ?= v1.2.0
|
VERSION ?= v1.1.0
|
||||||
|
|
||||||
all: build push
|
all: build push
|
||||||
|
|
||||||
|
@@ -49,7 +49,7 @@ spec:
|
|||||||
capabilities:
|
capabilities:
|
||||||
add: ["SYS_ADMIN"]
|
add: ["SYS_ADMIN"]
|
||||||
allowPrivilegeEscalation: true
|
allowPrivilegeEscalation: true
|
||||||
image: vitalif/vitastor-csi:v1.2.0
|
image: vitalif/vitastor-csi:v1.1.0
|
||||||
args:
|
args:
|
||||||
- "--node=$(NODE_ID)"
|
- "--node=$(NODE_ID)"
|
||||||
- "--endpoint=$(CSI_ENDPOINT)"
|
- "--endpoint=$(CSI_ENDPOINT)"
|
||||||
|
@@ -121,7 +121,7 @@ spec:
|
|||||||
privileged: true
|
privileged: true
|
||||||
capabilities:
|
capabilities:
|
||||||
add: ["SYS_ADMIN"]
|
add: ["SYS_ADMIN"]
|
||||||
image: vitalif/vitastor-csi:v1.2.0
|
image: vitalif/vitastor-csi:v1.1.0
|
||||||
args:
|
args:
|
||||||
- "--node=$(NODE_ID)"
|
- "--node=$(NODE_ID)"
|
||||||
- "--endpoint=$(CSI_ENDPOINT)"
|
- "--endpoint=$(CSI_ENDPOINT)"
|
||||||
|
@@ -5,7 +5,7 @@ package vitastor
|
|||||||
|
|
||||||
const (
|
const (
|
||||||
vitastorCSIDriverName = "csi.vitastor.io"
|
vitastorCSIDriverName = "csi.vitastor.io"
|
||||||
vitastorCSIDriverVersion = "1.2.0"
|
vitastorCSIDriverVersion = "1.1.0"
|
||||||
)
|
)
|
||||||
|
|
||||||
// Config struct fills the parameters of request or user input
|
// Config struct fills the parameters of request or user input
|
||||||
|
4
debian/changelog
vendored
4
debian/changelog
vendored
@@ -1,10 +1,10 @@
|
|||||||
vitastor (1.2.0-1) unstable; urgency=medium
|
vitastor (1.1.0-1) unstable; urgency=medium
|
||||||
|
|
||||||
* Bugfixes
|
* Bugfixes
|
||||||
|
|
||||||
-- Vitaliy Filippov <vitalif@yourcmc.ru> Fri, 03 Jun 2022 02:09:44 +0300
|
-- Vitaliy Filippov <vitalif@yourcmc.ru> Fri, 03 Jun 2022 02:09:44 +0300
|
||||||
|
|
||||||
vitastor (1.2.0-1) unstable; urgency=medium
|
vitastor (1.1.0-1) unstable; urgency=medium
|
||||||
|
|
||||||
* Implement NFS proxy
|
* Implement NFS proxy
|
||||||
* Add documentation
|
* Add documentation
|
||||||
|
8
debian/vitastor.Dockerfile
vendored
8
debian/vitastor.Dockerfile
vendored
@@ -35,8 +35,8 @@ RUN set -e -x; \
|
|||||||
mkdir -p /root/packages/vitastor-$REL; \
|
mkdir -p /root/packages/vitastor-$REL; \
|
||||||
rm -rf /root/packages/vitastor-$REL/*; \
|
rm -rf /root/packages/vitastor-$REL/*; \
|
||||||
cd /root/packages/vitastor-$REL; \
|
cd /root/packages/vitastor-$REL; \
|
||||||
cp -r /root/vitastor vitastor-1.2.0; \
|
cp -r /root/vitastor vitastor-1.1.0; \
|
||||||
cd vitastor-1.2.0; \
|
cd vitastor-1.1.0; \
|
||||||
ln -s /root/fio-build/fio-*/ ./fio; \
|
ln -s /root/fio-build/fio-*/ ./fio; \
|
||||||
FIO=$(head -n1 fio/debian/changelog | perl -pe 's/^.*\((.*?)\).*$/$1/'); \
|
FIO=$(head -n1 fio/debian/changelog | perl -pe 's/^.*\((.*?)\).*$/$1/'); \
|
||||||
ls /usr/include/linux/raw.h || cp ./debian/raw.h /usr/include/linux/raw.h; \
|
ls /usr/include/linux/raw.h || cp ./debian/raw.h /usr/include/linux/raw.h; \
|
||||||
@@ -49,8 +49,8 @@ RUN set -e -x; \
|
|||||||
rm -rf a b; \
|
rm -rf a b; \
|
||||||
echo "dep:fio=$FIO" > debian/fio_version; \
|
echo "dep:fio=$FIO" > debian/fio_version; \
|
||||||
cd /root/packages/vitastor-$REL; \
|
cd /root/packages/vitastor-$REL; \
|
||||||
tar --sort=name --mtime='2020-01-01' --owner=0 --group=0 --exclude=debian -cJf vitastor_1.2.0.orig.tar.xz vitastor-1.2.0; \
|
tar --sort=name --mtime='2020-01-01' --owner=0 --group=0 --exclude=debian -cJf vitastor_1.1.0.orig.tar.xz vitastor-1.1.0; \
|
||||||
cd vitastor-1.2.0; \
|
cd vitastor-1.1.0; \
|
||||||
V=$(head -n1 debian/changelog | perl -pe 's/^.*\((.*?)\).*$/$1/'); \
|
V=$(head -n1 debian/changelog | perl -pe 's/^.*\((.*?)\).*$/$1/'); \
|
||||||
DEBFULLNAME="Vitaliy Filippov <vitalif@yourcmc.ru>" dch -D $REL -v "$V""$REL" "Rebuild for $REL"; \
|
DEBFULLNAME="Vitaliy Filippov <vitalif@yourcmc.ru>" dch -D $REL -v "$V""$REL" "Rebuild for $REL"; \
|
||||||
DEB_BUILD_OPTIONS=nocheck dpkg-buildpackage --jobs=auto -sa; \
|
DEB_BUILD_OPTIONS=nocheck dpkg-buildpackage --jobs=auto -sa; \
|
||||||
|
@@ -20,7 +20,6 @@ between clients, OSDs and etcd.
|
|||||||
- [rdma_max_msg](#rdma_max_msg)
|
- [rdma_max_msg](#rdma_max_msg)
|
||||||
- [rdma_max_recv](#rdma_max_recv)
|
- [rdma_max_recv](#rdma_max_recv)
|
||||||
- [rdma_max_send](#rdma_max_send)
|
- [rdma_max_send](#rdma_max_send)
|
||||||
- [rdma_odp](#rdma_odp)
|
|
||||||
- [peer_connect_interval](#peer_connect_interval)
|
- [peer_connect_interval](#peer_connect_interval)
|
||||||
- [peer_connect_timeout](#peer_connect_timeout)
|
- [peer_connect_timeout](#peer_connect_timeout)
|
||||||
- [osd_idle_timeout](#osd_idle_timeout)
|
- [osd_idle_timeout](#osd_idle_timeout)
|
||||||
@@ -69,14 +68,11 @@ but they are not connected to the cluster.
|
|||||||
- Type: string
|
- Type: string
|
||||||
|
|
||||||
RDMA device name to use for Vitastor OSD communications (for example,
|
RDMA device name to use for Vitastor OSD communications (for example,
|
||||||
"rocep5s0f0"). Now Vitastor supports all adapters, even ones without
|
"rocep5s0f0"). Please note that Vitastor RDMA requires Implicit On-Demand
|
||||||
ODP support, like Mellanox ConnectX-3 and non-Mellanox cards.
|
Paging (Implicit ODP) and Scatter/Gather (SG) support from the RDMA device
|
||||||
|
to work. For example, Mellanox ConnectX-3 and older adapters don't have
|
||||||
Versions up to Vitastor 1.2.0 required ODP which is only present in
|
Implicit ODP, so they're unsupported by Vitastor. Run `ibv_devinfo -v` as
|
||||||
Mellanox ConnectX >= 4. See also [rdma_odp](#rdma_odp).
|
root to list available RDMA devices and their features.
|
||||||
|
|
||||||
Run `ibv_devinfo -v` as root to list available RDMA devices and their
|
|
||||||
features.
|
|
||||||
|
|
||||||
Remember that you also have to configure your network switches if you use
|
Remember that you also have to configure your network switches if you use
|
||||||
RoCE/RoCEv2, otherwise you may experience unstable performance. Refer to
|
RoCE/RoCEv2, otherwise you may experience unstable performance. Refer to
|
||||||
@@ -151,28 +147,6 @@ less than `rdma_max_recv` so the receiving side doesn't run out of buffers.
|
|||||||
Doesn't affect memory usage - additional memory isn't allocated for send
|
Doesn't affect memory usage - additional memory isn't allocated for send
|
||||||
operations.
|
operations.
|
||||||
|
|
||||||
## rdma_odp
|
|
||||||
|
|
||||||
- Type: boolean
|
|
||||||
- Default: false
|
|
||||||
|
|
||||||
Use RDMA with On-Demand Paging. ODP is currently only available on Mellanox
|
|
||||||
ConnectX-4 and newer adapters. ODP allows to not register memory explicitly
|
|
||||||
for RDMA adapter to be able to use it. This, in turn, allows to skip memory
|
|
||||||
copying during sending. One would think this should improve performance, but
|
|
||||||
**in reality** RDMA performance with ODP is **drastically** worse. Example
|
|
||||||
3-node cluster with 8 NVMe in each node and 2*25 GBit/s ConnectX-6 RDMA network
|
|
||||||
without ODP pushes 3950000 read iops, but only 239000 iops with ODP...
|
|
||||||
|
|
||||||
This happens because Mellanox ODP implementation seems to be based on
|
|
||||||
message retransmissions when the adapter doesn't know about the buffer yet -
|
|
||||||
it likely uses standard "RNR retransmissions" (RNR = receiver not ready)
|
|
||||||
which is generally slow in RDMA/RoCE networks. Here's a presentation about
|
|
||||||
it from ISPASS-2021 conference: https://tkygtr6.github.io/pub/ISPASS21_slides.pdf
|
|
||||||
|
|
||||||
ODP support is retained in the code just in case a good ODP implementation
|
|
||||||
appears one day.
|
|
||||||
|
|
||||||
## peer_connect_interval
|
## peer_connect_interval
|
||||||
|
|
||||||
- Type: seconds
|
- Type: seconds
|
||||||
|
@@ -20,7 +20,6 @@
|
|||||||
- [rdma_max_msg](#rdma_max_msg)
|
- [rdma_max_msg](#rdma_max_msg)
|
||||||
- [rdma_max_recv](#rdma_max_recv)
|
- [rdma_max_recv](#rdma_max_recv)
|
||||||
- [rdma_max_send](#rdma_max_send)
|
- [rdma_max_send](#rdma_max_send)
|
||||||
- [rdma_odp](#rdma_odp)
|
|
||||||
- [peer_connect_interval](#peer_connect_interval)
|
- [peer_connect_interval](#peer_connect_interval)
|
||||||
- [peer_connect_timeout](#peer_connect_timeout)
|
- [peer_connect_timeout](#peer_connect_timeout)
|
||||||
- [osd_idle_timeout](#osd_idle_timeout)
|
- [osd_idle_timeout](#osd_idle_timeout)
|
||||||
@@ -72,15 +71,12 @@ RDMA может быть нужно только если у клиентов е
|
|||||||
- Тип: строка
|
- Тип: строка
|
||||||
|
|
||||||
Название RDMA-устройства для связи с Vitastor OSD (например, "rocep5s0f0").
|
Название RDMA-устройства для связи с Vitastor OSD (например, "rocep5s0f0").
|
||||||
Сейчас Vitastor поддерживает все модели адаптеров, включая те, у которых
|
Имейте в виду, что поддержка RDMA в Vitastor требует функций устройства
|
||||||
нет поддержки ODP, то есть вы можете использовать RDMA с ConnectX-3 и
|
Implicit On-Demand Paging (Implicit ODP) и Scatter/Gather (SG). Например,
|
||||||
картами производства не Mellanox.
|
адаптеры Mellanox ConnectX-3 и более старые не поддерживают Implicit ODP и
|
||||||
|
потому не поддерживаются в Vitastor. Запустите `ibv_devinfo -v` от имени
|
||||||
Версии Vitastor до 1.2.0 включительно требовали ODP, который есть только
|
суперпользователя, чтобы посмотреть список доступных RDMA-устройств, их
|
||||||
на Mellanox ConnectX 4 и более новых. См. также [rdma_odp](#rdma_odp).
|
параметры и возможности.
|
||||||
|
|
||||||
Запустите `ibv_devinfo -v` от имени суперпользователя, чтобы посмотреть
|
|
||||||
список доступных RDMA-устройств, их параметры и возможности.
|
|
||||||
|
|
||||||
Обратите внимание, что если вы используете RoCE/RoCEv2, вам также необходимо
|
Обратите внимание, что если вы используете RoCE/RoCEv2, вам также необходимо
|
||||||
правильно настроить для него коммутаторы, иначе вы можете столкнуться с
|
правильно настроить для него коммутаторы, иначе вы можете столкнуться с
|
||||||
@@ -159,29 +155,6 @@ OSD в любом случае согласовывают реальное зн
|
|||||||
Не влияет на потребление памяти - дополнительная память на операции отправки
|
Не влияет на потребление памяти - дополнительная память на операции отправки
|
||||||
не выделяется.
|
не выделяется.
|
||||||
|
|
||||||
## rdma_odp
|
|
||||||
|
|
||||||
- Тип: булево (да/нет)
|
|
||||||
- Значение по умолчанию: false
|
|
||||||
|
|
||||||
Использовать RDMA с On-Demand Paging. ODP - функция, доступная пока что
|
|
||||||
исключительно на адаптерах Mellanox ConnectX-4 и более новых. ODP позволяет
|
|
||||||
не регистрировать память для её использования RDMA-картой. Благодаря этому
|
|
||||||
можно не копировать данные при отправке их в сеть и, казалось бы, это должно
|
|
||||||
улучшать производительность - но **по факту** получается так, что
|
|
||||||
производительность только ухудшается, причём сильно. Пример - на 3-узловом
|
|
||||||
кластере с 8 NVMe в каждом узле и сетью 2*25 Гбит/с на чтение с RDMA без ODP
|
|
||||||
удаётся снять 3950000 iops, а с ODP - всего 239000 iops...
|
|
||||||
|
|
||||||
Это происходит из-за того, что реализация ODP у Mellanox неоптимальная и
|
|
||||||
основана на повторной передаче сообщений, когда карте не известен буфер -
|
|
||||||
вероятно, на стандартных "RNR retransmission" (RNR = receiver not ready).
|
|
||||||
А данные повторные передачи в RDMA/RoCE - всегда очень медленная штука.
|
|
||||||
Презентация на эту тему с конференции ISPASS-2021: https://tkygtr6.github.io/pub/ISPASS21_slides.pdf
|
|
||||||
|
|
||||||
Возможность использования ODP сохранена в коде на случай, если вдруг в один
|
|
||||||
прекрасный день появится хорошая реализация ODP.
|
|
||||||
|
|
||||||
## peer_connect_interval
|
## peer_connect_interval
|
||||||
|
|
||||||
- Тип: секунды
|
- Тип: секунды
|
||||||
|
@@ -30,6 +30,18 @@
|
|||||||
будут использоваться обычные синхронные системные вызовы send/recv. Для OSD
|
будут использоваться обычные синхронные системные вызовы send/recv. Для OSD
|
||||||
это бессмысленно, так как OSD в любом случае нуждается в io_uring, но, в
|
это бессмысленно, так как OSD в любом случае нуждается в io_uring, но, в
|
||||||
принципе, это может применяться для клиентов со старыми версиями ядра.
|
принципе, это может применяться для клиентов со старыми версиями ядра.
|
||||||
|
- name: use_zerocopy_send
|
||||||
|
type: bool
|
||||||
|
default: false
|
||||||
|
info: |
|
||||||
|
If true, OSDs and clients will attempt to use TCP zero-copy send
|
||||||
|
(MSG_ZEROCOPY) for big buffers. It's recommended to raise net.ipv4.tcp_wmem
|
||||||
|
and net.core.wmem_max sysctls when using this mode.
|
||||||
|
info_ru: |
|
||||||
|
Если установлено в true, то OSD и клиенты будут стараться использовать
|
||||||
|
TCP-отправку без копирования (MSG_ZEROCOPY) для больших буферов данных.
|
||||||
|
Рекомендуется поднять значения sysctl net.ipv4.tcp_wmem и net.core.wmem_max
|
||||||
|
при использовании этого режима.
|
||||||
- name: use_rdma
|
- name: use_rdma
|
||||||
type: bool
|
type: bool
|
||||||
default: true
|
default: true
|
||||||
@@ -48,14 +60,11 @@
|
|||||||
type: string
|
type: string
|
||||||
info: |
|
info: |
|
||||||
RDMA device name to use for Vitastor OSD communications (for example,
|
RDMA device name to use for Vitastor OSD communications (for example,
|
||||||
"rocep5s0f0"). Now Vitastor supports all adapters, even ones without
|
"rocep5s0f0"). Please note that Vitastor RDMA requires Implicit On-Demand
|
||||||
ODP support, like Mellanox ConnectX-3 and non-Mellanox cards.
|
Paging (Implicit ODP) and Scatter/Gather (SG) support from the RDMA device
|
||||||
|
to work. For example, Mellanox ConnectX-3 and older adapters don't have
|
||||||
Versions up to Vitastor 1.2.0 required ODP which is only present in
|
Implicit ODP, so they're unsupported by Vitastor. Run `ibv_devinfo -v` as
|
||||||
Mellanox ConnectX >= 4. See also [rdma_odp](#rdma_odp).
|
root to list available RDMA devices and their features.
|
||||||
|
|
||||||
Run `ibv_devinfo -v` as root to list available RDMA devices and their
|
|
||||||
features.
|
|
||||||
|
|
||||||
Remember that you also have to configure your network switches if you use
|
Remember that you also have to configure your network switches if you use
|
||||||
RoCE/RoCEv2, otherwise you may experience unstable performance. Refer to
|
RoCE/RoCEv2, otherwise you may experience unstable performance. Refer to
|
||||||
@@ -64,15 +73,12 @@
|
|||||||
PFC (Priority Flow Control) and ECN (Explicit Congestion Notification).
|
PFC (Priority Flow Control) and ECN (Explicit Congestion Notification).
|
||||||
info_ru: |
|
info_ru: |
|
||||||
Название RDMA-устройства для связи с Vitastor OSD (например, "rocep5s0f0").
|
Название RDMA-устройства для связи с Vitastor OSD (например, "rocep5s0f0").
|
||||||
Сейчас Vitastor поддерживает все модели адаптеров, включая те, у которых
|
Имейте в виду, что поддержка RDMA в Vitastor требует функций устройства
|
||||||
нет поддержки ODP, то есть вы можете использовать RDMA с ConnectX-3 и
|
Implicit On-Demand Paging (Implicit ODP) и Scatter/Gather (SG). Например,
|
||||||
картами производства не Mellanox.
|
адаптеры Mellanox ConnectX-3 и более старые не поддерживают Implicit ODP и
|
||||||
|
потому не поддерживаются в Vitastor. Запустите `ibv_devinfo -v` от имени
|
||||||
Версии Vitastor до 1.2.0 включительно требовали ODP, который есть только
|
суперпользователя, чтобы посмотреть список доступных RDMA-устройств, их
|
||||||
на Mellanox ConnectX 4 и более новых. См. также [rdma_odp](#rdma_odp).
|
параметры и возможности.
|
||||||
|
|
||||||
Запустите `ibv_devinfo -v` от имени суперпользователя, чтобы посмотреть
|
|
||||||
список доступных RDMA-устройств, их параметры и возможности.
|
|
||||||
|
|
||||||
Обратите внимание, что если вы используете RoCE/RoCEv2, вам также необходимо
|
Обратите внимание, что если вы используете RoCE/RoCEv2, вам также необходимо
|
||||||
правильно настроить для него коммутаторы, иначе вы можете столкнуться с
|
правильно настроить для него коммутаторы, иначе вы можете столкнуться с
|
||||||
@@ -166,45 +172,6 @@
|
|||||||
у принимающей стороны в процессе работы не заканчивались буферы на приём.
|
у принимающей стороны в процессе работы не заканчивались буферы на приём.
|
||||||
Не влияет на потребление памяти - дополнительная память на операции отправки
|
Не влияет на потребление памяти - дополнительная память на операции отправки
|
||||||
не выделяется.
|
не выделяется.
|
||||||
- name: rdma_odp
|
|
||||||
type: bool
|
|
||||||
default: false
|
|
||||||
online: false
|
|
||||||
info: |
|
|
||||||
Use RDMA with On-Demand Paging. ODP is currently only available on Mellanox
|
|
||||||
ConnectX-4 and newer adapters. ODP allows to not register memory explicitly
|
|
||||||
for RDMA adapter to be able to use it. This, in turn, allows to skip memory
|
|
||||||
copying during sending. One would think this should improve performance, but
|
|
||||||
**in reality** RDMA performance with ODP is **drastically** worse. Example
|
|
||||||
3-node cluster with 8 NVMe in each node and 2*25 GBit/s ConnectX-6 RDMA network
|
|
||||||
without ODP pushes 3950000 read iops, but only 239000 iops with ODP...
|
|
||||||
|
|
||||||
This happens because Mellanox ODP implementation seems to be based on
|
|
||||||
message retransmissions when the adapter doesn't know about the buffer yet -
|
|
||||||
it likely uses standard "RNR retransmissions" (RNR = receiver not ready)
|
|
||||||
which is generally slow in RDMA/RoCE networks. Here's a presentation about
|
|
||||||
it from ISPASS-2021 conference: https://tkygtr6.github.io/pub/ISPASS21_slides.pdf
|
|
||||||
|
|
||||||
ODP support is retained in the code just in case a good ODP implementation
|
|
||||||
appears one day.
|
|
||||||
info_ru: |
|
|
||||||
Использовать RDMA с On-Demand Paging. ODP - функция, доступная пока что
|
|
||||||
исключительно на адаптерах Mellanox ConnectX-4 и более новых. ODP позволяет
|
|
||||||
не регистрировать память для её использования RDMA-картой. Благодаря этому
|
|
||||||
можно не копировать данные при отправке их в сеть и, казалось бы, это должно
|
|
||||||
улучшать производительность - но **по факту** получается так, что
|
|
||||||
производительность только ухудшается, причём сильно. Пример - на 3-узловом
|
|
||||||
кластере с 8 NVMe в каждом узле и сетью 2*25 Гбит/с на чтение с RDMA без ODP
|
|
||||||
удаётся снять 3950000 iops, а с ODP - всего 239000 iops...
|
|
||||||
|
|
||||||
Это происходит из-за того, что реализация ODP у Mellanox неоптимальная и
|
|
||||||
основана на повторной передаче сообщений, когда карте не известен буфер -
|
|
||||||
вероятно, на стандартных "RNR retransmission" (RNR = receiver not ready).
|
|
||||||
А данные повторные передачи в RDMA/RoCE - всегда очень медленная штука.
|
|
||||||
Презентация на эту тему с конференции ISPASS-2021: https://tkygtr6.github.io/pub/ISPASS21_slides.pdf
|
|
||||||
|
|
||||||
Возможность использования ODP сохранена в коде на случай, если вдруг в один
|
|
||||||
прекрасный день появится хорошая реализация ODP.
|
|
||||||
- name: peer_connect_interval
|
- name: peer_connect_interval
|
||||||
type: sec
|
type: sec
|
||||||
min: 1
|
min: 1
|
||||||
|
@@ -17,15 +17,4 @@ and apply all `NNN-*.yaml` manifests to your Kubernetes installation:
|
|||||||
for i in ./???-*.yaml; do kubectl apply -f $i; done
|
for i in ./???-*.yaml; do kubectl apply -f $i; done
|
||||||
```
|
```
|
||||||
|
|
||||||
After that you'll be able to create PersistentVolumes.
|
After that you'll be able to create PersistentVolumes. See example in [csi/deploy/example-pvc.yaml](../../csi/deploy/example-pvc.yaml).
|
||||||
|
|
||||||
## Features
|
|
||||||
|
|
||||||
Vitastor CSI supports:
|
|
||||||
- Kubernetes starting with 1.20 (or 1.17 for older vitastor-csi <= 1.1.0)
|
|
||||||
- Filesystem RWO (ReadWriteOnce) volumes. Example: [PVC](../../csi/deploy/example-pvc.yaml), [pod](../../csi/deploy/example-test-pod.yaml)
|
|
||||||
- Raw block RWX (ReadWriteMany) volumes. Example: [PVC](../../csi/deploy/example-pvc-block.yaml), [pod](../../csi/deploy/example-test-pod-block.yaml)
|
|
||||||
- Volume expansion
|
|
||||||
- Volume snapshots. Example: [snapshot class](../../csi/deploy/example-snapshot-class.yaml), [snapshot](../../csi/deploy/example-snapshot.yaml), [clone](../../csi/deploy/example-snapshot-clone.yaml)
|
|
||||||
|
|
||||||
Remember that to use snapshots with CSI you also have to install [Snapshot Controller and CRDs](https://kubernetes-csi.github.io/docs/snapshot-controller.html#deployment).
|
|
||||||
|
@@ -17,15 +17,4 @@
|
|||||||
for i in ./???-*.yaml; do kubectl apply -f $i; done
|
for i in ./???-*.yaml; do kubectl apply -f $i; done
|
||||||
```
|
```
|
||||||
|
|
||||||
После этого вы сможете создавать PersistentVolume.
|
После этого вы сможете создавать PersistentVolume. Пример смотрите в файле [csi/deploy/example-pvc.yaml](../../csi/deploy/example-pvc.yaml).
|
||||||
|
|
||||||
## Возможности
|
|
||||||
|
|
||||||
CSI-плагин Vitastor поддерживает:
|
|
||||||
- Версии Kubernetes, начиная с 1.20 (или с 1.17 для более старых vitastor-csi <= 1.1.0)
|
|
||||||
- Файловые RWO (ReadWriteOnce) тома. Пример: [PVC](../../csi/deploy/example-pvc.yaml), [под](../../csi/deploy/example-test-pod.yaml)
|
|
||||||
- Сырые блочные RWX (ReadWriteMany) тома. Пример: [PVC](../../csi/deploy/example-pvc-block.yaml), [под](../../csi/deploy/example-test-pod-block.yaml)
|
|
||||||
- Расширение размера томов
|
|
||||||
- Снимки томов. Пример: [класс снимков](../../csi/deploy/example-snapshot-class.yaml), [снимок](../../csi/deploy/example-snapshot.yaml), [клон снимка](../../csi/deploy/example-snapshot-clone.yaml)
|
|
||||||
|
|
||||||
Не забывайте, что для использования снимков нужно сначала установить [контроллер снимков и CRD](https://kubernetes-csi.github.io/docs/snapshot-controller.html#deployment).
|
|
||||||
|
@@ -127,46 +127,19 @@ Linux kernel, starting with version 5.15, supports a new interface for attaching
|
|||||||
to the host - VDUSE (vDPA Device in Userspace). QEMU, starting with 7.2, has support for
|
to the host - VDUSE (vDPA Device in Userspace). QEMU, starting with 7.2, has support for
|
||||||
exporting QEMU block devices over this protocol using qemu-storage-daemon.
|
exporting QEMU block devices over this protocol using qemu-storage-daemon.
|
||||||
|
|
||||||
VDUSE is currently the best interface to attach Vitastor disks as kernel devices because:
|
VDUSE has the same problem as other FUSE-like interfaces in Linux: if a userspace process hangs,
|
||||||
- It avoids data copies and thus achieves much better performance than [NBD](nbd.en.md)
|
for example, if it loses connectivity with Vitastor cluster - active processes doing I/O may
|
||||||
- It doesn't have NBD timeout problem - the device doesn't die if an operation executes for too long
|
hang in the D state (uninterruptible sleep) and you won't be able to kill them even with kill -9.
|
||||||
- It doesn't have hung device problem - if the userspace process dies it can be restarted (!)
|
In this case reboot will be the only way to remove VDUSE devices from system.
|
||||||
and block device will continue operation
|
|
||||||
- It doesn't seem to have the device number limit
|
|
||||||
|
|
||||||
Example performance comparison:
|
On the other hand, VDUSE is faster than [NBD](nbd.en.md), so you may prefer to use it if
|
||||||
|
performance is important for you. Approximate performance numbers:
|
||||||
| | direct fio | NBD | VDUSE |
|
direct fio benchmark - 115000 iops, NBD - 60000 iops, VDUSE - 90000 iops.
|
||||||
|----------------------|-------------|-------------|-------------|
|
|
||||||
| linear write | 3.85 GB/s | 1.12 GB/s | 3.85 GB/s |
|
|
||||||
| 4k random write Q128 | 240000 iops | 120000 iops | 178000 iops |
|
|
||||||
| 4k random write Q1 | 9500 iops | 7620 iops | 7640 iops |
|
|
||||||
| linear read | 4.3 GB/s | 1.8 GB/s | 2.85 GB/s |
|
|
||||||
| 4k random read Q128 | 287000 iops | 140000 iops | 189000 iops |
|
|
||||||
| 4k random read Q1 | 9600 iops | 7640 iops | 7780 iops |
|
|
||||||
|
|
||||||
To try VDUSE you need at least Linux 5.15, built with VDUSE support
|
To try VDUSE you need at least Linux 5.15, built with VDUSE support
|
||||||
(CONFIG_VIRTIO_VDPA=m, CONFIG_VDPA_USER=m, CONFIG_VIRTIO_VDPA=m).
|
(CONFIG_VIRTIO_VDPA=m and CONFIG_VDPA_USER=m). Debian Linux kernels have these options
|
||||||
|
disabled by now, so if you want to try it on Debian, use a kernel from Ubuntu
|
||||||
Debian Linux kernels have these options disabled by now, so if you want to try it on Debian,
|
[kernel-ppa/mainline](https://kernel.ubuntu.com/~kernel-ppa/mainline/) or Proxmox.
|
||||||
use a kernel from Ubuntu [kernel-ppa/mainline](https://kernel.ubuntu.com/~kernel-ppa/mainline/), Proxmox,
|
|
||||||
or build modules for Debian kernel manually:
|
|
||||||
|
|
||||||
```
|
|
||||||
mkdir build
|
|
||||||
cd build
|
|
||||||
apt-get install linux-headers-`uname -r`
|
|
||||||
apt-get build-dep linux-image-`uname -r`-unsigned
|
|
||||||
apt-get source linux-image-`uname -r`-unsigned
|
|
||||||
cd linux*/drivers/vdpa
|
|
||||||
make -C /lib/modules/`uname -r`/build M=$PWD CONFIG_VDPA=m CONFIG_VDPA_USER=m CONFIG_VIRTIO_VDPA=m -j8 modules modules_install
|
|
||||||
cat Module.symvers >> /lib/modules/`uname -r`/build/Module.symvers
|
|
||||||
cd ../virtio
|
|
||||||
make -C /lib/modules/`uname -r`/build M=$PWD CONFIG_VDPA=m CONFIG_VDPA_USER=m CONFIG_VIRTIO_VDPA=m -j8 modules modules_install
|
|
||||||
depmod -a
|
|
||||||
```
|
|
||||||
|
|
||||||
You also need `vdpa` tool from the `iproute2` package.
|
|
||||||
|
|
||||||
Commands to attach Vitastor image as a VDUSE device:
|
Commands to attach Vitastor image as a VDUSE device:
|
||||||
|
|
||||||
@@ -179,7 +152,7 @@ qemu-storage-daemon --daemonize --blockdev '{"node-name":"test1","driver":"vitas
|
|||||||
vdpa dev add name test1 mgmtdev vduse
|
vdpa dev add name test1 mgmtdev vduse
|
||||||
```
|
```
|
||||||
|
|
||||||
After running these commands, `/dev/vda` device will appear in the system and you'll be able to
|
After running these commands /dev/vda device will appear in the system and you'll be able to
|
||||||
use it as a normal disk.
|
use it as a normal disk.
|
||||||
|
|
||||||
To remove the device:
|
To remove the device:
|
||||||
|
@@ -129,47 +129,19 @@ qemu-system-x86_64 -enable-kvm -m 2048 -M accel=kvm,memory-backend=mem \
|
|||||||
к системе - VDUSE (vDPA Device in Userspace), а в QEMU, начиная с версии 7.2, есть поддержка
|
к системе - VDUSE (vDPA Device in Userspace), а в QEMU, начиная с версии 7.2, есть поддержка
|
||||||
экспорта блочных устройств QEMU по этому протоколу через qemu-storage-daemon.
|
экспорта блочных устройств QEMU по этому протоколу через qemu-storage-daemon.
|
||||||
|
|
||||||
VDUSE - на данный момент лучший интерфейс для подключения дисков Vitastor в виде блочных
|
VDUSE страдает общей проблемой FUSE-подобных интерфейсов в Linux: если пользовательский процесс
|
||||||
устройств на уровне ядра, ибо:
|
подвиснет, например, если будет потеряна связь с кластером Vitastor - читающие/пишущие в кластер
|
||||||
- VDUSE не копирует данные и поэтому достигает значительно лучшей производительности, чем [NBD](nbd.ru.md)
|
процессы могут "залипнуть" в состоянии D (непрерываемый сон) и их будет невозможно убить даже
|
||||||
- Также оно не имеет проблемы NBD-таймаута - устройство не умирает, если операция выполняется слишком долго
|
через kill -9. В этом случае удалить из системы устройство можно только перезагрузившись.
|
||||||
- Также оно не имеет проблемы подвисающих устройств - если процесс-обработчик умирает, его можно
|
|
||||||
перезапустить (!) и блочное устройство продолжит работать
|
|
||||||
- По-видимому, у него нет предела числа подключаемых в систему устройств
|
|
||||||
|
|
||||||
Пример сравнения производительности:
|
С другой стороны, VDUSE быстрее по сравнению с [NBD](nbd.ru.md), поэтому его может
|
||||||
|
быть предпочтительно использовать там, где производительность важнее. Порядок показателей:
|
||||||
|
прямое тестирование через fio - 115000 iops, NBD - 60000 iops, VDUSE - 90000 iops.
|
||||||
|
|
||||||
| | Прямой fio | NBD | VDUSE |
|
Чтобы использовать VDUSE, вам нужно ядро Linux версии хотя бы 5.15, собранное с поддержкой
|
||||||
|--------------------------|-------------|-------------|-------------|
|
VDUSE (CONFIG_VIRTIO_VDPA=m и CONFIG_VDPA_USER=m). В ядрах в Debian Linux поддержка пока
|
||||||
| линейная запись | 3.85 GB/s | 1.12 GB/s | 3.85 GB/s |
|
отключена - если хотите попробовать эту функцию на Debian, поставьте ядро из Ubuntu
|
||||||
| 4k случайная запись Q128 | 240000 iops | 120000 iops | 178000 iops |
|
[kernel-ppa/mainline](https://kernel.ubuntu.com/~kernel-ppa/mainline/) или из Proxmox.
|
||||||
| 4k случайная запись Q1 | 9500 iops | 7620 iops | 7640 iops |
|
|
||||||
| линейное чтение | 4.3 GB/s | 1.8 GB/s | 2.85 GB/s |
|
|
||||||
| 4k случайное чтение Q128 | 287000 iops | 140000 iops | 189000 iops |
|
|
||||||
| 4k случайное чтение Q1 | 9600 iops | 7640 iops | 7780 iops |
|
|
||||||
|
|
||||||
Чтобы попробовать VDUSE, вам нужно ядро Linux как минимум версии 5.15, собранное с поддержкой
|
|
||||||
VDUSE (CONFIG_VIRTIO_VDPA=m, CONFIG_VDPA_USER=m, CONFIG_VIRTIO_VDPA=m).
|
|
||||||
|
|
||||||
В ядрах в Debian Linux поддержка пока отключена по умолчанию, так что чтобы попробовать VDUSE
|
|
||||||
на Debian, поставьте ядро из Ubuntu [kernel-ppa/mainline](https://kernel.ubuntu.com/~kernel-ppa/mainline/),
|
|
||||||
из Proxmox или соберите модули для ядра Debian вручную:
|
|
||||||
|
|
||||||
```
|
|
||||||
mkdir build
|
|
||||||
cd build
|
|
||||||
apt-get install linux-headers-`uname -r`
|
|
||||||
apt-get build-dep linux-image-`uname -r`-unsigned
|
|
||||||
apt-get source linux-image-`uname -r`-unsigned
|
|
||||||
cd linux*/drivers/vdpa
|
|
||||||
make -C /lib/modules/`uname -r`/build M=$PWD CONFIG_VDPA=m CONFIG_VDPA_USER=m CONFIG_VIRTIO_VDPA=m -j8 modules modules_install
|
|
||||||
cat Module.symvers >> /lib/modules/`uname -r`/build/Module.symvers
|
|
||||||
cd ../virtio
|
|
||||||
make -C /lib/modules/`uname -r`/build M=$PWD CONFIG_VDPA=m CONFIG_VDPA_USER=m CONFIG_VIRTIO_VDPA=m -j8 modules modules_install
|
|
||||||
depmod -a
|
|
||||||
```
|
|
||||||
|
|
||||||
Также вам понадобится консольная утилита `vdpa` из пакета `iproute2`.
|
|
||||||
|
|
||||||
Команды для подключения виртуального диска через VDUSE:
|
Команды для подключения виртуального диска через VDUSE:
|
||||||
|
|
||||||
@@ -182,7 +154,7 @@ qemu-storage-daemon --daemonize --blockdev '{"node-name":"test1","driver":"vitas
|
|||||||
vdpa dev add name test1 mgmtdev vduse
|
vdpa dev add name test1 mgmtdev vduse
|
||||||
```
|
```
|
||||||
|
|
||||||
После этого в системе появится устройство `/dev/vda`, которое можно будет использовать как
|
После этого в системе появится устройство /dev/vda, которое можно будет использовать как
|
||||||
обычный диск.
|
обычный диск.
|
||||||
|
|
||||||
Для удаления устройства из системы:
|
Для удаления устройства из системы:
|
||||||
|
2
json11
2
json11
Submodule json11 updated: fd37016cf8...52a3af664f
40
mon/mon.js
40
mon/mon.js
@@ -65,6 +65,7 @@ const etcd_tree = {
|
|||||||
// client and osd
|
// client and osd
|
||||||
tcp_header_buffer_size: 65536,
|
tcp_header_buffer_size: 65536,
|
||||||
use_sync_send_recv: false,
|
use_sync_send_recv: false,
|
||||||
|
use_zerocopy_send: false,
|
||||||
use_rdma: true,
|
use_rdma: true,
|
||||||
rdma_device: null, // for example, "rocep5s0f0"
|
rdma_device: null, // for example, "rocep5s0f0"
|
||||||
rdma_port_num: 1,
|
rdma_port_num: 1,
|
||||||
@@ -403,7 +404,6 @@ class Mon
|
|||||||
this.ws_alive = false;
|
this.ws_alive = false;
|
||||||
this.ws_keepalive_timer = null;
|
this.ws_keepalive_timer = null;
|
||||||
this.on_stop_cb = () => this.on_stop(0).catch(console.error);
|
this.on_stop_cb = () => this.on_stop(0).catch(console.error);
|
||||||
this.recheck_pgs_active = false;
|
|
||||||
}
|
}
|
||||||
|
|
||||||
parse_etcd_addresses(addrs)
|
parse_etcd_addresses(addrs)
|
||||||
@@ -693,27 +693,8 @@ class Mon
|
|||||||
});
|
});
|
||||||
}
|
}
|
||||||
|
|
||||||
// Schedule save_last_clean() to to run after a small timeout (1s) (to not spam etcd)
|
|
||||||
schedule_save_last_clean()
|
|
||||||
{
|
|
||||||
if (!this.save_last_clean_timer)
|
|
||||||
{
|
|
||||||
this.save_last_clean_timer = setTimeout(() =>
|
|
||||||
{
|
|
||||||
this.save_last_clean_timer = null;
|
|
||||||
this.save_last_clean().catch(this.die);
|
|
||||||
}, this.config.mon_change_timeout || 1000);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
async save_last_clean()
|
async save_last_clean()
|
||||||
{
|
{
|
||||||
if (this.save_last_clean_running)
|
|
||||||
{
|
|
||||||
this.schedule_save_last_clean();
|
|
||||||
return;
|
|
||||||
}
|
|
||||||
this.save_last_clean_running = true;
|
|
||||||
// last_clean_pgs is used to avoid extra data move when observing a series of changes in the cluster
|
// last_clean_pgs is used to avoid extra data move when observing a series of changes in the cluster
|
||||||
const new_clean_pgs = { items: {} };
|
const new_clean_pgs = { items: {} };
|
||||||
next_pool:
|
next_pool:
|
||||||
@@ -750,7 +731,6 @@ class Mon
|
|||||||
value: b64(JSON.stringify(this.state.history.last_clean_pgs))
|
value: b64(JSON.stringify(this.state.history.last_clean_pgs))
|
||||||
} } ],
|
} } ],
|
||||||
}, this.etcd_start_timeout, 0);
|
}, this.etcd_start_timeout, 0);
|
||||||
this.save_last_clean_running = false;
|
|
||||||
}
|
}
|
||||||
|
|
||||||
get_mon_state()
|
get_mon_state()
|
||||||
@@ -1224,12 +1204,6 @@ class Mon
|
|||||||
|
|
||||||
async recheck_pgs()
|
async recheck_pgs()
|
||||||
{
|
{
|
||||||
if (this.recheck_pgs_active)
|
|
||||||
{
|
|
||||||
this.schedule_recheck();
|
|
||||||
return;
|
|
||||||
}
|
|
||||||
this.recheck_pgs_active = true;
|
|
||||||
// Take configuration and state, check it against the stored configuration hash
|
// Take configuration and state, check it against the stored configuration hash
|
||||||
// Recalculate PGs and save them to etcd if the configuration is changed
|
// Recalculate PGs and save them to etcd if the configuration is changed
|
||||||
// FIXME: Do not change anything if the distribution is good and random enough and no PGs are degraded
|
// FIXME: Do not change anything if the distribution is good and random enough and no PGs are degraded
|
||||||
@@ -1251,7 +1225,6 @@ class Mon
|
|||||||
// Pool deleted. Delete all PGs, but first stop them.
|
// Pool deleted. Delete all PGs, but first stop them.
|
||||||
if (!await this.stop_all_pgs(pool_id))
|
if (!await this.stop_all_pgs(pool_id))
|
||||||
{
|
{
|
||||||
this.recheck_pgs_active = false;
|
|
||||||
this.schedule_recheck();
|
this.schedule_recheck();
|
||||||
return;
|
return;
|
||||||
}
|
}
|
||||||
@@ -1320,16 +1293,9 @@ class Mon
|
|||||||
// PG count changed. Need to bring all PGs down.
|
// PG count changed. Need to bring all PGs down.
|
||||||
if (!await this.stop_all_pgs(pool_id))
|
if (!await this.stop_all_pgs(pool_id))
|
||||||
{
|
{
|
||||||
this.recheck_pgs_active = false;
|
|
||||||
this.schedule_recheck();
|
this.schedule_recheck();
|
||||||
return;
|
return;
|
||||||
}
|
}
|
||||||
}
|
|
||||||
if (prev_pgs.length != pool_cfg.pg_count)
|
|
||||||
{
|
|
||||||
// Scale PG count
|
|
||||||
// Do it even if old_pg_count is already equal to pool_cfg.pg_count,
|
|
||||||
// because last_clean_pgs may still contain the old number of PGs
|
|
||||||
const new_pg_history = [];
|
const new_pg_history = [];
|
||||||
PGUtil.scale_pg_count(prev_pgs, real_prev_pgs, pg_history, new_pg_history, pool_cfg.pg_count);
|
PGUtil.scale_pg_count(prev_pgs, real_prev_pgs, pg_history, new_pg_history, pool_cfg.pg_count);
|
||||||
pg_history = new_pg_history;
|
pg_history = new_pg_history;
|
||||||
@@ -1431,7 +1397,6 @@ class Mon
|
|||||||
await this.save_pg_config(new_config_pgs);
|
await this.save_pg_config(new_config_pgs);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
this.recheck_pgs_active = false;
|
|
||||||
}
|
}
|
||||||
|
|
||||||
async save_pg_config(new_config_pgs, etcd_request = { compare: [], success: [] })
|
async save_pg_config(new_config_pgs, etcd_request = { compare: [], success: [] })
|
||||||
@@ -1481,6 +1446,7 @@ class Mon
|
|||||||
}
|
}
|
||||||
|
|
||||||
// Schedule a recheck to run after a small timeout (1s)
|
// Schedule a recheck to run after a small timeout (1s)
|
||||||
|
// If already scheduled, cancel previous timer and schedule it again
|
||||||
// This is required for multiple change events to trigger at most 1 recheck in 1s
|
// This is required for multiple change events to trigger at most 1 recheck in 1s
|
||||||
schedule_recheck()
|
schedule_recheck()
|
||||||
{
|
{
|
||||||
@@ -1498,7 +1464,7 @@ class Mon
|
|||||||
{
|
{
|
||||||
const zero_stats = { op: { bps: 0n, iops: 0n, lat: 0n }, subop: { iops: 0n, lat: 0n }, recovery: { bps: 0n, iops: 0n } };
|
const zero_stats = { op: { bps: 0n, iops: 0n, lat: 0n }, subop: { iops: 0n, lat: 0n }, recovery: { bps: 0n, iops: 0n } };
|
||||||
const diff = { op_stats: {}, subop_stats: {}, recovery_stats: {}, inode_stats: {} };
|
const diff = { op_stats: {}, subop_stats: {}, recovery_stats: {}, inode_stats: {} };
|
||||||
if (!st || !st.time || !prev || !prev.time || prev.time >= st.time)
|
if (!st || !st.time || !prev || prev.time >= st.time)
|
||||||
{
|
{
|
||||||
return prev_diff || diff;
|
return prev_diff || diff;
|
||||||
}
|
}
|
||||||
|
@@ -1,6 +1,6 @@
|
|||||||
{
|
{
|
||||||
"name": "vitastor-mon",
|
"name": "vitastor-mon",
|
||||||
"version": "1.2.0",
|
"version": "1.1.0",
|
||||||
"description": "Vitastor SDS monitor service",
|
"description": "Vitastor SDS monitor service",
|
||||||
"main": "mon-main.js",
|
"main": "mon-main.js",
|
||||||
"scripts": {
|
"scripts": {
|
||||||
|
@@ -50,7 +50,7 @@ from cinder.volume import configuration
|
|||||||
from cinder.volume import driver
|
from cinder.volume import driver
|
||||||
from cinder.volume import volume_utils
|
from cinder.volume import volume_utils
|
||||||
|
|
||||||
VERSION = '1.2.0'
|
VERSION = '1.1.0'
|
||||||
|
|
||||||
LOG = logging.getLogger(__name__)
|
LOG = logging.getLogger(__name__)
|
||||||
|
|
||||||
|
@@ -24,4 +24,4 @@ rm fio
|
|||||||
mv fio-copy fio
|
mv fio-copy fio
|
||||||
FIO=`rpm -qi fio | perl -e 'while(<>) { /^Epoch[\s:]+(\S+)/ && print "$1:"; /^Version[\s:]+(\S+)/ && print $1; /^Release[\s:]+(\S+)/ && print "-$1"; }'`
|
FIO=`rpm -qi fio | perl -e 'while(<>) { /^Epoch[\s:]+(\S+)/ && print "$1:"; /^Version[\s:]+(\S+)/ && print $1; /^Release[\s:]+(\S+)/ && print "-$1"; }'`
|
||||||
perl -i -pe 's/(Requires:\s*fio)([^\n]+)?/$1 = '$FIO'/' $VITASTOR/rpm/vitastor-el$EL.spec
|
perl -i -pe 's/(Requires:\s*fio)([^\n]+)?/$1 = '$FIO'/' $VITASTOR/rpm/vitastor-el$EL.spec
|
||||||
tar --transform 's#^#vitastor-1.2.0/#' --exclude 'rpm/*.rpm' -czf $VITASTOR/../vitastor-1.2.0$(rpm --eval '%dist').tar.gz *
|
tar --transform 's#^#vitastor-1.1.0/#' --exclude 'rpm/*.rpm' -czf $VITASTOR/../vitastor-1.1.0$(rpm --eval '%dist').tar.gz *
|
||||||
|
@@ -35,7 +35,7 @@ ADD . /root/vitastor
|
|||||||
RUN set -e; \
|
RUN set -e; \
|
||||||
cd /root/vitastor/rpm; \
|
cd /root/vitastor/rpm; \
|
||||||
sh build-tarball.sh; \
|
sh build-tarball.sh; \
|
||||||
cp /root/vitastor-1.2.0.el7.tar.gz ~/rpmbuild/SOURCES; \
|
cp /root/vitastor-1.1.0.el7.tar.gz ~/rpmbuild/SOURCES; \
|
||||||
cp vitastor-el7.spec ~/rpmbuild/SPECS/vitastor.spec; \
|
cp vitastor-el7.spec ~/rpmbuild/SPECS/vitastor.spec; \
|
||||||
cd ~/rpmbuild/SPECS/; \
|
cd ~/rpmbuild/SPECS/; \
|
||||||
rpmbuild -ba vitastor.spec; \
|
rpmbuild -ba vitastor.spec; \
|
||||||
|
@@ -1,11 +1,11 @@
|
|||||||
Name: vitastor
|
Name: vitastor
|
||||||
Version: 1.2.0
|
Version: 1.1.0
|
||||||
Release: 1%{?dist}
|
Release: 1%{?dist}
|
||||||
Summary: Vitastor, a fast software-defined clustered block storage
|
Summary: Vitastor, a fast software-defined clustered block storage
|
||||||
|
|
||||||
License: Vitastor Network Public License 1.1
|
License: Vitastor Network Public License 1.1
|
||||||
URL: https://vitastor.io/
|
URL: https://vitastor.io/
|
||||||
Source0: vitastor-1.2.0.el7.tar.gz
|
Source0: vitastor-1.1.0.el7.tar.gz
|
||||||
|
|
||||||
BuildRequires: liburing-devel >= 0.6
|
BuildRequires: liburing-devel >= 0.6
|
||||||
BuildRequires: gperftools-devel
|
BuildRequires: gperftools-devel
|
||||||
|
@@ -35,7 +35,7 @@ ADD . /root/vitastor
|
|||||||
RUN set -e; \
|
RUN set -e; \
|
||||||
cd /root/vitastor/rpm; \
|
cd /root/vitastor/rpm; \
|
||||||
sh build-tarball.sh; \
|
sh build-tarball.sh; \
|
||||||
cp /root/vitastor-1.2.0.el8.tar.gz ~/rpmbuild/SOURCES; \
|
cp /root/vitastor-1.1.0.el8.tar.gz ~/rpmbuild/SOURCES; \
|
||||||
cp vitastor-el8.spec ~/rpmbuild/SPECS/vitastor.spec; \
|
cp vitastor-el8.spec ~/rpmbuild/SPECS/vitastor.spec; \
|
||||||
cd ~/rpmbuild/SPECS/; \
|
cd ~/rpmbuild/SPECS/; \
|
||||||
rpmbuild -ba vitastor.spec; \
|
rpmbuild -ba vitastor.spec; \
|
||||||
|
@@ -1,11 +1,11 @@
|
|||||||
Name: vitastor
|
Name: vitastor
|
||||||
Version: 1.2.0
|
Version: 1.1.0
|
||||||
Release: 1%{?dist}
|
Release: 1%{?dist}
|
||||||
Summary: Vitastor, a fast software-defined clustered block storage
|
Summary: Vitastor, a fast software-defined clustered block storage
|
||||||
|
|
||||||
License: Vitastor Network Public License 1.1
|
License: Vitastor Network Public License 1.1
|
||||||
URL: https://vitastor.io/
|
URL: https://vitastor.io/
|
||||||
Source0: vitastor-1.2.0.el8.tar.gz
|
Source0: vitastor-1.1.0.el8.tar.gz
|
||||||
|
|
||||||
BuildRequires: liburing-devel >= 0.6
|
BuildRequires: liburing-devel >= 0.6
|
||||||
BuildRequires: gperftools-devel
|
BuildRequires: gperftools-devel
|
||||||
|
@@ -18,7 +18,7 @@ ADD . /root/vitastor
|
|||||||
RUN set -e; \
|
RUN set -e; \
|
||||||
cd /root/vitastor/rpm; \
|
cd /root/vitastor/rpm; \
|
||||||
sh build-tarball.sh; \
|
sh build-tarball.sh; \
|
||||||
cp /root/vitastor-1.2.0.el9.tar.gz ~/rpmbuild/SOURCES; \
|
cp /root/vitastor-1.1.0.el9.tar.gz ~/rpmbuild/SOURCES; \
|
||||||
cp vitastor-el9.spec ~/rpmbuild/SPECS/vitastor.spec; \
|
cp vitastor-el9.spec ~/rpmbuild/SPECS/vitastor.spec; \
|
||||||
cd ~/rpmbuild/SPECS/; \
|
cd ~/rpmbuild/SPECS/; \
|
||||||
rpmbuild -ba vitastor.spec; \
|
rpmbuild -ba vitastor.spec; \
|
||||||
|
@@ -1,11 +1,11 @@
|
|||||||
Name: vitastor
|
Name: vitastor
|
||||||
Version: 1.2.0
|
Version: 1.1.0
|
||||||
Release: 1%{?dist}
|
Release: 1%{?dist}
|
||||||
Summary: Vitastor, a fast software-defined clustered block storage
|
Summary: Vitastor, a fast software-defined clustered block storage
|
||||||
|
|
||||||
License: Vitastor Network Public License 1.1
|
License: Vitastor Network Public License 1.1
|
||||||
URL: https://vitastor.io/
|
URL: https://vitastor.io/
|
||||||
Source0: vitastor-1.2.0.el9.tar.gz
|
Source0: vitastor-1.1.0.el9.tar.gz
|
||||||
|
|
||||||
BuildRequires: liburing-devel >= 0.6
|
BuildRequires: liburing-devel >= 0.6
|
||||||
BuildRequires: gperftools-devel
|
BuildRequires: gperftools-devel
|
||||||
|
@@ -16,11 +16,10 @@ if("${CMAKE_INSTALL_PREFIX}" MATCHES "^/usr/local/?$")
|
|||||||
set(CMAKE_INSTALL_RPATH "${CMAKE_INSTALL_PREFIX}/${CMAKE_INSTALL_LIBDIR}")
|
set(CMAKE_INSTALL_RPATH "${CMAKE_INSTALL_PREFIX}/${CMAKE_INSTALL_LIBDIR}")
|
||||||
endif()
|
endif()
|
||||||
|
|
||||||
add_definitions(-DVERSION="1.2.0")
|
add_definitions(-DVERSION="1.1.0")
|
||||||
add_definitions(-Wall -Wno-sign-compare -Wno-comment -Wno-parentheses -Wno-pointer-arith -fdiagnostics-color=always -fno-omit-frame-pointer -I ${CMAKE_SOURCE_DIR}/src)
|
add_definitions(-Wall -Wno-sign-compare -Wno-comment -Wno-parentheses -Wno-pointer-arith -fdiagnostics-color=always -I ${CMAKE_SOURCE_DIR}/src)
|
||||||
add_link_options(-fno-omit-frame-pointer)
|
|
||||||
if (${WITH_ASAN})
|
if (${WITH_ASAN})
|
||||||
add_definitions(-fsanitize=address)
|
add_definitions(-fsanitize=address -fno-omit-frame-pointer)
|
||||||
add_link_options(-fsanitize=address -fno-omit-frame-pointer)
|
add_link_options(-fsanitize=address -fno-omit-frame-pointer)
|
||||||
endif (${WITH_ASAN})
|
endif (${WITH_ASAN})
|
||||||
|
|
||||||
@@ -181,25 +180,6 @@ target_link_libraries(vitastor-nbd
|
|||||||
vitastor_client
|
vitastor_client
|
||||||
)
|
)
|
||||||
|
|
||||||
# vitastor-kv
|
|
||||||
add_executable(vitastor-kv
|
|
||||||
kv_cli.cpp
|
|
||||||
kv_db.cpp
|
|
||||||
kv_db.h
|
|
||||||
)
|
|
||||||
target_link_libraries(vitastor-kv
|
|
||||||
vitastor_client
|
|
||||||
)
|
|
||||||
|
|
||||||
add_executable(vitastor-kv-stress
|
|
||||||
kv_stress.cpp
|
|
||||||
kv_db.cpp
|
|
||||||
kv_db.h
|
|
||||||
)
|
|
||||||
target_link_libraries(vitastor-kv-stress
|
|
||||||
vitastor_client
|
|
||||||
)
|
|
||||||
|
|
||||||
# vitastor-nfs
|
# vitastor-nfs
|
||||||
add_executable(vitastor-nfs
|
add_executable(vitastor-nfs
|
||||||
nfs_proxy.cpp
|
nfs_proxy.cpp
|
||||||
|
@@ -1372,8 +1372,7 @@ bool journal_flusher_co::trim_journal(int wait_base)
|
|||||||
? (uint32_t)JE_START_V1_SIZE : (uint32_t)JE_START_V2_SIZE),
|
? (uint32_t)JE_START_V1_SIZE : (uint32_t)JE_START_V2_SIZE),
|
||||||
.reserved = 0,
|
.reserved = 0,
|
||||||
.journal_start = new_trim_pos,
|
.journal_start = new_trim_pos,
|
||||||
.version = (uint64_t)(!bs->dsk.data_csum_type && ((journal_entry_start*)flusher->journal_superblock)->version == JOURNAL_VERSION_V1
|
.version = JOURNAL_VERSION_V2,
|
||||||
? JOURNAL_VERSION_V1 : JOURNAL_VERSION_V2),
|
|
||||||
.data_csum_type = bs->dsk.data_csum_type,
|
.data_csum_type = bs->dsk.data_csum_type,
|
||||||
.csum_block_size = bs->dsk.csum_block_size,
|
.csum_block_size = bs->dsk.csum_block_size,
|
||||||
};
|
};
|
||||||
|
@@ -274,7 +274,7 @@ class blockstore_impl_t
|
|||||||
blockstore_dirty_db_t dirty_db;
|
blockstore_dirty_db_t dirty_db;
|
||||||
std::vector<blockstore_op_t*> submit_queue;
|
std::vector<blockstore_op_t*> submit_queue;
|
||||||
std::vector<obj_ver_id> unsynced_big_writes, unsynced_small_writes;
|
std::vector<obj_ver_id> unsynced_big_writes, unsynced_small_writes;
|
||||||
int unsynced_big_write_count = 0, unstable_unsynced = 0;
|
int unsynced_big_write_count = 0;
|
||||||
int unsynced_queued_ops = 0;
|
int unsynced_queued_ops = 0;
|
||||||
allocator *data_alloc = NULL;
|
allocator *data_alloc = NULL;
|
||||||
uint8_t *zero_object;
|
uint8_t *zero_object;
|
||||||
|
@@ -553,7 +553,7 @@ resume_1:
|
|||||||
}
|
}
|
||||||
if (je_start->size == JE_START_V0_SIZE ||
|
if (je_start->size == JE_START_V0_SIZE ||
|
||||||
(je_start->version != JOURNAL_VERSION_V1 || je_start->size != JE_START_V1_SIZE) &&
|
(je_start->version != JOURNAL_VERSION_V1 || je_start->size != JE_START_V1_SIZE) &&
|
||||||
(je_start->version != JOURNAL_VERSION_V2 || je_start->size != JE_START_V2_SIZE && je_start->size != JE_START_V1_SIZE))
|
(je_start->version != JOURNAL_VERSION_V2 || je_start->size != JE_START_V2_SIZE))
|
||||||
{
|
{
|
||||||
fprintf(
|
fprintf(
|
||||||
stderr, "The code only supports journal versions 2 and 1, but it is %lu on disk."
|
stderr, "The code only supports journal versions 2 and 1, but it is %lu on disk."
|
||||||
@@ -562,8 +562,7 @@ resume_1:
|
|||||||
);
|
);
|
||||||
exit(1);
|
exit(1);
|
||||||
}
|
}
|
||||||
if (je_start->version == JOURNAL_VERSION_V1 ||
|
if (je_start->version == JOURNAL_VERSION_V1)
|
||||||
je_start->version == JOURNAL_VERSION_V2 && je_start->size == JE_START_V1_SIZE)
|
|
||||||
{
|
{
|
||||||
je_start->data_csum_type = 0;
|
je_start->data_csum_type = 0;
|
||||||
je_start->csum_block_size = 0;
|
je_start->csum_block_size = 0;
|
||||||
|
@@ -145,7 +145,6 @@ journal_entry* prefill_single_journal_entry(journal_t & journal, uint16_t type,
|
|||||||
journal.sector_info[journal.cur_sector].offset = journal.next_free;
|
journal.sector_info[journal.cur_sector].offset = journal.next_free;
|
||||||
journal.in_sector_pos = 0;
|
journal.in_sector_pos = 0;
|
||||||
journal.next_free = (journal.next_free+journal.block_size) < journal.len ? journal.next_free + journal.block_size : journal.block_size;
|
journal.next_free = (journal.next_free+journal.block_size) < journal.len ? journal.next_free + journal.block_size : journal.block_size;
|
||||||
assert(journal.next_free != journal.used_start);
|
|
||||||
memset(journal.inmemory
|
memset(journal.inmemory
|
||||||
? (uint8_t*)journal.buffer + journal.sector_info[journal.cur_sector].offset
|
? (uint8_t*)journal.buffer + journal.sector_info[journal.cur_sector].offset
|
||||||
: (uint8_t*)journal.sector_buf + journal.block_size*journal.cur_sector, 0, journal.block_size);
|
: (uint8_t*)journal.sector_buf + journal.block_size*journal.cur_sector, 0, journal.block_size);
|
||||||
|
@@ -13,6 +13,12 @@
|
|||||||
#define JOURNAL_BUFFER_SIZE 4*1024*1024
|
#define JOURNAL_BUFFER_SIZE 4*1024*1024
|
||||||
#define JOURNAL_ENTRY_HEADER_SIZE 16
|
#define JOURNAL_ENTRY_HEADER_SIZE 16
|
||||||
|
|
||||||
|
// We reserve some extra space for future stabilize requests during writes
|
||||||
|
// FIXME: This value should be dynamic i.e. Blockstore ideally shouldn't allow
|
||||||
|
// writing more than can be stabilized afterwards
|
||||||
|
#define JOURNAL_STABILIZE_RESERVATION 65536
|
||||||
|
#define JOURNAL_INSTANT_RESERVATION 131072
|
||||||
|
|
||||||
// Journal entries
|
// Journal entries
|
||||||
// Journal entries are linked to each other by their crc32 value
|
// Journal entries are linked to each other by their crc32 value
|
||||||
// The journal is almost a blockchain, because object versions constantly increase
|
// The journal is almost a blockchain, because object versions constantly increase
|
||||||
|
@@ -86,15 +86,14 @@ int blockstore_impl_t::continue_sync(blockstore_op_t *op)
|
|||||||
auto & dirty_entry = dirty_db.at(sbw);
|
auto & dirty_entry = dirty_db.at(sbw);
|
||||||
uint64_t dyn_size = dsk.dirty_dyn_size(dirty_entry.offset, dirty_entry.len);
|
uint64_t dyn_size = dsk.dirty_dyn_size(dirty_entry.offset, dirty_entry.len);
|
||||||
if (!space_check.check_available(op, 1, sizeof(journal_entry_big_write) + dyn_size,
|
if (!space_check.check_available(op, 1, sizeof(journal_entry_big_write) + dyn_size,
|
||||||
(unstable_writes.size()+unstable_unsynced)*journal.block_size))
|
left == 0 ? JOURNAL_STABILIZE_RESERVATION : 0))
|
||||||
{
|
{
|
||||||
return 0;
|
return 0;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
else if (!space_check.check_available(op, PRIV(op)->sync_big_writes.size(),
|
else if (!space_check.check_available(op, PRIV(op)->sync_big_writes.size(),
|
||||||
sizeof(journal_entry_big_write) + dsk.clean_entry_bitmap_size,
|
sizeof(journal_entry_big_write) + dsk.clean_entry_bitmap_size, JOURNAL_STABILIZE_RESERVATION))
|
||||||
(unstable_writes.size()+unstable_unsynced)*journal.block_size))
|
|
||||||
{
|
{
|
||||||
return 0;
|
return 0;
|
||||||
}
|
}
|
||||||
@@ -185,11 +184,6 @@ void blockstore_impl_t::ack_sync(blockstore_op_t *op)
|
|||||||
{
|
{
|
||||||
mark_stable(dirty_it->first);
|
mark_stable(dirty_it->first);
|
||||||
}
|
}
|
||||||
else
|
|
||||||
{
|
|
||||||
unstable_unsynced--;
|
|
||||||
assert(unstable_unsynced >= 0);
|
|
||||||
}
|
|
||||||
dirty_it++;
|
dirty_it++;
|
||||||
while (dirty_it != dirty_db.end() && dirty_it->first.oid == it->oid)
|
while (dirty_it != dirty_db.end() && dirty_it->first.oid == it->oid)
|
||||||
{
|
{
|
||||||
@@ -220,11 +214,6 @@ void blockstore_impl_t::ack_sync(blockstore_op_t *op)
|
|||||||
{
|
{
|
||||||
mark_stable(*it);
|
mark_stable(*it);
|
||||||
}
|
}
|
||||||
else
|
|
||||||
{
|
|
||||||
unstable_unsynced--;
|
|
||||||
assert(unstable_unsynced >= 0);
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
op->retval = 0;
|
op->retval = 0;
|
||||||
|
@@ -21,7 +21,7 @@ bool blockstore_impl_t::enqueue_write(blockstore_op_t *op)
|
|||||||
dyn = calloc_or_die(1, dyn_size+sizeof(int));
|
dyn = calloc_or_die(1, dyn_size+sizeof(int));
|
||||||
*((int*)dyn) = 1;
|
*((int*)dyn) = 1;
|
||||||
}
|
}
|
||||||
uint8_t *dyn_ptr = (alloc_dyn_data ? (uint8_t*)dyn+sizeof(int) : (uint8_t*)&dyn);
|
uint8_t *dyn_ptr = (uint8_t*)(alloc_dyn_data ? dyn+sizeof(int) : &dyn);
|
||||||
uint64_t version = 1;
|
uint64_t version = 1;
|
||||||
if (dirty_db.size() > 0)
|
if (dirty_db.size() > 0)
|
||||||
{
|
{
|
||||||
@@ -320,7 +320,7 @@ int blockstore_impl_t::dequeue_write(blockstore_op_t *op)
|
|||||||
blockstore_journal_check_t space_check(this);
|
blockstore_journal_check_t space_check(this);
|
||||||
if (!space_check.check_available(op, unsynced_big_write_count + 1,
|
if (!space_check.check_available(op, unsynced_big_write_count + 1,
|
||||||
sizeof(journal_entry_big_write) + dsk.clean_dyn_size,
|
sizeof(journal_entry_big_write) + dsk.clean_dyn_size,
|
||||||
(unstable_writes.size()+unstable_unsynced)*journal.block_size))
|
(dirty_it->second.state & BS_ST_INSTANT) ? JOURNAL_INSTANT_RESERVATION : JOURNAL_STABILIZE_RESERVATION))
|
||||||
{
|
{
|
||||||
return 0;
|
return 0;
|
||||||
}
|
}
|
||||||
@@ -386,10 +386,6 @@ int blockstore_impl_t::dequeue_write(blockstore_op_t *op)
|
|||||||
sqe, dsk.data_fd, PRIV(op)->iov_zerofill, vcnt, dsk.data_offset + (loc << dsk.block_order) + op->offset - stripe_offset
|
sqe, dsk.data_fd, PRIV(op)->iov_zerofill, vcnt, dsk.data_offset + (loc << dsk.block_order) + op->offset - stripe_offset
|
||||||
);
|
);
|
||||||
PRIV(op)->pending_ops = 1;
|
PRIV(op)->pending_ops = 1;
|
||||||
if (immediate_commit != IMMEDIATE_ALL && !(dirty_it->second.state & BS_ST_INSTANT))
|
|
||||||
{
|
|
||||||
unstable_unsynced++;
|
|
||||||
}
|
|
||||||
if (immediate_commit != IMMEDIATE_ALL)
|
if (immediate_commit != IMMEDIATE_ALL)
|
||||||
{
|
{
|
||||||
// Increase the counter, but don't save into unsynced_writes yet (can't sync until the write is finished)
|
// Increase the counter, but don't save into unsynced_writes yet (can't sync until the write is finished)
|
||||||
@@ -412,7 +408,7 @@ int blockstore_impl_t::dequeue_write(blockstore_op_t *op)
|
|||||||
sizeof(journal_entry_big_write) + dsk.clean_dyn_size, 0)
|
sizeof(journal_entry_big_write) + dsk.clean_dyn_size, 0)
|
||||||
|| !space_check.check_available(op, 1,
|
|| !space_check.check_available(op, 1,
|
||||||
sizeof(journal_entry_small_write) + dyn_size,
|
sizeof(journal_entry_small_write) + dyn_size,
|
||||||
(unstable_writes.size()+unstable_unsynced)*journal.block_size))
|
op->len + ((dirty_it->second.state & BS_ST_INSTANT) ? JOURNAL_INSTANT_RESERVATION : JOURNAL_STABILIZE_RESERVATION)))
|
||||||
{
|
{
|
||||||
return 0;
|
return 0;
|
||||||
}
|
}
|
||||||
@@ -503,11 +499,6 @@ int blockstore_impl_t::dequeue_write(blockstore_op_t *op)
|
|||||||
if (journal.next_free >= journal.len)
|
if (journal.next_free >= journal.len)
|
||||||
{
|
{
|
||||||
journal.next_free = dsk.journal_block_size;
|
journal.next_free = dsk.journal_block_size;
|
||||||
assert(journal.next_free != journal.used_start);
|
|
||||||
}
|
|
||||||
if (immediate_commit == IMMEDIATE_NONE && !(dirty_it->second.state & BS_ST_INSTANT))
|
|
||||||
{
|
|
||||||
unstable_unsynced++;
|
|
||||||
}
|
}
|
||||||
if (!PRIV(op)->pending_ops)
|
if (!PRIV(op)->pending_ops)
|
||||||
{
|
{
|
||||||
@@ -547,7 +538,7 @@ resume_2:
|
|||||||
uint64_t dyn_size = dsk.dirty_dyn_size(op->offset, op->len);
|
uint64_t dyn_size = dsk.dirty_dyn_size(op->offset, op->len);
|
||||||
blockstore_journal_check_t space_check(this);
|
blockstore_journal_check_t space_check(this);
|
||||||
if (!space_check.check_available(op, 1, sizeof(journal_entry_big_write) + dyn_size,
|
if (!space_check.check_available(op, 1, sizeof(journal_entry_big_write) + dyn_size,
|
||||||
(unstable_writes.size()+unstable_unsynced)*journal.block_size))
|
((dirty_it->second.state & BS_ST_INSTANT) ? JOURNAL_INSTANT_RESERVATION : JOURNAL_STABILIZE_RESERVATION)))
|
||||||
{
|
{
|
||||||
return 0;
|
return 0;
|
||||||
}
|
}
|
||||||
@@ -591,20 +582,14 @@ resume_4:
|
|||||||
#endif
|
#endif
|
||||||
bool is_big = (dirty_it->second.state & BS_ST_TYPE_MASK) == BS_ST_BIG_WRITE;
|
bool is_big = (dirty_it->second.state & BS_ST_TYPE_MASK) == BS_ST_BIG_WRITE;
|
||||||
bool imm = is_big ? (immediate_commit == IMMEDIATE_ALL) : (immediate_commit != IMMEDIATE_NONE);
|
bool imm = is_big ? (immediate_commit == IMMEDIATE_ALL) : (immediate_commit != IMMEDIATE_NONE);
|
||||||
bool is_instant = ((dirty_it->second.state & BS_ST_TYPE_MASK) == BS_ST_DELETE || (dirty_it->second.state & BS_ST_INSTANT));
|
|
||||||
if (imm)
|
if (imm)
|
||||||
{
|
{
|
||||||
auto & unstab = unstable_writes[op->oid];
|
auto & unstab = unstable_writes[op->oid];
|
||||||
unstab = unstab < op->version ? op->version : unstab;
|
unstab = unstab < op->version ? op->version : unstab;
|
||||||
}
|
}
|
||||||
else if (!is_instant)
|
|
||||||
{
|
|
||||||
unstable_unsynced--;
|
|
||||||
assert(unstable_unsynced >= 0);
|
|
||||||
}
|
|
||||||
dirty_it->second.state = (dirty_it->second.state & ~BS_ST_WORKFLOW_MASK)
|
dirty_it->second.state = (dirty_it->second.state & ~BS_ST_WORKFLOW_MASK)
|
||||||
| (imm ? BS_ST_SYNCED : BS_ST_WRITTEN);
|
| (imm ? BS_ST_SYNCED : BS_ST_WRITTEN);
|
||||||
if (imm && is_instant)
|
if (imm && ((dirty_it->second.state & BS_ST_TYPE_MASK) == BS_ST_DELETE || (dirty_it->second.state & BS_ST_INSTANT)))
|
||||||
{
|
{
|
||||||
// Deletions and 'instant' operations are treated as immediately stable
|
// Deletions and 'instant' operations are treated as immediately stable
|
||||||
mark_stable(dirty_it->first);
|
mark_stable(dirty_it->first);
|
||||||
@@ -750,7 +735,7 @@ int blockstore_impl_t::dequeue_del(blockstore_op_t *op)
|
|||||||
});
|
});
|
||||||
assert(dirty_it != dirty_db.end());
|
assert(dirty_it != dirty_db.end());
|
||||||
blockstore_journal_check_t space_check(this);
|
blockstore_journal_check_t space_check(this);
|
||||||
if (!space_check.check_available(op, 1, sizeof(journal_entry_del), (unstable_writes.size()+unstable_unsynced)*journal.block_size))
|
if (!space_check.check_available(op, 1, sizeof(journal_entry_del), JOURNAL_INSTANT_RESERVATION))
|
||||||
{
|
{
|
||||||
return 0;
|
return 0;
|
||||||
}
|
}
|
||||||
|
@@ -17,7 +17,7 @@
|
|||||||
static const char *exe_name = NULL;
|
static const char *exe_name = NULL;
|
||||||
|
|
||||||
static const char* help_text =
|
static const char* help_text =
|
||||||
"Vitastor command-line tool " VERSION "\n"
|
"Vitastor command-line tool\n"
|
||||||
"(c) Vitaliy Filippov, 2019+ (VNPL-1.1)\n"
|
"(c) Vitaliy Filippov, 2019+ (VNPL-1.1)\n"
|
||||||
"\n"
|
"\n"
|
||||||
"COMMANDS:\n"
|
"COMMANDS:\n"
|
||||||
@@ -331,7 +331,7 @@ static int run(cli_tool_t *p, json11::Json::object cfg)
|
|||||||
{
|
{
|
||||||
// Create client
|
// Create client
|
||||||
json11::Json cfg_j = cfg;
|
json11::Json cfg_j = cfg;
|
||||||
p->ringloop = new ring_loop_t(RINGLOOP_DEFAULT_SIZE);
|
p->ringloop = new ring_loop_t(512);
|
||||||
p->epmgr = new epoll_manager_t(p->ringloop);
|
p->epmgr = new epoll_manager_t(p->ringloop);
|
||||||
p->cli = new cluster_client_t(p->ringloop, p->epmgr->tfd, cfg_j);
|
p->cli = new cluster_client_t(p->ringloop, p->epmgr->tfd, cfg_j);
|
||||||
// Smaller timeout by default for more interactiveness
|
// Smaller timeout by default for more interactiveness
|
||||||
|
@@ -109,7 +109,7 @@ resume_1:
|
|||||||
}
|
}
|
||||||
for (auto pg_per_pair: pg_per_osd)
|
for (auto pg_per_pair: pg_per_osd)
|
||||||
{
|
{
|
||||||
uint64_t pg_free = osd_free[pg_per_pair.first] * pool_cfg.real_pg_count / pg_per_pair.second;
|
uint64_t pg_free = osd_free[pg_per_pair.first] * pool_cfg.pg_count / pg_per_pair.second;
|
||||||
if (pool_avail > pg_free)
|
if (pool_avail > pg_free)
|
||||||
{
|
{
|
||||||
pool_avail = pg_free;
|
pool_avail = pg_free;
|
||||||
@@ -127,7 +127,6 @@ resume_1:
|
|||||||
{ "id", (uint64_t)pool_cfg.id },
|
{ "id", (uint64_t)pool_cfg.id },
|
||||||
{ "name", pool_cfg.name },
|
{ "name", pool_cfg.name },
|
||||||
{ "pg_count", pool_cfg.pg_count },
|
{ "pg_count", pool_cfg.pg_count },
|
||||||
{ "real_pg_count", pool_cfg.real_pg_count },
|
|
||||||
{ "scheme", pool_cfg.scheme == POOL_SCHEME_REPLICATED ? "replicated" : "ec" },
|
{ "scheme", pool_cfg.scheme == POOL_SCHEME_REPLICATED ? "replicated" : "ec" },
|
||||||
{ "scheme_name", pool_cfg.scheme == POOL_SCHEME_REPLICATED
|
{ "scheme_name", pool_cfg.scheme == POOL_SCHEME_REPLICATED
|
||||||
? std::to_string(pool_cfg.pg_size)+"/"+std::to_string(pool_cfg.pg_minsize)
|
? std::to_string(pool_cfg.pg_size)+"/"+std::to_string(pool_cfg.pg_minsize)
|
||||||
@@ -178,7 +177,7 @@ resume_1:
|
|||||||
{ "title", "SCHEME" },
|
{ "title", "SCHEME" },
|
||||||
});
|
});
|
||||||
cols.push_back(json11::Json::object{
|
cols.push_back(json11::Json::object{
|
||||||
{ "key", "pg_count_fmt" },
|
{ "key", "pg_count" },
|
||||||
{ "title", "PGS" },
|
{ "title", "PGS" },
|
||||||
});
|
});
|
||||||
cols.push_back(json11::Json::object{
|
cols.push_back(json11::Json::object{
|
||||||
@@ -207,9 +206,6 @@ resume_1:
|
|||||||
double raw_to = kv.second["raw_to_usable"].number_value();
|
double raw_to = kv.second["raw_to_usable"].number_value();
|
||||||
if (raw_to < 0.000001 && raw_to > -0.000001)
|
if (raw_to < 0.000001 && raw_to > -0.000001)
|
||||||
raw_to = 1;
|
raw_to = 1;
|
||||||
kv.second["pg_count_fmt"] = kv.second["real_pg_count"] == kv.second["pg_count"]
|
|
||||||
? kv.second["real_pg_count"].as_string()
|
|
||||||
: kv.second["real_pg_count"].as_string()+"->"+kv.second["pg_count"].as_string();
|
|
||||||
kv.second["total_fmt"] = format_size(kv.second["total_raw"].uint64_value() / raw_to);
|
kv.second["total_fmt"] = format_size(kv.second["total_raw"].uint64_value() / raw_to);
|
||||||
kv.second["used_fmt"] = format_size(kv.second["used_raw"].uint64_value() / raw_to);
|
kv.second["used_fmt"] = format_size(kv.second["used_raw"].uint64_value() / raw_to);
|
||||||
kv.second["max_avail_fmt"] = format_size(kv.second["max_available"].uint64_value());
|
kv.second["max_avail_fmt"] = format_size(kv.second["max_available"].uint64_value());
|
||||||
|
@@ -158,7 +158,12 @@ resume_2:
|
|||||||
for (auto & pool_pair: parent->cli->st_cli.pool_config)
|
for (auto & pool_pair: parent->cli->st_cli.pool_config)
|
||||||
{
|
{
|
||||||
auto & pool_cfg = pool_pair.second;
|
auto & pool_cfg = pool_pair.second;
|
||||||
bool active = pool_cfg.real_pg_count > 0;
|
bool active = true;
|
||||||
|
if (pool_cfg.pg_config.size() != pool_cfg.pg_count)
|
||||||
|
{
|
||||||
|
active = false;
|
||||||
|
pgs_by_state["offline"] += pool_cfg.pg_count-pool_cfg.pg_config.size();
|
||||||
|
}
|
||||||
pool_count++;
|
pool_count++;
|
||||||
for (auto pg_it = pool_cfg.pg_config.begin(); pg_it != pool_cfg.pg_config.end(); pg_it++)
|
for (auto pg_it = pool_cfg.pg_config.begin(); pg_it != pool_cfg.pg_config.end(); pg_it++)
|
||||||
{
|
{
|
||||||
|
@@ -6,7 +6,7 @@
|
|||||||
#include "cluster_client_impl.h"
|
#include "cluster_client_impl.h"
|
||||||
#include "http_client.h" // json_is_true
|
#include "http_client.h" // json_is_true
|
||||||
|
|
||||||
cluster_client_t::cluster_client_t(ring_loop_t *ringloop, timerfd_manager_t *tfd, json11::Json config)
|
cluster_client_t::cluster_client_t(ring_loop_t *ringloop, timerfd_manager_t *tfd, json11::Json & config)
|
||||||
{
|
{
|
||||||
wb = new writeback_cache_t();
|
wb = new writeback_cache_t();
|
||||||
|
|
||||||
@@ -64,7 +64,7 @@ cluster_client_t::cluster_client_t(ring_loop_t *ringloop, timerfd_manager_t *tfd
|
|||||||
|
|
||||||
cluster_client_t::~cluster_client_t()
|
cluster_client_t::~cluster_client_t()
|
||||||
{
|
{
|
||||||
msgr.repeer_pgs = [](osd_num_t){};
|
msgr.repeer_pgs = [this](osd_num_t){};
|
||||||
if (ringloop)
|
if (ringloop)
|
||||||
{
|
{
|
||||||
ringloop->unregister_consumer(&consumer);
|
ringloop->unregister_consumer(&consumer);
|
||||||
@@ -454,7 +454,7 @@ bool cluster_client_t::flush()
|
|||||||
wb->start_writebacks(this, 0);
|
wb->start_writebacks(this, 0);
|
||||||
cluster_op_t *sync = new cluster_op_t;
|
cluster_op_t *sync = new cluster_op_t;
|
||||||
sync->opcode = OSD_OP_SYNC;
|
sync->opcode = OSD_OP_SYNC;
|
||||||
sync->callback = [](cluster_op_t *sync)
|
sync->callback = [this](cluster_op_t *sync)
|
||||||
{
|
{
|
||||||
delete sync;
|
delete sync;
|
||||||
};
|
};
|
||||||
@@ -465,7 +465,7 @@ bool cluster_client_t::flush()
|
|||||||
bool sync_done = false;
|
bool sync_done = false;
|
||||||
cluster_op_t *sync = new cluster_op_t;
|
cluster_op_t *sync = new cluster_op_t;
|
||||||
sync->opcode = OSD_OP_SYNC;
|
sync->opcode = OSD_OP_SYNC;
|
||||||
sync->callback = [&sync_done](cluster_op_t *sync)
|
sync->callback = [this, &sync_done](cluster_op_t *sync)
|
||||||
{
|
{
|
||||||
delete sync;
|
delete sync;
|
||||||
sync_done = true;
|
sync_done = true;
|
||||||
@@ -532,7 +532,7 @@ void cluster_client_t::execute_internal(cluster_op_t *op)
|
|||||||
return;
|
return;
|
||||||
}
|
}
|
||||||
if (op->opcode == OSD_OP_WRITE && enable_writeback && !(op->flags & OP_FLUSH_BUFFER) &&
|
if (op->opcode == OSD_OP_WRITE && enable_writeback && !(op->flags & OP_FLUSH_BUFFER) &&
|
||||||
!op->version /* no CAS writeback */)
|
!op->version /* FIXME no CAS writeback */)
|
||||||
{
|
{
|
||||||
if (wb->writebacks_active >= client_max_writeback_iodepth)
|
if (wb->writebacks_active >= client_max_writeback_iodepth)
|
||||||
{
|
{
|
||||||
@@ -553,7 +553,7 @@ void cluster_client_t::execute_internal(cluster_op_t *op)
|
|||||||
}
|
}
|
||||||
if (op->opcode == OSD_OP_WRITE && !(op->flags & OP_IMMEDIATE_COMMIT))
|
if (op->opcode == OSD_OP_WRITE && !(op->flags & OP_IMMEDIATE_COMMIT))
|
||||||
{
|
{
|
||||||
if (!(op->flags & OP_FLUSH_BUFFER) && !op->version /* no CAS write-repeat */)
|
if (!(op->flags & OP_FLUSH_BUFFER))
|
||||||
{
|
{
|
||||||
wb->copy_write(op, CACHE_WRITTEN);
|
wb->copy_write(op, CACHE_WRITTEN);
|
||||||
}
|
}
|
||||||
@@ -1152,7 +1152,7 @@ void cluster_client_t::handle_op_part(cluster_op_part_t *part)
|
|||||||
osd_op_names[part->op.req.hdr.opcode], part->osd_num, part->op.reply.hdr.retval, expected
|
osd_op_names[part->op.req.hdr.opcode], part->osd_num, part->op.reply.hdr.retval, expected
|
||||||
);
|
);
|
||||||
}
|
}
|
||||||
else if (log_level > 0)
|
else
|
||||||
{
|
{
|
||||||
fprintf(
|
fprintf(
|
||||||
stderr, "%s operation failed on OSD %lu: retval=%ld (expected %d)\n",
|
stderr, "%s operation failed on OSD %lu: retval=%ld (expected %d)\n",
|
||||||
|
@@ -121,7 +121,7 @@ public:
|
|||||||
json11::Json::object cli_config, file_config, etcd_global_config;
|
json11::Json::object cli_config, file_config, etcd_global_config;
|
||||||
json11::Json::object config;
|
json11::Json::object config;
|
||||||
|
|
||||||
cluster_client_t(ring_loop_t *ringloop, timerfd_manager_t *tfd, json11::Json config);
|
cluster_client_t(ring_loop_t *ringloop, timerfd_manager_t *tfd, json11::Json & config);
|
||||||
~cluster_client_t();
|
~cluster_client_t();
|
||||||
void execute(cluster_op_t *op);
|
void execute(cluster_op_t *op);
|
||||||
void execute_raw(osd_num_t osd_num, osd_op_t *op);
|
void execute_raw(osd_num_t osd_num, osd_op_t *op);
|
||||||
|
@@ -263,7 +263,7 @@ void writeback_cache_t::flush_buffers(cluster_client_t *cli, dirty_buf_it_t from
|
|||||||
}
|
}
|
||||||
assert(calc_len == op->len);
|
assert(calc_len == op->len);
|
||||||
writebacks_active++;
|
writebacks_active++;
|
||||||
op->callback = [this, flush_id](cluster_op_t* op)
|
op->callback = [this, cli, flush_id](cluster_op_t* op)
|
||||||
{
|
{
|
||||||
// Buffer flushes should be always retried, regardless of the error,
|
// Buffer flushes should be always retried, regardless of the error,
|
||||||
// so they should never result in an error here
|
// so they should never result in an error here
|
||||||
@@ -383,7 +383,7 @@ static void copy_to_op(cluster_op_t *op, uint64_t offset, uint8_t *buf, uint64_t
|
|||||||
auto begin = (cur_offset < offset ? offset : cur_offset);
|
auto begin = (cur_offset < offset ? offset : cur_offset);
|
||||||
auto end = (cur_offset+v.iov_len > offset+len ? offset+len : cur_offset+v.iov_len);
|
auto end = (cur_offset+v.iov_len > offset+len ? offset+len : cur_offset+v.iov_len);
|
||||||
memcpy(
|
memcpy(
|
||||||
(uint8_t*)v.iov_base + begin - cur_offset,
|
v.iov_base + begin - cur_offset,
|
||||||
buf + (cur_offset <= offset ? 0 : cur_offset-offset),
|
buf + (cur_offset <= offset ? 0 : cur_offset-offset),
|
||||||
end - begin
|
end - begin
|
||||||
);
|
);
|
||||||
|
@@ -5,7 +5,7 @@
|
|||||||
#include "str_util.h"
|
#include "str_util.h"
|
||||||
|
|
||||||
static const char *help_text =
|
static const char *help_text =
|
||||||
"Vitastor disk management tool " VERSION "\n"
|
"Vitastor disk management tool\n"
|
||||||
"(c) Vitaliy Filippov, 2022+ (VNPL-1.1)\n"
|
"(c) Vitaliy Filippov, 2022+ (VNPL-1.1)\n"
|
||||||
"\n"
|
"\n"
|
||||||
"COMMANDS:\n"
|
"COMMANDS:\n"
|
||||||
@@ -229,7 +229,7 @@ int main(int argc, char *argv[])
|
|||||||
{
|
{
|
||||||
self.options["allow_data_loss"] = "1";
|
self.options["allow_data_loss"] = "1";
|
||||||
}
|
}
|
||||||
else if (argv[i][0] == '-' && argv[i][1] == '-' && i < argc-1)
|
else if (argv[i][0] == '-' && argv[i][1] == '-')
|
||||||
{
|
{
|
||||||
char *key = argv[i]+2;
|
char *key = argv[i]+2;
|
||||||
self.options[key] = argv[++i];
|
self.options[key] = argv[++i];
|
||||||
|
@@ -320,7 +320,7 @@ void disk_tool_t::dump_journal_entry(int num, journal_entry *je, bool json)
|
|||||||
if (journal_calc_data_pos != sw.data_offset)
|
if (journal_calc_data_pos != sw.data_offset)
|
||||||
{
|
{
|
||||||
printf(json ? ",\"bad_loc\":true,\"calc_loc\":\"0x%lx\""
|
printf(json ? ",\"bad_loc\":true,\"calc_loc\":\"0x%lx\""
|
||||||
: " (mismatched, calculated = %08lx)", journal_pos);
|
: " (mismatched, calculated = %lu)", journal_pos);
|
||||||
}
|
}
|
||||||
uint32_t data_csum_size = (!je_start.csum_block_size
|
uint32_t data_csum_size = (!je_start.csum_block_size
|
||||||
? 0
|
? 0
|
||||||
|
@@ -245,7 +245,7 @@ int disk_tool_t::resize_copy_data()
|
|||||||
{
|
{
|
||||||
iodepth = 32;
|
iodepth = 32;
|
||||||
}
|
}
|
||||||
ringloop = new ring_loop_t(iodepth < RINGLOOP_DEFAULT_SIZE ? RINGLOOP_DEFAULT_SIZE : iodepth);
|
ringloop = new ring_loop_t(iodepth < 512 ? 512 : iodepth);
|
||||||
dsk.data_fd = open(dsk.data_device.c_str(), O_DIRECT|O_RDWR);
|
dsk.data_fd = open(dsk.data_device.c_str(), O_DIRECT|O_RDWR);
|
||||||
if (dsk.data_fd < 0)
|
if (dsk.data_fd < 0)
|
||||||
{
|
{
|
||||||
|
@@ -130,7 +130,7 @@ static int bs_init(struct thread_data *td)
|
|||||||
config[p.first] = p.second.dump();
|
config[p.first] = p.second.dump();
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
bsd->ringloop = new ring_loop_t(RINGLOOP_DEFAULT_SIZE);
|
bsd->ringloop = new ring_loop_t(512);
|
||||||
bsd->epmgr = new epoll_manager_t(bsd->ringloop);
|
bsd->epmgr = new epoll_manager_t(bsd->ringloop);
|
||||||
bsd->bs = new blockstore_t(config, bsd->ringloop, bsd->epmgr->tfd);
|
bsd->bs = new blockstore_t(config, bsd->ringloop, bsd->epmgr->tfd);
|
||||||
while (1)
|
while (1)
|
||||||
|
401
src/kv_cli.cpp
401
src/kv_cli.cpp
@@ -1,401 +0,0 @@
|
|||||||
// Copyright (c) Vitaliy Filippov, 2019+
|
|
||||||
// License: VNPL-1.1 (see README.md for details)
|
|
||||||
//
|
|
||||||
// Vitastor shared key/value database test CLI
|
|
||||||
|
|
||||||
#define _XOPEN_SOURCE
|
|
||||||
#include <limits.h>
|
|
||||||
|
|
||||||
#include <netinet/tcp.h>
|
|
||||||
#include <sys/epoll.h>
|
|
||||||
#include <unistd.h>
|
|
||||||
#include <fcntl.h>
|
|
||||||
//#include <signal.h>
|
|
||||||
|
|
||||||
#include "epoll_manager.h"
|
|
||||||
#include "str_util.h"
|
|
||||||
#include "kv_db.h"
|
|
||||||
|
|
||||||
const char *exe_name = NULL;
|
|
||||||
|
|
||||||
class kv_cli_t
|
|
||||||
{
|
|
||||||
public:
|
|
||||||
kv_dbw_t *db = NULL;
|
|
||||||
ring_loop_t *ringloop = NULL;
|
|
||||||
epoll_manager_t *epmgr = NULL;
|
|
||||||
cluster_client_t *cli = NULL;
|
|
||||||
bool interactive = false;
|
|
||||||
int in_progress = 0;
|
|
||||||
char *cur_cmd = NULL;
|
|
||||||
int cur_cmd_size = 0, cur_cmd_alloc = 0;
|
|
||||||
bool finished = false, eof = false;
|
|
||||||
json11::Json::object cfg;
|
|
||||||
|
|
||||||
~kv_cli_t();
|
|
||||||
|
|
||||||
static json11::Json::object parse_args(int narg, const char *args[]);
|
|
||||||
void run(const json11::Json::object & cfg);
|
|
||||||
void read_cmd();
|
|
||||||
void next_cmd();
|
|
||||||
void handle_cmd(const std::string & cmd, std::function<void()> cb);
|
|
||||||
};
|
|
||||||
|
|
||||||
kv_cli_t::~kv_cli_t()
|
|
||||||
{
|
|
||||||
if (cur_cmd)
|
|
||||||
{
|
|
||||||
free(cur_cmd);
|
|
||||||
cur_cmd = NULL;
|
|
||||||
}
|
|
||||||
cur_cmd_alloc = 0;
|
|
||||||
if (db)
|
|
||||||
delete db;
|
|
||||||
if (cli)
|
|
||||||
{
|
|
||||||
cli->flush();
|
|
||||||
delete cli;
|
|
||||||
}
|
|
||||||
if (epmgr)
|
|
||||||
delete epmgr;
|
|
||||||
if (ringloop)
|
|
||||||
delete ringloop;
|
|
||||||
}
|
|
||||||
|
|
||||||
json11::Json::object kv_cli_t::parse_args(int narg, const char *args[])
|
|
||||||
{
|
|
||||||
json11::Json::object cfg;
|
|
||||||
for (int i = 1; i < narg; i++)
|
|
||||||
{
|
|
||||||
if (!strcmp(args[i], "-h") || !strcmp(args[i], "--help"))
|
|
||||||
{
|
|
||||||
printf(
|
|
||||||
"Vitastor Key/Value CLI\n"
|
|
||||||
"(c) Vitaliy Filippov, 2023+ (VNPL-1.1)\n"
|
|
||||||
"\n"
|
|
||||||
"USAGE: %s [--etcd_address ADDR] [OTHER OPTIONS]\n",
|
|
||||||
exe_name
|
|
||||||
);
|
|
||||||
exit(0);
|
|
||||||
}
|
|
||||||
else if (args[i][0] == '-' && args[i][1] == '-')
|
|
||||||
{
|
|
||||||
const char *opt = args[i]+2;
|
|
||||||
cfg[opt] = !strcmp(opt, "json") || i == narg-1 ? "1" : args[++i];
|
|
||||||
}
|
|
||||||
}
|
|
||||||
return cfg;
|
|
||||||
}
|
|
||||||
|
|
||||||
void kv_cli_t::run(const json11::Json::object & cfg)
|
|
||||||
{
|
|
||||||
// Create client
|
|
||||||
ringloop = new ring_loop_t(512);
|
|
||||||
epmgr = new epoll_manager_t(ringloop);
|
|
||||||
cli = new cluster_client_t(ringloop, epmgr->tfd, cfg);
|
|
||||||
db = new kv_dbw_t(cli);
|
|
||||||
// Load image metadata
|
|
||||||
while (!cli->is_ready())
|
|
||||||
{
|
|
||||||
ringloop->loop();
|
|
||||||
if (cli->is_ready())
|
|
||||||
break;
|
|
||||||
ringloop->wait();
|
|
||||||
}
|
|
||||||
// Run
|
|
||||||
fcntl(0, F_SETFL, fcntl(0, F_GETFL, 0) | O_NONBLOCK);
|
|
||||||
try
|
|
||||||
{
|
|
||||||
epmgr->tfd->set_fd_handler(0, false, [this](int fd, int events)
|
|
||||||
{
|
|
||||||
if (events & EPOLLIN)
|
|
||||||
{
|
|
||||||
read_cmd();
|
|
||||||
}
|
|
||||||
if (events & EPOLLRDHUP)
|
|
||||||
{
|
|
||||||
epmgr->tfd->set_fd_handler(0, false, NULL);
|
|
||||||
finished = true;
|
|
||||||
}
|
|
||||||
});
|
|
||||||
interactive = true;
|
|
||||||
printf("> ");
|
|
||||||
}
|
|
||||||
catch (std::exception & e)
|
|
||||||
{
|
|
||||||
// Can't add to epoll, STDIN is probably a file
|
|
||||||
read_cmd();
|
|
||||||
}
|
|
||||||
while (!finished)
|
|
||||||
{
|
|
||||||
ringloop->loop();
|
|
||||||
if (!finished)
|
|
||||||
ringloop->wait();
|
|
||||||
}
|
|
||||||
// Destroy the client
|
|
||||||
delete db;
|
|
||||||
db = NULL;
|
|
||||||
cli->flush();
|
|
||||||
delete cli;
|
|
||||||
delete epmgr;
|
|
||||||
delete ringloop;
|
|
||||||
cli = NULL;
|
|
||||||
epmgr = NULL;
|
|
||||||
ringloop = NULL;
|
|
||||||
}
|
|
||||||
|
|
||||||
void kv_cli_t::read_cmd()
|
|
||||||
{
|
|
||||||
if (!cur_cmd_alloc)
|
|
||||||
{
|
|
||||||
cur_cmd_alloc = 65536;
|
|
||||||
cur_cmd = (char*)malloc_or_die(cur_cmd_alloc);
|
|
||||||
}
|
|
||||||
while (cur_cmd_size < cur_cmd_alloc)
|
|
||||||
{
|
|
||||||
int r = read(0, cur_cmd+cur_cmd_size, cur_cmd_alloc-cur_cmd_size);
|
|
||||||
if (r < 0 && errno != EAGAIN)
|
|
||||||
fprintf(stderr, "Error reading from stdin: %s\n", strerror(errno));
|
|
||||||
if (r > 0)
|
|
||||||
cur_cmd_size += r;
|
|
||||||
if (r == 0)
|
|
||||||
eof = true;
|
|
||||||
if (r <= 0)
|
|
||||||
break;
|
|
||||||
}
|
|
||||||
next_cmd();
|
|
||||||
}
|
|
||||||
|
|
||||||
void kv_cli_t::next_cmd()
|
|
||||||
{
|
|
||||||
if (in_progress > 0)
|
|
||||||
{
|
|
||||||
return;
|
|
||||||
}
|
|
||||||
int pos = 0;
|
|
||||||
for (; pos < cur_cmd_size; pos++)
|
|
||||||
{
|
|
||||||
if (cur_cmd[pos] == '\n' || cur_cmd[pos] == '\r')
|
|
||||||
{
|
|
||||||
auto cmd = trim(std::string(cur_cmd, pos));
|
|
||||||
pos++;
|
|
||||||
memmove(cur_cmd, cur_cmd+pos, cur_cmd_size-pos);
|
|
||||||
cur_cmd_size -= pos;
|
|
||||||
in_progress++;
|
|
||||||
handle_cmd(cmd, [this]()
|
|
||||||
{
|
|
||||||
in_progress--;
|
|
||||||
if (interactive)
|
|
||||||
printf("> ");
|
|
||||||
next_cmd();
|
|
||||||
if (!in_progress)
|
|
||||||
read_cmd();
|
|
||||||
});
|
|
||||||
break;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
if (eof && !in_progress)
|
|
||||||
{
|
|
||||||
finished = true;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
void kv_cli_t::handle_cmd(const std::string & cmd, std::function<void()> cb)
|
|
||||||
{
|
|
||||||
if (cmd == "")
|
|
||||||
{
|
|
||||||
cb();
|
|
||||||
return;
|
|
||||||
}
|
|
||||||
auto pos = cmd.find_first_of(" \t");
|
|
||||||
if (pos != std::string::npos)
|
|
||||||
{
|
|
||||||
while (pos < cmd.size()-1 && (cmd[pos+1] == ' ' || cmd[pos+1] == '\t'))
|
|
||||||
pos++;
|
|
||||||
}
|
|
||||||
auto opname = strtolower(pos == std::string::npos ? cmd : cmd.substr(0, pos));
|
|
||||||
if (opname == "open")
|
|
||||||
{
|
|
||||||
uint64_t pool_id = 0;
|
|
||||||
inode_t inode_id = 0;
|
|
||||||
uint32_t kv_block_size = 0;
|
|
||||||
int scanned = sscanf(cmd.c_str() + pos+1, "%lu %lu %u", &pool_id, &inode_id, &kv_block_size);
|
|
||||||
if (scanned == 2)
|
|
||||||
{
|
|
||||||
kv_block_size = 4096;
|
|
||||||
}
|
|
||||||
if (scanned < 2 || !pool_id || !inode_id || !kv_block_size || (kv_block_size & (kv_block_size-1)) != 0)
|
|
||||||
{
|
|
||||||
fprintf(stderr, "Usage: open <pool_id> <inode_id> [block_size]. Block size must be a power of 2. Default is 4096.\n");
|
|
||||||
cb();
|
|
||||||
return;
|
|
||||||
}
|
|
||||||
cfg["kv_block_size"] = (uint64_t)kv_block_size;
|
|
||||||
db->open(INODE_WITH_POOL(pool_id, inode_id), cfg, [=](int res)
|
|
||||||
{
|
|
||||||
if (res < 0)
|
|
||||||
fprintf(stderr, "Error opening index: %s (code %d)\n", strerror(-res), res);
|
|
||||||
else
|
|
||||||
printf("Index opened. Current size: %lu bytes\n", db->get_size());
|
|
||||||
cb();
|
|
||||||
});
|
|
||||||
}
|
|
||||||
else if (opname == "config")
|
|
||||||
{
|
|
||||||
auto pos2 = cmd.find_first_of(" \t", pos+1);
|
|
||||||
if (pos2 == std::string::npos)
|
|
||||||
{
|
|
||||||
fprintf(stderr, "Usage: config <property> <value>\n");
|
|
||||||
cb();
|
|
||||||
return;
|
|
||||||
}
|
|
||||||
auto key = trim(cmd.substr(pos+1, pos2-pos-1));
|
|
||||||
auto value = parse_size(trim(cmd.substr(pos2+1)));
|
|
||||||
if (key != "kv_memory_limit" &&
|
|
||||||
key != "kv_allocate_blocks" &&
|
|
||||||
key != "kv_evict_max_misses" &&
|
|
||||||
key != "kv_evict_attempts_per_level" &&
|
|
||||||
key != "kv_evict_unused_age" &&
|
|
||||||
key != "kv_log_level")
|
|
||||||
{
|
|
||||||
fprintf(
|
|
||||||
stderr, "Allowed properties: kv_memory_limit, kv_allocate_blocks,"
|
|
||||||
" kv_evict_max_misses, kv_evict_attempts_per_level, kv_evict_unused_age, kv_log_level\n"
|
|
||||||
);
|
|
||||||
}
|
|
||||||
else
|
|
||||||
{
|
|
||||||
cfg[key] = value;
|
|
||||||
db->set_config(cfg);
|
|
||||||
}
|
|
||||||
cb();
|
|
||||||
}
|
|
||||||
else if (opname == "get" || opname == "set" || opname == "del")
|
|
||||||
{
|
|
||||||
if (opname == "get" || opname == "del")
|
|
||||||
{
|
|
||||||
if (pos == std::string::npos)
|
|
||||||
{
|
|
||||||
fprintf(stderr, "Usage: %s <key>\n", opname.c_str());
|
|
||||||
cb();
|
|
||||||
return;
|
|
||||||
}
|
|
||||||
auto key = trim(cmd.substr(pos+1));
|
|
||||||
if (opname == "get")
|
|
||||||
{
|
|
||||||
db->get(key, [this, cb](int res, const std::string & value)
|
|
||||||
{
|
|
||||||
if (res < 0)
|
|
||||||
fprintf(stderr, "Error: %s (code %d)\n", strerror(-res), res);
|
|
||||||
else
|
|
||||||
{
|
|
||||||
write(1, value.c_str(), value.size());
|
|
||||||
write(1, "\n", 1);
|
|
||||||
}
|
|
||||||
cb();
|
|
||||||
});
|
|
||||||
}
|
|
||||||
else
|
|
||||||
{
|
|
||||||
db->del(key, [this, cb](int res)
|
|
||||||
{
|
|
||||||
if (res < 0)
|
|
||||||
fprintf(stderr, "Error: %s (code %d)\n", strerror(-res), res);
|
|
||||||
else
|
|
||||||
printf("OK\n");
|
|
||||||
cb();
|
|
||||||
});
|
|
||||||
}
|
|
||||||
}
|
|
||||||
else
|
|
||||||
{
|
|
||||||
auto pos2 = cmd.find_first_of(" \t", pos+1);
|
|
||||||
if (pos2 == std::string::npos)
|
|
||||||
{
|
|
||||||
fprintf(stderr, "Usage: set <key> <value>\n");
|
|
||||||
cb();
|
|
||||||
return;
|
|
||||||
}
|
|
||||||
auto key = trim(cmd.substr(pos+1, pos2-pos-1));
|
|
||||||
auto value = trim(cmd.substr(pos2+1));
|
|
||||||
db->set(key, value, [this, cb](int res)
|
|
||||||
{
|
|
||||||
if (res < 0)
|
|
||||||
fprintf(stderr, "Error: %s (code %d)\n", strerror(-res), res);
|
|
||||||
else
|
|
||||||
printf("OK\n");
|
|
||||||
cb();
|
|
||||||
});
|
|
||||||
}
|
|
||||||
}
|
|
||||||
else if (opname == "list")
|
|
||||||
{
|
|
||||||
std::string start, end;
|
|
||||||
if (pos != std::string::npos)
|
|
||||||
{
|
|
||||||
auto pos2 = cmd.find_first_of(" \t", pos+1);
|
|
||||||
if (pos2 != std::string::npos)
|
|
||||||
{
|
|
||||||
start = trim(cmd.substr(pos+1, pos2-pos-1));
|
|
||||||
end = trim(cmd.substr(pos2+1));
|
|
||||||
}
|
|
||||||
else
|
|
||||||
{
|
|
||||||
start = trim(cmd.substr(pos+1));
|
|
||||||
}
|
|
||||||
}
|
|
||||||
void *handle = db->list_start(start);
|
|
||||||
db->list_next(handle, [=](int res, const std::string & key, const std::string & value)
|
|
||||||
{
|
|
||||||
if (res < 0)
|
|
||||||
{
|
|
||||||
if (res != -ENOENT)
|
|
||||||
{
|
|
||||||
fprintf(stderr, "Error: %s (code %d)\n", strerror(-res), res);
|
|
||||||
}
|
|
||||||
db->list_close(handle);
|
|
||||||
cb();
|
|
||||||
}
|
|
||||||
else
|
|
||||||
{
|
|
||||||
printf("%s = %s\n", key.c_str(), value.c_str());
|
|
||||||
db->list_next(handle, NULL);
|
|
||||||
}
|
|
||||||
});
|
|
||||||
}
|
|
||||||
else if (opname == "close")
|
|
||||||
{
|
|
||||||
db->close([=]()
|
|
||||||
{
|
|
||||||
printf("Index closed\n");
|
|
||||||
cb();
|
|
||||||
});
|
|
||||||
}
|
|
||||||
else if (opname == "quit" || opname == "q")
|
|
||||||
{
|
|
||||||
::close(0);
|
|
||||||
finished = true;
|
|
||||||
}
|
|
||||||
else
|
|
||||||
{
|
|
||||||
fprintf(
|
|
||||||
stderr, "Unknown operation: %s. Supported operations:\n"
|
|
||||||
"open <pool_id> <inode_id> [block_size]\n"
|
|
||||||
"config <property> <value>\n"
|
|
||||||
"get <key>\nset <key> <value>\ndel <key>\nlist [<start> [end]]\n"
|
|
||||||
"close\nquit\n", opname.c_str()
|
|
||||||
);
|
|
||||||
cb();
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
int main(int narg, const char *args[])
|
|
||||||
{
|
|
||||||
setvbuf(stdout, NULL, _IONBF, 0);
|
|
||||||
setvbuf(stderr, NULL, _IONBF, 0);
|
|
||||||
exe_name = args[0];
|
|
||||||
kv_cli_t *p = new kv_cli_t();
|
|
||||||
p->run(kv_cli_t::parse_args(narg, args));
|
|
||||||
delete p;
|
|
||||||
return 0;
|
|
||||||
}
|
|
2064
src/kv_db.cpp
2064
src/kv_db.cpp
File diff suppressed because it is too large
Load Diff
36
src/kv_db.h
36
src/kv_db.h
@@ -1,36 +0,0 @@
|
|||||||
// Copyright (c) Vitaliy Filippov, 2019+
|
|
||||||
// License: VNPL-1.1 (see README.md for details)
|
|
||||||
//
|
|
||||||
// Vitastor shared key/value database
|
|
||||||
// Parallel optimistic B-Tree O:-)
|
|
||||||
|
|
||||||
#pragma once
|
|
||||||
|
|
||||||
#include "cluster_client.h"
|
|
||||||
|
|
||||||
struct kv_db_t;
|
|
||||||
|
|
||||||
struct kv_dbw_t
|
|
||||||
{
|
|
||||||
kv_dbw_t(cluster_client_t *cli);
|
|
||||||
~kv_dbw_t();
|
|
||||||
|
|
||||||
void open(inode_t inode_id, json11::Json cfg, std::function<void(int)> cb);
|
|
||||||
void set_config(json11::Json cfg);
|
|
||||||
void close(std::function<void()> cb);
|
|
||||||
|
|
||||||
uint64_t get_size();
|
|
||||||
|
|
||||||
void get(const std::string & key, std::function<void(int res, const std::string & value)> cb,
|
|
||||||
bool allow_old_cached = false);
|
|
||||||
void set(const std::string & key, const std::string & value, std::function<void(int res)> cb,
|
|
||||||
std::function<bool(int res, const std::string & value)> cas_compare = NULL);
|
|
||||||
void del(const std::string & key, std::function<void(int res)> cb,
|
|
||||||
std::function<bool(int res, const std::string & value)> cas_compare = NULL);
|
|
||||||
|
|
||||||
void* list_start(const std::string & start);
|
|
||||||
void list_next(void *handle, std::function<void(int res, const std::string & key, const std::string & value)> cb);
|
|
||||||
void list_close(void *handle);
|
|
||||||
|
|
||||||
kv_db_t *db;
|
|
||||||
};
|
|
@@ -1,697 +0,0 @@
|
|||||||
// Copyright (c) Vitaliy Filippov, 2019+
|
|
||||||
// License: VNPL-1.1 (see README.md for details)
|
|
||||||
//
|
|
||||||
// Vitastor shared key/value database stress tester / benchmark
|
|
||||||
|
|
||||||
#define _XOPEN_SOURCE
|
|
||||||
#include <limits.h>
|
|
||||||
|
|
||||||
#include <netinet/tcp.h>
|
|
||||||
#include <sys/epoll.h>
|
|
||||||
#include <unistd.h>
|
|
||||||
#include <fcntl.h>
|
|
||||||
//#include <signal.h>
|
|
||||||
|
|
||||||
#include "epoll_manager.h"
|
|
||||||
#include "str_util.h"
|
|
||||||
#include "kv_db.h"
|
|
||||||
|
|
||||||
const char *exe_name = NULL;
|
|
||||||
|
|
||||||
struct kv_test_listing_t
|
|
||||||
{
|
|
||||||
uint64_t count = 0, done = 0;
|
|
||||||
void *handle = NULL;
|
|
||||||
std::string next_after;
|
|
||||||
std::set<std::string> inflights;
|
|
||||||
timespec tv_begin;
|
|
||||||
bool error = false;
|
|
||||||
};
|
|
||||||
|
|
||||||
struct kv_test_lat_t
|
|
||||||
{
|
|
||||||
const char *name = NULL;
|
|
||||||
uint64_t usec = 0, count = 0;
|
|
||||||
};
|
|
||||||
|
|
||||||
struct kv_test_stat_t
|
|
||||||
{
|
|
||||||
kv_test_lat_t get, add, update, del, list;
|
|
||||||
uint64_t list_keys = 0;
|
|
||||||
};
|
|
||||||
|
|
||||||
class kv_test_t
|
|
||||||
{
|
|
||||||
public:
|
|
||||||
// Config
|
|
||||||
json11::Json::object kv_cfg;
|
|
||||||
std::string key_prefix, key_suffix;
|
|
||||||
uint64_t inode_id = 0;
|
|
||||||
uint64_t op_count = 1000000;
|
|
||||||
uint64_t runtime_sec = 0;
|
|
||||||
uint64_t parallelism = 4;
|
|
||||||
uint64_t reopen_prob = 1;
|
|
||||||
uint64_t get_prob = 30000;
|
|
||||||
uint64_t add_prob = 20000;
|
|
||||||
uint64_t update_prob = 20000;
|
|
||||||
uint64_t del_prob = 5000;
|
|
||||||
uint64_t list_prob = 300;
|
|
||||||
uint64_t min_key_len = 10;
|
|
||||||
uint64_t max_key_len = 70;
|
|
||||||
uint64_t min_value_len = 50;
|
|
||||||
uint64_t max_value_len = 300;
|
|
||||||
uint64_t min_list_count = 10;
|
|
||||||
uint64_t max_list_count = 1000;
|
|
||||||
uint64_t print_stats_interval = 1;
|
|
||||||
bool json_output = false;
|
|
||||||
uint64_t log_level = 1;
|
|
||||||
bool trace = false;
|
|
||||||
bool stop_on_error = false;
|
|
||||||
// FIXME: Multiple clients
|
|
||||||
kv_test_stat_t stat, prev_stat;
|
|
||||||
timespec prev_stat_time, start_stat_time;
|
|
||||||
|
|
||||||
// State
|
|
||||||
kv_dbw_t *db = NULL;
|
|
||||||
ring_loop_t *ringloop = NULL;
|
|
||||||
epoll_manager_t *epmgr = NULL;
|
|
||||||
cluster_client_t *cli = NULL;
|
|
||||||
ring_consumer_t consumer;
|
|
||||||
bool finished = false;
|
|
||||||
uint64_t total_prob = 0;
|
|
||||||
uint64_t ops_sent = 0, ops_done = 0;
|
|
||||||
int stat_timer_id = -1;
|
|
||||||
int in_progress = 0;
|
|
||||||
bool reopening = false;
|
|
||||||
std::set<kv_test_listing_t*> listings;
|
|
||||||
std::set<std::string> changing_keys;
|
|
||||||
std::map<std::string, std::string> values;
|
|
||||||
|
|
||||||
~kv_test_t();
|
|
||||||
|
|
||||||
static json11::Json::object parse_args(int narg, const char *args[]);
|
|
||||||
void parse_config(json11::Json cfg);
|
|
||||||
void run(json11::Json cfg);
|
|
||||||
void loop();
|
|
||||||
void print_stats(kv_test_stat_t & prev_stat, timespec & prev_stat_time);
|
|
||||||
void print_total_stats();
|
|
||||||
void start_change(const std::string & key);
|
|
||||||
void stop_change(const std::string & key);
|
|
||||||
void add_stat(kv_test_lat_t & stat, timespec tv_begin);
|
|
||||||
};
|
|
||||||
|
|
||||||
kv_test_t::~kv_test_t()
|
|
||||||
{
|
|
||||||
if (db)
|
|
||||||
delete db;
|
|
||||||
if (cli)
|
|
||||||
{
|
|
||||||
cli->flush();
|
|
||||||
delete cli;
|
|
||||||
}
|
|
||||||
if (epmgr)
|
|
||||||
delete epmgr;
|
|
||||||
if (ringloop)
|
|
||||||
delete ringloop;
|
|
||||||
}
|
|
||||||
|
|
||||||
json11::Json::object kv_test_t::parse_args(int narg, const char *args[])
|
|
||||||
{
|
|
||||||
json11::Json::object cfg;
|
|
||||||
for (int i = 1; i < narg; i++)
|
|
||||||
{
|
|
||||||
if (!strcmp(args[i], "-h") || !strcmp(args[i], "--help"))
|
|
||||||
{
|
|
||||||
printf(
|
|
||||||
"Vitastor Key/Value DB stress tester / benchmark\n"
|
|
||||||
"(c) Vitaliy Filippov, 2023+ (VNPL-1.1)\n"
|
|
||||||
"\n"
|
|
||||||
"USAGE: %s --pool_id POOL_ID --inode_id INODE_ID [OPTIONS]\n"
|
|
||||||
" --op_count 1000000\n"
|
|
||||||
" Total operations to run during test. 0 means unlimited\n"
|
|
||||||
" --key_prefix \"\"\n"
|
|
||||||
" Prefix for all keys read or written (to avoid collisions)\n"
|
|
||||||
" --key_suffix \"\"\n"
|
|
||||||
" Suffix for all keys read or written (to avoid collisions, but scan all DB)\n"
|
|
||||||
" --runtime 0\n"
|
|
||||||
" Run for this number of seconds. 0 means unlimited\n"
|
|
||||||
" --parallelism 4\n"
|
|
||||||
" Run this number of operations in parallel\n"
|
|
||||||
" --get_prob 30000\n"
|
|
||||||
" Fraction of key retrieve operations\n"
|
|
||||||
" --add_prob 20000\n"
|
|
||||||
" Fraction of key addition operations\n"
|
|
||||||
" --update_prob 20000\n"
|
|
||||||
" Fraction of key update operations\n"
|
|
||||||
" --del_prob 30000\n"
|
|
||||||
" Fraction of key delete operations\n"
|
|
||||||
" --list_prob 300\n"
|
|
||||||
" Fraction of listing operations\n"
|
|
||||||
" --min_key_len 10\n"
|
|
||||||
" Minimum key size in bytes\n"
|
|
||||||
" --max_key_len 70\n"
|
|
||||||
" Maximum key size in bytes\n"
|
|
||||||
" --min_value_len 50\n"
|
|
||||||
" Minimum value size in bytes\n"
|
|
||||||
" --max_value_len 300\n"
|
|
||||||
" Maximum value size in bytes\n"
|
|
||||||
" --min_list_count 10\n"
|
|
||||||
" Minimum number of keys read in listing (0 = all keys)\n"
|
|
||||||
" --max_list_count 1000\n"
|
|
||||||
" Maximum number of keys read in listing\n"
|
|
||||||
" --print_stats 1\n"
|
|
||||||
" Print operation statistics every this number of seconds\n"
|
|
||||||
" --json\n"
|
|
||||||
" JSON output\n"
|
|
||||||
" --stop_on_error 0\n"
|
|
||||||
" Stop on first execution error, mismatch, lost key or extra key during listing\n"
|
|
||||||
" --kv_memory_limit 128M\n"
|
|
||||||
" Maximum memory to use for vitastor-kv index cache\n"
|
|
||||||
" --kv_allocate_blocks 4\n"
|
|
||||||
" Number of PG blocks used for new tree block allocation in parallel\n"
|
|
||||||
" --kv_evict_max_misses 10\n"
|
|
||||||
" Eviction algorithm parameter: retry eviction from another random spot\n"
|
|
||||||
" if this number of keys is used currently or was used recently\n"
|
|
||||||
" --kv_evict_attempts_per_level 3\n"
|
|
||||||
" Retry eviction at most this number of times per tree level, starting\n"
|
|
||||||
" with bottom-most levels\n"
|
|
||||||
" --kv_evict_unused_age 1000\n"
|
|
||||||
" Evict only keys unused during this number of last operations\n"
|
|
||||||
" --kv_log_level 1\n"
|
|
||||||
" Log level. 0 = errors, 1 = warnings, 10 = trace operations\n",
|
|
||||||
exe_name
|
|
||||||
);
|
|
||||||
exit(0);
|
|
||||||
}
|
|
||||||
else if (args[i][0] == '-' && args[i][1] == '-')
|
|
||||||
{
|
|
||||||
const char *opt = args[i]+2;
|
|
||||||
cfg[opt] = !strcmp(opt, "json") || i == narg-1 ? "1" : args[++i];
|
|
||||||
}
|
|
||||||
}
|
|
||||||
return cfg;
|
|
||||||
}
|
|
||||||
|
|
||||||
void kv_test_t::parse_config(json11::Json cfg)
|
|
||||||
{
|
|
||||||
inode_id = INODE_WITH_POOL(cfg["pool_id"].uint64_value(), cfg["inode_id"].uint64_value());
|
|
||||||
if (cfg["op_count"].uint64_value() > 0)
|
|
||||||
op_count = cfg["op_count"].uint64_value();
|
|
||||||
key_prefix = cfg["key_prefix"].string_value();
|
|
||||||
key_suffix = cfg["key_suffix"].string_value();
|
|
||||||
if (cfg["runtime"].uint64_value() > 0)
|
|
||||||
runtime_sec = cfg["runtime"].uint64_value();
|
|
||||||
if (cfg["parallelism"].uint64_value() > 0)
|
|
||||||
parallelism = cfg["parallelism"].uint64_value();
|
|
||||||
if (!cfg["reopen_prob"].is_null())
|
|
||||||
reopen_prob = cfg["reopen_prob"].uint64_value();
|
|
||||||
if (!cfg["get_prob"].is_null())
|
|
||||||
get_prob = cfg["get_prob"].uint64_value();
|
|
||||||
if (!cfg["add_prob"].is_null())
|
|
||||||
add_prob = cfg["add_prob"].uint64_value();
|
|
||||||
if (!cfg["update_prob"].is_null())
|
|
||||||
update_prob = cfg["update_prob"].uint64_value();
|
|
||||||
if (!cfg["del_prob"].is_null())
|
|
||||||
del_prob = cfg["del_prob"].uint64_value();
|
|
||||||
if (!cfg["list_prob"].is_null())
|
|
||||||
list_prob = cfg["list_prob"].uint64_value();
|
|
||||||
if (!cfg["min_key_len"].is_null())
|
|
||||||
min_key_len = cfg["min_key_len"].uint64_value();
|
|
||||||
if (cfg["max_key_len"].uint64_value() > 0)
|
|
||||||
max_key_len = cfg["max_key_len"].uint64_value();
|
|
||||||
if (!cfg["min_value_len"].is_null())
|
|
||||||
min_value_len = cfg["min_value_len"].uint64_value();
|
|
||||||
if (cfg["max_value_len"].uint64_value() > 0)
|
|
||||||
max_value_len = cfg["max_value_len"].uint64_value();
|
|
||||||
if (!cfg["min_list_count"].is_null())
|
|
||||||
min_list_count = cfg["min_list_count"].uint64_value();
|
|
||||||
if (!cfg["max_list_count"].is_null())
|
|
||||||
max_list_count = cfg["max_list_count"].uint64_value();
|
|
||||||
if (!cfg["print_stats"].is_null())
|
|
||||||
print_stats_interval = cfg["print_stats"].uint64_value();
|
|
||||||
if (!cfg["json"].is_null())
|
|
||||||
json_output = true;
|
|
||||||
if (!cfg["stop_on_error"].is_null())
|
|
||||||
stop_on_error = cfg["stop_on_error"].bool_value();
|
|
||||||
if (!cfg["kv_memory_limit"].is_null())
|
|
||||||
kv_cfg["kv_memory_limit"] = cfg["kv_memory_limit"];
|
|
||||||
if (!cfg["kv_allocate_blocks"].is_null())
|
|
||||||
kv_cfg["kv_allocate_blocks"] = cfg["kv_allocate_blocks"];
|
|
||||||
if (!cfg["kv_evict_max_misses"].is_null())
|
|
||||||
kv_cfg["kv_evict_max_misses"] = cfg["kv_evict_max_misses"];
|
|
||||||
if (!cfg["kv_evict_attempts_per_level"].is_null())
|
|
||||||
kv_cfg["kv_evict_attempts_per_level"] = cfg["kv_evict_attempts_per_level"];
|
|
||||||
if (!cfg["kv_evict_unused_age"].is_null())
|
|
||||||
kv_cfg["kv_evict_unused_age"] = cfg["kv_evict_unused_age"];
|
|
||||||
if (!cfg["kv_log_level"].is_null())
|
|
||||||
{
|
|
||||||
log_level = cfg["kv_log_level"].uint64_value();
|
|
||||||
trace = log_level >= 10;
|
|
||||||
kv_cfg["kv_log_level"] = cfg["kv_log_level"];
|
|
||||||
}
|
|
||||||
total_prob = reopen_prob+get_prob+add_prob+update_prob+del_prob+list_prob;
|
|
||||||
stat.get.name = "get";
|
|
||||||
stat.add.name = "add";
|
|
||||||
stat.update.name = "update";
|
|
||||||
stat.del.name = "del";
|
|
||||||
stat.list.name = "list";
|
|
||||||
}
|
|
||||||
|
|
||||||
void kv_test_t::run(json11::Json cfg)
|
|
||||||
{
|
|
||||||
srand48(time(NULL));
|
|
||||||
parse_config(cfg);
|
|
||||||
// Create client
|
|
||||||
ringloop = new ring_loop_t(512);
|
|
||||||
epmgr = new epoll_manager_t(ringloop);
|
|
||||||
cli = new cluster_client_t(ringloop, epmgr->tfd, cfg);
|
|
||||||
db = new kv_dbw_t(cli);
|
|
||||||
// Load image metadata
|
|
||||||
while (!cli->is_ready())
|
|
||||||
{
|
|
||||||
ringloop->loop();
|
|
||||||
if (cli->is_ready())
|
|
||||||
break;
|
|
||||||
ringloop->wait();
|
|
||||||
}
|
|
||||||
// Run
|
|
||||||
reopening = true;
|
|
||||||
db->open(inode_id, kv_cfg, [this](int res)
|
|
||||||
{
|
|
||||||
reopening = false;
|
|
||||||
if (res < 0)
|
|
||||||
{
|
|
||||||
fprintf(stderr, "ERROR: Open index: %d (%s)\n", res, strerror(-res));
|
|
||||||
exit(1);
|
|
||||||
}
|
|
||||||
if (trace)
|
|
||||||
printf("Index opened\n");
|
|
||||||
ringloop->wakeup();
|
|
||||||
});
|
|
||||||
consumer.loop = [this]() { loop(); };
|
|
||||||
ringloop->register_consumer(&consumer);
|
|
||||||
if (print_stats_interval)
|
|
||||||
stat_timer_id = epmgr->tfd->set_timer(print_stats_interval*1000, true, [this](int) { print_stats(prev_stat, prev_stat_time); });
|
|
||||||
clock_gettime(CLOCK_REALTIME, &start_stat_time);
|
|
||||||
prev_stat_time = start_stat_time;
|
|
||||||
while (!finished)
|
|
||||||
{
|
|
||||||
ringloop->loop();
|
|
||||||
if (!finished)
|
|
||||||
ringloop->wait();
|
|
||||||
}
|
|
||||||
if (stat_timer_id >= 0)
|
|
||||||
epmgr->tfd->clear_timer(stat_timer_id);
|
|
||||||
ringloop->unregister_consumer(&consumer);
|
|
||||||
// Print total stats
|
|
||||||
print_total_stats();
|
|
||||||
// Destroy the client
|
|
||||||
delete db;
|
|
||||||
db = NULL;
|
|
||||||
cli->flush();
|
|
||||||
delete cli;
|
|
||||||
delete epmgr;
|
|
||||||
delete ringloop;
|
|
||||||
cli = NULL;
|
|
||||||
epmgr = NULL;
|
|
||||||
ringloop = NULL;
|
|
||||||
}
|
|
||||||
|
|
||||||
static const char *base64_chars = "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789@+/";
|
|
||||||
|
|
||||||
std::string random_str(int len)
|
|
||||||
{
|
|
||||||
std::string str;
|
|
||||||
str.resize(len);
|
|
||||||
for (int i = 0; i < len; i++)
|
|
||||||
{
|
|
||||||
str[i] = base64_chars[lrand48() % 64];
|
|
||||||
}
|
|
||||||
return str;
|
|
||||||
}
|
|
||||||
|
|
||||||
void kv_test_t::loop()
|
|
||||||
{
|
|
||||||
if (reopening)
|
|
||||||
{
|
|
||||||
return;
|
|
||||||
}
|
|
||||||
if (ops_done >= op_count)
|
|
||||||
{
|
|
||||||
finished = true;
|
|
||||||
}
|
|
||||||
while (!finished && ops_sent < op_count && in_progress < parallelism)
|
|
||||||
{
|
|
||||||
uint64_t dice = (lrand48() % total_prob);
|
|
||||||
if (dice < reopen_prob)
|
|
||||||
{
|
|
||||||
reopening = true;
|
|
||||||
db->close([this]()
|
|
||||||
{
|
|
||||||
if (trace)
|
|
||||||
printf("Index closed\n");
|
|
||||||
db->open(inode_id, kv_cfg, [this](int res)
|
|
||||||
{
|
|
||||||
reopening = false;
|
|
||||||
if (res < 0)
|
|
||||||
{
|
|
||||||
fprintf(stderr, "ERROR: Reopen index: %d (%s)\n", res, strerror(-res));
|
|
||||||
finished = true;
|
|
||||||
return;
|
|
||||||
}
|
|
||||||
if (trace)
|
|
||||||
printf("Index reopened\n");
|
|
||||||
ringloop->wakeup();
|
|
||||||
});
|
|
||||||
});
|
|
||||||
return;
|
|
||||||
}
|
|
||||||
else if (dice < reopen_prob+get_prob)
|
|
||||||
{
|
|
||||||
// get existing
|
|
||||||
auto key = random_str(max_key_len);
|
|
||||||
auto k_it = values.lower_bound(key);
|
|
||||||
if (k_it == values.end())
|
|
||||||
continue;
|
|
||||||
key = k_it->first;
|
|
||||||
if (changing_keys.find(key) != changing_keys.end())
|
|
||||||
continue;
|
|
||||||
in_progress++;
|
|
||||||
ops_sent++;
|
|
||||||
if (trace)
|
|
||||||
printf("get %s\n", key.c_str());
|
|
||||||
timespec tv_begin;
|
|
||||||
clock_gettime(CLOCK_REALTIME, &tv_begin);
|
|
||||||
db->get(key, [this, key, tv_begin](int res, const std::string & value)
|
|
||||||
{
|
|
||||||
add_stat(stat.get, tv_begin);
|
|
||||||
ops_done++;
|
|
||||||
in_progress--;
|
|
||||||
auto it = values.find(key);
|
|
||||||
if (res != (it == values.end() ? -ENOENT : 0))
|
|
||||||
{
|
|
||||||
fprintf(stderr, "ERROR: get %s: %d (%s)\n", key.c_str(), res, strerror(-res));
|
|
||||||
if (stop_on_error)
|
|
||||||
exit(1);
|
|
||||||
}
|
|
||||||
else if (it != values.end() && value != it->second)
|
|
||||||
{
|
|
||||||
fprintf(stderr, "ERROR: get %s: mismatch: %s vs %s\n", key.c_str(), value.c_str(), it->second.c_str());
|
|
||||||
if (stop_on_error)
|
|
||||||
exit(1);
|
|
||||||
}
|
|
||||||
ringloop->wakeup();
|
|
||||||
});
|
|
||||||
}
|
|
||||||
else if (dice < reopen_prob+get_prob+add_prob+update_prob)
|
|
||||||
{
|
|
||||||
bool is_add = false;
|
|
||||||
std::string key;
|
|
||||||
if (dice < reopen_prob+get_prob+add_prob)
|
|
||||||
{
|
|
||||||
// add
|
|
||||||
is_add = true;
|
|
||||||
uint64_t key_len = min_key_len + (max_key_len > min_key_len ? lrand48() % (max_key_len-min_key_len) : 0);
|
|
||||||
key = key_prefix + random_str(key_len) + key_suffix;
|
|
||||||
}
|
|
||||||
else
|
|
||||||
{
|
|
||||||
// update
|
|
||||||
key = random_str(max_key_len);
|
|
||||||
auto k_it = values.lower_bound(key);
|
|
||||||
if (k_it == values.end())
|
|
||||||
continue;
|
|
||||||
key = k_it->first;
|
|
||||||
}
|
|
||||||
if (changing_keys.find(key) != changing_keys.end())
|
|
||||||
continue;
|
|
||||||
uint64_t value_len = min_value_len + (max_value_len > min_value_len ? lrand48() % (max_value_len-min_value_len) : 0);
|
|
||||||
auto value = random_str(value_len);
|
|
||||||
start_change(key);
|
|
||||||
ops_sent++;
|
|
||||||
in_progress++;
|
|
||||||
if (trace)
|
|
||||||
printf("set %s = %s\n", key.c_str(), value.c_str());
|
|
||||||
timespec tv_begin;
|
|
||||||
clock_gettime(CLOCK_REALTIME, &tv_begin);
|
|
||||||
db->set(key, value, [this, key, value, tv_begin, is_add](int res)
|
|
||||||
{
|
|
||||||
add_stat(is_add ? stat.add : stat.update, tv_begin);
|
|
||||||
stop_change(key);
|
|
||||||
ops_done++;
|
|
||||||
in_progress--;
|
|
||||||
if (res != 0)
|
|
||||||
{
|
|
||||||
fprintf(stderr, "ERROR: set %s = %s: %d (%s)\n", key.c_str(), value.c_str(), res, strerror(-res));
|
|
||||||
if (stop_on_error)
|
|
||||||
exit(1);
|
|
||||||
}
|
|
||||||
else
|
|
||||||
{
|
|
||||||
values[key] = value;
|
|
||||||
}
|
|
||||||
ringloop->wakeup();
|
|
||||||
}, NULL);
|
|
||||||
}
|
|
||||||
else if (dice < reopen_prob+get_prob+add_prob+update_prob+del_prob)
|
|
||||||
{
|
|
||||||
// delete
|
|
||||||
auto key = random_str(max_key_len);
|
|
||||||
auto k_it = values.lower_bound(key);
|
|
||||||
if (k_it == values.end())
|
|
||||||
continue;
|
|
||||||
key = k_it->first;
|
|
||||||
if (changing_keys.find(key) != changing_keys.end())
|
|
||||||
continue;
|
|
||||||
start_change(key);
|
|
||||||
ops_sent++;
|
|
||||||
in_progress++;
|
|
||||||
if (trace)
|
|
||||||
printf("del %s\n", key.c_str());
|
|
||||||
timespec tv_begin;
|
|
||||||
clock_gettime(CLOCK_REALTIME, &tv_begin);
|
|
||||||
db->del(key, [this, key, tv_begin](int res)
|
|
||||||
{
|
|
||||||
add_stat(stat.del, tv_begin);
|
|
||||||
stop_change(key);
|
|
||||||
ops_done++;
|
|
||||||
in_progress--;
|
|
||||||
if (res != 0)
|
|
||||||
{
|
|
||||||
fprintf(stderr, "ERROR: del %s: %d (%s)\n", key.c_str(), res, strerror(-res));
|
|
||||||
if (stop_on_error)
|
|
||||||
exit(1);
|
|
||||||
}
|
|
||||||
else
|
|
||||||
{
|
|
||||||
values.erase(key);
|
|
||||||
}
|
|
||||||
ringloop->wakeup();
|
|
||||||
}, NULL);
|
|
||||||
}
|
|
||||||
else if (dice < reopen_prob+get_prob+add_prob+update_prob+del_prob+list_prob)
|
|
||||||
{
|
|
||||||
// list
|
|
||||||
ops_sent++;
|
|
||||||
in_progress++;
|
|
||||||
auto key = random_str(max_key_len);
|
|
||||||
auto lst = new kv_test_listing_t;
|
|
||||||
auto k_it = values.lower_bound(key);
|
|
||||||
lst->count = min_list_count + (max_list_count > min_list_count ? lrand48() % (max_list_count-min_list_count) : 0);
|
|
||||||
lst->handle = db->list_start(k_it == values.begin() ? key_prefix : key);
|
|
||||||
lst->next_after = k_it == values.begin() ? key_prefix : key;
|
|
||||||
lst->inflights = changing_keys;
|
|
||||||
listings.insert(lst);
|
|
||||||
if (trace)
|
|
||||||
printf("list from %s\n", key.c_str());
|
|
||||||
clock_gettime(CLOCK_REALTIME, &lst->tv_begin);
|
|
||||||
db->list_next(lst->handle, [this, lst](int res, const std::string & key, const std::string & value)
|
|
||||||
{
|
|
||||||
if (log_level >= 11)
|
|
||||||
printf("list: %s = %s\n", key.c_str(), value.c_str());
|
|
||||||
if (res >= 0 && key_prefix.size() && (key.size() < key_prefix.size() ||
|
|
||||||
key.substr(0, key_prefix.size()) != key_prefix))
|
|
||||||
{
|
|
||||||
// stop at this key
|
|
||||||
res = -ENOENT;
|
|
||||||
}
|
|
||||||
if (res < 0 || (lst->count > 0 && lst->done >= lst->count))
|
|
||||||
{
|
|
||||||
add_stat(stat.list, lst->tv_begin);
|
|
||||||
if (res == 0)
|
|
||||||
{
|
|
||||||
// ok (done >= count)
|
|
||||||
}
|
|
||||||
else if (res != -ENOENT)
|
|
||||||
{
|
|
||||||
fprintf(stderr, "ERROR: list: %d (%s)\n", res, strerror(-res));
|
|
||||||
lst->error = true;
|
|
||||||
}
|
|
||||||
else
|
|
||||||
{
|
|
||||||
auto k_it = lst->next_after == "" ? values.begin() : values.upper_bound(lst->next_after);
|
|
||||||
while (k_it != values.end())
|
|
||||||
{
|
|
||||||
while (k_it != values.end() && lst->inflights.find(k_it->first) != lst->inflights.end())
|
|
||||||
k_it++;
|
|
||||||
if (k_it != values.end())
|
|
||||||
{
|
|
||||||
fprintf(stderr, "ERROR: list: missing key %s\n", (k_it++)->first.c_str());
|
|
||||||
lst->error = true;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
if (lst->error && stop_on_error)
|
|
||||||
exit(1);
|
|
||||||
ops_done++;
|
|
||||||
in_progress--;
|
|
||||||
db->list_close(lst->handle);
|
|
||||||
delete lst;
|
|
||||||
listings.erase(lst);
|
|
||||||
ringloop->wakeup();
|
|
||||||
}
|
|
||||||
else
|
|
||||||
{
|
|
||||||
stat.list_keys++;
|
|
||||||
// Do not check modified keys in listing
|
|
||||||
// Listing may return their old or new state
|
|
||||||
if ((!key_suffix.size() || key.size() >= key_suffix.size() &&
|
|
||||||
key.substr(key.size()-key_suffix.size()) == key_suffix) &&
|
|
||||||
lst->inflights.find(key) == lst->inflights.end())
|
|
||||||
{
|
|
||||||
lst->done++;
|
|
||||||
auto k_it = lst->next_after == "" ? values.begin() : values.upper_bound(lst->next_after);
|
|
||||||
while (true)
|
|
||||||
{
|
|
||||||
while (k_it != values.end() && lst->inflights.find(k_it->first) != lst->inflights.end())
|
|
||||||
{
|
|
||||||
k_it++;
|
|
||||||
}
|
|
||||||
if (k_it == values.end() || k_it->first > key)
|
|
||||||
{
|
|
||||||
fprintf(stderr, "ERROR: list: extra key %s\n", key.c_str());
|
|
||||||
lst->error = true;
|
|
||||||
break;
|
|
||||||
}
|
|
||||||
else if (k_it->first < key)
|
|
||||||
{
|
|
||||||
fprintf(stderr, "ERROR: list: missing key %s\n", k_it->first.c_str());
|
|
||||||
lst->error = true;
|
|
||||||
lst->next_after = k_it->first;
|
|
||||||
k_it++;
|
|
||||||
}
|
|
||||||
else
|
|
||||||
{
|
|
||||||
if (k_it->second != value)
|
|
||||||
{
|
|
||||||
fprintf(stderr, "ERROR: list: mismatch: %s = %s but should be %s\n",
|
|
||||||
key.c_str(), value.c_str(), k_it->second.c_str());
|
|
||||||
lst->error = true;
|
|
||||||
}
|
|
||||||
lst->next_after = k_it->first;
|
|
||||||
break;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
db->list_next(lst->handle, NULL);
|
|
||||||
}
|
|
||||||
});
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
void kv_test_t::add_stat(kv_test_lat_t & stat, timespec tv_begin)
|
|
||||||
{
|
|
||||||
timespec tv_end;
|
|
||||||
clock_gettime(CLOCK_REALTIME, &tv_end);
|
|
||||||
int64_t usec = (tv_end.tv_sec - tv_begin.tv_sec)*1000000 +
|
|
||||||
(tv_end.tv_nsec - tv_begin.tv_nsec)/1000;
|
|
||||||
if (usec > 0)
|
|
||||||
{
|
|
||||||
stat.usec += usec;
|
|
||||||
stat.count++;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
void kv_test_t::print_stats(kv_test_stat_t & prev_stat, timespec & prev_stat_time)
|
|
||||||
{
|
|
||||||
timespec cur_stat_time;
|
|
||||||
clock_gettime(CLOCK_REALTIME, &cur_stat_time);
|
|
||||||
int64_t usec = (cur_stat_time.tv_sec - prev_stat_time.tv_sec)*1000000 +
|
|
||||||
(cur_stat_time.tv_nsec - prev_stat_time.tv_nsec)/1000;
|
|
||||||
if (usec > 0)
|
|
||||||
{
|
|
||||||
kv_test_lat_t *lats[] = { &stat.get, &stat.add, &stat.update, &stat.del, &stat.list };
|
|
||||||
kv_test_lat_t *prev[] = { &prev_stat.get, &prev_stat.add, &prev_stat.update, &prev_stat.del, &prev_stat.list };
|
|
||||||
if (!json_output)
|
|
||||||
{
|
|
||||||
char buf[128] = { 0 };
|
|
||||||
for (int i = 0; i < sizeof(lats)/sizeof(lats[0]); i++)
|
|
||||||
{
|
|
||||||
snprintf(buf, sizeof(buf)-1, "%.1f %s/s (%lu us)", (lats[i]->count-prev[i]->count)*1000000.0/usec,
|
|
||||||
lats[i]->name, (lats[i]->usec-prev[i]->usec)/(lats[i]->count-prev[i]->count > 0 ? lats[i]->count-prev[i]->count : 1));
|
|
||||||
int k;
|
|
||||||
for (k = strlen(buf); k < strlen(lats[i]->name)+21; k++)
|
|
||||||
buf[k] = ' ';
|
|
||||||
buf[k] = 0;
|
|
||||||
printf("%s", buf);
|
|
||||||
}
|
|
||||||
printf("\n");
|
|
||||||
}
|
|
||||||
else
|
|
||||||
{
|
|
||||||
int64_t runtime = (cur_stat_time.tv_sec - start_stat_time.tv_sec)*1000000 +
|
|
||||||
(cur_stat_time.tv_nsec - start_stat_time.tv_nsec)/1000;
|
|
||||||
printf("{\"runtime\":%.1f", (double)runtime/1000000.0);
|
|
||||||
for (int i = 0; i < sizeof(lats)/sizeof(lats[0]); i++)
|
|
||||||
{
|
|
||||||
if (lats[i]->count > prev[i]->count)
|
|
||||||
{
|
|
||||||
printf(
|
|
||||||
",\"%s\":{\"avg\":{\"iops\":%.1f,\"usec\":%lu},\"total\":{\"count\":%lu,\"usec\":%lu}}",
|
|
||||||
lats[i]->name, (lats[i]->count-prev[i]->count)*1000000.0/usec,
|
|
||||||
(lats[i]->usec-prev[i]->usec)/(lats[i]->count-prev[i]->count),
|
|
||||||
lats[i]->count, lats[i]->usec
|
|
||||||
);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
printf("}\n");
|
|
||||||
}
|
|
||||||
}
|
|
||||||
prev_stat = stat;
|
|
||||||
prev_stat_time = cur_stat_time;
|
|
||||||
}
|
|
||||||
|
|
||||||
void kv_test_t::print_total_stats()
|
|
||||||
{
|
|
||||||
if (!json_output)
|
|
||||||
printf("Total:\n");
|
|
||||||
kv_test_stat_t start_stats;
|
|
||||||
timespec start_stat_time = this->start_stat_time;
|
|
||||||
print_stats(start_stats, start_stat_time);
|
|
||||||
}
|
|
||||||
|
|
||||||
void kv_test_t::start_change(const std::string & key)
|
|
||||||
{
|
|
||||||
changing_keys.insert(key);
|
|
||||||
for (auto lst: listings)
|
|
||||||
{
|
|
||||||
lst->inflights.insert(key);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
void kv_test_t::stop_change(const std::string & key)
|
|
||||||
{
|
|
||||||
changing_keys.erase(key);
|
|
||||||
}
|
|
||||||
|
|
||||||
int main(int narg, const char *args[])
|
|
||||||
{
|
|
||||||
setvbuf(stdout, NULL, _IONBF, 0);
|
|
||||||
setvbuf(stderr, NULL, _IONBF, 0);
|
|
||||||
exe_name = args[0];
|
|
||||||
kv_test_t *p = new kv_test_t();
|
|
||||||
p->run(kv_test_t::parse_args(narg, args));
|
|
||||||
delete p;
|
|
||||||
return 0;
|
|
||||||
}
|
|
@@ -22,7 +22,7 @@ void osd_messenger_t::init()
|
|||||||
{
|
{
|
||||||
rdma_context = msgr_rdma_context_t::create(
|
rdma_context = msgr_rdma_context_t::create(
|
||||||
rdma_device != "" ? rdma_device.c_str() : NULL,
|
rdma_device != "" ? rdma_device.c_str() : NULL,
|
||||||
rdma_port_num, rdma_gid_index, rdma_mtu, rdma_odp, log_level
|
rdma_port_num, rdma_gid_index, rdma_mtu, log_level
|
||||||
);
|
);
|
||||||
if (!rdma_context)
|
if (!rdma_context)
|
||||||
{
|
{
|
||||||
@@ -42,6 +42,12 @@ void osd_messenger_t::init()
|
|||||||
handle_rdma_events();
|
handle_rdma_events();
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
#endif
|
||||||
|
#ifndef SO_ZEROCOPY
|
||||||
|
if (log_level > 0)
|
||||||
|
{
|
||||||
|
fprintf(stderr, "Zero-copy TCP send is not supported in this build, ignoring\n");
|
||||||
|
}
|
||||||
#endif
|
#endif
|
||||||
keepalive_timer_id = tfd->set_timer(1000, true, [this](int)
|
keepalive_timer_id = tfd->set_timer(1000, true, [this](int)
|
||||||
{
|
{
|
||||||
@@ -167,13 +173,14 @@ void osd_messenger_t::parse_config(const json11::Json & config)
|
|||||||
this->rdma_max_msg = config["rdma_max_msg"].uint64_value();
|
this->rdma_max_msg = config["rdma_max_msg"].uint64_value();
|
||||||
if (!this->rdma_max_msg || this->rdma_max_msg > 128*1024*1024)
|
if (!this->rdma_max_msg || this->rdma_max_msg > 128*1024*1024)
|
||||||
this->rdma_max_msg = 129*1024;
|
this->rdma_max_msg = 129*1024;
|
||||||
this->rdma_odp = config["rdma_odp"].bool_value();
|
|
||||||
#endif
|
#endif
|
||||||
this->receive_buffer_size = (uint32_t)config["tcp_header_buffer_size"].uint64_value();
|
this->receive_buffer_size = (uint32_t)config["tcp_header_buffer_size"].uint64_value();
|
||||||
if (!this->receive_buffer_size || this->receive_buffer_size > 1024*1024*1024)
|
if (!this->receive_buffer_size || this->receive_buffer_size > 1024*1024*1024)
|
||||||
this->receive_buffer_size = 65536;
|
this->receive_buffer_size = 65536;
|
||||||
this->use_sync_send_recv = config["use_sync_send_recv"].bool_value() ||
|
this->use_sync_send_recv = config["use_sync_send_recv"].bool_value() ||
|
||||||
config["use_sync_send_recv"].uint64_value();
|
config["use_sync_send_recv"].uint64_value();
|
||||||
|
this->use_zerocopy_send = config["use_zerocopy_send"].bool_value() ||
|
||||||
|
config["use_zerocopy_send"].uint64_value();
|
||||||
this->peer_connect_interval = config["peer_connect_interval"].uint64_value();
|
this->peer_connect_interval = config["peer_connect_interval"].uint64_value();
|
||||||
if (!this->peer_connect_interval)
|
if (!this->peer_connect_interval)
|
||||||
this->peer_connect_interval = 5;
|
this->peer_connect_interval = 5;
|
||||||
@@ -304,8 +311,7 @@ void osd_messenger_t::handle_connect_epoll(int peer_fd)
|
|||||||
on_connect_peer(peer_osd, -result);
|
on_connect_peer(peer_osd, -result);
|
||||||
return;
|
return;
|
||||||
}
|
}
|
||||||
int one = 1;
|
set_socket_options(cl);
|
||||||
setsockopt(peer_fd, SOL_TCP, TCP_NODELAY, &one, sizeof(one));
|
|
||||||
cl->peer_state = PEER_CONNECTED;
|
cl->peer_state = PEER_CONNECTED;
|
||||||
tfd->set_fd_handler(peer_fd, false, [this](int peer_fd, int epoll_events)
|
tfd->set_fd_handler(peer_fd, false, [this](int peer_fd, int epoll_events)
|
||||||
{
|
{
|
||||||
@@ -315,6 +321,23 @@ void osd_messenger_t::handle_connect_epoll(int peer_fd)
|
|||||||
check_peer_config(cl);
|
check_peer_config(cl);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
void osd_messenger_t::set_socket_options(osd_client_t *cl)
|
||||||
|
{
|
||||||
|
int one = 1;
|
||||||
|
setsockopt(cl->peer_fd, SOL_TCP, TCP_NODELAY, &one, sizeof(one));
|
||||||
|
#ifdef SO_ZEROCOPY
|
||||||
|
if (!use_zerocopy_send)
|
||||||
|
cl->zerocopy_send = false;
|
||||||
|
else if (setsockopt(cl->peer_fd, SOL_SOCKET, SO_ZEROCOPY, &one, sizeof(one)) != 0)
|
||||||
|
{
|
||||||
|
if (log_level > 0)
|
||||||
|
fprintf(stderr, "[OSD %lu] Failed to enable zero-copy send for client %d: %s\n", this->osd_num, cl->peer_fd, strerror(errno));
|
||||||
|
}
|
||||||
|
else
|
||||||
|
cl->zerocopy_send = true;
|
||||||
|
#endif
|
||||||
|
}
|
||||||
|
|
||||||
void osd_messenger_t::handle_peer_epoll(int peer_fd, int epoll_events)
|
void osd_messenger_t::handle_peer_epoll(int peer_fd, int epoll_events)
|
||||||
{
|
{
|
||||||
// Mark client as ready (i.e. some data is available)
|
// Mark client as ready (i.e. some data is available)
|
||||||
@@ -491,14 +514,7 @@ void osd_messenger_t::check_peer_config(osd_client_t *cl)
|
|||||||
fprintf(stderr, "Connected to OSD %lu using RDMA\n", cl->osd_num);
|
fprintf(stderr, "Connected to OSD %lu using RDMA\n", cl->osd_num);
|
||||||
}
|
}
|
||||||
cl->peer_state = PEER_RDMA;
|
cl->peer_state = PEER_RDMA;
|
||||||
tfd->set_fd_handler(cl->peer_fd, false, [this](int peer_fd, int epoll_events)
|
tfd->set_fd_handler(cl->peer_fd, false, NULL);
|
||||||
{
|
|
||||||
// Do not miss the disconnection!
|
|
||||||
if (epoll_events & EPOLLRDHUP)
|
|
||||||
{
|
|
||||||
handle_peer_epoll(peer_fd, epoll_events);
|
|
||||||
}
|
|
||||||
});
|
|
||||||
// Add the initial receive request
|
// Add the initial receive request
|
||||||
try_recv_rdma(cl);
|
try_recv_rdma(cl);
|
||||||
}
|
}
|
||||||
@@ -523,14 +539,13 @@ void osd_messenger_t::accept_connections(int listen_fd)
|
|||||||
fprintf(stderr, "[OSD %lu] new client %d: connection from %s\n", this->osd_num, peer_fd,
|
fprintf(stderr, "[OSD %lu] new client %d: connection from %s\n", this->osd_num, peer_fd,
|
||||||
addr_to_string(addr).c_str());
|
addr_to_string(addr).c_str());
|
||||||
fcntl(peer_fd, F_SETFL, fcntl(peer_fd, F_GETFL, 0) | O_NONBLOCK);
|
fcntl(peer_fd, F_SETFL, fcntl(peer_fd, F_GETFL, 0) | O_NONBLOCK);
|
||||||
int one = 1;
|
auto cl = clients[peer_fd] = new osd_client_t();
|
||||||
setsockopt(peer_fd, SOL_TCP, TCP_NODELAY, &one, sizeof(one));
|
cl->peer_addr = addr;
|
||||||
clients[peer_fd] = new osd_client_t();
|
cl->peer_port = ntohs(((sockaddr_in*)&addr)->sin_port);
|
||||||
clients[peer_fd]->peer_addr = addr;
|
cl->peer_fd = peer_fd;
|
||||||
clients[peer_fd]->peer_port = ntohs(((sockaddr_in*)&addr)->sin_port);
|
cl->peer_state = PEER_CONNECTED;
|
||||||
clients[peer_fd]->peer_fd = peer_fd;
|
cl->in_buf = malloc_or_die(receive_buffer_size);
|
||||||
clients[peer_fd]->peer_state = PEER_CONNECTED;
|
set_socket_options(cl);
|
||||||
clients[peer_fd]->in_buf = malloc_or_die(receive_buffer_size);
|
|
||||||
// Add FD to epoll
|
// Add FD to epoll
|
||||||
tfd->set_fd_handler(peer_fd, false, [this](int peer_fd, int epoll_events)
|
tfd->set_fd_handler(peer_fd, false, [this](int peer_fd, int epoll_events)
|
||||||
{
|
{
|
||||||
|
@@ -45,6 +45,12 @@ struct msgr_rdma_connection_t;
|
|||||||
struct msgr_rdma_context_t;
|
struct msgr_rdma_context_t;
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
|
struct msgr_zc_not_t
|
||||||
|
{
|
||||||
|
osd_op_t *op;
|
||||||
|
uint32_t nsend;
|
||||||
|
};
|
||||||
|
|
||||||
struct osd_client_t
|
struct osd_client_t
|
||||||
{
|
{
|
||||||
int refs = 0;
|
int refs = 0;
|
||||||
@@ -57,6 +63,7 @@ struct osd_client_t
|
|||||||
int ping_time_remaining = 0;
|
int ping_time_remaining = 0;
|
||||||
int idle_time_remaining = 0;
|
int idle_time_remaining = 0;
|
||||||
osd_num_t osd_num = 0;
|
osd_num_t osd_num = 0;
|
||||||
|
bool zerocopy_send = false;
|
||||||
|
|
||||||
void *in_buf = NULL;
|
void *in_buf = NULL;
|
||||||
|
|
||||||
@@ -87,6 +94,12 @@ struct osd_client_t
|
|||||||
int write_state = 0;
|
int write_state = 0;
|
||||||
std::vector<iovec> send_list, next_send_list;
|
std::vector<iovec> send_list, next_send_list;
|
||||||
std::vector<msgr_sendp_t> outbox, next_outbox;
|
std::vector<msgr_sendp_t> outbox, next_outbox;
|
||||||
|
std::vector<msgr_zc_not_t> zerocopy_sent;
|
||||||
|
uint64_t outbox_size = 0, next_outbox_size = 0;
|
||||||
|
uint32_t zerocopy_notification_idx = 0;
|
||||||
|
uint32_t zerocopy_notification_prev = 0;
|
||||||
|
uint8_t zerocopy_notification_buf[256];
|
||||||
|
struct msghdr zerocopy_notification_msg;
|
||||||
|
|
||||||
~osd_client_t();
|
~osd_client_t();
|
||||||
};
|
};
|
||||||
@@ -123,6 +136,7 @@ protected:
|
|||||||
int osd_ping_timeout = 0;
|
int osd_ping_timeout = 0;
|
||||||
int log_level = 0;
|
int log_level = 0;
|
||||||
bool use_sync_send_recv = false;
|
bool use_sync_send_recv = false;
|
||||||
|
bool use_zerocopy_send = false;
|
||||||
|
|
||||||
#ifdef WITH_RDMA
|
#ifdef WITH_RDMA
|
||||||
bool use_rdma = true;
|
bool use_rdma = true;
|
||||||
@@ -131,7 +145,6 @@ protected:
|
|||||||
msgr_rdma_context_t *rdma_context = NULL;
|
msgr_rdma_context_t *rdma_context = NULL;
|
||||||
uint64_t rdma_max_sge = 0, rdma_max_send = 0, rdma_max_recv = 0;
|
uint64_t rdma_max_sge = 0, rdma_max_send = 0, rdma_max_recv = 0;
|
||||||
uint64_t rdma_max_msg = 0;
|
uint64_t rdma_max_msg = 0;
|
||||||
bool rdma_odp = false;
|
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
std::vector<int> read_ready_clients;
|
std::vector<int> read_ready_clients;
|
||||||
@@ -186,9 +199,11 @@ protected:
|
|||||||
void check_peer_config(osd_client_t *cl);
|
void check_peer_config(osd_client_t *cl);
|
||||||
void cancel_osd_ops(osd_client_t *cl);
|
void cancel_osd_ops(osd_client_t *cl);
|
||||||
void cancel_op(osd_op_t *op);
|
void cancel_op(osd_op_t *op);
|
||||||
|
void set_socket_options(osd_client_t *cl);
|
||||||
|
|
||||||
bool try_send(osd_client_t *cl);
|
bool try_send(osd_client_t *cl);
|
||||||
void handle_send(int result, osd_client_t *cl);
|
void handle_send(int result, osd_client_t *cl);
|
||||||
|
void handle_zerocopy_notification(osd_client_t *cl, int res);
|
||||||
|
|
||||||
bool handle_read(int result, osd_client_t *cl);
|
bool handle_read(int result, osd_client_t *cl);
|
||||||
bool handle_read_buffer(osd_client_t *cl, void *curbuf, int remain);
|
bool handle_read_buffer(osd_client_t *cl, void *curbuf, int remain);
|
||||||
@@ -198,9 +213,7 @@ protected:
|
|||||||
void handle_reply_ready(osd_op_t *op);
|
void handle_reply_ready(osd_op_t *op);
|
||||||
|
|
||||||
#ifdef WITH_RDMA
|
#ifdef WITH_RDMA
|
||||||
void try_send_rdma(osd_client_t *cl);
|
bool try_send_rdma(osd_client_t *cl);
|
||||||
void try_send_rdma_odp(osd_client_t *cl);
|
|
||||||
void try_send_rdma_nodp(osd_client_t *cl);
|
|
||||||
bool try_recv_rdma(osd_client_t *cl);
|
bool try_recv_rdma(osd_client_t *cl);
|
||||||
void handle_rdma_events();
|
void handle_rdma_events();
|
||||||
#endif
|
#endif
|
||||||
|
@@ -47,29 +47,11 @@ msgr_rdma_connection_t::~msgr_rdma_connection_t()
|
|||||||
if (qp)
|
if (qp)
|
||||||
ibv_destroy_qp(qp);
|
ibv_destroy_qp(qp);
|
||||||
if (recv_buffers.size())
|
if (recv_buffers.size())
|
||||||
{
|
|
||||||
for (auto b: recv_buffers)
|
for (auto b: recv_buffers)
|
||||||
{
|
free(b);
|
||||||
if (b.mr)
|
|
||||||
ibv_dereg_mr(b.mr);
|
|
||||||
free(b.buf);
|
|
||||||
}
|
|
||||||
recv_buffers.clear();
|
|
||||||
}
|
|
||||||
if (send_out.mr)
|
|
||||||
{
|
|
||||||
ibv_dereg_mr(send_out.mr);
|
|
||||||
send_out.mr = NULL;
|
|
||||||
}
|
|
||||||
if (send_out.buf)
|
|
||||||
{
|
|
||||||
free(send_out.buf);
|
|
||||||
send_out.buf = NULL;
|
|
||||||
}
|
|
||||||
send_out_size = 0;
|
|
||||||
}
|
}
|
||||||
|
|
||||||
msgr_rdma_context_t *msgr_rdma_context_t::create(const char *ib_devname, uint8_t ib_port, uint8_t gid_index, uint32_t mtu, bool odp, int log_level)
|
msgr_rdma_context_t *msgr_rdma_context_t::create(const char *ib_devname, uint8_t ib_port, uint8_t gid_index, uint32_t mtu, int log_level)
|
||||||
{
|
{
|
||||||
int res;
|
int res;
|
||||||
ibv_device **dev_list = NULL;
|
ibv_device **dev_list = NULL;
|
||||||
@@ -154,27 +136,21 @@ msgr_rdma_context_t *msgr_rdma_context_t::create(const char *ib_devname, uint8_t
|
|||||||
fprintf(stderr, "Couldn't query RDMA device for its features\n");
|
fprintf(stderr, "Couldn't query RDMA device for its features\n");
|
||||||
goto cleanup;
|
goto cleanup;
|
||||||
}
|
}
|
||||||
ctx->odp = odp;
|
if (!(ctx->attrx.odp_caps.general_caps & IBV_ODP_SUPPORT) ||
|
||||||
if (ctx->odp &&
|
|
||||||
(!(ctx->attrx.odp_caps.general_caps & IBV_ODP_SUPPORT) ||
|
|
||||||
!(ctx->attrx.odp_caps.general_caps & IBV_ODP_SUPPORT_IMPLICIT) ||
|
!(ctx->attrx.odp_caps.general_caps & IBV_ODP_SUPPORT_IMPLICIT) ||
|
||||||
!(ctx->attrx.odp_caps.per_transport_caps.rc_odp_caps & IBV_ODP_SUPPORT_SEND) ||
|
!(ctx->attrx.odp_caps.per_transport_caps.rc_odp_caps & IBV_ODP_SUPPORT_SEND) ||
|
||||||
!(ctx->attrx.odp_caps.per_transport_caps.rc_odp_caps & IBV_ODP_SUPPORT_RECV)))
|
!(ctx->attrx.odp_caps.per_transport_caps.rc_odp_caps & IBV_ODP_SUPPORT_RECV))
|
||||||
{
|
{
|
||||||
ctx->odp = false;
|
fprintf(stderr, "The RDMA device isn't implicit ODP (On-Demand Paging) capable or does not support RC send and receive with ODP\n");
|
||||||
if (log_level > 0)
|
goto cleanup;
|
||||||
fprintf(stderr, "The RDMA device isn't implicit ODP (On-Demand Paging) capable, disabling it\n");
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
if (ctx->odp)
|
ctx->mr = ibv_reg_mr(ctx->pd, NULL, SIZE_MAX, IBV_ACCESS_LOCAL_WRITE | IBV_ACCESS_ON_DEMAND);
|
||||||
|
if (!ctx->mr)
|
||||||
{
|
{
|
||||||
ctx->mr = ibv_reg_mr(ctx->pd, NULL, SIZE_MAX, IBV_ACCESS_LOCAL_WRITE | IBV_ACCESS_ON_DEMAND);
|
fprintf(stderr, "Couldn't register RDMA memory region\n");
|
||||||
if (!ctx->mr)
|
goto cleanup;
|
||||||
{
|
|
||||||
fprintf(stderr, "Couldn't register RDMA memory region\n");
|
|
||||||
goto cleanup;
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
|
|
||||||
ctx->channel = ibv_create_comp_channel(ctx->context);
|
ctx->channel = ibv_create_comp_channel(ctx->context);
|
||||||
@@ -389,34 +365,12 @@ static void try_send_rdma_wr(osd_client_t *cl, ibv_sge *sge, int op_sge)
|
|||||||
cl->rdma_conn->cur_send++;
|
cl->rdma_conn->cur_send++;
|
||||||
}
|
}
|
||||||
|
|
||||||
static int try_send_rdma_copy(osd_client_t *cl, uint8_t *dst, int dst_len)
|
bool osd_messenger_t::try_send_rdma(osd_client_t *cl)
|
||||||
{
|
|
||||||
auto rc = cl->rdma_conn;
|
|
||||||
int total_dst_len = dst_len;
|
|
||||||
while (dst_len > 0 && rc->send_pos < cl->send_list.size())
|
|
||||||
{
|
|
||||||
iovec & iov = cl->send_list[rc->send_pos];
|
|
||||||
uint32_t len = (uint32_t)(iov.iov_len-rc->send_buf_pos < dst_len
|
|
||||||
? iov.iov_len-rc->send_buf_pos : dst_len);
|
|
||||||
memcpy(dst, iov.iov_base+rc->send_buf_pos, len);
|
|
||||||
dst += len;
|
|
||||||
dst_len -= len;
|
|
||||||
rc->send_buf_pos += len;
|
|
||||||
if (rc->send_buf_pos >= iov.iov_len)
|
|
||||||
{
|
|
||||||
rc->send_pos++;
|
|
||||||
rc->send_buf_pos = 0;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
return total_dst_len-dst_len;
|
|
||||||
}
|
|
||||||
|
|
||||||
void osd_messenger_t::try_send_rdma_odp(osd_client_t *cl)
|
|
||||||
{
|
{
|
||||||
auto rc = cl->rdma_conn;
|
auto rc = cl->rdma_conn;
|
||||||
if (!cl->send_list.size() || rc->cur_send >= rc->max_send)
|
if (!cl->send_list.size() || rc->cur_send >= rc->max_send)
|
||||||
{
|
{
|
||||||
return;
|
return true;
|
||||||
}
|
}
|
||||||
uint64_t op_size = 0, op_sge = 0;
|
uint64_t op_size = 0, op_sge = 0;
|
||||||
ibv_sge sge[rc->max_sge];
|
ibv_sge sge[rc->max_sge];
|
||||||
@@ -454,70 +408,15 @@ void osd_messenger_t::try_send_rdma_odp(osd_client_t *cl)
|
|||||||
rc->send_sizes.push_back(op_size);
|
rc->send_sizes.push_back(op_size);
|
||||||
try_send_rdma_wr(cl, sge, op_sge);
|
try_send_rdma_wr(cl, sge, op_sge);
|
||||||
}
|
}
|
||||||
|
return true;
|
||||||
}
|
}
|
||||||
|
|
||||||
void osd_messenger_t::try_send_rdma_nodp(osd_client_t *cl)
|
static void try_recv_rdma_wr(osd_client_t *cl, void *buf)
|
||||||
{
|
|
||||||
auto rc = cl->rdma_conn;
|
|
||||||
if (!rc->send_out_size)
|
|
||||||
{
|
|
||||||
// Allocate send ring buffer, if not yet
|
|
||||||
rc->send_out_size = rc->max_msg*rdma_max_send;
|
|
||||||
rc->send_out.buf = malloc_or_die(rc->send_out_size);
|
|
||||||
if (!rdma_context->odp)
|
|
||||||
{
|
|
||||||
rc->send_out.mr = ibv_reg_mr(rdma_context->pd, rc->send_out.buf, rc->send_out_size, 0);
|
|
||||||
if (!rc->send_out.mr)
|
|
||||||
{
|
|
||||||
fprintf(stderr, "Failed to register RDMA memory region: %s\n", strerror(errno));
|
|
||||||
exit(1);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
// Copy data into the buffer and send it
|
|
||||||
uint8_t *dst = NULL;
|
|
||||||
int dst_len = 0;
|
|
||||||
int copied = 1;
|
|
||||||
while (!rc->send_out_full && copied > 0 && rc->cur_send < rc->max_send)
|
|
||||||
{
|
|
||||||
dst = (uint8_t*)rc->send_out.buf + rc->send_out_pos;
|
|
||||||
dst_len = (rc->send_out_pos < rc->send_out_size ? rc->send_out_size-rc->send_out_pos : rc->send_done_pos-rc->send_out_pos);
|
|
||||||
if (dst_len > rc->max_msg)
|
|
||||||
dst_len = rc->max_msg;
|
|
||||||
copied = try_send_rdma_copy(cl, dst, dst_len);
|
|
||||||
if (copied > 0)
|
|
||||||
{
|
|
||||||
rc->send_out_pos += copied;
|
|
||||||
if (rc->send_out_pos == rc->send_out_size)
|
|
||||||
rc->send_out_pos = 0;
|
|
||||||
assert(rc->send_out_pos < rc->send_out_size);
|
|
||||||
if (rc->send_out_pos >= rc->send_done_pos)
|
|
||||||
rc->send_out_full = true;
|
|
||||||
ibv_sge sge = {
|
|
||||||
.addr = (uintptr_t)dst,
|
|
||||||
.length = (uint32_t)copied,
|
|
||||||
.lkey = rdma_context->odp ? rdma_context->mr->lkey : rc->send_out.mr->lkey,
|
|
||||||
};
|
|
||||||
try_send_rdma_wr(cl, &sge, 1);
|
|
||||||
rc->send_sizes.push_back(copied);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
void osd_messenger_t::try_send_rdma(osd_client_t *cl)
|
|
||||||
{
|
|
||||||
if (rdma_context->odp)
|
|
||||||
try_send_rdma_odp(cl);
|
|
||||||
else
|
|
||||||
try_send_rdma_nodp(cl);
|
|
||||||
}
|
|
||||||
|
|
||||||
static void try_recv_rdma_wr(osd_client_t *cl, msgr_rdma_buf_t b)
|
|
||||||
{
|
{
|
||||||
ibv_sge sge = {
|
ibv_sge sge = {
|
||||||
.addr = (uintptr_t)b.buf,
|
.addr = (uintptr_t)buf,
|
||||||
.length = (uint32_t)cl->rdma_conn->max_msg,
|
.length = (uint32_t)cl->rdma_conn->max_msg,
|
||||||
.lkey = cl->rdma_conn->ctx->odp ? cl->rdma_conn->ctx->mr->lkey : b.mr->lkey,
|
.lkey = cl->rdma_conn->ctx->mr->lkey,
|
||||||
};
|
};
|
||||||
ibv_recv_wr *bad_wr = NULL;
|
ibv_recv_wr *bad_wr = NULL;
|
||||||
ibv_recv_wr wr = {
|
ibv_recv_wr wr = {
|
||||||
@@ -539,19 +438,9 @@ bool osd_messenger_t::try_recv_rdma(osd_client_t *cl)
|
|||||||
auto rc = cl->rdma_conn;
|
auto rc = cl->rdma_conn;
|
||||||
while (rc->cur_recv < rc->max_recv)
|
while (rc->cur_recv < rc->max_recv)
|
||||||
{
|
{
|
||||||
msgr_rdma_buf_t b;
|
void *buf = malloc_or_die(rc->max_msg);
|
||||||
b.buf = malloc_or_die(rc->max_msg);
|
rc->recv_buffers.push_back(buf);
|
||||||
if (!rdma_context->odp)
|
try_recv_rdma_wr(cl, buf);
|
||||||
{
|
|
||||||
b.mr = ibv_reg_mr(rdma_context->pd, b.buf, rc->max_msg, IBV_ACCESS_LOCAL_WRITE);
|
|
||||||
if (!b.mr)
|
|
||||||
{
|
|
||||||
fprintf(stderr, "Failed to register RDMA memory region: %s\n", strerror(errno));
|
|
||||||
exit(1);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
rc->recv_buffers.push_back(b);
|
|
||||||
try_recv_rdma_wr(cl, b);
|
|
||||||
}
|
}
|
||||||
return true;
|
return true;
|
||||||
}
|
}
|
||||||
@@ -603,7 +492,7 @@ void osd_messenger_t::handle_rdma_events()
|
|||||||
if (!is_send)
|
if (!is_send)
|
||||||
{
|
{
|
||||||
rc->cur_recv--;
|
rc->cur_recv--;
|
||||||
if (!handle_read_buffer(cl, rc->recv_buffers[rc->next_recv_buf].buf, wc[i].byte_len))
|
if (!handle_read_buffer(cl, rc->recv_buffers[rc->next_recv_buf], wc[i].byte_len))
|
||||||
{
|
{
|
||||||
// handle_read_buffer may stop the client
|
// handle_read_buffer may stop the client
|
||||||
continue;
|
continue;
|
||||||
@@ -616,14 +505,6 @@ void osd_messenger_t::handle_rdma_events()
|
|||||||
rc->cur_send--;
|
rc->cur_send--;
|
||||||
uint64_t sent_size = rc->send_sizes.at(0);
|
uint64_t sent_size = rc->send_sizes.at(0);
|
||||||
rc->send_sizes.erase(rc->send_sizes.begin(), rc->send_sizes.begin()+1);
|
rc->send_sizes.erase(rc->send_sizes.begin(), rc->send_sizes.begin()+1);
|
||||||
if (!rdma_context->odp)
|
|
||||||
{
|
|
||||||
rc->send_done_pos += sent_size;
|
|
||||||
rc->send_out_full = false;
|
|
||||||
if (rc->send_done_pos == rc->send_out_size)
|
|
||||||
rc->send_done_pos = 0;
|
|
||||||
assert(rc->send_done_pos < rc->send_out_size);
|
|
||||||
}
|
|
||||||
int send_pos = 0, send_buf_pos = 0;
|
int send_pos = 0, send_buf_pos = 0;
|
||||||
while (sent_size > 0)
|
while (sent_size > 0)
|
||||||
{
|
{
|
||||||
|
@@ -23,7 +23,6 @@ struct msgr_rdma_context_t
|
|||||||
ibv_device *dev = NULL;
|
ibv_device *dev = NULL;
|
||||||
ibv_device_attr_ex attrx;
|
ibv_device_attr_ex attrx;
|
||||||
ibv_pd *pd = NULL;
|
ibv_pd *pd = NULL;
|
||||||
bool odp = false;
|
|
||||||
ibv_mr *mr = NULL;
|
ibv_mr *mr = NULL;
|
||||||
ibv_comp_channel *channel = NULL;
|
ibv_comp_channel *channel = NULL;
|
||||||
ibv_cq *cq = NULL;
|
ibv_cq *cq = NULL;
|
||||||
@@ -36,16 +35,10 @@ struct msgr_rdma_context_t
|
|||||||
int max_cqe = 0;
|
int max_cqe = 0;
|
||||||
int used_max_cqe = 0;
|
int used_max_cqe = 0;
|
||||||
|
|
||||||
static msgr_rdma_context_t *create(const char *ib_devname, uint8_t ib_port, uint8_t gid_index, uint32_t mtu, bool odp, int log_level);
|
static msgr_rdma_context_t *create(const char *ib_devname, uint8_t ib_port, uint8_t gid_index, uint32_t mtu, int log_level);
|
||||||
~msgr_rdma_context_t();
|
~msgr_rdma_context_t();
|
||||||
};
|
};
|
||||||
|
|
||||||
struct msgr_rdma_buf_t
|
|
||||||
{
|
|
||||||
void *buf = NULL;
|
|
||||||
ibv_mr *mr = NULL;
|
|
||||||
};
|
|
||||||
|
|
||||||
struct msgr_rdma_connection_t
|
struct msgr_rdma_connection_t
|
||||||
{
|
{
|
||||||
msgr_rdma_context_t *ctx = NULL;
|
msgr_rdma_context_t *ctx = NULL;
|
||||||
@@ -57,11 +50,8 @@ struct msgr_rdma_connection_t
|
|||||||
|
|
||||||
int send_pos = 0, send_buf_pos = 0;
|
int send_pos = 0, send_buf_pos = 0;
|
||||||
int next_recv_buf = 0;
|
int next_recv_buf = 0;
|
||||||
std::vector<msgr_rdma_buf_t> recv_buffers;
|
std::vector<void*> recv_buffers;
|
||||||
std::vector<uint64_t> send_sizes;
|
std::vector<uint64_t> send_sizes;
|
||||||
msgr_rdma_buf_t send_out;
|
|
||||||
int send_out_pos = 0, send_done_pos = 0, send_out_size = 0;
|
|
||||||
bool send_out_full = false;
|
|
||||||
|
|
||||||
~msgr_rdma_connection_t();
|
~msgr_rdma_connection_t();
|
||||||
static msgr_rdma_connection_t *create(msgr_rdma_context_t *ctx, uint32_t max_send, uint32_t max_recv, uint32_t max_sge, uint32_t max_msg);
|
static msgr_rdma_connection_t *create(msgr_rdma_context_t *ctx, uint32_t max_send, uint32_t max_recv, uint32_t max_sge, uint32_t max_msg);
|
||||||
|
@@ -3,10 +3,15 @@
|
|||||||
|
|
||||||
#define _XOPEN_SOURCE
|
#define _XOPEN_SOURCE
|
||||||
#include <limits.h>
|
#include <limits.h>
|
||||||
#include <sys/epoll.h>
|
|
||||||
|
|
||||||
#include "messenger.h"
|
#include "messenger.h"
|
||||||
|
|
||||||
|
#include <linux/errqueue.h>
|
||||||
|
|
||||||
|
#ifndef MSG_ZEROCOPY
|
||||||
|
#define MSG_ZEROCOPY 0
|
||||||
|
#endif
|
||||||
|
|
||||||
void osd_messenger_t::outbox_push(osd_op_t *cur_op)
|
void osd_messenger_t::outbox_push(osd_op_t *cur_op)
|
||||||
{
|
{
|
||||||
assert(cur_op->peer_fd);
|
assert(cur_op->peer_fd);
|
||||||
@@ -37,6 +42,7 @@ void osd_messenger_t::outbox_push(osd_op_t *cur_op)
|
|||||||
}
|
}
|
||||||
auto & to_send_list = cl->write_msg.msg_iovlen ? cl->next_send_list : cl->send_list;
|
auto & to_send_list = cl->write_msg.msg_iovlen ? cl->next_send_list : cl->send_list;
|
||||||
auto & to_outbox = cl->write_msg.msg_iovlen ? cl->next_outbox : cl->outbox;
|
auto & to_outbox = cl->write_msg.msg_iovlen ? cl->next_outbox : cl->outbox;
|
||||||
|
auto & to_size = cl->write_msg.msg_iovlen ? cl->next_outbox_size : cl->outbox_size;
|
||||||
if (cur_op->op_type == OSD_OP_IN)
|
if (cur_op->op_type == OSD_OP_IN)
|
||||||
{
|
{
|
||||||
measure_exec(cur_op);
|
measure_exec(cur_op);
|
||||||
@@ -47,6 +53,7 @@ void osd_messenger_t::outbox_push(osd_op_t *cur_op)
|
|||||||
to_send_list.push_back((iovec){ .iov_base = cur_op->req.buf, .iov_len = OSD_PACKET_SIZE });
|
to_send_list.push_back((iovec){ .iov_base = cur_op->req.buf, .iov_len = OSD_PACKET_SIZE });
|
||||||
cl->sent_ops[cur_op->req.hdr.id] = cur_op;
|
cl->sent_ops[cur_op->req.hdr.id] = cur_op;
|
||||||
}
|
}
|
||||||
|
to_size += OSD_PACKET_SIZE;
|
||||||
to_outbox.push_back((msgr_sendp_t){ .op = cur_op, .flags = MSGR_SENDP_HDR });
|
to_outbox.push_back((msgr_sendp_t){ .op = cur_op, .flags = MSGR_SENDP_HDR });
|
||||||
// Bitmap
|
// Bitmap
|
||||||
if (cur_op->op_type == OSD_OP_IN &&
|
if (cur_op->op_type == OSD_OP_IN &&
|
||||||
@@ -58,6 +65,7 @@ void osd_messenger_t::outbox_push(osd_op_t *cur_op)
|
|||||||
.iov_len = cur_op->reply.sec_rw.attr_len,
|
.iov_len = cur_op->reply.sec_rw.attr_len,
|
||||||
});
|
});
|
||||||
to_outbox.push_back((msgr_sendp_t){ .op = cur_op, .flags = 0 });
|
to_outbox.push_back((msgr_sendp_t){ .op = cur_op, .flags = 0 });
|
||||||
|
to_size += cur_op->reply.sec_rw.attr_len;
|
||||||
}
|
}
|
||||||
else if (cur_op->op_type == OSD_OP_OUT &&
|
else if (cur_op->op_type == OSD_OP_OUT &&
|
||||||
(cur_op->req.hdr.opcode == OSD_OP_SEC_WRITE || cur_op->req.hdr.opcode == OSD_OP_SEC_WRITE_STABLE) &&
|
(cur_op->req.hdr.opcode == OSD_OP_SEC_WRITE || cur_op->req.hdr.opcode == OSD_OP_SEC_WRITE_STABLE) &&
|
||||||
@@ -68,6 +76,7 @@ void osd_messenger_t::outbox_push(osd_op_t *cur_op)
|
|||||||
.iov_len = cur_op->req.sec_rw.attr_len,
|
.iov_len = cur_op->req.sec_rw.attr_len,
|
||||||
});
|
});
|
||||||
to_outbox.push_back((msgr_sendp_t){ .op = cur_op, .flags = 0 });
|
to_outbox.push_back((msgr_sendp_t){ .op = cur_op, .flags = 0 });
|
||||||
|
to_size += cur_op->req.sec_rw.attr_len;
|
||||||
}
|
}
|
||||||
// Operation data
|
// Operation data
|
||||||
if ((cur_op->op_type == OSD_OP_IN
|
if ((cur_op->op_type == OSD_OP_IN
|
||||||
@@ -90,15 +99,22 @@ void osd_messenger_t::outbox_push(osd_op_t *cur_op)
|
|||||||
assert(cur_op->iov.buf[i].iov_base);
|
assert(cur_op->iov.buf[i].iov_base);
|
||||||
to_send_list.push_back(cur_op->iov.buf[i]);
|
to_send_list.push_back(cur_op->iov.buf[i]);
|
||||||
to_outbox.push_back((msgr_sendp_t){ .op = cur_op, .flags = 0 });
|
to_outbox.push_back((msgr_sendp_t){ .op = cur_op, .flags = 0 });
|
||||||
|
to_size += cur_op->iov.buf[i].iov_len;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
if (cur_op->req.hdr.opcode == OSD_OP_SEC_READ_BMP)
|
if (cur_op->req.hdr.opcode == OSD_OP_SEC_READ_BMP)
|
||||||
{
|
{
|
||||||
if (cur_op->op_type == OSD_OP_IN && cur_op->reply.hdr.retval > 0)
|
if (cur_op->op_type == OSD_OP_IN && cur_op->reply.hdr.retval > 0)
|
||||||
|
{
|
||||||
to_send_list.push_back((iovec){ .iov_base = cur_op->buf, .iov_len = (size_t)cur_op->reply.hdr.retval });
|
to_send_list.push_back((iovec){ .iov_base = cur_op->buf, .iov_len = (size_t)cur_op->reply.hdr.retval });
|
||||||
|
to_size += cur_op->reply.hdr.retval;
|
||||||
|
}
|
||||||
else if (cur_op->op_type == OSD_OP_OUT && cur_op->req.sec_read_bmp.len > 0)
|
else if (cur_op->op_type == OSD_OP_OUT && cur_op->req.sec_read_bmp.len > 0)
|
||||||
|
{
|
||||||
to_send_list.push_back((iovec){ .iov_base = cur_op->buf, .iov_len = (size_t)cur_op->req.sec_read_bmp.len });
|
to_send_list.push_back((iovec){ .iov_base = cur_op->buf, .iov_len = (size_t)cur_op->req.sec_read_bmp.len });
|
||||||
|
to_size += cur_op->req.sec_read_bmp.len;
|
||||||
|
}
|
||||||
to_outbox.push_back((msgr_sendp_t){ .op = cur_op, .flags = 0 });
|
to_outbox.push_back((msgr_sendp_t){ .op = cur_op, .flags = 0 });
|
||||||
}
|
}
|
||||||
if (cur_op->op_type == OSD_OP_IN)
|
if (cur_op->op_type == OSD_OP_IN)
|
||||||
@@ -184,17 +200,19 @@ bool osd_messenger_t::try_send(osd_client_t *cl)
|
|||||||
}
|
}
|
||||||
cl->write_msg.msg_iov = cl->send_list.data();
|
cl->write_msg.msg_iov = cl->send_list.data();
|
||||||
cl->write_msg.msg_iovlen = cl->send_list.size() < IOV_MAX ? cl->send_list.size() : IOV_MAX;
|
cl->write_msg.msg_iovlen = cl->send_list.size() < IOV_MAX ? cl->send_list.size() : IOV_MAX;
|
||||||
|
cl->write_msg.msg_flags = (cl->zerocopy_send && (cl->outbox_size/cl->send_list.size()) >= 4096 ? MSG_ZEROCOPY : 0);
|
||||||
cl->refs++;
|
cl->refs++;
|
||||||
ring_data_t* data = ((ring_data_t*)sqe->user_data);
|
ring_data_t* data = ((ring_data_t*)sqe->user_data);
|
||||||
data->callback = [this, cl](ring_data_t *data) { handle_send(data->res, cl); };
|
data->callback = [this, cl](ring_data_t *data) { handle_send(data->res, cl); };
|
||||||
my_uring_prep_sendmsg(sqe, peer_fd, &cl->write_msg, 0);
|
my_uring_prep_sendmsg(sqe, peer_fd, &cl->write_msg, cl->write_msg.msg_flags);
|
||||||
}
|
}
|
||||||
else
|
else
|
||||||
{
|
{
|
||||||
cl->write_msg.msg_iov = cl->send_list.data();
|
cl->write_msg.msg_iov = cl->send_list.data();
|
||||||
cl->write_msg.msg_iovlen = cl->send_list.size() < IOV_MAX ? cl->send_list.size() : IOV_MAX;
|
cl->write_msg.msg_iovlen = cl->send_list.size() < IOV_MAX ? cl->send_list.size() : IOV_MAX;
|
||||||
|
cl->write_msg.msg_flags = (cl->zerocopy_send && (cl->outbox_size/cl->send_list.size()) >= 4096 ? MSG_ZEROCOPY : 0);
|
||||||
cl->refs++;
|
cl->refs++;
|
||||||
int result = sendmsg(peer_fd, &cl->write_msg, MSG_NOSIGNAL);
|
int result = sendmsg(peer_fd, &cl->write_msg, MSG_NOSIGNAL | cl->write_msg.msg_flags);
|
||||||
if (result < 0)
|
if (result < 0)
|
||||||
{
|
{
|
||||||
result = -errno;
|
result = -errno;
|
||||||
@@ -204,6 +222,62 @@ bool osd_messenger_t::try_send(osd_client_t *cl)
|
|||||||
return true;
|
return true;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
void osd_messenger_t::handle_zerocopy_notification(osd_client_t *cl, int res)
|
||||||
|
{
|
||||||
|
cl->refs--;
|
||||||
|
if (cl->peer_state == PEER_STOPPED)
|
||||||
|
{
|
||||||
|
if (cl->refs <= 0)
|
||||||
|
{
|
||||||
|
delete cl;
|
||||||
|
}
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
if (res != 0)
|
||||||
|
{
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
if (cl->zerocopy_notification_msg.msg_flags & MSG_CTRUNC)
|
||||||
|
{
|
||||||
|
fprintf(stderr, "zero-copy send notification truncated on client socket %d\n", cl->peer_fd);
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
for (struct cmsghdr *cm = CMSG_FIRSTHDR(&cl->zerocopy_notification_msg); cm; cm = CMSG_NXTHDR(&cl->zerocopy_notification_msg, cm))
|
||||||
|
{
|
||||||
|
if (cm->cmsg_level == SOL_IP && cm->cmsg_type == IP_RECVERR)
|
||||||
|
{
|
||||||
|
struct sock_extended_err *serr = (struct sock_extended_err*)CMSG_DATA(cm);
|
||||||
|
if (serr->ee_errno == 0 && serr->ee_origin == SO_EE_ORIGIN_ZEROCOPY)
|
||||||
|
{
|
||||||
|
// completed sends numbered serr->ee_info .. serr->ee_data
|
||||||
|
int start = 0;
|
||||||
|
while (start < cl->zerocopy_sent.size() && cl->zerocopy_sent[start].nsend < serr->ee_info)
|
||||||
|
start++;
|
||||||
|
int end = start;
|
||||||
|
if (serr->ee_data < serr->ee_info)
|
||||||
|
{
|
||||||
|
// counter has wrapped around
|
||||||
|
while (end < cl->zerocopy_sent.size() && cl->zerocopy_sent[end].nsend >= cl->zerocopy_sent[start].nsend)
|
||||||
|
end++;
|
||||||
|
}
|
||||||
|
while (end < cl->zerocopy_sent.size() && cl->zerocopy_sent[end].nsend <= serr->ee_data)
|
||||||
|
end++;
|
||||||
|
if (end > start)
|
||||||
|
{
|
||||||
|
for (int i = start; i < end; i++)
|
||||||
|
{
|
||||||
|
delete cl->zerocopy_sent[i].op;
|
||||||
|
}
|
||||||
|
cl->zerocopy_sent.erase(
|
||||||
|
cl->zerocopy_sent.begin() + start,
|
||||||
|
cl->zerocopy_sent.begin() + end
|
||||||
|
);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
void osd_messenger_t::send_replies()
|
void osd_messenger_t::send_replies()
|
||||||
{
|
{
|
||||||
for (int i = 0; i < write_ready_clients.size(); i++)
|
for (int i = 0; i < write_ready_clients.size(); i++)
|
||||||
@@ -231,16 +305,19 @@ void osd_messenger_t::handle_send(int result, osd_client_t *cl)
|
|||||||
}
|
}
|
||||||
return;
|
return;
|
||||||
}
|
}
|
||||||
if (result < 0 && result != -EAGAIN && result != -EINTR)
|
if (result < 0 && result != -EAGAIN && result != -EINTR && result != -ENOBUFS)
|
||||||
{
|
{
|
||||||
// this is a client socket, so don't panic. just disconnect it
|
// this is a client socket, so don't panic. just disconnect it
|
||||||
fprintf(stderr, "Client %d socket write error: %d (%s). Disconnecting client\n", cl->peer_fd, -result, strerror(-result));
|
fprintf(stderr, "Client %d socket write error: %d (%s). Disconnecting client\n", cl->peer_fd, -result, strerror(-result));
|
||||||
stop_client(cl->peer_fd);
|
stop_client(cl->peer_fd);
|
||||||
return;
|
return;
|
||||||
}
|
}
|
||||||
|
bool used_zerocopy = false;
|
||||||
if (result >= 0)
|
if (result >= 0)
|
||||||
{
|
{
|
||||||
|
used_zerocopy = (cl->write_msg.msg_flags & MSG_ZEROCOPY) ? true : false;
|
||||||
int done = 0;
|
int done = 0;
|
||||||
|
int bytes_written = result;
|
||||||
while (result > 0 && done < cl->send_list.size())
|
while (result > 0 && done < cl->send_list.size())
|
||||||
{
|
{
|
||||||
iovec & iov = cl->send_list[done];
|
iovec & iov = cl->send_list[done];
|
||||||
@@ -249,7 +326,19 @@ void osd_messenger_t::handle_send(int result, osd_client_t *cl)
|
|||||||
if (cl->outbox[done].flags & MSGR_SENDP_FREE)
|
if (cl->outbox[done].flags & MSGR_SENDP_FREE)
|
||||||
{
|
{
|
||||||
// Reply fully sent
|
// Reply fully sent
|
||||||
delete cl->outbox[done].op;
|
if (!used_zerocopy)
|
||||||
|
{
|
||||||
|
delete cl->outbox[done].op;
|
||||||
|
}
|
||||||
|
else
|
||||||
|
{
|
||||||
|
// With zero-copy send the difference is that we must keep the buffer (i.e. the operation)
|
||||||
|
// allocated until we get send notification from MSG_ERRQUEUE
|
||||||
|
cl->zerocopy_sent.push_back((msgr_zc_not_t){
|
||||||
|
.op = cl->outbox[done].op,
|
||||||
|
.nsend = cl->zerocopy_notification_idx,
|
||||||
|
});
|
||||||
|
}
|
||||||
}
|
}
|
||||||
result -= iov.iov_len;
|
result -= iov.iov_len;
|
||||||
done++;
|
done++;
|
||||||
@@ -261,6 +350,11 @@ void osd_messenger_t::handle_send(int result, osd_client_t *cl)
|
|||||||
break;
|
break;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
if (used_zerocopy)
|
||||||
|
{
|
||||||
|
cl->zerocopy_notification_idx++;
|
||||||
|
}
|
||||||
|
cl->outbox_size -= bytes_written;
|
||||||
if (done > 0)
|
if (done > 0)
|
||||||
{
|
{
|
||||||
cl->send_list.erase(cl->send_list.begin(), cl->send_list.begin()+done);
|
cl->send_list.erase(cl->send_list.begin(), cl->send_list.begin()+done);
|
||||||
@@ -270,8 +364,10 @@ void osd_messenger_t::handle_send(int result, osd_client_t *cl)
|
|||||||
{
|
{
|
||||||
cl->send_list.insert(cl->send_list.end(), cl->next_send_list.begin(), cl->next_send_list.end());
|
cl->send_list.insert(cl->send_list.end(), cl->next_send_list.begin(), cl->next_send_list.end());
|
||||||
cl->outbox.insert(cl->outbox.end(), cl->next_outbox.begin(), cl->next_outbox.end());
|
cl->outbox.insert(cl->outbox.end(), cl->next_outbox.begin(), cl->next_outbox.end());
|
||||||
|
cl->outbox_size += cl->next_outbox_size;
|
||||||
cl->next_send_list.clear();
|
cl->next_send_list.clear();
|
||||||
cl->next_outbox.clear();
|
cl->next_outbox.clear();
|
||||||
|
cl->next_outbox_size = 0;
|
||||||
}
|
}
|
||||||
cl->write_state = cl->outbox.size() > 0 ? CL_WRITE_READY : 0;
|
cl->write_state = cl->outbox.size() > 0 ? CL_WRITE_READY : 0;
|
||||||
#ifdef WITH_RDMA
|
#ifdef WITH_RDMA
|
||||||
@@ -284,14 +380,7 @@ void osd_messenger_t::handle_send(int result, osd_client_t *cl)
|
|||||||
fprintf(stderr, "Successfully connected with client %d using RDMA\n", cl->peer_fd);
|
fprintf(stderr, "Successfully connected with client %d using RDMA\n", cl->peer_fd);
|
||||||
}
|
}
|
||||||
cl->peer_state = PEER_RDMA;
|
cl->peer_state = PEER_RDMA;
|
||||||
tfd->set_fd_handler(cl->peer_fd, false, [this](int peer_fd, int epoll_events)
|
tfd->set_fd_handler(cl->peer_fd, false, NULL);
|
||||||
{
|
|
||||||
// Do not miss the disconnection!
|
|
||||||
if (epoll_events & EPOLLRDHUP)
|
|
||||||
{
|
|
||||||
handle_peer_epoll(peer_fd, epoll_events);
|
|
||||||
}
|
|
||||||
});
|
|
||||||
// Add the initial receive request
|
// Add the initial receive request
|
||||||
try_recv_rdma(cl);
|
try_recv_rdma(cl);
|
||||||
}
|
}
|
||||||
@@ -301,4 +390,34 @@ void osd_messenger_t::handle_send(int result, osd_client_t *cl)
|
|||||||
{
|
{
|
||||||
write_ready_clients.push_back(cl->peer_fd);
|
write_ready_clients.push_back(cl->peer_fd);
|
||||||
}
|
}
|
||||||
|
if (used_zerocopy && (cl->zerocopy_notification_idx-cl->zerocopy_notification_prev) >= 16 &&
|
||||||
|
cl->zerocopy_sent.size() > 0)
|
||||||
|
{
|
||||||
|
cl->zerocopy_notification_prev = cl->zerocopy_notification_idx;
|
||||||
|
cl->zerocopy_notification_msg = {
|
||||||
|
.msg_control = cl->zerocopy_notification_buf,
|
||||||
|
.msg_controllen = sizeof(cl->zerocopy_notification_buf),
|
||||||
|
};
|
||||||
|
cl->refs++;
|
||||||
|
io_uring_sqe* sqe = NULL;
|
||||||
|
if (ringloop && !use_sync_send_recv)
|
||||||
|
{
|
||||||
|
sqe = ringloop->get_sqe();
|
||||||
|
}
|
||||||
|
if (!sqe)
|
||||||
|
{
|
||||||
|
int res = recvmsg(cl->peer_fd, &cl->zerocopy_notification_msg, MSG_ERRQUEUE|MSG_DONTWAIT);
|
||||||
|
if (res < 0)
|
||||||
|
{
|
||||||
|
res = -errno;
|
||||||
|
}
|
||||||
|
handle_zerocopy_notification(cl, res);
|
||||||
|
}
|
||||||
|
else
|
||||||
|
{
|
||||||
|
ring_data_t* data = ((ring_data_t*)sqe->user_data);
|
||||||
|
data->callback = [this, cl](ring_data_t *data) { handle_zerocopy_notification(cl, data->res); };
|
||||||
|
my_uring_prep_recvmsg(sqe, cl->peer_fd, &cl->zerocopy_notification_msg, MSG_ERRQUEUE);
|
||||||
|
}
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
@@ -225,7 +225,7 @@ public:
|
|||||||
cfg = obj;
|
cfg = obj;
|
||||||
}
|
}
|
||||||
// Create client
|
// Create client
|
||||||
ringloop = new ring_loop_t(RINGLOOP_DEFAULT_SIZE);
|
ringloop = new ring_loop_t(512);
|
||||||
epmgr = new epoll_manager_t(ringloop);
|
epmgr = new epoll_manager_t(ringloop);
|
||||||
cli = new cluster_client_t(ringloop, epmgr->tfd, cfg);
|
cli = new cluster_client_t(ringloop, epmgr->tfd, cfg);
|
||||||
if (!inode)
|
if (!inode)
|
||||||
|
@@ -124,7 +124,7 @@ void nfs_proxy_t::run(json11::Json cfg)
|
|||||||
cfg = obj;
|
cfg = obj;
|
||||||
}
|
}
|
||||||
// Create client
|
// Create client
|
||||||
ringloop = new ring_loop_t(RINGLOOP_DEFAULT_SIZE);
|
ringloop = new ring_loop_t(512);
|
||||||
epmgr = new epoll_manager_t(ringloop);
|
epmgr = new epoll_manager_t(ringloop);
|
||||||
cli = new cluster_client_t(ringloop, epmgr->tfd, cfg);
|
cli = new cluster_client_t(ringloop, epmgr->tfd, cfg);
|
||||||
cmd = new cli_tool_t();
|
cmd = new cli_tool_t();
|
||||||
|
@@ -541,15 +541,11 @@ void osd_t::print_slow()
|
|||||||
}
|
}
|
||||||
else if (op->req.hdr.opcode == OSD_OP_SEC_STABILIZE || op->req.hdr.opcode == OSD_OP_SEC_ROLLBACK)
|
else if (op->req.hdr.opcode == OSD_OP_SEC_STABILIZE || op->req.hdr.opcode == OSD_OP_SEC_ROLLBACK)
|
||||||
{
|
{
|
||||||
for (uint64_t i = 0; i < op->req.sec_stab.len && i < sizeof(obj_ver_id)*12; i += sizeof(obj_ver_id))
|
for (uint64_t i = 0; i < op->req.sec_stab.len; i += sizeof(obj_ver_id))
|
||||||
{
|
{
|
||||||
obj_ver_id *ov = (obj_ver_id*)((uint8_t*)op->buf + i);
|
obj_ver_id *ov = (obj_ver_id*)((uint8_t*)op->buf + i);
|
||||||
bufprintf(i == 0 ? " %lx:%lx v%lu" : ", %lx:%lx v%lu", ov->oid.inode, ov->oid.stripe, ov->version);
|
bufprintf(i == 0 ? " %lx:%lx v%lu" : ", %lx:%lx v%lu", ov->oid.inode, ov->oid.stripe, ov->version);
|
||||||
}
|
}
|
||||||
if (op->req.sec_stab.len > sizeof(obj_ver_id)*12)
|
|
||||||
{
|
|
||||||
bufprintf(", ... (%lu items)", op->req.sec_stab.len/sizeof(obj_ver_id));
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
else if (op->req.hdr.opcode == OSD_OP_SEC_LIST)
|
else if (op->req.hdr.opcode == OSD_OP_SEC_LIST)
|
||||||
{
|
{
|
||||||
|
@@ -19,14 +19,6 @@ static void handle_sigint(int sig)
|
|||||||
exit(0);
|
exit(0);
|
||||||
}
|
}
|
||||||
|
|
||||||
static const char* help_text =
|
|
||||||
"Vitastor OSD (block object storage daemon) " VERSION "\n"
|
|
||||||
"(c) Vitaliy Filippov, 2019+ (VNPL-1.1)\n"
|
|
||||||
"\n"
|
|
||||||
"OSDs are usually started by vitastor-disk.\n"
|
|
||||||
"Manual usage: vitastor-osd [--option value] ...\n"
|
|
||||||
;
|
|
||||||
|
|
||||||
int main(int narg, char *args[])
|
int main(int narg, char *args[])
|
||||||
{
|
{
|
||||||
setvbuf(stdout, NULL, _IONBF, 0);
|
setvbuf(stdout, NULL, _IONBF, 0);
|
||||||
@@ -45,20 +37,10 @@ int main(int narg, char *args[])
|
|||||||
char *opt = args[i]+2;
|
char *opt = args[i]+2;
|
||||||
config[std::string(opt)] = std::string(args[++i]);
|
config[std::string(opt)] = std::string(args[++i]);
|
||||||
}
|
}
|
||||||
else if (!strcmp(args[i], "--help"))
|
|
||||||
{
|
|
||||||
printf("%s", help_text);
|
|
||||||
return 0;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
if (!config.size())
|
|
||||||
{
|
|
||||||
printf("%s", help_text);
|
|
||||||
return 1;
|
|
||||||
}
|
}
|
||||||
signal(SIGINT, handle_sigint);
|
signal(SIGINT, handle_sigint);
|
||||||
signal(SIGTERM, handle_sigint);
|
signal(SIGTERM, handle_sigint);
|
||||||
ring_loop_t *ringloop = new ring_loop_t(RINGLOOP_DEFAULT_SIZE);
|
ring_loop_t *ringloop = new ring_loop_t(512);
|
||||||
osd = new osd_t(config, ringloop);
|
osd = new osd_t(config, ringloop);
|
||||||
while (1)
|
while (1)
|
||||||
{
|
{
|
||||||
|
@@ -17,7 +17,7 @@ ring_loop_t::ring_loop_t(int qd)
|
|||||||
{
|
{
|
||||||
throw std::runtime_error(std::string("io_uring_queue_init: ") + strerror(-ret));
|
throw std::runtime_error(std::string("io_uring_queue_init: ") + strerror(-ret));
|
||||||
}
|
}
|
||||||
free_ring_data_ptr = *ring.sq.kring_entries;
|
free_ring_data_ptr = *ring.cq.kring_entries;
|
||||||
ring_datas = (struct ring_data_t*)calloc(free_ring_data_ptr, sizeof(ring_data_t));
|
ring_datas = (struct ring_data_t*)calloc(free_ring_data_ptr, sizeof(ring_data_t));
|
||||||
free_ring_data = (int*)malloc(sizeof(int) * free_ring_data_ptr);
|
free_ring_data = (int*)malloc(sizeof(int) * free_ring_data_ptr);
|
||||||
if (!ring_datas || !free_ring_data)
|
if (!ring_datas || !free_ring_data)
|
||||||
|
@@ -15,8 +15,6 @@
|
|||||||
#include <functional>
|
#include <functional>
|
||||||
#include <vector>
|
#include <vector>
|
||||||
|
|
||||||
#define RINGLOOP_DEFAULT_SIZE 1024
|
|
||||||
|
|
||||||
static inline void my_uring_prep_rw(int op, struct io_uring_sqe *sqe, int fd, const void *addr, unsigned len, off_t offset)
|
static inline void my_uring_prep_rw(int op, struct io_uring_sqe *sqe, int fd, const void *addr, unsigned len, off_t offset)
|
||||||
{
|
{
|
||||||
// Prepare a read/write operation without clearing user_data
|
// Prepare a read/write operation without clearing user_data
|
||||||
@@ -141,9 +139,11 @@ public:
|
|||||||
if (free_ring_data_ptr == 0)
|
if (free_ring_data_ptr == 0)
|
||||||
return NULL;
|
return NULL;
|
||||||
struct io_uring_sqe* sqe = io_uring_get_sqe(&ring);
|
struct io_uring_sqe* sqe = io_uring_get_sqe(&ring);
|
||||||
assert(sqe);
|
if (sqe)
|
||||||
*sqe = { 0 };
|
{
|
||||||
io_uring_sqe_set_data(sqe, ring_datas + free_ring_data[--free_ring_data_ptr]);
|
*sqe = { 0 };
|
||||||
|
io_uring_sqe_set_data(sqe, ring_datas + free_ring_data[--free_ring_data_ptr]);
|
||||||
|
}
|
||||||
return sqe;
|
return sqe;
|
||||||
}
|
}
|
||||||
inline void set_immediate(const std::function<void()> cb)
|
inline void set_immediate(const std::function<void()> cb)
|
||||||
|
@@ -30,7 +30,7 @@ void stub_exec_op(osd_messenger_t *msgr, osd_op_t *op);
|
|||||||
int main(int narg, char *args[])
|
int main(int narg, char *args[])
|
||||||
{
|
{
|
||||||
ring_consumer_t looper;
|
ring_consumer_t looper;
|
||||||
ring_loop_t *ringloop = new ring_loop_t(RINGLOOP_DEFAULT_SIZE);
|
ring_loop_t *ringloop = new ring_loop_t(512);
|
||||||
epoll_manager_t *epmgr = new epoll_manager_t(ringloop);
|
epoll_manager_t *epmgr = new epoll_manager_t(ringloop);
|
||||||
osd_messenger_t *msgr = new osd_messenger_t();
|
osd_messenger_t *msgr = new osd_messenger_t();
|
||||||
msgr->osd_num = 1351;
|
msgr->osd_num = 1351;
|
||||||
|
@@ -11,7 +11,7 @@ int main(int narg, char *args[])
|
|||||||
config["meta_device"] = "./test_meta.bin";
|
config["meta_device"] = "./test_meta.bin";
|
||||||
config["journal_device"] = "./test_journal.bin";
|
config["journal_device"] = "./test_journal.bin";
|
||||||
config["data_device"] = "./test_data.bin";
|
config["data_device"] = "./test_data.bin";
|
||||||
ring_loop_t *ringloop = new ring_loop_t(RINGLOOP_DEFAULT_SIZE);
|
ring_loop_t *ringloop = new ring_loop_t(512);
|
||||||
epoll_manager_t *epmgr = new epoll_manager_t(ringloop);
|
epoll_manager_t *epmgr = new epoll_manager_t(ringloop);
|
||||||
blockstore_t *bs = new blockstore_t(config, ringloop, epmgr->tfd);
|
blockstore_t *bs = new blockstore_t(config, ringloop, epmgr->tfd);
|
||||||
|
|
||||||
|
@@ -68,7 +68,7 @@ int main(int narg, char *args[])
|
|||||||
| cfg["inode_id"].uint64_value();
|
| cfg["inode_id"].uint64_value();
|
||||||
uint64_t base_ver = 0;
|
uint64_t base_ver = 0;
|
||||||
// Create client
|
// Create client
|
||||||
auto ringloop = new ring_loop_t(RINGLOOP_DEFAULT_SIZE);
|
auto ringloop = new ring_loop_t(512);
|
||||||
auto epmgr = new epoll_manager_t(ringloop);
|
auto epmgr = new epoll_manager_t(ringloop);
|
||||||
auto cli = new cluster_client_t(ringloop, epmgr->tfd, cfg);
|
auto cli = new cluster_client_t(ringloop, epmgr->tfd, cfg);
|
||||||
cli->on_ready([&]()
|
cli->on_ready([&]()
|
||||||
|
@@ -6,7 +6,7 @@ includedir=${prefix}/@CMAKE_INSTALL_INCLUDEDIR@
|
|||||||
|
|
||||||
Name: Vitastor
|
Name: Vitastor
|
||||||
Description: Vitastor client library
|
Description: Vitastor client library
|
||||||
Version: 1.2.0
|
Version: 1.1.0
|
||||||
Libs: -L${libdir} -lvitastor_client
|
Libs: -L${libdir} -lvitastor_client
|
||||||
Cflags: -I${includedir}
|
Cflags: -I${includedir}
|
||||||
|
|
||||||
|
@@ -114,7 +114,7 @@ vitastor_c *vitastor_c_create_qemu_uring(QEMUSetFDHandler *aio_set_fd_handler, v
|
|||||||
ring_loop_t *ringloop = NULL;
|
ring_loop_t *ringloop = NULL;
|
||||||
try
|
try
|
||||||
{
|
{
|
||||||
ringloop = new ring_loop_t(RINGLOOP_DEFAULT_SIZE);
|
ringloop = new ring_loop_t(512);
|
||||||
}
|
}
|
||||||
catch (std::exception & e)
|
catch (std::exception & e)
|
||||||
{
|
{
|
||||||
@@ -136,7 +136,7 @@ vitastor_c *vitastor_c_create_uring(const char *config_path, const char *etcd_ho
|
|||||||
ring_loop_t *ringloop = NULL;
|
ring_loop_t *ringloop = NULL;
|
||||||
try
|
try
|
||||||
{
|
{
|
||||||
ringloop = new ring_loop_t(RINGLOOP_DEFAULT_SIZE);
|
ringloop = new ring_loop_t(512);
|
||||||
}
|
}
|
||||||
catch (std::exception & e)
|
catch (std::exception & e)
|
||||||
{
|
{
|
||||||
@@ -167,7 +167,7 @@ vitastor_c *vitastor_c_create_uring_json(const char **options, int options_len)
|
|||||||
ring_loop_t *ringloop = NULL;
|
ring_loop_t *ringloop = NULL;
|
||||||
try
|
try
|
||||||
{
|
{
|
||||||
ringloop = new ring_loop_t(RINGLOOP_DEFAULT_SIZE);
|
ringloop = new ring_loop_t(512);
|
||||||
}
|
}
|
||||||
catch (std::exception & e)
|
catch (std::exception & e)
|
||||||
{
|
{
|
||||||
|
Reference in New Issue
Block a user