1
0
Fork 0

Compare commits

...

1 Commits

Author SHA1 Message Date
Vitaliy Filippov 043ed854f3 Support RDMA devices without Implicit ODP using mlockall()
UPD: Seems it won't work because ibv_reg_mr() takes a permissions argument
and doesn't allow more permissions than allowed by the kernel for memory
mappings. So the only way to register all memory is probably to iterate
over /proc/PID/maps... :)

Mellanox docs mention that older MLNX_OFED emulated ODP, so maybe it's
still possible to use it, but it's not confirmed.
2022-02-02 01:40:29 +03:00
2 changed files with 35 additions and 19 deletions

View File

@ -48,19 +48,28 @@
type: string type: string
info: | info: |
RDMA device name to use for Vitastor OSD communications (for example, RDMA device name to use for Vitastor OSD communications (for example,
"rocep5s0f0"). Please note that Vitastor RDMA requires Implicit On-Demand "rocep5s0f0"). Please note that if your RDMA device doesn't support
Paging (Implicit ODP) and Scatter/Gather (SG) support from the RDMA device Implicit ODP (Implicit On-Demand Paging) then all Vitastor OSDs and clients
to work. For example, Mellanox ConnectX-3 and older adapters don't have will have to use mlockall() to lock all application memory to use RDMA.
Implicit ODP, so they're unsupported by Vitastor. Run `ibv_devinfo -v` as In case of the native Vitastor QEMU driver with RDMA, all virtual machine
root to list available RDMA devices and their features. memory will be locked if your RDMA device doesn't support Implicit ODP.
Notably, Mellanox ConnectX-3 and older adapters don't support Implicit ODP,
while ConnectX-4 and newer do. Run `ibv_devinfo -v` as root to list
available RDMA devices and their features.
info_ru: | info_ru: |
Название RDMA-устройства для связи с Vitastor OSD (например, "rocep5s0f0"). Название RDMA-устройства для связи с Vitastor OSD (например, "rocep5s0f0").
Имейте в виду, что поддержка RDMA в Vitastor требует функций устройства Имейте в виду, что если ваше устройство не поддерживает Implicit ODP
Implicit On-Demand Paging (Implicit ODP) и Scatter/Gather (SG). Например, (Implicit On-Demand Paging), то все OSD и клиенты Vitastor будут вынуждены
адаптеры Mellanox ConnectX-3 и более старые не поддерживают Implicit ODP и блокировать всю память приложения с помощью mlockall(), чтобы задействовать
потому не поддерживаются в Vitastor. Запустите `ibv_devinfo -v` от имени RDMA. В случае нативного QEMU-драйвера это будет означать, что при
суперпользователя, чтобы посмотреть список доступных RDMA-устройств, их использовании RDMA на устройстве без поддержки Implicit ODP блокироваться
параметры и возможности. от выгрузки будет вся память виртуальных машин.
В случае с адаптерами Mellanox Implicit ODP поддерживается начиная с
ConnectX-4. ConnectX-3 и более старые адаптеры не поддерживают Implicit ODP.
Чтобы посмотреть список своих RDMA-устройств и их возможностей, запустите
`ibv_devinfo -v` от имени суперпользователя.
- name: rdma_port_num - name: rdma_port_num
type: int type: int
default: 1 default: 1

View File

@ -3,6 +3,7 @@
#include <stdio.h> #include <stdio.h>
#include <stdlib.h> #include <stdlib.h>
#include <sys/mman.h>
#include "msgr_rdma.h" #include "msgr_rdma.h"
#include "messenger.h" #include "messenger.h"
@ -54,6 +55,7 @@ msgr_rdma_connection_t::~msgr_rdma_connection_t()
msgr_rdma_context_t *msgr_rdma_context_t::create(const char *ib_devname, uint8_t ib_port, uint8_t gid_index, uint32_t mtu, int log_level) msgr_rdma_context_t *msgr_rdma_context_t::create(const char *ib_devname, uint8_t ib_port, uint8_t gid_index, uint32_t mtu, int log_level)
{ {
int res; int res;
bool odp = true;
ibv_device **dev_list = NULL; ibv_device **dev_list = NULL;
msgr_rdma_context_t *ctx = new msgr_rdma_context_t(); msgr_rdma_context_t *ctx = new msgr_rdma_context_t();
ctx->mtu = mtu; ctx->mtu = mtu;
@ -117,9 +119,9 @@ msgr_rdma_context_t *msgr_rdma_context_t::create(const char *ib_devname, uint8_t
fprintf(stderr, "RDMA device %s must have local LID because it's not Ethernet, but LID is zero\n", ibv_get_device_name(ctx->dev)); fprintf(stderr, "RDMA device %s must have local LID because it's not Ethernet, but LID is zero\n", ibv_get_device_name(ctx->dev));
goto cleanup; goto cleanup;
} }
if (ibv_query_gid(ctx->context, ib_port, gid_index, &ctx->my_gid)) if ((res = ibv_query_gid(ctx->context, ib_port, gid_index, &ctx->my_gid)) != 0)
{ {
fprintf(stderr, "Couldn't read RDMA device %s GID index %d\n", ibv_get_device_name(ctx->dev), gid_index); fprintf(stderr, "Couldn't read RDMA device %s GID index %d: %s\n", ibv_get_device_name(ctx->dev), gid_index, strerror(res));
goto cleanup; goto cleanup;
} }
@ -131,9 +133,9 @@ msgr_rdma_context_t *msgr_rdma_context_t::create(const char *ib_devname, uint8_t
} }
{ {
if (ibv_query_device_ex(ctx->context, NULL, &ctx->attrx)) if ((res = ibv_query_device_ex(ctx->context, NULL, &ctx->attrx)) != 0)
{ {
fprintf(stderr, "Couldn't query RDMA device for its features\n"); fprintf(stderr, "Couldn't query RDMA device %s for its features: %s\n", ibv_get_device_name(ctx->dev), strerror(res));
goto cleanup; goto cleanup;
} }
if (!(ctx->attrx.odp_caps.general_caps & IBV_ODP_SUPPORT) || if (!(ctx->attrx.odp_caps.general_caps & IBV_ODP_SUPPORT) ||
@ -141,15 +143,20 @@ msgr_rdma_context_t *msgr_rdma_context_t::create(const char *ib_devname, uint8_t
!(ctx->attrx.odp_caps.per_transport_caps.rc_odp_caps & IBV_ODP_SUPPORT_SEND) || !(ctx->attrx.odp_caps.per_transport_caps.rc_odp_caps & IBV_ODP_SUPPORT_SEND) ||
!(ctx->attrx.odp_caps.per_transport_caps.rc_odp_caps & IBV_ODP_SUPPORT_RECV)) !(ctx->attrx.odp_caps.per_transport_caps.rc_odp_caps & IBV_ODP_SUPPORT_RECV))
{ {
fprintf(stderr, "The RDMA device isn't implicit ODP (On-Demand Paging) capable or does not support RC send and receive with ODP\n"); fprintf(stderr, "Warning: RDMA device isn't implicit ODP (On-Demand Paging) capable, trying to lock all application memory\n");
goto cleanup; if (mlockall(MCL_CURRENT|MCL_FUTURE|MCL_ONFAULT) != 0)
{
fprintf(stderr, "mlockall() failed: %s\n", strerror(errno));
goto cleanup;
}
odp = false;
} }
} }
ctx->mr = ibv_reg_mr(ctx->pd, NULL, SIZE_MAX, IBV_ACCESS_LOCAL_WRITE | IBV_ACCESS_ON_DEMAND); ctx->mr = ibv_reg_mr(ctx->pd, NULL, SIZE_MAX, IBV_ACCESS_LOCAL_WRITE | (odp ? IBV_ACCESS_ON_DEMAND : 0));
if (!ctx->mr) if (!ctx->mr)
{ {
fprintf(stderr, "Couldn't register RDMA memory region\n"); fprintf(stderr, "Couldn't register RDMA memory region: %s\n", strerror(errno));
goto cleanup; goto cleanup;
} }