forked from vitalif/vitastor
Implement RDMA receive with memory copying (send remains zero-copy)
This is the simplest and, as usual, the best implementation :) 100% zero-copy implementation is also possible (see rdma-zerocopy branch), but it requires to create A LOT of queues (~128 per client) to use QPN as a 'tag' because of the lack of receive tags and the server may simply run out of queues. Hardware limit is 262144 on Mellanox ConnectX-4 which amounts to only 2048 'connections' per host. And even with that amount of queues it's still less optimal than the non-zerocopy one. In fact, newest hardware like Mellanox ConnectX-5 does have Tag Matching support, but it's still unsuitable for us because it doesn't support scatter/gather (tm_caps.max_sge=1).allow-etcd-address-option
parent
9e6cbc6ebc
commit
971aa4ae4f
|
@ -49,6 +49,7 @@ Vitastor на данный момент находится в статусе п
|
||||||
- Именование инодов через хранение их метаданных в etcd
|
- Именование инодов через хранение их метаданных в etcd
|
||||||
- Снапшоты и copy-on-write клоны
|
- Снапшоты и copy-on-write клоны
|
||||||
- Сглаживание производительности случайной записи в SSD+HDD конфигурациях
|
- Сглаживание производительности случайной записи в SSD+HDD конфигурациях
|
||||||
|
- Поддержка RDMA/RoCEv2 через libibverbs
|
||||||
|
|
||||||
## Планы развития
|
## Планы развития
|
||||||
|
|
||||||
|
@ -60,7 +61,7 @@ Vitastor на данный момент находится в статусе п
|
||||||
- Фоновая проверка целостности без контрольных сумм (сверка реплик)
|
- Фоновая проверка целостности без контрольных сумм (сверка реплик)
|
||||||
- Контрольные суммы
|
- Контрольные суммы
|
||||||
- Поддержка SSD-кэширования (tiered storage)
|
- Поддержка SSD-кэширования (tiered storage)
|
||||||
- Поддержка RDMA и NVDIMM
|
- Поддержка NVDIMM
|
||||||
- Web-интерфейс
|
- Web-интерфейс
|
||||||
- Возможно, сжатие
|
- Возможно, сжатие
|
||||||
- Возможно, поддержка кэширования данных через системный page cache
|
- Возможно, поддержка кэширования данных через системный page cache
|
||||||
|
|
|
@ -43,6 +43,7 @@ breaking changes in the future. However, the following is implemented:
|
||||||
- Inode metadata storage in etcd
|
- Inode metadata storage in etcd
|
||||||
- Snapshots and copy-on-write image clones
|
- Snapshots and copy-on-write image clones
|
||||||
- Write throttling to smooth random write workloads in SSD+HDD configurations
|
- Write throttling to smooth random write workloads in SSD+HDD configurations
|
||||||
|
- RDMA/RoCEv2 support via libibverbs
|
||||||
|
|
||||||
## Roadmap
|
## Roadmap
|
||||||
|
|
||||||
|
@ -54,7 +55,7 @@ breaking changes in the future. However, the following is implemented:
|
||||||
- Scrubbing without checksums (verification of replicas)
|
- Scrubbing without checksums (verification of replicas)
|
||||||
- Checksums
|
- Checksums
|
||||||
- Tiered storage
|
- Tiered storage
|
||||||
- RDMA and NVDIMM support
|
- NVDIMM support
|
||||||
- Web GUI
|
- Web GUI
|
||||||
- Compression (possibly)
|
- Compression (possibly)
|
||||||
- Read caching using system page cache (possibly)
|
- Read caching using system page cache (possibly)
|
||||||
|
|
|
@ -139,9 +139,6 @@ void osd_messenger_t::parse_config(const json11::Json & config)
|
||||||
this->rdma_gid_index = (uint8_t)config["rdma_gid_index"].uint64_value();
|
this->rdma_gid_index = (uint8_t)config["rdma_gid_index"].uint64_value();
|
||||||
this->rdma_mtu = (uint32_t)config["rdma_mtu"].uint64_value();
|
this->rdma_mtu = (uint32_t)config["rdma_mtu"].uint64_value();
|
||||||
#endif
|
#endif
|
||||||
this->bs_bitmap_granularity = strtoull(config["bitmap_granularity"].string_value().c_str(), NULL, 10);
|
|
||||||
if (!this->bs_bitmap_granularity)
|
|
||||||
this->bs_bitmap_granularity = DEFAULT_BITMAP_GRANULARITY;
|
|
||||||
this->use_sync_send_recv = config["use_sync_send_recv"].bool_value() ||
|
this->use_sync_send_recv = config["use_sync_send_recv"].bool_value() ||
|
||||||
config["use_sync_send_recv"].uint64_value();
|
config["use_sync_send_recv"].uint64_value();
|
||||||
this->peer_connect_interval = config["peer_connect_interval"].uint64_value();
|
this->peer_connect_interval = config["peer_connect_interval"].uint64_value();
|
||||||
|
|
|
@ -128,7 +128,6 @@ protected:
|
||||||
int peer_connect_timeout = DEFAULT_PEER_CONNECT_TIMEOUT;
|
int peer_connect_timeout = DEFAULT_PEER_CONNECT_TIMEOUT;
|
||||||
int osd_idle_timeout = DEFAULT_OSD_PING_TIMEOUT;
|
int osd_idle_timeout = DEFAULT_OSD_PING_TIMEOUT;
|
||||||
int osd_ping_timeout = DEFAULT_OSD_PING_TIMEOUT;
|
int osd_ping_timeout = DEFAULT_OSD_PING_TIMEOUT;
|
||||||
uint32_t bs_bitmap_granularity = 0;
|
|
||||||
int log_level = 0;
|
int log_level = 0;
|
||||||
bool use_sync_send_recv = false;
|
bool use_sync_send_recv = false;
|
||||||
|
|
||||||
|
@ -137,6 +136,7 @@ protected:
|
||||||
std::string rdma_device;
|
std::string rdma_device;
|
||||||
uint64_t rdma_port_num = 1, rdma_gid_index = 0, rdma_mtu = 0;
|
uint64_t rdma_port_num = 1, rdma_gid_index = 0, rdma_mtu = 0;
|
||||||
msgr_rdma_context_t *rdma_context = NULL;
|
msgr_rdma_context_t *rdma_context = NULL;
|
||||||
|
// FIXME: Allow to configure these options
|
||||||
uint64_t rdma_max_sge = 128, rdma_max_send = 32, rdma_max_recv = 32;
|
uint64_t rdma_max_sge = 128, rdma_max_send = 32, rdma_max_recv = 32;
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
|
@ -189,6 +189,7 @@ protected:
|
||||||
void handle_send(int result, osd_client_t *cl);
|
void handle_send(int result, osd_client_t *cl);
|
||||||
|
|
||||||
bool handle_read(int result, osd_client_t *cl);
|
bool handle_read(int result, osd_client_t *cl);
|
||||||
|
bool handle_read_buffer(osd_client_t *cl, void *curbuf, int remain);
|
||||||
bool handle_finished_read(osd_client_t *cl);
|
bool handle_finished_read(osd_client_t *cl);
|
||||||
void handle_op_hdr(osd_client_t *cl);
|
void handle_op_hdr(osd_client_t *cl);
|
||||||
bool handle_reply_hdr(osd_client_t *cl);
|
bool handle_reply_hdr(osd_client_t *cl);
|
||||||
|
|
|
@ -1,3 +1,6 @@
|
||||||
|
// Copyright (c) Vitaliy Filippov, 2019+
|
||||||
|
// License: VNPL-1.1 or GNU GPL-2.0+ (see README.md for details)
|
||||||
|
|
||||||
#include <stdio.h>
|
#include <stdio.h>
|
||||||
#include <stdlib.h>
|
#include <stdlib.h>
|
||||||
#include "msgr_rdma.h"
|
#include "msgr_rdma.h"
|
||||||
|
@ -355,45 +358,23 @@ bool osd_messenger_t::try_send_rdma(osd_client_t *cl)
|
||||||
// Only send one batch at a time
|
// Only send one batch at a time
|
||||||
return true;
|
return true;
|
||||||
}
|
}
|
||||||
int op_size = 0, op_sge = 0, op_max = rc->max_sge*bs_bitmap_granularity;
|
uint64_t op_size = 0, op_sge = 0;
|
||||||
ibv_sge sge[rc->max_sge];
|
ibv_sge sge[rc->max_sge];
|
||||||
while (rc->send_pos < cl->send_list.size())
|
while (rc->send_pos < cl->send_list.size())
|
||||||
{
|
{
|
||||||
iovec & iov = cl->send_list[rc->send_pos];
|
iovec & iov = cl->send_list[rc->send_pos];
|
||||||
if (cl->outbox[rc->send_pos].flags & MSGR_SENDP_HDR)
|
if (op_size >= RDMA_MAX_MSG || op_sge >= rc->max_sge)
|
||||||
{
|
|
||||||
if (op_sge > 0)
|
|
||||||
{
|
{
|
||||||
try_send_rdma_wr(cl, sge, op_sge);
|
try_send_rdma_wr(cl, sge, op_sge);
|
||||||
op_sge = 0;
|
op_sge = 0;
|
||||||
op_size = 0;
|
op_size = 0;
|
||||||
if (rc->cur_send >= rc->max_send)
|
if (rc->cur_send >= rc->max_send)
|
||||||
break;
|
|
||||||
}
|
|
||||||
assert(rc->send_buf_pos == 0);
|
|
||||||
sge[0] = {
|
|
||||||
.addr = (uintptr_t)iov.iov_base,
|
|
||||||
.length = (uint32_t)iov.iov_len,
|
|
||||||
.lkey = rc->ctx->mr->lkey,
|
|
||||||
};
|
|
||||||
try_send_rdma_wr(cl, sge, 1);
|
|
||||||
rc->send_pos++;
|
|
||||||
if (rc->cur_send >= rc->max_send)
|
|
||||||
break;
|
|
||||||
}
|
|
||||||
else
|
|
||||||
{
|
{
|
||||||
if (op_size >= op_max || op_sge >= rc->max_sge)
|
|
||||||
{
|
|
||||||
try_send_rdma_wr(cl, sge, op_sge);
|
|
||||||
op_sge = 0;
|
|
||||||
op_size = 0;
|
|
||||||
if (rc->cur_send >= rc->max_send)
|
|
||||||
break;
|
break;
|
||||||
}
|
}
|
||||||
// Fragment all messages into parts no longer than (max_sge*4k) = 120k on ConnectX-4
|
}
|
||||||
// Otherwise the client may not be able to receive them in small parts
|
uint32_t len = (uint32_t)(op_size+iov.iov_len-rc->send_buf_pos < RDMA_MAX_MSG
|
||||||
uint32_t len = (uint32_t)(op_size+iov.iov_len-rc->send_buf_pos < op_max ? iov.iov_len-rc->send_buf_pos : op_max-op_size);
|
? iov.iov_len-rc->send_buf_pos : RDMA_MAX_MSG-op_size);
|
||||||
sge[op_sge++] = {
|
sge[op_sge++] = {
|
||||||
.addr = (uintptr_t)(iov.iov_base+rc->send_buf_pos),
|
.addr = (uintptr_t)(iov.iov_base+rc->send_buf_pos),
|
||||||
.length = len,
|
.length = len,
|
||||||
|
@ -407,7 +388,6 @@ bool osd_messenger_t::try_send_rdma(osd_client_t *cl)
|
||||||
rc->send_buf_pos = 0;
|
rc->send_buf_pos = 0;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
|
||||||
if (op_sge > 0)
|
if (op_sge > 0)
|
||||||
{
|
{
|
||||||
try_send_rdma_wr(cl, sge, op_sge);
|
try_send_rdma_wr(cl, sge, op_sge);
|
||||||
|
@ -435,52 +415,16 @@ static void try_recv_rdma_wr(osd_client_t *cl, ibv_sge *sge, int op_sge)
|
||||||
bool osd_messenger_t::try_recv_rdma(osd_client_t *cl)
|
bool osd_messenger_t::try_recv_rdma(osd_client_t *cl)
|
||||||
{
|
{
|
||||||
auto rc = cl->rdma_conn;
|
auto rc = cl->rdma_conn;
|
||||||
if (rc->cur_recv > 0)
|
while (rc->cur_recv < rc->max_recv)
|
||||||
{
|
{
|
||||||
return true;
|
void *buf = malloc_or_die(RDMA_MAX_MSG);
|
||||||
}
|
rc->recv_buffers.push_back(buf);
|
||||||
if (!cl->recv_list.get_size())
|
ibv_sge sge = {
|
||||||
{
|
.addr = (uintptr_t)buf,
|
||||||
cl->recv_list.reset();
|
.length = RDMA_MAX_MSG,
|
||||||
cl->read_op = new osd_op_t;
|
|
||||||
cl->read_op->peer_fd = cl->peer_fd;
|
|
||||||
cl->read_op->op_type = OSD_OP_IN;
|
|
||||||
cl->recv_list.push_back(cl->read_op->req.buf, OSD_PACKET_SIZE);
|
|
||||||
cl->read_remaining = OSD_PACKET_SIZE;
|
|
||||||
cl->read_state = CL_READ_HDR;
|
|
||||||
}
|
|
||||||
int op_size = 0, op_sge = 0, op_max = rc->max_sge*bs_bitmap_granularity;
|
|
||||||
iovec *segments = cl->recv_list.get_iovec();
|
|
||||||
ibv_sge sge[rc->max_sge];
|
|
||||||
while (rc->recv_pos < cl->recv_list.get_size())
|
|
||||||
{
|
|
||||||
iovec & iov = segments[rc->recv_pos];
|
|
||||||
if (op_size >= op_max || op_sge >= rc->max_sge)
|
|
||||||
{
|
|
||||||
try_recv_rdma_wr(cl, sge, op_sge);
|
|
||||||
op_sge = 0;
|
|
||||||
op_size = 0;
|
|
||||||
if (rc->cur_recv >= rc->max_recv)
|
|
||||||
break;
|
|
||||||
}
|
|
||||||
// Receive in identical (max_sge*4k) fragments
|
|
||||||
uint32_t len = (uint32_t)(op_size+iov.iov_len-rc->recv_buf_pos < op_max ? iov.iov_len-rc->recv_buf_pos : op_max-op_size);
|
|
||||||
sge[op_sge++] = {
|
|
||||||
.addr = (uintptr_t)(iov.iov_base+rc->recv_buf_pos),
|
|
||||||
.length = len,
|
|
||||||
.lkey = rc->ctx->mr->lkey,
|
.lkey = rc->ctx->mr->lkey,
|
||||||
};
|
};
|
||||||
op_size += len;
|
try_recv_rdma_wr(cl, &sge, 1);
|
||||||
rc->recv_buf_pos += len;
|
|
||||||
if (rc->recv_buf_pos >= iov.iov_len)
|
|
||||||
{
|
|
||||||
rc->recv_pos++;
|
|
||||||
rc->recv_buf_pos = 0;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
if (op_sge > 0)
|
|
||||||
{
|
|
||||||
try_recv_rdma_wr(cl, sge, op_sge);
|
|
||||||
}
|
}
|
||||||
return true;
|
return true;
|
||||||
}
|
}
|
||||||
|
@ -531,25 +475,11 @@ void osd_messenger_t::handle_rdma_events()
|
||||||
if (!is_send)
|
if (!is_send)
|
||||||
{
|
{
|
||||||
cl->rdma_conn->cur_recv--;
|
cl->rdma_conn->cur_recv--;
|
||||||
if (!cl->rdma_conn->cur_recv)
|
handle_read_buffer(cl, cl->rdma_conn->recv_buffers[0], wc[i].byte_len);
|
||||||
{
|
free(cl->rdma_conn->recv_buffers[0]);
|
||||||
cl->recv_list.done += cl->rdma_conn->recv_pos;
|
cl->rdma_conn->recv_buffers.erase(cl->rdma_conn->recv_buffers.begin(), cl->rdma_conn->recv_buffers.begin()+1);
|
||||||
cl->rdma_conn->recv_pos = 0;
|
|
||||||
if (!cl->recv_list.get_size())
|
|
||||||
{
|
|
||||||
cl->read_remaining = 0;
|
|
||||||
if (handle_finished_read(cl))
|
|
||||||
{
|
|
||||||
try_recv_rdma(cl);
|
try_recv_rdma(cl);
|
||||||
}
|
}
|
||||||
}
|
|
||||||
else
|
|
||||||
{
|
|
||||||
// Continue to receive data
|
|
||||||
try_recv_rdma(cl);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
else
|
else
|
||||||
{
|
{
|
||||||
cl->rdma_conn->cur_send--;
|
cl->rdma_conn->cur_send--;
|
||||||
|
|
|
@ -1,8 +1,14 @@
|
||||||
|
// Copyright (c) Vitaliy Filippov, 2019+
|
||||||
|
// License: VNPL-1.1 or GNU GPL-2.0+ (see README.md for details)
|
||||||
|
|
||||||
#pragma once
|
#pragma once
|
||||||
#include <infiniband/verbs.h>
|
#include <infiniband/verbs.h>
|
||||||
#include <string>
|
#include <string>
|
||||||
#include <vector>
|
#include <vector>
|
||||||
|
|
||||||
|
// FIXME: Allow to configure this option
|
||||||
|
#define RDMA_MAX_MSG 4194304
|
||||||
|
|
||||||
struct msgr_rdma_address_t
|
struct msgr_rdma_address_t
|
||||||
{
|
{
|
||||||
ibv_gid gid;
|
ibv_gid gid;
|
||||||
|
@ -46,6 +52,7 @@ struct msgr_rdma_connection_t
|
||||||
|
|
||||||
int send_pos = 0, send_buf_pos = 0;
|
int send_pos = 0, send_buf_pos = 0;
|
||||||
int recv_pos = 0, recv_buf_pos = 0;
|
int recv_pos = 0, recv_buf_pos = 0;
|
||||||
|
std::vector<void*> recv_buffers;
|
||||||
|
|
||||||
~msgr_rdma_connection_t();
|
~msgr_rdma_connection_t();
|
||||||
static msgr_rdma_connection_t *create(msgr_rdma_context_t *ctx, uint32_t max_send, uint32_t max_recv, uint32_t max_sge);
|
static msgr_rdma_connection_t *create(msgr_rdma_context_t *ctx, uint32_t max_send, uint32_t max_recv, uint32_t max_sge);
|
||||||
|
|
|
@ -91,9 +91,38 @@ bool osd_messenger_t::handle_read(int result, osd_client_t *cl)
|
||||||
{
|
{
|
||||||
if (cl->read_iov.iov_base == cl->in_buf)
|
if (cl->read_iov.iov_base == cl->in_buf)
|
||||||
{
|
{
|
||||||
|
if (!handle_read_buffer(cl, cl->in_buf, result))
|
||||||
|
{
|
||||||
|
goto fin;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
else
|
||||||
|
{
|
||||||
|
// Long data
|
||||||
|
cl->read_remaining -= result;
|
||||||
|
cl->recv_list.eat(result);
|
||||||
|
if (cl->recv_list.done >= cl->recv_list.count)
|
||||||
|
{
|
||||||
|
handle_finished_read(cl);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
if (result >= cl->read_iov.iov_len)
|
||||||
|
{
|
||||||
|
ret = true;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
fin:
|
||||||
|
for (auto cb: set_immediate)
|
||||||
|
{
|
||||||
|
cb();
|
||||||
|
}
|
||||||
|
set_immediate.clear();
|
||||||
|
return ret;
|
||||||
|
}
|
||||||
|
|
||||||
|
bool osd_messenger_t::handle_read_buffer(osd_client_t *cl, void *curbuf, int remain)
|
||||||
|
{
|
||||||
// Compose operation(s) from the buffer
|
// Compose operation(s) from the buffer
|
||||||
int remain = result;
|
|
||||||
void *curbuf = cl->in_buf;
|
|
||||||
while (remain > 0)
|
while (remain > 0)
|
||||||
{
|
{
|
||||||
if (!cl->read_op)
|
if (!cl->read_op)
|
||||||
|
@ -130,33 +159,11 @@ bool osd_messenger_t::handle_read(int result, osd_client_t *cl)
|
||||||
{
|
{
|
||||||
if (!handle_finished_read(cl))
|
if (!handle_finished_read(cl))
|
||||||
{
|
{
|
||||||
goto fin;
|
return false;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
return true;
|
||||||
else
|
|
||||||
{
|
|
||||||
// Long data
|
|
||||||
cl->read_remaining -= result;
|
|
||||||
cl->recv_list.eat(result);
|
|
||||||
if (cl->recv_list.done >= cl->recv_list.count)
|
|
||||||
{
|
|
||||||
handle_finished_read(cl);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
if (result >= cl->read_iov.iov_len)
|
|
||||||
{
|
|
||||||
ret = true;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
fin:
|
|
||||||
for (auto cb: set_immediate)
|
|
||||||
{
|
|
||||||
cb();
|
|
||||||
}
|
|
||||||
set_immediate.clear();
|
|
||||||
return ret;
|
|
||||||
}
|
}
|
||||||
|
|
||||||
bool osd_messenger_t::handle_finished_read(osd_client_t *cl)
|
bool osd_messenger_t::handle_finished_read(osd_client_t *cl)
|
||||||
|
|
Loading…
Reference in New Issue