Compare commits

..

3 Commits

Author SHA1 Message Date
Vitaliy Filippov 4f42302d1a Add an alternative RDMA implementation via RDMA-CM
Required for non-RoCE cards: iWARP and, possibly, Infiniband
2025-03-31 16:23:18 +03:00
Vitaliy Filippov a0aacba6cd Support multiple RDMA networks 2025-03-31 16:23:18 +03:00
Vitaliy Filippov 0bbfc2ab25 Support multiple OSD networks and separate OSD cluster network 2025-03-31 16:17:37 +03:00
4 changed files with 37 additions and 22 deletions

View File

@ -124,7 +124,9 @@ void osd_messenger_t::init()
rdmacm_evch = rdma_create_event_channel();
if (!rdmacm_evch)
{
fprintf(stderr, "Failed to initialize RDMA-CM event channel: %s (code %d)\n", strerror(errno), errno);
// ENODEV means that the client doesn't have RDMA devices available
if (errno != ENODEV || log_level > 0)
fprintf(stderr, "Failed to initialize RDMA-CM event channel: %s (code %d)\n", strerror(errno), errno);
}
else
{
@ -417,11 +419,10 @@ void osd_messenger_t::connect_peer(uint64_t peer_osd, json11::Json peer_state)
address_list.push_back(json_addr);
}
}
auto n_cluster = this->osd_num ? cluster_address_list.size() : 0;
if (this->osd_num)
address_list.insert(address_list.begin(), cluster_address_list.begin(), cluster_address_list.end());
// FIXME: Maybe support optional fallback from cluster to public network?
if (this->osd_num && cluster_address_list.size() > 0)
address_list = cluster_address_list;
wanted_peers[peer_osd].address_list = address_list;
wanted_peers[peer_osd].n_cluster_addr = n_cluster;
}
else
wanted_peers[peer_osd].address_list = peer_state["addresses"];

View File

@ -97,7 +97,6 @@ struct osd_wanted_peer_t
json11::Json raw_address_list;
json11::Json address_list;
bool peer_rdmacm = false;
int n_cluster_addr = 0;
int port = 0;
time_t last_connect_attempt = 0;
bool connecting = false, address_changed = false;

View File

@ -6,6 +6,32 @@
#include "msgr_rdma.h"
#include "messenger.h"
static uint32_t ibv_mtu_to_bytes(ibv_mtu mtu)
{
switch (mtu)
{
case IBV_MTU_256: return 256;
case IBV_MTU_512: return 512;
case IBV_MTU_1024: return 1024;
case IBV_MTU_2048: return 2048;
case IBV_MTU_4096: return 4096;
}
return 4096;
}
static ibv_mtu bytes_to_ibv_mtu(uint32_t mtu)
{
switch (mtu)
{
case 256: return IBV_MTU_256;
case 512: return IBV_MTU_512;
case 1024: return IBV_MTU_1024;
case 2048: return IBV_MTU_2048;
case 4096: return IBV_MTU_4096;
}
return IBV_MTU_4096;
}
std::string msgr_rdma_address_t::to_string()
{
char msg[sizeof "0000:00000000:00000000:00000000000000000000000000000000"];
@ -235,7 +261,7 @@ std::vector<msgr_rdma_context_t*> msgr_rdma_context_t::create_all(const std::vec
fprintf(stderr, "RDMA device %s port %d GID %d does not exist\n", ibv_get_device_name(dev), port_num, sel_gid_index);
continue;
}
uint32_t port_mtu = sel_mtu ? sel_mtu : portinfo.active_mtu;
uint32_t port_mtu = sel_mtu ? sel_mtu : ibv_mtu_to_bytes(portinfo.active_mtu);
#ifdef IBV_ADVISE_MR_ADVICE_PREFETCH_NO_FAULT
if (sel_gid_index < 0)
{
@ -443,25 +469,12 @@ msgr_rdma_connection_t *msgr_rdma_connection_t::create(msgr_rdma_context_t *ctx,
return conn;
}
static ibv_mtu mtu_to_ibv_mtu(uint32_t mtu)
{
switch (mtu)
{
case 256: return IBV_MTU_256;
case 512: return IBV_MTU_512;
case 1024: return IBV_MTU_1024;
case 2048: return IBV_MTU_2048;
case 4096: return IBV_MTU_4096;
}
return IBV_MTU_4096;
}
int msgr_rdma_connection_t::connect(msgr_rdma_address_t *dest)
{
auto conn = this;
ibv_qp_attr attr = {
.qp_state = IBV_QPS_RTR,
.path_mtu = mtu_to_ibv_mtu(conn->ctx->mtu),
.path_mtu = bytes_to_ibv_mtu(conn->ctx->mtu),
.rq_psn = dest->psn,
.sq_psn = conn->addr.psn,
.dest_qp_num = dest->qpn,

View File

@ -397,7 +397,9 @@ void osd_messenger_t::rdmacm_try_connect_peer(uint64_t peer_osd, const std::stri
if (rdma_resolve_addr(cmid, NULL, (sockaddr*)&conn->parsed_addr, conn->timeout_ms) != 0)
{
auto res = -errno;
fprintf(stderr, "Failed to resolve address %s via RDMA-CM: %s (code %d)\n", addr.c_str(), strerror(errno), errno);
// ENODEV means that the client doesn't have an RDMA device for this address
if (res != -ENODEV || log_level > 0)
fprintf(stderr, "Failed to resolve address %s via RDMA-CM: %s (code %d)\n", addr.c_str(), strerror(errno), errno);
rdmacm_on_connect_peer_error(cmid, res);
return;
}