Compare commits
1 Commits
d0a20b3f7a
...
29498c9a9e
Author | SHA1 | Date |
---|---|---|
Vitaliy Filippov | 29498c9a9e |
|
@ -61,6 +61,10 @@ pkg_check_modules(ISAL libisal)
|
||||||
if (ISAL_LIBRARIES)
|
if (ISAL_LIBRARIES)
|
||||||
add_definitions(-DWITH_ISAL)
|
add_definitions(-DWITH_ISAL)
|
||||||
endif (ISAL_LIBRARIES)
|
endif (ISAL_LIBRARIES)
|
||||||
|
pkg_check_modules(RDMACM librdmacm)
|
||||||
|
if (RDMACM_LIBRARIES)
|
||||||
|
add_definitions(-DWITH_RDMACM)
|
||||||
|
endif (RDMACM_LIBRARIES)
|
||||||
|
|
||||||
add_custom_target(build_tests)
|
add_custom_target(build_tests)
|
||||||
add_custom_target(test
|
add_custom_target(test
|
||||||
|
|
|
@ -0,0 +1,279 @@
|
||||||
|
// Copyright (c) Vitaliy Filippov, 2019+
|
||||||
|
// License: VNPL-1.1 (see README.md for details)
|
||||||
|
//
|
||||||
|
// NFS RDMA support
|
||||||
|
|
||||||
|
#define _XOPEN_SOURCE
|
||||||
|
#include <limits.h>
|
||||||
|
|
||||||
|
#include <netinet/tcp.h>
|
||||||
|
#include <sys/epoll.h>
|
||||||
|
#include <sys/wait.h>
|
||||||
|
#include <unistd.h>
|
||||||
|
#include <fcntl.h>
|
||||||
|
#include <signal.h>
|
||||||
|
|
||||||
|
#include "proto/nfs.h"
|
||||||
|
#include "proto/rpc.h"
|
||||||
|
#include "proto/portmap.h"
|
||||||
|
|
||||||
|
#include "addr_util.h"
|
||||||
|
#include "str_util.h"
|
||||||
|
#include "json_util.h"
|
||||||
|
#include "nfs_proxy.h"
|
||||||
|
#include "nfs_kv.h"
|
||||||
|
#include "nfs_block.h"
|
||||||
|
#include "nfs_common.h"
|
||||||
|
#include "http_client.h"
|
||||||
|
#include "cli.h"
|
||||||
|
|
||||||
|
#define NFS_RDMACM_PRIVATE_DATA_MAGIC_LE 0x180eabf6
|
||||||
|
|
||||||
|
struct __attribute__((__packed__)) nfs_rdmacm_private
|
||||||
|
{
|
||||||
|
uint32_t format_identifier; // magic, should be 0xf6ab0e18 in big endian
|
||||||
|
uint8_t version; // version, 1
|
||||||
|
uint8_t remote_invalidate; // remote invalidation flag (1 or 0)
|
||||||
|
uint8_t max_send_size; // maximum RDMA Send operation size / 1024 - 1 (i.e. 0 is 1 KB, 255 is 256 KB)
|
||||||
|
uint8_t max_recv_size; // maximum RDMA Receive operation size / 1024 - 1 (i.e. 0 is 1 KB, 255 is 256 KB)
|
||||||
|
};
|
||||||
|
|
||||||
|
struct nfs_rdma_context_t
|
||||||
|
{
|
||||||
|
int max_send = 8, max_recv = 8;
|
||||||
|
int max_cqe = 0, used_max_cqe = 0;
|
||||||
|
rdma_event_channel *evch = NULL;
|
||||||
|
rdma_cm_id *cmid = NULL;
|
||||||
|
struct sockaddr_storage addr;
|
||||||
|
ibv_comp_channel *channel = NULL;
|
||||||
|
ibv_cq *cq = NULL;
|
||||||
|
};
|
||||||
|
|
||||||
|
struct nfs_rdma_conn_t
|
||||||
|
{
|
||||||
|
int max_send = 8, max_recv = 8;
|
||||||
|
|
||||||
|
};
|
||||||
|
|
||||||
|
void nfs_proxy_t::init_rdma()
|
||||||
|
{
|
||||||
|
evch = rdma_create_event_channel();
|
||||||
|
if (!evch)
|
||||||
|
{
|
||||||
|
fprintf(stderr, "Failed to initialize RDMA-CM event channel: %s (code %d)\n", strerror(errno), errno);
|
||||||
|
exit(1);
|
||||||
|
}
|
||||||
|
fcntl(evch->fd, F_SETFL, fcntl(evch->fd, F_GETFL, 0) | O_NONBLOCK);
|
||||||
|
epmgr->tfd->set_fd_handler(evch->fd, false, [this](int rdmacm_eventfd, int epoll_events)
|
||||||
|
{
|
||||||
|
handle_rdmacm_events();
|
||||||
|
});
|
||||||
|
int r = rdma_create_id(evch, &cmid, NULL, RDMA_PS_TCP);
|
||||||
|
if (r != 0)
|
||||||
|
{
|
||||||
|
fprintf(stderr, "Failed to create RDMA-CM ID: %s (code %d)\n", strerror(errno), errno);
|
||||||
|
exit(1);
|
||||||
|
}
|
||||||
|
if (!string_to_addr(bind_address, 0, rdma_port, &addr))
|
||||||
|
{
|
||||||
|
fprintf(stderr, "Server address: %s is not valid\n", bind_address.c_str());
|
||||||
|
return -1;
|
||||||
|
}
|
||||||
|
r = rdma_bind_addr(cmid, (sockaddr*)&addr);
|
||||||
|
if (r != 0)
|
||||||
|
{
|
||||||
|
fprintf(stderr, "Failed to bind RDMA-CM to %s:%d: %s (code %d)\n", bind_address.c_str(), rdma_port, strerror(errno), errno);
|
||||||
|
exit(1);
|
||||||
|
}
|
||||||
|
r = rdma_listen(cmid, 128);
|
||||||
|
if (r != 0)
|
||||||
|
{
|
||||||
|
fprintf(stderr, "Failed to listen RDMA-CM: %s (code %d)\n", strerror(errno), errno);
|
||||||
|
exit(1);
|
||||||
|
}
|
||||||
|
ctx->ibv_channel = ibv_create_comp_channel(ctx->cmid->ibv_context);
|
||||||
|
if (!ctx->ibv_channel)
|
||||||
|
{
|
||||||
|
fprintf(stderr, "Couldn't create RDMA completion channel\n");
|
||||||
|
goto cleanup;
|
||||||
|
}
|
||||||
|
fcntl(ctx->ibv_channel->fd, F_SETFL, fcntl(ctx->ibv_channel->fd, F_GETFL, 0) | O_NONBLOCK);
|
||||||
|
ctx->max_cqe = 4096;
|
||||||
|
ctx->ibv_cq = ibv_create_cq(ctx->cmid->ibv_context, ctx->max_cqe, NULL, ctx->ibv_channel, 0);
|
||||||
|
if (!ctx->ibv_cq)
|
||||||
|
{
|
||||||
|
fprintf(stderr, "Couldn't create RDMA completion queue\n");
|
||||||
|
goto cleanup;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
void nfs_proxy_t::destroy_rdma()
|
||||||
|
{
|
||||||
|
int r = rdma_destroy_id(cmid);
|
||||||
|
if (r != 0)
|
||||||
|
{
|
||||||
|
fprintf(stderr, "Failed to destroy RDMA-CM ID: %s (code %d)\n", strerror(errno), errno);
|
||||||
|
}
|
||||||
|
epmgr->tfd->set_fd_handler(evch->fd, false, NULL);
|
||||||
|
rdma_destroy_event_channel(evch);
|
||||||
|
if (cq)
|
||||||
|
ibv_destroy_cq(cq);
|
||||||
|
if (channel)
|
||||||
|
ibv_destroy_comp_channel(channel);
|
||||||
|
}
|
||||||
|
|
||||||
|
void nfs_proxy_t::handle_rdmacm_events()
|
||||||
|
{
|
||||||
|
rdma_cm_event *ev = NULL;
|
||||||
|
while (1)
|
||||||
|
{
|
||||||
|
int r = rdma_get_cm_event(evch, &ev);
|
||||||
|
if (r != 0)
|
||||||
|
{
|
||||||
|
if (errno == EAGAIN || errno == EINTR)
|
||||||
|
break;
|
||||||
|
fprintf(stderr, "Failed to get RDMA-CM event: %s (code %d)\n", strerror(errno), errno);
|
||||||
|
exit(1);
|
||||||
|
}
|
||||||
|
if (ev->event == RDMA_CM_EVENT_CONNECT_REQUEST)
|
||||||
|
{
|
||||||
|
rdmacm_accept();
|
||||||
|
}
|
||||||
|
else if (ev->event == RDMA_CM_EVENT_CONNECT_ERROR ||
|
||||||
|
ev->event == RDMA_CM_EVENT_REJECTED ||
|
||||||
|
ev->event == RDMA_CM_EVENT_DISCONNECTED ||
|
||||||
|
ev->event == RDMA_CM_EVENT_DEVICE_REMOVAL)
|
||||||
|
{
|
||||||
|
auto event_type_name = ev->event == RDMA_CM_EVENT_CONNECT_ERROR ? "RDMA_CM_EVENT_CONNECT_ERROR" : (
|
||||||
|
ev->event == RDMA_CM_EVENT_REJECTED ? "RDMA_CM_EVENT_REJECTED" : (
|
||||||
|
ev->event == RDMA_CM_EVENT_DISCONNECTED ? "RDMA_CM_EVENT_DISCONNECTED" : "RDMA_CM_EVENT_DEVICE_REMOVAL"));
|
||||||
|
auto conn_it = rdma_connections.find(ev->id);
|
||||||
|
if (conn_it == rdma_connections.end())
|
||||||
|
{
|
||||||
|
fprintf(stderr, "Received %s event for an unknown connection 0x%lx - ignoring\n",
|
||||||
|
event_type_name, (uint64_t)ev->id);
|
||||||
|
}
|
||||||
|
else
|
||||||
|
{
|
||||||
|
fprintf(stderr, "Received %s event for connection 0x%lx - closing it\n",
|
||||||
|
event_type_name, (uint64_t)ev->id);
|
||||||
|
auto conn = conn_it->second;
|
||||||
|
delete conn;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
else if (ev->event == RDMA_CM_EVENT_ESTABLISHED)
|
||||||
|
{
|
||||||
|
rdmacm_established(ev);
|
||||||
|
}
|
||||||
|
else if (ev->event == RDMA_CM_EVENT_ADDR_CHANGE || ev->event == RDMA_CM_EVENT_TIMEWAIT_EXIT)
|
||||||
|
{
|
||||||
|
// Do nothing
|
||||||
|
}
|
||||||
|
else
|
||||||
|
{
|
||||||
|
// Other events are unexpected
|
||||||
|
fprintf(stderr, "Unexpected RDMA-CM event type: %d\n", ev->event);
|
||||||
|
}
|
||||||
|
r = rdma_ack_cm_event(ev);
|
||||||
|
if (r != 0)
|
||||||
|
{
|
||||||
|
fprintf(stderr, "Failed to ack (free) RDMA-CM event: %s (code %d)\n", strerror(errno), errno);
|
||||||
|
exit(1);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
void nfs_rdma_context_t::rdmacm_accept(rdma_cm_event *ev)
|
||||||
|
{
|
||||||
|
ctx->used_max_cqe += max_send+max_recv;
|
||||||
|
if (ctx->used_max_cqe > ctx->max_cqe)
|
||||||
|
{
|
||||||
|
// Resize CQ
|
||||||
|
int new_max_cqe = ctx->max_cqe;
|
||||||
|
while (ctx->used_max_cqe > new_max_cqe)
|
||||||
|
{
|
||||||
|
new_max_cqe *= 2;
|
||||||
|
}
|
||||||
|
if (ibv_resize_cq(ctx->cq, new_max_cqe) != 0)
|
||||||
|
{
|
||||||
|
fprintf(stderr, "Couldn't resize RDMA completion queue to %d entries\n", new_max_cqe);
|
||||||
|
delete conn;
|
||||||
|
return NULL;
|
||||||
|
}
|
||||||
|
ctx->max_cqe = new_max_cqe;
|
||||||
|
}
|
||||||
|
ibv_qp_init_attr init_attr = {
|
||||||
|
.send_cq = ctx->ibv_cq,
|
||||||
|
.recv_cq = ctx->ibv_cq,
|
||||||
|
.cap = {
|
||||||
|
.max_send_wr = max_send, // ?????? тут большой вопрос сколько на самом деле
|
||||||
|
.max_recv_wr = max_recv,
|
||||||
|
.max_send_sge = max_sge,
|
||||||
|
.max_recv_sge = max_sge,
|
||||||
|
},
|
||||||
|
.qp_type = IBV_QPT_RC,
|
||||||
|
};
|
||||||
|
r = rdma_create_qp(ev->id, NULL, &init_attr);
|
||||||
|
if (r != 0)
|
||||||
|
{
|
||||||
|
fprintf(stderr, "Failed to create a queue pair via RDMA-CM: %s (code %d)\n", strerror(errno), errno);
|
||||||
|
exit(1);
|
||||||
|
}
|
||||||
|
nfs_rdmacm_private private_data = {
|
||||||
|
.format_identifier = NFS_RDMACM_PRIVATE_DATA_MAGIC_LE,
|
||||||
|
.version = 1,
|
||||||
|
.remote_invalidate = ?,
|
||||||
|
.max_send_size = (max_send <= 256*1024 ? max_send/1024 - 1 : 255),
|
||||||
|
.max_recv_size = (max_recv <= 256*1024 ? max_recv/1024 - 1 : 255),
|
||||||
|
};
|
||||||
|
rdma_conn_param conn_params = {
|
||||||
|
.private_data = &private_data,
|
||||||
|
.private_data_len = sizeof(private_data),
|
||||||
|
//.responder_resources = max_qp_rd_atom of the device?,
|
||||||
|
//.initiator_depth = max_qp_init_rd_atom of the device?,
|
||||||
|
.rnr_retry_count = 7,
|
||||||
|
//.qp_num = manually created QP number?,
|
||||||
|
};
|
||||||
|
r = rdma_accept(ev->id, &conn_params);
|
||||||
|
if (r != 0)
|
||||||
|
{
|
||||||
|
fprintf(stderr, "Failed to accept RDMA-CM connection: %s (code %d)\n", strerror(errno), errno);
|
||||||
|
rdma_destroy_qp(ev->id);
|
||||||
|
rdma_destroy_id(ev->id);
|
||||||
|
}
|
||||||
|
else
|
||||||
|
{
|
||||||
|
auto conn = new nfs_rdma_conn_t();
|
||||||
|
conn->id = ctx->id;
|
||||||
|
rdma_connections[ctx->id] = conn;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
void nfs_rdma_context_t::rdmacm_established(rdma_cm_event *ev)
|
||||||
|
{
|
||||||
|
auto conn_it = rdma_connections.find(ev->id);
|
||||||
|
if (conn_it == rdma_connections.end())
|
||||||
|
{
|
||||||
|
fprintf(stderr, "Received RDMA_CM_EVENT_ESTABLISHED event for an unknown connection 0x%lx - ignoring\n", (uint64_t)ev->id);
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
fprintf(stderr, "Received RDMA_CM_EVENT_ESTABLISHED event for connection 0x%lx - connection established\n", (uint64_t)ev->id);
|
||||||
|
auto conn = conn_it->second;
|
||||||
|
conn->established = true;
|
||||||
|
// Handle NFS private_data
|
||||||
|
if (ev->private_data_len >= sizeof(nfs_rdmacm_private))
|
||||||
|
{
|
||||||
|
nfs_rdmacm_private *private_data = (nfs_rdmacm_private *)ev->private_data;
|
||||||
|
if (private_data->format_identifier == NFS_RDMACM_PRIVATE_DATA_MAGIC_LE &&
|
||||||
|
private_data->version == 1)
|
||||||
|
{
|
||||||
|
conn->remote_invalidate = private_data->remote_invalidate;
|
||||||
|
conn->remote_max_send = (private_data->max_send_size+1) * 1024;
|
||||||
|
conn->remote_max_recv = (private_data->max_recv_size+1) * 1024;
|
||||||
|
if (conn->remote_max_recv < conn->max_send)
|
||||||
|
conn->max_send = conn->remote_max_recv;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
// Post initial receive requests
|
||||||
|
conn->post_initial_receives();
|
||||||
|
}
|
|
@ -168,7 +168,7 @@ struct WRITE3args {
|
||||||
offset3 offset;
|
offset3 offset;
|
||||||
count3 count;
|
count3 count;
|
||||||
stable_how stable;
|
stable_how stable;
|
||||||
opaque data<>;
|
opaque data<>; /* RDMA DDP-eligible */
|
||||||
};
|
};
|
||||||
|
|
||||||
typedef opaque writeverf3[NFS3_WRITEVERFSIZE];
|
typedef opaque writeverf3[NFS3_WRITEVERFSIZE];
|
||||||
|
@ -409,7 +409,7 @@ struct READ3resok {
|
||||||
post_op_attr file_attributes;
|
post_op_attr file_attributes;
|
||||||
count3 count;
|
count3 count;
|
||||||
bool eof;
|
bool eof;
|
||||||
opaque data<>;
|
opaque data<>; /* RDMA DDP-eligible */
|
||||||
};
|
};
|
||||||
|
|
||||||
struct READ3resfail {
|
struct READ3resfail {
|
||||||
|
@ -514,7 +514,7 @@ typedef string nfspath3<>;
|
||||||
|
|
||||||
struct symlinkdata3 {
|
struct symlinkdata3 {
|
||||||
sattr3 symlink_attributes;
|
sattr3 symlink_attributes;
|
||||||
nfspath3 symlink_data;
|
nfspath3 symlink_data; /* RDMA DDP-eligible */
|
||||||
};
|
};
|
||||||
|
|
||||||
struct SYMLINK3args {
|
struct SYMLINK3args {
|
||||||
|
@ -546,7 +546,7 @@ struct READLINK3args {
|
||||||
|
|
||||||
struct READLINK3resok {
|
struct READLINK3resok {
|
||||||
post_op_attr symlink_attributes;
|
post_op_attr symlink_attributes;
|
||||||
nfspath3 data;
|
nfspath3 data; /* RDMA DDP-eligible */
|
||||||
};
|
};
|
||||||
|
|
||||||
struct READLINK3resfail {
|
struct READLINK3resfail {
|
||||||
|
|
|
@ -0,0 +1,144 @@
|
||||||
|
/*
|
||||||
|
* Please do not edit this file.
|
||||||
|
* It was generated using rpcgen.
|
||||||
|
*/
|
||||||
|
|
||||||
|
#ifndef _RPC_RDMA_H_RPCGEN
|
||||||
|
#define _RPC_RDMA_H_RPCGEN
|
||||||
|
|
||||||
|
#include "xdr_impl.h"
|
||||||
|
|
||||||
|
|
||||||
|
#ifdef __cplusplus
|
||||||
|
extern "C" {
|
||||||
|
#endif
|
||||||
|
|
||||||
|
|
||||||
|
struct xdr_rdma_segment {
|
||||||
|
uint32 handle;
|
||||||
|
uint32 length;
|
||||||
|
uint64 offset;
|
||||||
|
};
|
||||||
|
typedef struct xdr_rdma_segment xdr_rdma_segment;
|
||||||
|
|
||||||
|
struct xdr_read_chunk {
|
||||||
|
uint32 position;
|
||||||
|
struct xdr_rdma_segment target;
|
||||||
|
};
|
||||||
|
typedef struct xdr_read_chunk xdr_read_chunk;
|
||||||
|
|
||||||
|
struct xdr_read_list {
|
||||||
|
struct xdr_read_chunk entry;
|
||||||
|
struct xdr_read_list *next;
|
||||||
|
};
|
||||||
|
typedef struct xdr_read_list xdr_read_list;
|
||||||
|
|
||||||
|
struct xdr_write_chunk {
|
||||||
|
struct {
|
||||||
|
u_int target_len;
|
||||||
|
struct xdr_rdma_segment *target_val;
|
||||||
|
} target;
|
||||||
|
};
|
||||||
|
typedef struct xdr_write_chunk xdr_write_chunk;
|
||||||
|
|
||||||
|
struct xdr_write_list {
|
||||||
|
struct xdr_write_chunk entry;
|
||||||
|
struct xdr_write_list *next;
|
||||||
|
};
|
||||||
|
typedef struct xdr_write_list xdr_write_list;
|
||||||
|
|
||||||
|
struct rpc_rdma_header {
|
||||||
|
struct xdr_read_list *rdma_reads;
|
||||||
|
struct xdr_write_list *rdma_writes;
|
||||||
|
struct xdr_write_chunk *rdma_reply;
|
||||||
|
};
|
||||||
|
typedef struct rpc_rdma_header rpc_rdma_header;
|
||||||
|
|
||||||
|
struct rpc_rdma_header_nomsg {
|
||||||
|
struct xdr_read_list *rdma_reads;
|
||||||
|
struct xdr_write_list *rdma_writes;
|
||||||
|
struct xdr_write_chunk *rdma_reply;
|
||||||
|
};
|
||||||
|
typedef struct rpc_rdma_header_nomsg rpc_rdma_header_nomsg;
|
||||||
|
|
||||||
|
struct rpc_rdma_header_padded {
|
||||||
|
uint32 rdma_align;
|
||||||
|
uint32 rdma_thresh;
|
||||||
|
struct xdr_read_list *rdma_reads;
|
||||||
|
struct xdr_write_list *rdma_writes;
|
||||||
|
struct xdr_write_chunk *rdma_reply;
|
||||||
|
};
|
||||||
|
typedef struct rpc_rdma_header_padded rpc_rdma_header_padded;
|
||||||
|
|
||||||
|
enum rpc_rdma_errcode {
|
||||||
|
ERR_VERS = 1,
|
||||||
|
ERR_CHUNK = 2,
|
||||||
|
};
|
||||||
|
typedef enum rpc_rdma_errcode rpc_rdma_errcode;
|
||||||
|
|
||||||
|
struct rpc_rdma_errvers {
|
||||||
|
uint32 rdma_vers_low;
|
||||||
|
uint32 rdma_vers_high;
|
||||||
|
};
|
||||||
|
typedef struct rpc_rdma_errvers rpc_rdma_errvers;
|
||||||
|
|
||||||
|
struct rpc_rdma_error {
|
||||||
|
rpc_rdma_errcode err;
|
||||||
|
union {
|
||||||
|
rpc_rdma_errvers range;
|
||||||
|
};
|
||||||
|
};
|
||||||
|
typedef struct rpc_rdma_error rpc_rdma_error;
|
||||||
|
|
||||||
|
enum rdma_proc {
|
||||||
|
RDMA_MSG = 0,
|
||||||
|
RDMA_NOMSG = 1,
|
||||||
|
RDMA_MSGP = 2,
|
||||||
|
RDMA_DONE = 3,
|
||||||
|
RDMA_ERROR = 4,
|
||||||
|
};
|
||||||
|
typedef enum rdma_proc rdma_proc;
|
||||||
|
|
||||||
|
struct rdma_body {
|
||||||
|
rdma_proc proc;
|
||||||
|
union {
|
||||||
|
rpc_rdma_header rdma_msg;
|
||||||
|
rpc_rdma_header_nomsg rdma_nomsg;
|
||||||
|
rpc_rdma_header_padded rdma_msgp;
|
||||||
|
rpc_rdma_error rdma_error;
|
||||||
|
};
|
||||||
|
};
|
||||||
|
typedef struct rdma_body rdma_body;
|
||||||
|
|
||||||
|
struct rdma_msg {
|
||||||
|
uint32 rdma_xid;
|
||||||
|
uint32 rdma_vers;
|
||||||
|
uint32 rdma_credit;
|
||||||
|
rdma_body rdma_body;
|
||||||
|
};
|
||||||
|
typedef struct rdma_msg rdma_msg;
|
||||||
|
|
||||||
|
/* the xdr functions */
|
||||||
|
|
||||||
|
|
||||||
|
extern bool_t xdr_xdr_rdma_segment (XDR *, xdr_rdma_segment*);
|
||||||
|
extern bool_t xdr_xdr_read_chunk (XDR *, xdr_read_chunk*);
|
||||||
|
extern bool_t xdr_xdr_read_list (XDR *, xdr_read_list*);
|
||||||
|
extern bool_t xdr_xdr_write_chunk (XDR *, xdr_write_chunk*);
|
||||||
|
extern bool_t xdr_xdr_write_list (XDR *, xdr_write_list*);
|
||||||
|
extern bool_t xdr_rpc_rdma_header (XDR *, rpc_rdma_header*);
|
||||||
|
extern bool_t xdr_rpc_rdma_header_nomsg (XDR *, rpc_rdma_header_nomsg*);
|
||||||
|
extern bool_t xdr_rpc_rdma_header_padded (XDR *, rpc_rdma_header_padded*);
|
||||||
|
extern bool_t xdr_rpc_rdma_errcode (XDR *, rpc_rdma_errcode*);
|
||||||
|
extern bool_t xdr_rpc_rdma_errvers (XDR *, rpc_rdma_errvers*);
|
||||||
|
extern bool_t xdr_rpc_rdma_error (XDR *, rpc_rdma_error*);
|
||||||
|
extern bool_t xdr_rdma_proc (XDR *, rdma_proc*);
|
||||||
|
extern bool_t xdr_rdma_body (XDR *, rdma_body*);
|
||||||
|
extern bool_t xdr_rdma_msg (XDR *, rdma_msg*);
|
||||||
|
|
||||||
|
|
||||||
|
#ifdef __cplusplus
|
||||||
|
}
|
||||||
|
#endif
|
||||||
|
|
||||||
|
#endif /* !_RPC_RDMA_H_RPCGEN */
|
|
@ -0,0 +1,166 @@
|
||||||
|
/* RFC 8166 - Remote Direct Memory Access Transport for Remote Procedure Call Version 1 */
|
||||||
|
|
||||||
|
/*
|
||||||
|
* Copyright (c) 2010-2017 IETF Trust and the persons
|
||||||
|
* identified as authors of the code. All rights reserved.
|
||||||
|
*
|
||||||
|
* The authors of the code are:
|
||||||
|
* B. Callaghan, T. Talpey, and C. Lever
|
||||||
|
*
|
||||||
|
* Redistribution and use in source and binary forms, with
|
||||||
|
* or without modification, are permitted provided that the
|
||||||
|
* following conditions are met:
|
||||||
|
*
|
||||||
|
* - Redistributions of source code must retain the above
|
||||||
|
* copyright notice, this list of conditions and the
|
||||||
|
* following disclaimer.
|
||||||
|
*
|
||||||
|
* - Redistributions in binary form must reproduce the above
|
||||||
|
* copyright notice, this list of conditions and the
|
||||||
|
* following disclaimer in the documentation and/or other
|
||||||
|
* materials provided with the distribution.
|
||||||
|
*
|
||||||
|
* - Neither the name of Internet Society, IETF or IETF
|
||||||
|
* Trust, nor the names of specific contributors, may be
|
||||||
|
* used to endorse or promote products derived from this
|
||||||
|
* software without specific prior written permission.
|
||||||
|
*
|
||||||
|
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS
|
||||||
|
* AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED
|
||||||
|
* WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
||||||
|
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
|
||||||
|
* FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO
|
||||||
|
* EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
|
||||||
|
* LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
|
||||||
|
* EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
|
||||||
|
* NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
|
||||||
|
* SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
|
||||||
|
* INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
|
||||||
|
* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
|
||||||
|
* OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING
|
||||||
|
* IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF
|
||||||
|
* ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||||
|
*/
|
||||||
|
|
||||||
|
/*
|
||||||
|
* Plain RDMA segment (Section 3.4.3)
|
||||||
|
*/
|
||||||
|
struct xdr_rdma_segment {
|
||||||
|
uint32 handle; /* Registered memory handle */
|
||||||
|
uint32 length; /* Length of the chunk in bytes */
|
||||||
|
uint64 offset; /* Chunk virtual address or offset */
|
||||||
|
};
|
||||||
|
|
||||||
|
/*
|
||||||
|
* RDMA read segment (Section 3.4.5)
|
||||||
|
*/
|
||||||
|
struct xdr_read_chunk {
|
||||||
|
uint32 position; /* Position in XDR stream */
|
||||||
|
struct xdr_rdma_segment target;
|
||||||
|
};
|
||||||
|
|
||||||
|
/*
|
||||||
|
* Read list (Section 4.3.1)
|
||||||
|
*/
|
||||||
|
struct xdr_read_list {
|
||||||
|
struct xdr_read_chunk entry;
|
||||||
|
struct xdr_read_list *next;
|
||||||
|
};
|
||||||
|
|
||||||
|
/*
|
||||||
|
* Write chunk (Section 3.4.6)
|
||||||
|
*/
|
||||||
|
struct xdr_write_chunk {
|
||||||
|
struct xdr_rdma_segment target<>;
|
||||||
|
};
|
||||||
|
|
||||||
|
/*
|
||||||
|
* Write list (Section 4.3.2)
|
||||||
|
*/
|
||||||
|
struct xdr_write_list {
|
||||||
|
struct xdr_write_chunk entry;
|
||||||
|
struct xdr_write_list *next;
|
||||||
|
};
|
||||||
|
|
||||||
|
/*
|
||||||
|
* Chunk lists (Section 4.3)
|
||||||
|
*/
|
||||||
|
struct rpc_rdma_header {
|
||||||
|
struct xdr_read_list *rdma_reads;
|
||||||
|
struct xdr_write_list *rdma_writes;
|
||||||
|
struct xdr_write_chunk *rdma_reply;
|
||||||
|
/* rpc body follows */
|
||||||
|
};
|
||||||
|
|
||||||
|
struct rpc_rdma_header_nomsg {
|
||||||
|
struct xdr_read_list *rdma_reads;
|
||||||
|
struct xdr_write_list *rdma_writes;
|
||||||
|
struct xdr_write_chunk *rdma_reply;
|
||||||
|
};
|
||||||
|
|
||||||
|
/* Not to be used */
|
||||||
|
struct rpc_rdma_header_padded {
|
||||||
|
uint32 rdma_align;
|
||||||
|
uint32 rdma_thresh;
|
||||||
|
struct xdr_read_list *rdma_reads;
|
||||||
|
struct xdr_write_list *rdma_writes;
|
||||||
|
struct xdr_write_chunk *rdma_reply;
|
||||||
|
/* rpc body follows */
|
||||||
|
};
|
||||||
|
|
||||||
|
/*
|
||||||
|
* Error handling (Section 4.5)
|
||||||
|
*/
|
||||||
|
enum rpc_rdma_errcode {
|
||||||
|
ERR_VERS = 1, /* Value fixed for all versions */
|
||||||
|
ERR_CHUNK = 2
|
||||||
|
};
|
||||||
|
|
||||||
|
/* Structure fixed for all versions */
|
||||||
|
struct rpc_rdma_errvers {
|
||||||
|
uint32 rdma_vers_low;
|
||||||
|
uint32 rdma_vers_high;
|
||||||
|
};
|
||||||
|
|
||||||
|
union rpc_rdma_error switch (rpc_rdma_errcode err) {
|
||||||
|
case ERR_VERS:
|
||||||
|
rpc_rdma_errvers range;
|
||||||
|
case ERR_CHUNK:
|
||||||
|
void;
|
||||||
|
};
|
||||||
|
|
||||||
|
/*
|
||||||
|
* Procedures (Section 4.2.4)
|
||||||
|
*/
|
||||||
|
enum rdma_proc {
|
||||||
|
RDMA_MSG = 0, /* Value fixed for all versions */
|
||||||
|
RDMA_NOMSG = 1, /* Value fixed for all versions */
|
||||||
|
RDMA_MSGP = 2, /* Not to be used */
|
||||||
|
RDMA_DONE = 3, /* Not to be used */
|
||||||
|
RDMA_ERROR = 4 /* Value fixed for all versions */
|
||||||
|
};
|
||||||
|
|
||||||
|
/* The position of the proc discriminator field is
|
||||||
|
* fixed for all versions */
|
||||||
|
union rdma_body switch (rdma_proc proc) {
|
||||||
|
case RDMA_MSG:
|
||||||
|
rpc_rdma_header rdma_msg;
|
||||||
|
case RDMA_NOMSG:
|
||||||
|
rpc_rdma_header_nomsg rdma_nomsg;
|
||||||
|
case RDMA_MSGP: /* Not to be used */
|
||||||
|
rpc_rdma_header_padded rdma_msgp;
|
||||||
|
case RDMA_DONE: /* Not to be used */
|
||||||
|
void;
|
||||||
|
case RDMA_ERROR:
|
||||||
|
rpc_rdma_error rdma_error;
|
||||||
|
};
|
||||||
|
|
||||||
|
/*
|
||||||
|
* Fixed header fields (Section 4.2)
|
||||||
|
*/
|
||||||
|
struct rdma_msg {
|
||||||
|
uint32 rdma_xid; /* Position fixed for all versions */
|
||||||
|
uint32 rdma_vers; /* Position fixed for all versions */
|
||||||
|
uint32 rdma_credit; /* Position fixed for all versions */
|
||||||
|
rdma_body rdma_body;
|
||||||
|
};
|
|
@ -0,0 +1,200 @@
|
||||||
|
/*
|
||||||
|
* Please do not edit this file.
|
||||||
|
* It was generated using rpcgen.
|
||||||
|
*/
|
||||||
|
|
||||||
|
#include "rpc_rdma.h"
|
||||||
|
#include "xdr_impl_inline.h"
|
||||||
|
|
||||||
|
bool_t
|
||||||
|
xdr_xdr_rdma_segment (XDR *xdrs, xdr_rdma_segment *objp)
|
||||||
|
{
|
||||||
|
|
||||||
|
if (!xdr_uint32 (xdrs, &objp->handle))
|
||||||
|
return FALSE;
|
||||||
|
if (!xdr_uint32 (xdrs, &objp->length))
|
||||||
|
return FALSE;
|
||||||
|
if (!xdr_uint64 (xdrs, &objp->offset))
|
||||||
|
return FALSE;
|
||||||
|
return TRUE;
|
||||||
|
}
|
||||||
|
|
||||||
|
bool_t
|
||||||
|
xdr_xdr_read_chunk (XDR *xdrs, xdr_read_chunk *objp)
|
||||||
|
{
|
||||||
|
|
||||||
|
if (!xdr_uint32 (xdrs, &objp->position))
|
||||||
|
return FALSE;
|
||||||
|
if (!xdr_xdr_rdma_segment (xdrs, &objp->target))
|
||||||
|
return FALSE;
|
||||||
|
return TRUE;
|
||||||
|
}
|
||||||
|
|
||||||
|
bool_t
|
||||||
|
xdr_xdr_read_list (XDR *xdrs, xdr_read_list *objp)
|
||||||
|
{
|
||||||
|
|
||||||
|
if (!xdr_xdr_read_chunk (xdrs, &objp->entry))
|
||||||
|
return FALSE;
|
||||||
|
if (!xdr_pointer (xdrs, (char **)&objp->next, sizeof (xdr_read_list), (xdrproc_t) xdr_xdr_read_list))
|
||||||
|
return FALSE;
|
||||||
|
return TRUE;
|
||||||
|
}
|
||||||
|
|
||||||
|
bool_t
|
||||||
|
xdr_xdr_write_chunk (XDR *xdrs, xdr_write_chunk *objp)
|
||||||
|
{
|
||||||
|
|
||||||
|
if (!xdr_array (xdrs, (char **)&objp->target.target_val, (u_int *) &objp->target.target_len, ~0,
|
||||||
|
sizeof (xdr_rdma_segment), (xdrproc_t) xdr_xdr_rdma_segment))
|
||||||
|
return FALSE;
|
||||||
|
return TRUE;
|
||||||
|
}
|
||||||
|
|
||||||
|
bool_t
|
||||||
|
xdr_xdr_write_list (XDR *xdrs, xdr_write_list *objp)
|
||||||
|
{
|
||||||
|
|
||||||
|
if (!xdr_xdr_write_chunk (xdrs, &objp->entry))
|
||||||
|
return FALSE;
|
||||||
|
if (!xdr_pointer (xdrs, (char **)&objp->next, sizeof (xdr_write_list), (xdrproc_t) xdr_xdr_write_list))
|
||||||
|
return FALSE;
|
||||||
|
return TRUE;
|
||||||
|
}
|
||||||
|
|
||||||
|
bool_t
|
||||||
|
xdr_rpc_rdma_header (XDR *xdrs, rpc_rdma_header *objp)
|
||||||
|
{
|
||||||
|
|
||||||
|
if (!xdr_pointer (xdrs, (char **)&objp->rdma_reads, sizeof (xdr_read_list), (xdrproc_t) xdr_xdr_read_list))
|
||||||
|
return FALSE;
|
||||||
|
if (!xdr_pointer (xdrs, (char **)&objp->rdma_writes, sizeof (xdr_write_list), (xdrproc_t) xdr_xdr_write_list))
|
||||||
|
return FALSE;
|
||||||
|
if (!xdr_pointer (xdrs, (char **)&objp->rdma_reply, sizeof (xdr_write_chunk), (xdrproc_t) xdr_xdr_write_chunk))
|
||||||
|
return FALSE;
|
||||||
|
return TRUE;
|
||||||
|
}
|
||||||
|
|
||||||
|
bool_t
|
||||||
|
xdr_rpc_rdma_header_nomsg (XDR *xdrs, rpc_rdma_header_nomsg *objp)
|
||||||
|
{
|
||||||
|
|
||||||
|
if (!xdr_pointer (xdrs, (char **)&objp->rdma_reads, sizeof (xdr_read_list), (xdrproc_t) xdr_xdr_read_list))
|
||||||
|
return FALSE;
|
||||||
|
if (!xdr_pointer (xdrs, (char **)&objp->rdma_writes, sizeof (xdr_write_list), (xdrproc_t) xdr_xdr_write_list))
|
||||||
|
return FALSE;
|
||||||
|
if (!xdr_pointer (xdrs, (char **)&objp->rdma_reply, sizeof (xdr_write_chunk), (xdrproc_t) xdr_xdr_write_chunk))
|
||||||
|
return FALSE;
|
||||||
|
return TRUE;
|
||||||
|
}
|
||||||
|
|
||||||
|
bool_t
|
||||||
|
xdr_rpc_rdma_header_padded (XDR *xdrs, rpc_rdma_header_padded *objp)
|
||||||
|
{
|
||||||
|
|
||||||
|
if (!xdr_uint32 (xdrs, &objp->rdma_align))
|
||||||
|
return FALSE;
|
||||||
|
if (!xdr_uint32 (xdrs, &objp->rdma_thresh))
|
||||||
|
return FALSE;
|
||||||
|
if (!xdr_pointer (xdrs, (char **)&objp->rdma_reads, sizeof (xdr_read_list), (xdrproc_t) xdr_xdr_read_list))
|
||||||
|
return FALSE;
|
||||||
|
if (!xdr_pointer (xdrs, (char **)&objp->rdma_writes, sizeof (xdr_write_list), (xdrproc_t) xdr_xdr_write_list))
|
||||||
|
return FALSE;
|
||||||
|
if (!xdr_pointer (xdrs, (char **)&objp->rdma_reply, sizeof (xdr_write_chunk), (xdrproc_t) xdr_xdr_write_chunk))
|
||||||
|
return FALSE;
|
||||||
|
return TRUE;
|
||||||
|
}
|
||||||
|
|
||||||
|
bool_t
|
||||||
|
xdr_rpc_rdma_errcode (XDR *xdrs, rpc_rdma_errcode *objp)
|
||||||
|
{
|
||||||
|
|
||||||
|
if (!xdr_enum (xdrs, (enum_t *) objp))
|
||||||
|
return FALSE;
|
||||||
|
return TRUE;
|
||||||
|
}
|
||||||
|
|
||||||
|
bool_t
|
||||||
|
xdr_rpc_rdma_errvers (XDR *xdrs, rpc_rdma_errvers *objp)
|
||||||
|
{
|
||||||
|
|
||||||
|
if (!xdr_uint32 (xdrs, &objp->rdma_vers_low))
|
||||||
|
return FALSE;
|
||||||
|
if (!xdr_uint32 (xdrs, &objp->rdma_vers_high))
|
||||||
|
return FALSE;
|
||||||
|
return TRUE;
|
||||||
|
}
|
||||||
|
|
||||||
|
bool_t
|
||||||
|
xdr_rpc_rdma_error (XDR *xdrs, rpc_rdma_error *objp)
|
||||||
|
{
|
||||||
|
|
||||||
|
if (!xdr_rpc_rdma_errcode (xdrs, &objp->err))
|
||||||
|
return FALSE;
|
||||||
|
switch (objp->err) {
|
||||||
|
case ERR_VERS:
|
||||||
|
if (!xdr_rpc_rdma_errvers (xdrs, &objp->range))
|
||||||
|
return FALSE;
|
||||||
|
break;
|
||||||
|
case ERR_CHUNK:
|
||||||
|
break;
|
||||||
|
default:
|
||||||
|
return FALSE;
|
||||||
|
}
|
||||||
|
return TRUE;
|
||||||
|
}
|
||||||
|
|
||||||
|
bool_t
|
||||||
|
xdr_rdma_proc (XDR *xdrs, rdma_proc *objp)
|
||||||
|
{
|
||||||
|
|
||||||
|
if (!xdr_enum (xdrs, (enum_t *) objp))
|
||||||
|
return FALSE;
|
||||||
|
return TRUE;
|
||||||
|
}
|
||||||
|
|
||||||
|
bool_t
|
||||||
|
xdr_rdma_body (XDR *xdrs, rdma_body *objp)
|
||||||
|
{
|
||||||
|
|
||||||
|
if (!xdr_rdma_proc (xdrs, &objp->proc))
|
||||||
|
return FALSE;
|
||||||
|
switch (objp->proc) {
|
||||||
|
case RDMA_MSG:
|
||||||
|
if (!xdr_rpc_rdma_header (xdrs, &objp->rdma_msg))
|
||||||
|
return FALSE;
|
||||||
|
break;
|
||||||
|
case RDMA_NOMSG:
|
||||||
|
if (!xdr_rpc_rdma_header_nomsg (xdrs, &objp->rdma_nomsg))
|
||||||
|
return FALSE;
|
||||||
|
break;
|
||||||
|
case RDMA_MSGP:
|
||||||
|
if (!xdr_rpc_rdma_header_padded (xdrs, &objp->rdma_msgp))
|
||||||
|
return FALSE;
|
||||||
|
break;
|
||||||
|
case RDMA_DONE:
|
||||||
|
break;
|
||||||
|
case RDMA_ERROR:
|
||||||
|
if (!xdr_rpc_rdma_error (xdrs, &objp->rdma_error))
|
||||||
|
return FALSE;
|
||||||
|
break;
|
||||||
|
default:
|
||||||
|
return FALSE;
|
||||||
|
}
|
||||||
|
return TRUE;
|
||||||
|
}
|
||||||
|
|
||||||
|
bool_t
|
||||||
|
xdr_rdma_msg (XDR *xdrs, rdma_msg *objp)
|
||||||
|
{
|
||||||
|
|
||||||
|
if (!xdr_uint32 (xdrs, &objp->rdma_xid))
|
||||||
|
return FALSE;
|
||||||
|
if (!xdr_uint32 (xdrs, &objp->rdma_vers))
|
||||||
|
return FALSE;
|
||||||
|
if (!xdr_uint32 (xdrs, &objp->rdma_credit))
|
||||||
|
return FALSE;
|
||||||
|
if (!xdr_rdma_body (xdrs, &objp->rdma_body))
|
||||||
|
return FALSE;
|
||||||
|
return TRUE;
|
||||||
|
}
|
|
@ -46,3 +46,4 @@ run_rpcgen() {
|
||||||
run_rpcgen nfs
|
run_rpcgen nfs
|
||||||
run_rpcgen rpc
|
run_rpcgen rpc
|
||||||
run_rpcgen portmap
|
run_rpcgen portmap
|
||||||
|
run_rpcgen rpc_rdma
|
||||||
|
|
|
@ -28,6 +28,19 @@
|
||||||
// RPC over TCP:
|
// RPC over TCP:
|
||||||
//
|
//
|
||||||
// BE 32bit length, then rpc_msg, then the procedure message itself
|
// BE 32bit length, then rpc_msg, then the procedure message itself
|
||||||
|
//
|
||||||
|
// RPC over RDMA:
|
||||||
|
// RFC 8166 - Remote Direct Memory Access Transport for Remote Procedure Call Version 1
|
||||||
|
// RFC 8267 - Network File System (NFS) Upper-Layer Binding to RPC-over-RDMA Version 1
|
||||||
|
// RFC 8797 - Remote Direct Memory Access - Connection Manager (RDMA-CM) Private Data for RPC-over-RDMA Version 1
|
||||||
|
// message is received in an RDMA Receive operation
|
||||||
|
// message: list of read chunks, list of write chunks, optional reply write chunk, then actual RPC body if present
|
||||||
|
// read chunk: BE 32bit position, BE 32bit registered memory key, BE 32bit length, BE 64bit offset
|
||||||
|
// write chunk: BE 32bit registered memory key, BE 32bit length, BE 64bit offset
|
||||||
|
// in reality for NFS 3.0: only 1 read chunk in write3 and symlink3, only 1 write chunk in read3 and readlink3
|
||||||
|
// read chunk is read by the server using RDMA Read from the client memory after receiving RPC request
|
||||||
|
// write chunk is pushed by the server using RDMA Write to the client memory before sending RPC reply
|
||||||
|
// connection is established using RDMA-CM at default port 20049
|
||||||
|
|
||||||
#pragma once
|
#pragma once
|
||||||
|
|
||||||
|
|
Loading…
Reference in New Issue