qperf/src/ib.c

2232 lines
54 KiB
C

/*
* qperf - handle RDMA tests.
*
* Copyright (c) 2002-2007 Johann George. All rights reserved.
* Copyright (c) 2006-2007 QLogic Corporation. All rights reserved.
*
* This software is available to you under a choice of one of two
* licenses. You may choose to be licensed under the terms of the GNU
* General Public License (GPL) Version 2, available from the file
* COPYING in the main directory of this source tree, or the
* OpenIB.org BSD license below:
*
* Redistribution and use in source and binary forms, with or
* without modification, are permitted provided that the following
* conditions are met:
*
* - Redistributions of source code must retain the above
* copyright notice, this list of conditions and the following
* disclaimer.
*
* - Redistributions in binary form must reproduce the above
* copyright notice, this list of conditions and the following
* disclaimer in the documentation and/or other materials
* provided with the distribution.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
* MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
* BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
* ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
* CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
* SOFTWARE.
*/
#define _GNU_SOURCE
#include <errno.h>
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <unistd.h>
#include <netinet/in.h>
#include <infiniband/verbs.h>
#include "qperf.h"
/*
* RDMA parameters.
*/
#define QKEY 0x11111111 /* Q_Key */
#define NCQE 1024 /* Number of CQ entries */
#define GRH_SIZE 40 /* IB GRH size */
#define MTU_SIZE 2048 /* Default MTU Size */
#define RETRY_CNT 7 /* RC/UC retry count */
#define RNR_RETRY 7 /* RC/UC RNR retry count */
#define RNR_TIMER 12 /* RC/UC RNR timeout */
#define TIMEOUT 14 /* RC/UC timeout */
/*
* Work request IDs.
*/
#define WRID_SEND 1 /* Send */
#define WRID_RECV 2 /* Receive */
#define WRID_RDMA 3 /* RDMA */
/*
* Constants.
*/
#define K2 (2*1024)
#define K64 (64*1024)
/*
* For convenience.
*/
typedef enum ibv_wr_opcode OPCODE;
/*
* Atomics.
*/
typedef enum ATOMIC {
FETCH_ADD, /* Fetch and add */
COMPARE_SWAP /* Compare and swap */
} ATOMIC;
/*
* IO Mode.
*/
typedef enum IOMODE {
IO_SR, /* Send/Receive */
IO_RDMA /* RDMA */
} IOMODE;
/*
* RDMA connection context.
*/
typedef struct IBCON {
uint32_t lid; /* Local ID */
uint32_t qpn; /* Queue pair number */
uint32_t psn; /* Packet sequence number */
uint32_t rkey; /* Remote key */
uint64_t vaddr; /* Virtual address */
} IBCON;
/*
* RDMA descriptor.
*/
typedef struct IBDEV {
IBCON lcon; /* Local context */
IBCON rcon; /* Remote context */
int mtu; /* MTU */
int port; /* Port */
int rate; /* Rate */
int trans; /* QP transport */
int maxinline; /* Maximum amount of inline data */
char *buffer; /* Buffer */
struct ibv_device **devlist; /* Device list */
struct ibv_context *context; /* Context */
struct ibv_comp_channel *channel; /* Channel */
struct ibv_pd *pd; /* Protection domain */
struct ibv_mr *mr; /* Memory region */
struct ibv_cq *cq; /* Completion queue */
struct ibv_qp *qp; /* QPair */
struct ibv_ah *ah; /* Address handle */
} IBDEV;
/*
* Names associated with a value.
*/
typedef struct NAMES {
int value; /* Value */
char *name; /* Name */
} NAMES;
/*
* RDMA speeds and names.
*/
typedef struct RATES {
const char *name; /* Name */
uint32_t rate; /* Rate */
} RATES;
/*
* Function prototypes.
*/
static void cq_error(int status);
static void dec_ibcon(IBCON *host);
static int do_error(int status, uint64_t *errors);
static void enc_ibcon(IBCON *host);
static void ib_bi_bw(int transport);
static void ib_client_atomic(ATOMIC atomic);
static void ib_client_bw(int transport);
static void ib_client_rdma_bw(int transport, OPCODE opcode);
static void ib_client_rdma_read_lat(int transport);
static void ib_close(IBDEV *ibdev);
static void ib_debug_info(IBDEV *ibdev);
static int ib_init(IBDEV *ibdev);
static int ib_mralloc(IBDEV *ibdev, int size);
static int ib_open(IBDEV *ibdev, int trans, int maxSendWR, int maxRecvWR);
static void ib_params_atomics(void);
static void ib_params_msgs(long msgSize, int use_poll_mode);
static int ib_poll(IBDEV *ibdev, struct ibv_wc *wc, int nwc);
static int ib_post_rdma(IBDEV *ibdev, OPCODE opcode, int n);
static int ib_post_compare_swap(IBDEV *ibdev,
int wrid, int offset, uint64_t compare, uint64_t swap);
static int ib_post_fetch_add(IBDEV *ibdev,
int wrid, int offset, uint64_t add);
static int ib_post_recv(IBDEV *ibdev, int n);
static int ib_post_send(IBDEV *ibdev, int n);
static void ib_pp_lat(int transport, IOMODE iomode);
static void ib_pp_lat_loop(IBDEV *ibdev, IOMODE iomode);
static int ib_prepare(IBDEV *ibdev);
static void ib_rdma_write_poll_lat(int transport);
static void ib_server_def(int transport);
static void ib_server_nop(int transport);
static char *opcode_name(int opcode);
/*
* List of errors we can get from a CQE.
*/
NAMES CQErrors[] ={
{ IBV_WC_SUCCESS, "Success" },
{ IBV_WC_LOC_LEN_ERR, "Local length error" },
{ IBV_WC_LOC_QP_OP_ERR, "Local QP operation failure" },
{ IBV_WC_LOC_EEC_OP_ERR, "Local EEC operation failure" },
{ IBV_WC_LOC_PROT_ERR, "Local protection error" },
{ IBV_WC_WR_FLUSH_ERR, "WR flush failure" },
{ IBV_WC_MW_BIND_ERR, "Memory window bind failure" },
{ IBV_WC_BAD_RESP_ERR, "Bad response" },
{ IBV_WC_LOC_ACCESS_ERR, "Local access failure" },
{ IBV_WC_REM_INV_REQ_ERR, "Remote invalid request" },
{ IBV_WC_REM_ACCESS_ERR, "Remote access failure" },
{ IBV_WC_REM_OP_ERR, "Remote operation failure" },
{ IBV_WC_RETRY_EXC_ERR, "Retries exceeded" },
{ IBV_WC_RNR_RETRY_EXC_ERR, "RNR retry exceeded" },
{ IBV_WC_LOC_RDD_VIOL_ERR, "Local RDD violation" },
{ IBV_WC_REM_INV_RD_REQ_ERR, "Remote invalid read request" },
{ IBV_WC_REM_ABORT_ERR, "Remote abort" },
{ IBV_WC_INV_EECN_ERR, "Invalid EECN" },
{ IBV_WC_INV_EEC_STATE_ERR, "Invalid EEC state" },
{ IBV_WC_FATAL_ERR, "Fatal error" },
{ IBV_WC_RESP_TIMEOUT_ERR, "Responder timeout" },
{ IBV_WC_GENERAL_ERR, "General error" },
};
/*
* Opcodes.
*/
NAMES Opcodes[] ={
{ IBV_WR_ATOMIC_CMP_AND_SWP, "compare and swap" },
{ IBV_WR_ATOMIC_FETCH_AND_ADD, "fetch and add" },
{ IBV_WR_RDMA_READ, "rdma read" },
{ IBV_WR_RDMA_WRITE, "rdma write" },
{ IBV_WR_RDMA_WRITE_WITH_IMM, "rdma write with immediate" },
{ IBV_WR_SEND, "send" },
{ IBV_WR_SEND_WITH_IMM, "send with immediate" },
};
/*
* Opcodes.
*/
RATES Rates[] ={
{ "", IBV_RATE_MAX },
{ "max", IBV_RATE_MAX },
{ "1xSDR", IBV_RATE_2_5_GBPS },
{ "1xDDR", IBV_RATE_5_GBPS },
{ "1xQDR", IBV_RATE_10_GBPS },
{ "4xSDR", IBV_RATE_10_GBPS },
{ "4xDDR", IBV_RATE_20_GBPS },
{ "4xQDR", IBV_RATE_40_GBPS },
{ "8xSDR", IBV_RATE_20_GBPS },
{ "8xDDR", IBV_RATE_40_GBPS },
{ "8xQDR", IBV_RATE_80_GBPS },
{ "2.5", IBV_RATE_2_5_GBPS },
{ "5", IBV_RATE_5_GBPS },
{ "10", IBV_RATE_10_GBPS },
{ "20", IBV_RATE_20_GBPS },
{ "30", IBV_RATE_30_GBPS },
{ "40", IBV_RATE_40_GBPS },
{ "60", IBV_RATE_60_GBPS },
{ "80", IBV_RATE_80_GBPS },
{ "120", IBV_RATE_120_GBPS },
};
/*
* Experimental (client side).
*/
void
run_client_experimental(void)
{
IBDEV ibdev;
ib_params_msgs(K64, 1);
if (!ib_open(&ibdev, IBV_QPT_UC, 1, 0))
goto err;
if (!ib_init(&ibdev))
goto err;
if (!synchronize())
goto err;
if (!ib_post_rdma(&ibdev, IBV_WR_RDMA_WRITE_WITH_IMM, 1))
goto err;
Successful = 1;
err:
stop_timing();
exchange_results();
ib_close(&ibdev);
}
/*
* Experimental (server side).
*/
void
run_server_experimental(void)
{
IBDEV ibdev;
int found = 0;
if (!ib_open(&ibdev, IBV_QPT_UC, 0, 1))
return;
if (!ib_init(&ibdev))
goto err;
if (!ib_post_recv(&ibdev, 1))
goto err;
if (!synchronize())
goto err;
while (!Finished) {
struct ibv_wc wc[NCQE];
int n = ib_poll(&ibdev, wc, cardof(wc));
if (n < 0)
goto err;
if (n) {
found = 1;
break;
}
}
if (found)
printf("Received immediate data\n");
else
printf("Failed to received immediate data\n");
Successful = 1;
err:
stop_timing();
exchange_results();
ib_close(&ibdev);
}
/*
* Measure RC bi-directional bandwidth (client side).
*/
void
run_client_rc_bi_bw(void)
{
par_use(L_ACCESS_RECV);
par_use(R_ACCESS_RECV);
ib_params_msgs(K64, 1);
ib_bi_bw(IBV_QPT_RC);
show_results(BANDWIDTH);
}
/*
* Measure RC bi-directional bandwidth (server side).
*/
void
run_server_rc_bi_bw(void)
{
ib_bi_bw(IBV_QPT_RC);
}
/*
* Measure RC bandwidth (client side).
*/
void
run_client_rc_bw(void)
{
par_use(L_ACCESS_RECV);
par_use(R_ACCESS_RECV);
par_use(L_NO_MSGS);
par_use(R_NO_MSGS);
ib_params_msgs(K64, 1);
ib_client_bw(IBV_QPT_RC);
show_results(BANDWIDTH);
}
/*
* Measure RC bandwidth (server side).
*/
void
run_server_rc_bw(void)
{
ib_server_def(IBV_QPT_RC);
}
/*
* Measure RC compare and swap messaging rate (client side).
*/
void
run_client_rc_compare_swap_mr(void)
{
ib_client_atomic(COMPARE_SWAP);
}
/*
* Measure RC compare and swap messaging rate (server side).
*/
void
run_server_rc_compare_swap_mr(void)
{
ib_server_nop(IBV_QPT_RC);
}
/*
* Measure RC fetch and add messaging rate (client side).
*/
void
run_client_rc_fetch_add_mr(void)
{
ib_client_atomic(FETCH_ADD);
}
/*
* Measure RC fetch and add messaging rate (server side).
*/
void
run_server_rc_fetch_add_mr(void)
{
ib_server_nop(IBV_QPT_RC);
}
/*
* Measure RC latency (client side).
*/
void
run_client_rc_lat(void)
{
ib_params_msgs(1, 1);
ib_pp_lat(IBV_QPT_RC, IO_SR);
}
/*
* Measure RC latency (server side).
*/
void
run_server_rc_lat(void)
{
ib_pp_lat(IBV_QPT_RC, IO_SR);
}
/*
* Measure RC RDMA read bandwidth (client side).
*/
void
run_client_rc_rdma_read_bw(void)
{
par_use(L_RD_ATOMIC);
par_use(R_RD_ATOMIC);
ib_params_msgs(K64, 1);
ib_client_rdma_bw(IBV_QPT_RC, IBV_WR_RDMA_READ);
show_results(BANDWIDTH);
}
/*
* Measure RC RDMA read bandwidth (server side).
*/
void
run_server_rc_rdma_read_bw(void)
{
ib_server_nop(IBV_QPT_RC);
}
/*
* Measure RC RDMA read latency (client side).
*/
void
run_client_rc_rdma_read_lat(void)
{
ib_params_msgs(1, 1);
ib_client_rdma_read_lat(IBV_QPT_RC);
}
/*
* Measure RC RDMA read latency (server side).
*/
void
run_server_rc_rdma_read_lat(void)
{
ib_server_nop(IBV_QPT_RC);
}
/*
* Measure RC RDMA write bandwidth (client side).
*/
void
run_client_rc_rdma_write_bw(void)
{
ib_params_msgs(K64, 1);
ib_client_rdma_bw(IBV_QPT_RC, IBV_WR_RDMA_WRITE_WITH_IMM);
show_results(BANDWIDTH);
}
/*
* Measure RC RDMA write bandwidth (server side).
*/
void
run_server_rc_rdma_write_bw(void)
{
ib_server_def(IBV_QPT_RC);
}
/*
* Measure RC RDMA write latency (client side).
*/
void
run_client_rc_rdma_write_lat(void)
{
ib_params_msgs(1, 1);
ib_pp_lat(IBV_QPT_RC, IO_RDMA);
}
/*
* Measure RC RDMA write latency (server side).
*/
void
run_server_rc_rdma_write_lat(void)
{
ib_pp_lat(IBV_QPT_RC, IO_RDMA);
}
/*
* Measure RC RDMA write polling latency (client side).
*/
void
run_client_rc_rdma_write_poll_lat(void)
{
ib_params_msgs(1, 0);
ib_rdma_write_poll_lat(IBV_QPT_RC);
show_results(LATENCY);
}
/*
* Measure RC RDMA write polling latency (server side).
*/
void
run_server_rc_rdma_write_poll_lat(void)
{
ib_rdma_write_poll_lat(IBV_QPT_RC);
}
/*
* Measure UC bi-directional bandwidth (client side).
*/
void
run_client_uc_bi_bw(void)
{
par_use(L_ACCESS_RECV);
par_use(R_ACCESS_RECV);
ib_params_msgs(K64, 1);
ib_bi_bw(IBV_QPT_UC);
show_results(BANDWIDTH_SR);
}
/*
* Measure UC bi-directional bandwidth (server side).
*/
void
run_server_uc_bi_bw(void)
{
ib_bi_bw(IBV_QPT_UC);
}
/*
* Measure UC bandwidth (client side).
*/
void
run_client_uc_bw(void)
{
par_use(L_ACCESS_RECV);
par_use(R_ACCESS_RECV);
par_use(L_NO_MSGS);
par_use(R_NO_MSGS);
ib_params_msgs(K64, 1);
ib_client_bw(IBV_QPT_UC);
show_results(BANDWIDTH_SR);
}
/*
* Measure UC bandwidth (server side).
*/
void
run_server_uc_bw(void)
{
ib_server_def(IBV_QPT_UC);
}
/*
* Measure UC latency (client side).
*/
void
run_client_uc_lat(void)
{
ib_params_msgs(1, 1);
ib_pp_lat(IBV_QPT_UC, IO_SR);
}
/*
* Measure UC latency (server side).
*/
void
run_server_uc_lat(void)
{
ib_pp_lat(IBV_QPT_UC, IO_SR);
}
/*
* Measure UC RDMA write bandwidth (client side).
*/
void
run_client_uc_rdma_write_bw(void)
{
ib_params_msgs(K64, 1);
ib_client_rdma_bw(IBV_QPT_UC, IBV_WR_RDMA_WRITE_WITH_IMM);
show_results(BANDWIDTH_SR);
}
/*
* Measure UC RDMA write bandwidth (server side).
*/
void
run_server_uc_rdma_write_bw(void)
{
ib_server_def(IBV_QPT_UC);
}
/*
* Measure UC RDMA write latency (client side).
*/
void
run_client_uc_rdma_write_lat(void)
{
ib_params_msgs(1, 1);
ib_pp_lat(IBV_QPT_UC, IO_RDMA);
}
/*
* Measure UC RDMA write latency (server side).
*/
void
run_server_uc_rdma_write_lat(void)
{
ib_pp_lat(IBV_QPT_UC, IO_RDMA);
}
/*
* Measure UC RDMA write polling latency (client side).
*/
void
run_client_uc_rdma_write_poll_lat(void)
{
ib_params_msgs(1, 1);
ib_rdma_write_poll_lat(IBV_QPT_UC);
show_results(LATENCY);
}
/*
* Measure UC RDMA write polling latency (server side).
*/
void
run_server_uc_rdma_write_poll_lat(void)
{
ib_rdma_write_poll_lat(IBV_QPT_UC);
}
/*
* Measure UD bi-directional bandwidth (client side).
*/
void
run_client_ud_bi_bw(void)
{
par_use(L_ACCESS_RECV);
par_use(R_ACCESS_RECV);
ib_params_msgs(K2, 1);
ib_bi_bw(IBV_QPT_UD);
show_results(BANDWIDTH_SR);
}
/*
* Measure UD bi-directional bandwidth (server side).
*/
void
run_server_ud_bi_bw(void)
{
ib_bi_bw(IBV_QPT_UD);
}
/*
* Measure UD bandwidth (client side).
*/
void
run_client_ud_bw(void)
{
par_use(L_ACCESS_RECV);
par_use(R_ACCESS_RECV);
par_use(L_NO_MSGS);
par_use(R_NO_MSGS);
ib_params_msgs(K2, 1);
ib_client_bw(IBV_QPT_UD);
show_results(BANDWIDTH_SR);
}
/*
* Measure UD bandwidth (server side).
*/
void
run_server_ud_bw(void)
{
ib_server_def(IBV_QPT_UD);
}
/*
* Measure UD latency (client side).
*/
void
run_client_ud_lat(void)
{
ib_params_msgs(1, 1);
ib_pp_lat(IBV_QPT_UD, IO_SR);
}
/*
* Measure UD latency (server side).
*/
void
run_server_ud_lat(void)
{
ib_pp_lat(IBV_QPT_UD, IO_SR);
}
/*
* Verify RC compare and swap (client side).
*/
void
run_client_ver_rc_compare_swap(void)
{
IBDEV ibdev;
uint64_t *result;
uint64_t last = 0;
uint64_t cur = 0;
uint64_t next = 0x0123456789abcdefULL;
int i;
int size;
ib_params_atomics();
if (!ib_open(&ibdev, IBV_QPT_RC, NCQE, 0))
goto err;
size = Req.rd_atomic * sizeof(uint64_t);
setv_u32(L_MSG_SIZE, size);
setv_u32(R_MSG_SIZE, size);
ib_mralloc(&ibdev, size);
if (!ib_init(&ibdev))
goto err;
if (!synchronize())
goto err;
for (i = 0; i < Req.rd_atomic; ++i) {
if (!ib_post_compare_swap(&ibdev, i, i*sizeof(uint64_t), cur, next))
goto err;
cur = next;
next = cur + 1;
}
result = (uint64_t *) ibdev.buffer;
while (!Finished) {
struct ibv_wc wc[NCQE];
int n = ib_poll(&ibdev, wc, cardof(wc));
uint64_t res;
if (Finished)
break;
if (n < 0)
goto err;
if (n > LStat.max_cqes)
LStat.max_cqes = n;
for (i = 0; i < n; ++i) {
int x = wc[i].wr_id;
int status = wc[i].status;
if (status == IBV_WC_SUCCESS) {
LStat.rem_r.no_bytes += sizeof(uint64_t);
LStat.rem_r.no_msgs++;
} else if (!do_error(status, &LStat.s.no_errs))
goto err;
res = result[x];
if (last != res) {
error("compare and swap doesn't match (expected %llx vs. %llx)",
(long long)last, (long long)res);
goto err;
}
if (last)
last++;
else
last = 0x0123456789abcdefULL;
next = cur + 1;
if (!ib_post_compare_swap(&ibdev, x, x*sizeof(uint64_t),
cur, next))
goto err;
cur = next;
}
}
Successful = 1;
err:
stop_timing();
exchange_results();
ib_close(&ibdev);
show_results(MSG_RATE);
}
/*
* Verify RC compare and swap (server side).
*/
void
run_server_ver_rc_compare_swap(void)
{
ib_server_nop(IBV_QPT_RC);
}
/*
* Verify RC fetch and add (client side).
*/
void
run_client_ver_rc_fetch_add(void)
{
IBDEV ibdev;
uint64_t *result;
uint64_t last = 0;
int i;
int size;
ib_params_atomics();
if (!ib_open(&ibdev, IBV_QPT_RC, NCQE, 0))
goto err;
size = Req.rd_atomic * sizeof(uint64_t);
setv_u32(L_MSG_SIZE, size);
setv_u32(R_MSG_SIZE, size);
ib_mralloc(&ibdev, size);
if (!ib_init(&ibdev))
goto err;
if (!synchronize())
goto err;
for (i = 0; i < Req.rd_atomic; ++i) {
if (!ib_post_fetch_add(&ibdev, i, i*sizeof(uint64_t), 1))
goto err;
}
result = (uint64_t *) ibdev.buffer;
while (!Finished) {
struct ibv_wc wc[NCQE];
int n = ib_poll(&ibdev, wc, cardof(wc));
uint64_t res;
if (Finished)
break;
if (n < 0)
goto err;
if (n > LStat.max_cqes)
LStat.max_cqes = n;
for (i = 0; i < n; ++i) {
int x = wc[i].wr_id;
int status = wc[i].status;
if (status == IBV_WC_SUCCESS) {
LStat.rem_r.no_bytes += sizeof(uint64_t);
LStat.rem_r.no_msgs++;
} else if (!do_error(status, &LStat.s.no_errs))
goto err;
res = result[x];
if (last != res) {
error("fetch and add doesn't match (expected %llx vs. %llx)",
(long long)last, (long long)res);
goto err;
}
last++;
if (!ib_post_fetch_add(&ibdev, x, x*sizeof(uint64_t), 1))
goto err;
}
}
Successful = 1;
err:
stop_timing();
exchange_results();
ib_close(&ibdev);
show_results(MSG_RATE);
}
/*
* Verify RC fetch and add (server side).
*/
void
run_server_ver_rc_fetch_add(void)
{
ib_server_nop(IBV_QPT_RC);
}
/*
* Measure messaging rate for an atomic operation.
*/
static void
ib_client_atomic(ATOMIC atomic)
{
int i;
int r;
IBDEV ibdev;
ib_params_atomics();
if (!ib_open(&ibdev, IBV_QPT_RC, NCQE, 0))
goto err;
setv_u32(L_MSG_SIZE, sizeof(uint64_t));
setv_u32(R_MSG_SIZE, sizeof(uint64_t));
ib_mralloc(&ibdev, sizeof(uint64_t));
if (!ib_init(&ibdev))
goto err;
if (!synchronize())
goto err;
for (i = 0; i < Req.rd_atomic; ++i) {
r = (atomic == FETCH_ADD)
? ib_post_fetch_add(&ibdev, 0, 0, 0)
: ib_post_compare_swap(&ibdev, 0, 0, 0, 0);
if (!r)
goto err;
}
while (!Finished) {
struct ibv_wc wc[NCQE];
int n = ib_poll(&ibdev, wc, cardof(wc));
if (Finished)
break;
if (n < 0)
goto err;
if (n > LStat.max_cqes)
LStat.max_cqes = n;
for (i = 0; i < n; ++i) {
int status = wc[i].status;
if (status == IBV_WC_SUCCESS) {
LStat.rem_r.no_bytes += sizeof(uint64_t);
LStat.rem_r.no_msgs++;
} else if (!do_error(status, &LStat.s.no_errs))
goto err;
r = (atomic == FETCH_ADD)
? ib_post_fetch_add(&ibdev, 0, 0, 0)
: ib_post_compare_swap(&ibdev, 0, 0, 0, 0);
if (!r)
goto err;
}
}
Successful = 1;
err:
stop_timing();
exchange_results();
ib_close(&ibdev);
show_results(MSG_RATE);
}
/*
* Measure IB bandwidth (client side).
*/
static void
ib_client_bw(int transport)
{
IBDEV ibdev;
long sent = 0;
if (!ib_open(&ibdev, transport, NCQE, 0))
goto err;
if (!ib_init(&ibdev))
goto err;
if (!synchronize())
goto err;
if (!ib_post_send(&ibdev, left_to_send(&sent, NCQE)))
goto err;
sent = NCQE;
while (!Finished) {
int i;
struct ibv_wc wc[NCQE];
int n = ib_poll(&ibdev, wc, cardof(wc));
if (n > LStat.max_cqes)
LStat.max_cqes = n;
if (n < 0)
goto err;
if (Finished)
break;
for (i = 0; i < n; ++i) {
int id = wc[i].wr_id;
int status = wc[i].status;
if (id != WRID_SEND)
debug("bad WR ID %d", id);
else if (status != IBV_WC_SUCCESS)
if (!do_error(status, &LStat.s.no_errs))
goto err;
}
if (Req.no_msgs) {
if (LStat.s.no_msgs + LStat.s.no_errs >= Req.no_msgs)
break;
n = left_to_send(&sent, n);
}
if (!ib_post_send(&ibdev, n))
goto err;
sent += n;
}
Successful = 1;
err:
stop_timing();
exchange_results();
ib_close(&ibdev);
}
/*
* Default action for the server is to post receive buffers and whenever it
* gets a completion entry, compute statistics and post more buffers.
*/
static void
ib_server_def(int transport)
{
IBDEV ibdev;
if (!ib_open(&ibdev, transport, 0, NCQE))
return;
if (!ib_init(&ibdev))
goto err;
if (!ib_post_recv(&ibdev, NCQE))
goto err;
if (!synchronize())
goto err;
while (!Finished) {
int i;
struct ibv_wc wc[NCQE];
int n = ib_poll(&ibdev, wc, cardof(wc));
if (Finished)
break;
if (n > LStat.max_cqes)
LStat.max_cqes = n;
if (n < 0)
goto err;
for (i = 0; i < n; ++i) {
int status = wc[i].status;
if (status == IBV_WC_SUCCESS) {
LStat.r.no_bytes += Req.msg_size;
LStat.r.no_msgs++;
if (Req.access_recv)
touch_data(ibdev.buffer, Req.msg_size);
} else if (!do_error(status, &LStat.r.no_errs))
goto err;
}
if (Req.no_msgs)
if (LStat.r.no_msgs + LStat.r.no_errs >= Req.no_msgs)
break;
if (!ib_post_recv(&ibdev, n))
goto err;
}
Successful = 1;
err:
stop_timing();
exchange_results();
ib_close(&ibdev);
}
/*
* Measure bi-directional IB bandwidth.
*/
static void
ib_bi_bw(int transport)
{
IBDEV ibdev;
if (!ib_open(&ibdev, transport, NCQE, NCQE))
goto err;
if (!ib_init(&ibdev))
goto err;
if (!ib_post_recv(&ibdev, NCQE))
goto err;
if (!synchronize())
goto err;
if (!ib_post_send(&ibdev, NCQE))
goto err;
while (!Finished) {
int i;
struct ibv_wc wc[NCQE];
int noSend = 0;
int noRecv = 0;
int n = ib_poll(&ibdev, wc, cardof(wc));
if (Finished)
break;
if (n > LStat.max_cqes)
LStat.max_cqes = n;
if (n < 0)
goto err;
for (i = 0; i < n; ++i) {
int id = wc[i].wr_id;
int status = wc[i].status;
switch (id) {
case WRID_SEND:
if (status != IBV_WC_SUCCESS)
if (!do_error(status, &LStat.s.no_errs))
goto err;
++noSend;
break;
case WRID_RECV:
if (status == IBV_WC_SUCCESS) {
LStat.r.no_bytes += Req.msg_size;
LStat.r.no_msgs++;
if (Req.access_recv)
touch_data(ibdev.buffer, Req.msg_size);
} else if (!do_error(status, &LStat.r.no_errs))
goto err;
++noRecv;
break;
default:
debug("bad WR ID %d", id);
}
}
if (noRecv)
if (!ib_post_recv(&ibdev, noRecv))
goto err;
if (noSend)
if (!ib_post_send(&ibdev, noSend))
goto err;
}
Successful = 1;
err:
stop_timing();
exchange_results();
ib_close(&ibdev);
}
/*
* Measure ping-pong latency (client and server side).
*/
static void
ib_pp_lat(int transport, IOMODE iomode)
{
IBDEV ibdev;
if (!ib_open(&ibdev, transport, 1, 1))
goto err;
if (!ib_init(&ibdev))
goto err;
ib_pp_lat_loop(&ibdev, iomode);
err:
stop_timing();
exchange_results();
ib_close(&ibdev);
if (is_client())
show_results(LATENCY);
}
/*
* Loop sending packets back and forth to measure ping-pong latency.
*/
static void
ib_pp_lat_loop(IBDEV *ibdev, IOMODE iomode)
{
int done = 1;
if (!ib_post_recv(ibdev, 1))
return;
if (!synchronize())
return;
if (is_client()) {
if (iomode == IO_SR) {
if (!ib_post_send(ibdev, 1))
return;
} else {
if (!ib_post_rdma(ibdev, IBV_WR_RDMA_WRITE_WITH_IMM, 1))
return;
}
done = 0;
}
while (!Finished) {
int i;
struct ibv_wc wc[2];
int n = ib_poll(ibdev, wc, cardof(wc));
if (Finished)
break;
if (n < 0)
return;
for (i = 0; i < n; ++i) {
int id = wc[i].wr_id;
int status = wc[i].status;
switch (id) {
case WRID_SEND:
case WRID_RDMA:
if (status != IBV_WC_SUCCESS)
if (!do_error(status, &LStat.s.no_errs))
return;
done |= 1;
continue;
case WRID_RECV:
if (status == IBV_WC_SUCCESS) {
LStat.r.no_bytes += Req.msg_size;
LStat.r.no_msgs++;
if (!ib_post_recv(ibdev, 1))
return;
} else if (!do_error(status, &LStat.r.no_errs))
return;
done |= 2;
continue;
default:
debug("bad WR ID %d", id);
continue;
}
break;
}
if (done == 3) {
if (iomode == IO_SR) {
if (!ib_post_send(ibdev, 1))
return;
} else {
if (!ib_post_rdma(ibdev, IBV_WR_RDMA_WRITE_WITH_IMM, 1))
return;
}
done = 0;
}
}
Successful = 1;
}
/*
* Loop sending packets back and forth using RDMA Write and polling to measure
* latency. Note that if we increase the number of entries of wc to be NCQE,
* on the PS HCA, the latency is much longer.
*/
static void
ib_rdma_write_poll_lat(int transport)
{
IBDEV ibdev;
volatile char *p;
volatile char *q;
int send = is_client() ? 1 : 0;
int locID = send;
int remID = !locID;
if (!ib_open(&ibdev, transport, NCQE, 0))
goto err;
if (!ib_init(&ibdev))
goto err;
if (!synchronize())
goto err;
p = &ibdev.buffer[0];
q = &ibdev.buffer[Req.msg_size-1];
while (!Finished) {
*p = locID;
*q = locID;
if (send) {
int i;
int n;
struct ibv_wc wc[2];
if (!ib_post_rdma(&ibdev, IBV_WR_RDMA_WRITE, 1))
goto err;
if (Finished)
break;
n = ibv_poll_cq(ibdev.cq, cardof(wc), wc);
if (n < 0) {
syserror("CQ poll failed");
goto err;
}
for (i = 0; i < n; ++i) {
int id = wc[i].wr_id;
int status = wc[i].status;
if (id != WRID_RDMA)
debug("bad WR ID %d", id);
else if (status != IBV_WC_SUCCESS) {
if (!do_error(status, &LStat.s.no_errs))
goto err;
}
}
}
while (!Finished)
if (*p == remID && *q == remID)
break;
LStat.r.no_bytes += Req.msg_size;
LStat.r.no_msgs++;
send = 1;
}
Successful = 1;
err:
stop_timing();
exchange_results();
ib_close(&ibdev);
}
/*
* Measure RDMA Read latency (client side).
*/
static void
ib_client_rdma_read_lat(int transport)
{
IBDEV ibdev;
if (!ib_open(&ibdev, transport, 1, 0))
goto err;
if (!ib_init(&ibdev))
goto err;
if (!synchronize())
goto err;
if (!ib_post_rdma(&ibdev, IBV_WR_RDMA_READ, 1))
goto err;
while (!Finished) {
struct ibv_wc wc;
int n = ib_poll(&ibdev, &wc, 1);
if (n < 0)
goto err;
if (n == 0)
continue;
if (Finished)
break;
if (wc.wr_id != WRID_RDMA) {
debug("bad WR ID %d", (int)wc.wr_id);
continue;
}
if (wc.status == IBV_WC_SUCCESS) {
LStat.r.no_bytes += Req.msg_size;
LStat.r.no_msgs++;
LStat.rem_s.no_bytes += Req.msg_size;
LStat.rem_s.no_msgs++;
} else if (!do_error(wc.status, &LStat.s.no_errs))
goto err;
if (!ib_post_rdma(&ibdev, IBV_WR_RDMA_READ, 1))
goto err;
}
Successful = 1;
err:
stop_timing();
exchange_results();
ib_close(&ibdev);
show_results(LATENCY);
}
/*
* Measure RDMA bandwidth (client side).
*/
static void
ib_client_rdma_bw(int transport, OPCODE opcode)
{
IBDEV ibdev;
if (!ib_open(&ibdev, transport, NCQE, 0))
goto err;
if (!ib_init(&ibdev))
goto err;
if (!synchronize())
goto err;
if (!ib_post_rdma(&ibdev, opcode, NCQE))
goto err;
while (!Finished) {
int i;
struct ibv_wc wc[NCQE];
int n = ib_poll(&ibdev, wc, cardof(wc));
if (Finished)
break;
if (n < 0)
goto err;
if (n > LStat.max_cqes)
LStat.max_cqes = n;
for (i = 0; i < n; ++i) {
int status = wc[i].status;
if (status == IBV_WC_SUCCESS) {
if (opcode == IBV_WR_RDMA_READ) {
LStat.r.no_bytes += Req.msg_size;
LStat.r.no_msgs++;
LStat.rem_s.no_bytes += Req.msg_size;
LStat.rem_s.no_msgs++;
}
} else if (!do_error(status, &LStat.s.no_errs))
goto err;
}
if (!ib_post_rdma(&ibdev, opcode, n))
goto err;
}
Successful = 1;
err:
stop_timing();
exchange_results();
ib_close(&ibdev);
}
/*
* Server just waits and lets driver take care of any requests.
*/
static void
ib_server_nop(int transport)
{
IBDEV ibdev;
/* workaround: Size of RQ should be 0; bug in Mellanox driver */
if (!ib_open(&ibdev, transport, 0, 1))
goto err;
if (!ib_init(&ibdev))
goto err;
if (!synchronize())
goto err;
while (!Finished)
pause();
Successful = 1;
err:
stop_timing();
exchange_results();
ib_close(&ibdev);
}
/*
* Set default IB parameters for tests that use messages.
*/
static void
ib_params_msgs(long msgSize, int use_poll_mode)
{
setp_u32(0, L_MSG_SIZE, msgSize);
setp_u32(0, R_MSG_SIZE, msgSize);
setp_u32(0, L_MTU_SIZE, MTU_SIZE);
setp_u32(0, R_MTU_SIZE, MTU_SIZE);
par_use(L_ID);
par_use(R_ID);
par_use(L_MTU_SIZE);
par_use(R_MTU_SIZE);
par_use(L_RATE);
par_use(R_RATE);
if (use_poll_mode) {
par_use(L_POLL_MODE);
par_use(R_POLL_MODE);
}
opt_check();
}
/*
* Set default IB parameters for tests that use atomics.
*/
static void
ib_params_atomics(void)
{
setp_u32(0, L_MTU_SIZE, MTU_SIZE);
setp_u32(0, R_MTU_SIZE, MTU_SIZE);
par_use(L_ID);
par_use(R_ID);
par_use(L_POLL_MODE);
par_use(R_POLL_MODE);
par_use(L_RATE);
par_use(R_RATE);
par_use(L_RD_ATOMIC);
par_use(R_RD_ATOMIC);
opt_check();
setv_u32(L_MSG_SIZE, 0);
}
/*
* IB initialization.
*/
static int
ib_init(IBDEV *ibdev)
{
IBCON ibcon;
if (is_client()) {
client_send_request();
enc_init(&ibcon);
enc_ibcon(&ibdev->lcon);
if (!send_mesg(&ibcon, sizeof(ibcon), "IB connection"))
return 0;
if (!recv_mesg(&ibcon, sizeof(ibcon), "IB connection"))
return 0;
dec_init(&ibcon);
dec_ibcon(&ibdev->rcon);
} else {
if (!recv_mesg(&ibcon, sizeof(ibcon), "IB connection"))
return 0;
dec_init(&ibcon);
dec_ibcon(&ibdev->rcon);
enc_init(&ibcon);
enc_ibcon(&ibdev->lcon);
if (!send_mesg(&ibcon, sizeof(ibcon), "IB connection"))
return 0;
}
if (!ib_prepare(ibdev))
return 0;
ib_debug_info(ibdev);
return 1;
}
/*
* Show debugging information.
*/
static void
ib_debug_info(IBDEV *ibdev)
{
debug("L: lid=%04x qpn=%06x psn=%06x rkey=%08x vaddr=%010x",
ibdev->lcon.lid, ibdev->lcon.qpn, ibdev->lcon.psn,
ibdev->lcon.rkey, ibdev->lcon.vaddr);
debug("R: lid=%04x qpn=%06x psn=%06x rkey=%08x vaddr=%010x",
ibdev->rcon.lid, ibdev->rcon.qpn, ibdev->rcon.psn,
ibdev->rcon.rkey, ibdev->rcon.vaddr);
}
/*
* Open a RDMA device.
*/
static int
ib_open(IBDEV *ibdev, int trans, int maxSendWR, int maxRecvWR)
{
/* Clear structure */
memset(ibdev, 0, sizeof(*ibdev));
/* Check and set MTU */
{
int mtu = Req.mtu_size;
if (mtu == 256)
ibdev->mtu = IBV_MTU_256;
else if (mtu == 512)
ibdev->mtu = IBV_MTU_512;
else if (mtu == 1024)
ibdev->mtu = IBV_MTU_1024;
else if (mtu == 2048)
ibdev->mtu = IBV_MTU_2048;
else if (mtu == 4096)
ibdev->mtu = IBV_MTU_4096;
else
error_die("Bad MTU: %d; must be 256/512/1K/2K/4K", mtu);
}
/* Set transport type */
ibdev->trans = trans;
/* Set port */
{
int port = 1;
char *p = index(Req.id, ':');
if (p) {
*p++ = '\0';
port = atoi(p);
if (port < 1)
error_die("Bad IB port: %d; must be at least 1", port);
}
ibdev->port = port;
}
/* Set rate */
{
RATES *q = Rates;
RATES *r = q + cardof(Rates);
for (;; ++q) {
if (q >= r) {
syserror("Bad rate: %s", Req.rate);
goto err;
}
if (streq(Req.rate, q->name)) {
ibdev->rate = q->rate;
break;
}
}
}
/* Determine device and open */
{
struct ibv_device *device;
char *name = Req.id[0] ? Req.id : 0;
ibdev->devlist = ibv_get_device_list(0);
if (!ibdev->devlist) {
syserror("Failed to find any IB devices");
goto err;
}
if (!name)
device = *ibdev->devlist;
else {
struct ibv_device **d = ibdev->devlist;
while ((device = *d++))
if (streq(ibv_get_device_name(device), name))
break;
}
if (!device) {
syserror("Failed to find IB device");
goto err;
}
ibdev->context = ibv_open_device(device);
if (!ibdev->context) {
syserror("Failed to open device %s", ibv_get_device_name(device));
goto err;
}
}
/* Allocate completion channel */
ibdev->channel = ibv_create_comp_channel(ibdev->context);
if (!ibdev->channel) {
syserror("Failed to create completion channel");
goto err;
}
/* Allocate protection domain */
ibdev->pd = ibv_alloc_pd(ibdev->context);
if (!ibdev->pd) {
syserror("Failed to allocate protection domain");
goto err;
}
/* Allocate message buffer and memory region */
{
int bufSize = Req.msg_size;
int pageSize = sysconf(_SC_PAGESIZE);
if (trans == IBV_QPT_UD)
bufSize += GRH_SIZE;
if (bufSize == 0)
bufSize = 1;
if (posix_memalign((void **)&ibdev->buffer, pageSize, bufSize) != 0) {
syserror("Failed to allocate memory");
goto err;
}
memset(ibdev->buffer, 0, bufSize);
int flags = IBV_ACCESS_LOCAL_WRITE |
IBV_ACCESS_REMOTE_READ |
IBV_ACCESS_REMOTE_WRITE |
IBV_ACCESS_REMOTE_ATOMIC;
ibdev->mr = ibv_reg_mr(ibdev->pd, ibdev->buffer, bufSize, flags);
if (!ibdev->mr) {
syserror("Failed to allocate memory region");
goto err;
}
}
/* Create completion queue */
ibdev->cq = ibv_create_cq(ibdev->context,
maxSendWR+maxRecvWR, 0, ibdev->channel, 0);
if (!ibdev->cq) {
syserror("Failed to create completion queue");
goto err;
}
/* Create queue pair */
{
struct ibv_qp_init_attr attr ={
.send_cq = ibdev->cq,
.recv_cq = ibdev->cq,
.cap ={
.max_send_wr = maxSendWR,
.max_recv_wr = maxRecvWR,
.max_send_sge = 1,
.max_recv_sge = 1,
.max_inline_data = 0,
},
.qp_type = ibdev->trans,
};
ibdev->qp = ibv_create_qp(ibdev->pd, &attr);
if (!ibdev->qp) {
syserror("Failed to create QP");
goto err;
}
}
/* Modify queue pair to INIT state */
{
struct ibv_qp_attr attr ={
.qp_state = IBV_QPS_INIT,
.pkey_index = 0,
.port_num = ibdev->port
};
int flags = IBV_QP_STATE | IBV_QP_PKEY_INDEX | IBV_QP_PORT;
if (ibdev->trans == IBV_QPT_UD) {
flags |= IBV_QP_QKEY;
attr.qkey = QKEY;
} else if (ibdev->trans == IBV_QPT_RC) {
flags |= IBV_QP_ACCESS_FLAGS;
attr.qp_access_flags =
IBV_ACCESS_REMOTE_READ |
IBV_ACCESS_REMOTE_WRITE |
IBV_ACCESS_REMOTE_ATOMIC;
} else if (ibdev->trans == IBV_QPT_UC) {
flags |= IBV_QP_ACCESS_FLAGS;
attr.qp_access_flags = IBV_ACCESS_REMOTE_WRITE;
}
if (ibv_modify_qp(ibdev->qp, &attr, flags) != SUCCESS0) {
syserror("Failed to modify QP to INIT state");
goto err;
}
}
/* Get QP attributes */
{
struct ibv_qp_attr qp_attr;
struct ibv_qp_init_attr qp_init_attr;
if (ibv_query_qp(ibdev->qp, &qp_attr, 0, &qp_init_attr) != SUCCESS0) {
syserror("Query QP failed");
goto err;
}
ibdev->maxinline = qp_attr.cap.max_inline_data;
}
/* Get device properties */
{
struct ibv_device_attr dev_attr;
if (ibv_query_device(ibdev->context, &dev_attr) != SUCCESS0) {
syserror("Query device failed");
goto err;
}
if (Req.rd_atomic == 0)
Req.rd_atomic = dev_attr.max_qp_rd_atom;
else if (Req.rd_atomic > dev_attr.max_qp_rd_atom)
error("Device only supports %d (< %d) RDMA reads or atomic ops",
dev_attr.max_qp_rd_atom, Req.rd_atomic);
}
/* Set up local context */
{
struct ibv_port_attr port_attr;
int stat = ibv_query_port(ibdev->context, ibdev->port, &port_attr);
if (stat != SUCCESS0) {
syserror("Query port failed");
goto err;
}
srand48(getpid()*time(0));
ibdev->lcon.lid = port_attr.lid;
ibdev->lcon.qpn = ibdev->qp->qp_num;
ibdev->lcon.psn = lrand48() & 0xffffff;
ibdev->lcon.rkey = 0;
ibdev->lcon.vaddr = 0;
}
/* Allocate memory region */
if (!ib_mralloc(ibdev, Req.msg_size))
goto err;
return 1;
err:
ib_close(ibdev);
return 0;
}
/*
* Allocate a memory region.
*/
static int
ib_mralloc(IBDEV *ibdev, int size)
{
int pageSize;
if (size == 0)
return 1;
if (ibdev->trans == IBV_QPT_UD)
size += GRH_SIZE;
pageSize = sysconf(_SC_PAGESIZE);
if (posix_memalign((void **)&ibdev->buffer, pageSize, size) != 0) {
syserror("Failed to allocate memory");
goto err;
}
memset(ibdev->buffer, 0, size);
int flags = IBV_ACCESS_LOCAL_WRITE |
IBV_ACCESS_REMOTE_READ |
IBV_ACCESS_REMOTE_WRITE |
IBV_ACCESS_REMOTE_ATOMIC;
ibdev->mr = ibv_reg_mr(ibdev->pd, ibdev->buffer, size, flags);
if (!ibdev->mr) {
syserror("Failed to allocate memory region");
goto err;
}
ibdev->lcon.rkey = ibdev->mr->rkey;
ibdev->lcon.vaddr = (unsigned long)ibdev->buffer;
return 1;
err:
if (ibdev->buffer) {
free(ibdev->buffer);
ibdev->buffer = 0;
}
return 0;
}
/*
* Prepare the IB device for receiving and sending.
*/
static int
ib_prepare(IBDEV *ibdev)
{
int flags;
struct ibv_qp_attr rtr_attr ={
.qp_state = IBV_QPS_RTR,
.path_mtu = ibdev->mtu,
.dest_qp_num = ibdev->rcon.qpn,
.rq_psn = ibdev->rcon.psn,
.min_rnr_timer = RNR_TIMER,
.max_dest_rd_atomic = Req.rd_atomic,
.ah_attr = {
.dlid = ibdev->rcon.lid,
.port_num = ibdev->port,
.static_rate = ibdev->rate
}
};
struct ibv_qp_attr rts_attr ={
.qp_state = IBV_QPS_RTS,
.timeout = TIMEOUT,
.retry_cnt = RETRY_CNT,
.rnr_retry = RNR_RETRY,
.sq_psn = ibdev->lcon.psn,
.max_rd_atomic = Req.rd_atomic
};
struct ibv_ah_attr ah_attr ={
.dlid = ibdev->rcon.lid,
.port_num = ibdev->port,
.static_rate = ibdev->rate
};
if (ibdev->trans == IBV_QPT_UD) {
/* Modify queue pair to RTR */
flags = IBV_QP_STATE;
if (ibv_modify_qp(ibdev->qp, &rtr_attr, flags) != SUCCESS0)
return syserror("Failed to modify QP to RTR");
/* Modify queue pair to RTS */
flags = IBV_QP_STATE | IBV_QP_SQ_PSN;
if (ibv_modify_qp(ibdev->qp, &rts_attr, flags) != SUCCESS0)
return syserror("Failed to modify QP to RTS");
/* Create address handle */
ibdev->ah = ibv_create_ah(ibdev->pd, &ah_attr);
if (!ibdev->ah)
return syserror("Failed to create address handle");
} else if (ibdev->trans == IBV_QPT_RC) {
/* Modify queue pair to RTR */
flags = IBV_QP_STATE |
IBV_QP_AV |
IBV_QP_PATH_MTU |
IBV_QP_DEST_QPN |
IBV_QP_RQ_PSN |
IBV_QP_MAX_DEST_RD_ATOMIC |
IBV_QP_MIN_RNR_TIMER;
if (ibv_modify_qp(ibdev->qp, &rtr_attr, flags) != SUCCESS0)
return syserror("Failed to modify QP to RTR");
/* Modify queue pair to RTS */
flags = IBV_QP_STATE |
IBV_QP_TIMEOUT |
IBV_QP_RETRY_CNT |
IBV_QP_RNR_RETRY |
IBV_QP_SQ_PSN |
IBV_QP_MAX_QP_RD_ATOMIC;
if (ibv_modify_qp(ibdev->qp, &rts_attr, flags) != SUCCESS0)
return syserror("Failed to modify QP to RTS");
} else if (ibdev->trans == IBV_QPT_UC) {
/* Modify queue pair to RTR */
flags = IBV_QP_STATE |
IBV_QP_AV |
IBV_QP_PATH_MTU |
IBV_QP_DEST_QPN |
IBV_QP_RQ_PSN;
if (ibv_modify_qp(ibdev->qp, &rtr_attr, flags) != SUCCESS0)
return syserror("Failed to modify QP to RTR");
/* Modify queue pair to RTS */
flags = IBV_QP_STATE |
IBV_QP_SQ_PSN;
if (ibv_modify_qp(ibdev->qp, &rts_attr, flags) != SUCCESS0)
return syserror("Failed to modify QP to RTS");
}
if (!Req.poll_mode) {
if (ibv_req_notify_cq(ibdev->cq, 0) != SUCCESS0)
return syserror("Failed to request CQ notification");
}
return 1;
}
/*
* Close a RDMA device. We ust destroy the CQ before the QP otherwise the
* ibv_destroy_qp call might hang.
*/
static void
ib_close(IBDEV *ibdev)
{
if (ibdev->ah)
ibv_destroy_ah(ibdev->ah);
if (ibdev->cq)
ibv_destroy_cq(ibdev->cq);
if (ibdev->qp)
ibv_destroy_qp(ibdev->qp);
if (ibdev->mr)
ibv_dereg_mr(ibdev->mr);
if (ibdev->pd)
ibv_dealloc_pd(ibdev->pd);
if (ibdev->channel)
ibv_destroy_comp_channel(ibdev->channel);
if (ibdev->context)
ibv_close_device(ibdev->context);
if (ibdev->buffer)
free(ibdev->buffer);
if (ibdev->devlist)
free(ibdev->devlist);
memset(ibdev, 0, sizeof(*ibdev));
}
/*
* Post a compare and swap request.
*/
static int
ib_post_compare_swap(IBDEV *ibdev,
int wrid, int offset, uint64_t compare, uint64_t swap)
{
struct ibv_sge sge ={
.addr = (uintptr_t)ibdev->buffer + offset,
.length = sizeof(uint64_t),
.lkey = ibdev->mr->lkey
};
struct ibv_send_wr wr ={
.wr_id = wrid,
.sg_list = &sge,
.num_sge = 1,
.opcode = IBV_WR_ATOMIC_CMP_AND_SWP,
.send_flags = IBV_SEND_SIGNALED,
.wr = {
.atomic = {
.remote_addr = ibdev->rcon.vaddr,
.rkey = ibdev->rcon.rkey,
.compare_add = compare,
.swap = swap
}
}
};
struct ibv_send_wr *badWR;
errno = 0;
if (ibv_post_send(ibdev->qp, &wr, &badWR) != SUCCESS0) {
if (Finished && errno == EINTR)
return 1;
return syserror("Failed to post compare and swap");
}
LStat.s.no_bytes += sizeof(uint64_t);
LStat.s.no_msgs++;
return 1;
}
/*
* Post a fetch and add request.
*/
static int
ib_post_fetch_add(IBDEV *ibdev, int wrid, int offset, uint64_t add)
{
struct ibv_sge sge ={
.addr = (uintptr_t) ibdev->buffer + offset,
.length = sizeof(uint64_t),
.lkey = ibdev->mr->lkey
};
struct ibv_send_wr wr ={
.wr_id = wrid,
.sg_list = &sge,
.num_sge = 1,
.opcode = IBV_WR_ATOMIC_FETCH_AND_ADD,
.send_flags = IBV_SEND_SIGNALED,
.wr = {
.atomic = {
.remote_addr = ibdev->rcon.vaddr,
.rkey = ibdev->rcon.rkey,
.compare_add = add
}
}
};
struct ibv_send_wr *badWR;
errno = 0;
if (ibv_post_send(ibdev->qp, &wr, &badWR) != SUCCESS0) {
if (Finished && errno == EINTR)
return 1;
return syserror("Failed to post fetch and add");
}
LStat.s.no_bytes += sizeof(uint64_t);
LStat.s.no_msgs++;
return 1;
}
/*
* Post n sends.
*/
static int
ib_post_send(IBDEV *ibdev, int n)
{
struct ibv_sge sge ={
.addr = (uintptr_t) ibdev->buffer,
.length = Req.msg_size,
.lkey = ibdev->mr->lkey
};
struct ibv_send_wr wr ={
.wr_id = WRID_SEND,
.sg_list = &sge,
.num_sge = 1,
.opcode = IBV_WR_SEND,
.send_flags = IBV_SEND_SIGNALED,
};
struct ibv_send_wr *badWR;
if (ibdev->trans == IBV_QPT_UD) {
wr.wr.ud.ah = ibdev->ah;
wr.wr.ud.remote_qpn = ibdev->rcon.qpn;
wr.wr.ud.remote_qkey = QKEY;
}
if (Req.msg_size <= ibdev->maxinline)
wr.send_flags |= IBV_SEND_INLINE;
errno = 0;
while (n-- > 0) {
if (ibv_post_send(ibdev->qp, &wr, &badWR) != SUCCESS0) {
if (Finished && errno == EINTR)
return 1;
return syserror("Failed to post send");
}
LStat.s.no_bytes += Req.msg_size;
LStat.s.no_msgs++;
}
return 1;
}
/*
* Post n receives.
*/
static int
ib_post_recv(IBDEV *ibdev, int n)
{
struct ibv_sge sge ={
.addr = (uintptr_t) ibdev->buffer,
.length = Req.msg_size,
.lkey = ibdev->mr->lkey
};
struct ibv_recv_wr wr ={
.wr_id = WRID_RECV,
.sg_list = &sge,
.num_sge = 1,
};
struct ibv_recv_wr *badWR;
if (ibdev->trans == IBV_QPT_UD)
sge.length += GRH_SIZE;
errno = 0;
while (n-- > 0) {
if (ibv_post_recv(ibdev->qp, &wr, &badWR) != SUCCESS0) {
if (Finished && errno == EINTR)
return 1;
return syserror("Failed to post receive");
}
}
return 1;
}
/*
* Post n RDMA requests.
*/
static int
ib_post_rdma(IBDEV *ibdev, OPCODE opcode, int n)
{
struct ibv_sge sge ={
.addr = (uintptr_t) ibdev->buffer,
.length = Req.msg_size,
.lkey = ibdev->mr->lkey
};
struct ibv_send_wr wr ={
.wr_id = WRID_RDMA,
.sg_list = &sge,
.num_sge = 1,
.opcode = opcode,
.send_flags = IBV_SEND_SIGNALED,
.wr = {
.rdma = {
.remote_addr = ibdev->rcon.vaddr,
.rkey = ibdev->rcon.rkey
}
}
};
struct ibv_send_wr *badWR;
if (opcode != IBV_WR_RDMA_READ && Req.msg_size <= ibdev->maxinline)
wr.send_flags |= IBV_SEND_INLINE;
errno = 0;
while (n--) {
if (ibv_post_send(ibdev->qp, &wr, &badWR) != SUCCESS0) {
if (Finished && errno == EINTR)
return 1;
return syserror("Failed to post %s", opcode_name(wr.opcode));
}
if (opcode != IBV_WR_RDMA_READ) {
LStat.s.no_bytes += Req.msg_size;
LStat.s.no_msgs++;
}
}
return 1;
}
/*
* Poll the completion queue.
*/
static int
ib_poll(IBDEV *ibdev, struct ibv_wc *wc, int nwc)
{
int n;
char *msg;
if (!Req.poll_mode && !Finished) {
void *ectx;
struct ibv_cq *ecq;
if (ibv_get_cq_event(ibdev->channel, &ecq, &ectx) != SUCCESS0)
{msg = "failed to get CQ event"; goto err;}
if (ecq != ibdev->cq)
{msg = "CQ event for unknown CQ"; goto err;}
if (ibv_req_notify_cq(ibdev->cq, 0) != SUCCESS0)
{msg = "failed to request CQ notification"; goto err;}
}
n = ibv_poll_cq(ibdev->cq, nwc, wc);
if (n < 0)
{msg = "CQ poll failed"; goto err;}
return n;
err:
if (Finished && errno == EINTR)
return 0;
syserror(msg);
return -1;
}
/*
* Encode a IBCON structure into a data stream.
*/
static void
enc_ibcon(IBCON *host)
{
enc_int(host->lid, sizeof(host->lid));
enc_int(host->qpn, sizeof(host->qpn));
enc_int(host->psn, sizeof(host->psn));
enc_int(host->rkey, sizeof(host->rkey));
enc_int(host->vaddr, sizeof(host->vaddr));
}
/*
* Decode a IBCON structure from a data stream.
*/
static void
dec_ibcon(IBCON *host)
{
host->lid = dec_int(sizeof(host->lid));
host->qpn = dec_int(sizeof(host->qpn));
host->psn = dec_int(sizeof(host->psn));
host->rkey = dec_int(sizeof(host->rkey));
host->vaddr = dec_int(sizeof(host->vaddr));
}
/*
* Handle a CQ error and return true if it is recoverable.
*/
static int
do_error(int status, uint64_t *errors)
{
++*errors;
cq_error(status);
return 0;
}
/*
* Print out a CQ error given a status.
*/
static void
cq_error(int status)
{
int i;
for (i = 0; i < cardof(CQErrors); ++i) {
if (CQErrors[i].value == status) {
error("%s failed: %s", TestName, CQErrors[i].name);
return;
}
}
error("%s failed: CQ error %d", TestName, status);
}
/*
* Return the name of an opcode.
*/
static char *
opcode_name(int opcode)
{
int i;
for (i = 0; i < cardof(Opcodes); ++i)
if (Opcodes[i].value == opcode)
return Opcodes[i].name;
return "unknown operation";
}