mtcp/mtcp/src/dpdk_module.c

927 lines
27 KiB
C

/* for io_module_func def'ns */
#include "io_module.h"
#ifndef DISABLE_DPDK
/* for mtcp related def'ns */
#include "mtcp.h"
/* for errno */
#include <errno.h>
/* for logging */
#include "debug.h"
/* for num_devices_* */
#include "config.h"
/* for rte_max_eth_ports */
#include <rte_common.h>
/* for rte_eth_rxconf */
#include <rte_ethdev.h>
/* for delay funcs */
#include <rte_cycles.h>
#include <rte_errno.h>
#define ENABLE_STATS_IOCTL 1
#ifdef ENABLE_STATS_IOCTL
/* for close */
#include <unistd.h>
/* for open */
#include <fcntl.h>
/* for ioctl */
#include <sys/ioctl.h>
#endif /* !ENABLE_STATS_IOCTL */
/* for ip pseudo-chksum */
#include <rte_ip.h>
//#define IP_DEFRAG 1
#ifdef IP_DEFRAG
/* for ip defragging */
#include <rte_ip_frag.h>
#endif
/* for ioctl funcs */
#include <dpdk_iface_common.h>
/* for retrieving rte version(s) */
#include <rte_version.h>
/*----------------------------------------------------------------------------*/
/* Essential macros */
#define MAX_RX_QUEUE_PER_LCORE MAX_CPUS
#define MAX_TX_QUEUE_PER_PORT MAX_CPUS
#ifdef ENABLELRO
#define BUF_SIZE 16384
#else
#define BUF_SIZE 2048
#endif /* !ENABLELRO */
#define MBUF_SIZE (BUF_SIZE + sizeof(struct rte_mbuf) + RTE_PKTMBUF_HEADROOM)
#define NB_MBUF 8192
#define MEMPOOL_CACHE_SIZE 256
#ifdef ENFORCE_RX_IDLE
#define RX_IDLE_ENABLE 1
#define RX_IDLE_TIMEOUT 1 /* in micro-seconds */
#endif
/*
* RX and TX Prefetch, Host, and Write-back threshold values should be
* carefully set for optimal performance. Consult the network
* controller's datasheet and supporting DPDK documentation for guidance
* on how these parameters should be set.
*/
#define RX_PTHRESH 8 /**< Default values of RX prefetch threshold reg. */
#define RX_HTHRESH 8 /**< Default values of RX host threshold reg. */
#define RX_WTHRESH 4 /**< Default values of RX write-back threshold reg. */
/*
* These default values are optimized for use with the Intel(R) 82599 10 GbE
* Controller and the DPDK ixgbe PMD. Consider using other values for other
* network controllers and/or network drivers.
*/
#define TX_PTHRESH 36 /**< Default values of TX prefetch threshold reg. */
#define TX_HTHRESH 0 /**< Default values of TX host threshold reg. */
#define TX_WTHRESH 0 /**< Default values of TX write-back threshold reg. */
#define MAX_PKT_BURST 64/*128*/
/*
* Configurable number of RX/TX ring descriptors
*/
#define RTE_TEST_RX_DESC_DEFAULT 128
#define RTE_TEST_TX_DESC_DEFAULT 128
/*
* Ethernet frame overhead
*/
#define ETHER_IFG 12
#define ETHER_PREAMBLE 8
#define ETHER_OVR (ETHER_CRC_LEN + ETHER_PREAMBLE + ETHER_IFG)
static const uint16_t nb_rxd = RTE_TEST_RX_DESC_DEFAULT;
static const uint16_t nb_txd = RTE_TEST_TX_DESC_DEFAULT;
/*----------------------------------------------------------------------------*/
/* packet memory pools for storing packet bufs */
static struct rte_mempool *pktmbuf_pool[MAX_CPUS] = {NULL};
//#define DEBUG 1
#ifdef DEBUG
/* ethernet addresses of ports */
static struct ether_addr ports_eth_addr[RTE_MAX_ETHPORTS];
#endif
static struct rte_eth_dev_info dev_info[RTE_MAX_ETHPORTS];
static struct rte_eth_conf port_conf = {
.rxmode = {
.mq_mode = ETH_MQ_RX_RSS,
.max_rx_pkt_len = ETHER_MAX_LEN,
.offloads = (
#if (RTE_VER_YEAR <= 18) && (RTE_VER_MONTH <= 02)
DEV_RX_OFFLOAD_CRC_STRIP |
#endif
DEV_RX_OFFLOAD_CHECKSUM
#ifdef ENABLELRO
| DEV_RX_OFFLOAD_TCP_LRO
#endif
),
.split_hdr_size = 0,
#if (RTE_VER_YEAR <= 18) && (RTE_VER_MONTH <= 02)
.header_split = 0, /**< Header Split disabled */
.hw_ip_checksum = 1, /**< IP checksum offload enabled */
.hw_vlan_filter = 0, /**< VLAN filtering disabled */
.jumbo_frame = 0, /**< Jumbo Frame Support disabled */
.hw_strip_crc = 1, /**< CRC stripped by hardware */
#endif
#ifdef ENABLELRO
.enable_lro = 1, /**< Enable LRO */
#endif
},
.rx_adv_conf = {
.rss_conf = {
.rss_key = NULL,
.rss_hf = ETH_RSS_TCP | ETH_RSS_UDP |
ETH_RSS_IP | ETH_RSS_L2_PAYLOAD
},
},
.txmode = {
.mq_mode = ETH_MQ_TX_NONE,
#if (RTE_VER_YEAR >= 18) && (RTE_VER_MONTH >= 02)
.offloads = (DEV_TX_OFFLOAD_IPV4_CKSUM |
DEV_TX_OFFLOAD_UDP_CKSUM |
DEV_TX_OFFLOAD_TCP_CKSUM)
#endif
},
};
static const struct rte_eth_rxconf rx_conf = {
.rx_thresh = {
.pthresh = RX_PTHRESH, /* RX prefetch threshold reg */
.hthresh = RX_HTHRESH, /* RX host threshold reg */
.wthresh = RX_WTHRESH, /* RX write-back threshold reg */
},
.rx_free_thresh = 32,
};
static const struct rte_eth_txconf tx_conf = {
.tx_thresh = {
.pthresh = TX_PTHRESH, /* TX prefetch threshold reg */
.hthresh = TX_HTHRESH, /* TX host threshold reg */
.wthresh = TX_WTHRESH, /* TX write-back threshold reg */
},
.tx_free_thresh = 0, /* Use PMD default values */
.tx_rs_thresh = 0, /* Use PMD default values */
#if (RTE_VER_YEAR <= 18) && (RTE_VER_MONTH <= 02)
/*
* As the example won't handle mult-segments and offload cases,
* set the flag by default.
*/
.txq_flags = 0x0,
#endif
};
struct mbuf_table {
uint16_t len; /* length of queued packets */
struct rte_mbuf *m_table[MAX_PKT_BURST];
};
struct dpdk_private_context {
struct mbuf_table rmbufs[RTE_MAX_ETHPORTS];
struct mbuf_table wmbufs[RTE_MAX_ETHPORTS];
struct rte_mempool *pktmbuf_pool;
struct rte_mbuf *pkts_burst[MAX_PKT_BURST];
#ifdef RX_IDLE_ENABLE
uint8_t rx_idle;
#endif
#ifdef IP_DEFRAG
struct rte_ip_frag_tbl *frag_tbl;
struct rte_ip_frag_death_row death_row;
#endif
#ifdef ENABLELRO
struct rte_mbuf *cur_rx_m;
#endif
#ifdef ENABLE_STATS_IOCTL
int fd;
uint32_t cur_ts;
#endif /* !ENABLE_STATS_IOCTL */
} __rte_cache_aligned;
#ifdef ENABLE_STATS_IOCTL
/**
* stats struct passed on from user space to the driver
*/
struct stats_struct {
uint64_t tx_bytes;
uint64_t tx_pkts;
uint64_t rx_bytes;
uint64_t rx_pkts;
uint64_t rmiss;
uint64_t rerr;
uint64_t terr;
uint8_t qid;
uint8_t dev;
};
#endif /* !ENABLE_STATS_IOCTL */
#ifdef IP_DEFRAG
/* Should be power of two. */
#define IP_FRAG_TBL_BUCKET_ENTRIES 16
#define RTE_LOGTYPE_IP_RSMBL RTE_LOGTYPE_USER1
#define MAX_FRAG_NUM RTE_LIBRTE_IP_FRAG_MAX_FRAG
#endif /* !IP_DEFRAG */
/*----------------------------------------------------------------------------*/
void
dpdk_init_handle(struct mtcp_thread_context *ctxt)
{
struct dpdk_private_context *dpc;
int i, j;
char mempool_name[RTE_MEMPOOL_NAMESIZE];
/* create and initialize private I/O module context */
ctxt->io_private_context = calloc(1, sizeof(struct dpdk_private_context));
if (ctxt->io_private_context == NULL) {
TRACE_ERROR("Failed to initialize ctxt->io_private_context: "
"Can't allocate memory\n");
exit(EXIT_FAILURE);
}
sprintf(mempool_name, "mbuf_pool-%d", ctxt->cpu);
dpc = (struct dpdk_private_context *)ctxt->io_private_context;
dpc->pktmbuf_pool = pktmbuf_pool[ctxt->cpu];
/* set wmbufs correctly */
for (j = 0; j < num_devices_attached; j++) {
/* Allocate wmbufs for each registered port */
for (i = 0; i < MAX_PKT_BURST; i++) {
dpc->wmbufs[j].m_table[i] = rte_pktmbuf_alloc(pktmbuf_pool[ctxt->cpu]);
if (dpc->wmbufs[j].m_table[i] == NULL) {
TRACE_ERROR("Failed to allocate %d:wmbuf[%d] on device %d!\n",
ctxt->cpu, i, j);
exit(EXIT_FAILURE);
}
}
/* set mbufs queue length to 0 to begin with */
dpc->wmbufs[j].len = 0;
}
#ifdef IP_DEFRAG
int max_flows;
int socket;
uint64_t frag_cycles;
max_flows = CONFIG.max_concurrency / CONFIG.num_cores;
frag_cycles = (rte_get_tsc_hz() + MS_PER_S - 1)
/ MS_PER_S * max_flows;
socket = rte_lcore_to_socket_id(ctxt->cpu);
if ((dpc->frag_tbl = rte_ip_frag_table_create(max_flows,
IP_FRAG_TBL_BUCKET_ENTRIES,
max_flows,
frag_cycles,
socket)) == NULL) {
RTE_LOG(ERR, IP_RSMBL, "ip_frag_tbl_create(%u) on "
"lcore: %u for queue: %u failed\n",
max_flows, ctxt->cpu, ctxt->cpu);
exit(EXIT_FAILURE);
}
#endif /* !IP_DEFRAG */
#ifdef ENABLE_STATS_IOCTL
dpc->fd = open(DEV_PATH, O_RDWR);
if (dpc->fd == -1) {
TRACE_ERROR("Can't open " DEV_PATH " for context->cpu: %d! "
"Are you using mlx4/mlx5 driver?\n",
ctxt->cpu);
}
#endif /* !ENABLE_STATS_IOCTL */
}
/*----------------------------------------------------------------------------*/
int
dpdk_link_devices(struct mtcp_thread_context *ctxt)
{
/* linking takes place during mtcp_init() */
return 0;
}
/*----------------------------------------------------------------------------*/
void
dpdk_release_pkt(struct mtcp_thread_context *ctxt, int ifidx, unsigned char *pkt_data, int len)
{
/*
* do nothing over here - memory reclamation
* will take place in dpdk_recv_pkts
*/
}
/*----------------------------------------------------------------------------*/
int
dpdk_send_pkts(struct mtcp_thread_context *ctxt, int ifidx)
{
struct dpdk_private_context *dpc;
#ifdef NETSTAT
mtcp_manager_t mtcp;
#endif
int ret, i, portid = CONFIG.eths[ifidx].ifindex;
dpc = (struct dpdk_private_context *)ctxt->io_private_context;
#ifdef NETSTAT
mtcp = ctxt->mtcp_manager;
#endif
ret = 0;
/* if there are packets in the queue... flush them out to the wire */
if (dpc->wmbufs[ifidx].len >/*= MAX_PKT_BURST*/ 0) {
struct rte_mbuf **pkts;
#ifdef ENABLE_STATS_IOCTL
#ifdef NETSTAT
struct rte_eth_stats stats;
struct stats_struct ss;
#endif
#endif /* !ENABLE_STATS_IOCTL */
int cnt = dpc->wmbufs[ifidx].len;
pkts = dpc->wmbufs[ifidx].m_table;
#ifdef NETSTAT
mtcp->nstat.tx_packets[ifidx] += cnt;
#ifdef ENABLE_STATS_IOCTL
/* only pass stats after >= 1 sec interval */
if (abs(mtcp->cur_ts - dpc->cur_ts) >= 1000 &&
likely(dpc->fd >= 0)) {
/* rte_get_stats is global func, use only for 1 core */
if (ctxt->cpu == 0) {
rte_eth_stats_get(portid, &stats);
ss.rmiss = stats.imissed;
ss.rerr = stats.ierrors;
ss.terr = stats.oerrors;
} else
ss.rmiss = ss.rerr = ss.terr = 0;
ss.tx_pkts = mtcp->nstat.tx_packets[ifidx];
ss.tx_bytes = mtcp->nstat.tx_bytes[ifidx];
ss.rx_pkts = mtcp->nstat.rx_packets[ifidx];
ss.rx_bytes = mtcp->nstat.rx_bytes[ifidx];
ss.qid = ctxt->cpu;
ss.dev = portid;
/* pass the info now */
if (ioctl(dpc->fd, SEND_STATS, &ss) == -1)
TRACE_ERROR("Can't update iface stats!\n");
dpc->cur_ts = mtcp->cur_ts;
if (ctxt->cpu == 0)
rte_eth_stats_reset(portid);
}
#endif /* !ENABLE_STATS_IOCTL */
#endif
do {
/* tx cnt # of packets */
ret = rte_eth_tx_burst(portid, ctxt->cpu,
pkts, cnt);
pkts += ret;
cnt -= ret;
/* if not all pkts were sent... then repeat the cycle */
} while (cnt > 0);
/* time to allocate fresh mbufs for the queue */
for (i = 0; i < dpc->wmbufs[ifidx].len; i++) {
dpc->wmbufs[ifidx].m_table[i] = rte_pktmbuf_alloc(pktmbuf_pool[ctxt->cpu]);
/* error checking */
if (unlikely(dpc->wmbufs[ifidx].m_table[i] == NULL)) {
TRACE_ERROR("Failed to allocate %d:wmbuf[%d] on device %d!\n",
ctxt->cpu, i, ifidx);
exit(EXIT_FAILURE);
}
}
/* reset the len of mbufs var after flushing of packets */
dpc->wmbufs[ifidx].len = 0;
}
return ret;
}
/*----------------------------------------------------------------------------*/
uint8_t *
dpdk_get_wptr(struct mtcp_thread_context *ctxt, int ifidx, uint16_t pktsize)
{
struct dpdk_private_context *dpc;
#ifdef NETSTAT
mtcp_manager_t mtcp;
#endif
struct rte_mbuf *m;
uint8_t *ptr;
int len_of_mbuf;
dpc = (struct dpdk_private_context *) ctxt->io_private_context;
#ifdef NETSTAT
mtcp = ctxt->mtcp_manager;
#endif
/* sanity check */
if (unlikely(dpc->wmbufs[ifidx].len == MAX_PKT_BURST))
return NULL;
len_of_mbuf = dpc->wmbufs[ifidx].len;
m = dpc->wmbufs[ifidx].m_table[len_of_mbuf];
/* retrieve the right write offset */
ptr = (void *)rte_pktmbuf_mtod(m, struct ether_hdr *);
m->pkt_len = m->data_len = pktsize;
m->nb_segs = 1;
m->next = NULL;
#ifdef NETSTAT
mtcp->nstat.tx_bytes[ifidx] += pktsize + ETHER_OVR;
#endif
/* increment the len_of_mbuf var */
dpc->wmbufs[ifidx].len = len_of_mbuf + 1;
return (uint8_t *)ptr;
}
/*----------------------------------------------------------------------------*/
static inline void
free_pkts(struct rte_mbuf **mtable, unsigned len)
{
int i;
/* free the freaking packets */
for (i = 0; i < len; i++) {
rte_pktmbuf_free(mtable[i]);
RTE_MBUF_PREFETCH_TO_FREE(mtable[i+1]);
}
}
/*----------------------------------------------------------------------------*/
int32_t
dpdk_recv_pkts(struct mtcp_thread_context *ctxt, int ifidx)
{
struct dpdk_private_context *dpc;
int ret;
dpc = (struct dpdk_private_context *) ctxt->io_private_context;
if (dpc->rmbufs[ifidx].len != 0) {
free_pkts(dpc->rmbufs[ifidx].m_table, dpc->rmbufs[ifidx].len);
dpc->rmbufs[ifidx].len = 0;
}
int portid = CONFIG.eths[ifidx].ifindex;
ret = rte_eth_rx_burst((uint8_t)portid, ctxt->cpu,
dpc->pkts_burst, MAX_PKT_BURST);
#ifdef RX_IDLE_ENABLE
dpc->rx_idle = (likely(ret != 0)) ? 0 : dpc->rx_idle + 1;
#endif
dpc->rmbufs[ifidx].len = ret;
return ret;
}
/*----------------------------------------------------------------------------*/
#ifdef IP_DEFRAG
struct rte_mbuf *
ip_reassemble(struct dpdk_private_context *dpc, struct rte_mbuf *m)
{
struct ether_hdr *eth_hdr;
struct rte_ip_frag_tbl *tbl;
struct rte_ip_frag_death_row *dr;
/* if packet is IPv4 */
if (RTE_ETH_IS_IPV4_HDR(m->packet_type)) {
struct ipv4_hdr *ip_hdr;
eth_hdr = rte_pktmbuf_mtod(m, struct ether_hdr *);
ip_hdr = (struct ipv4_hdr *)(eth_hdr + 1);
/* if it is a fragmented packet, then try to reassemble. */
if (rte_ipv4_frag_pkt_is_fragmented(ip_hdr)) {
struct rte_mbuf *mo;
tbl = dpc->frag_tbl;
dr = &dpc->death_row;
/* prepare mbuf: setup l2_len/l3_len. */
m->l2_len = sizeof(*eth_hdr);
m->l3_len = sizeof(*ip_hdr);
/* process this fragment. */
mo = rte_ipv4_frag_reassemble_packet(tbl, dr, m, rte_rdtsc(), ip_hdr);
if (mo == NULL)
/* no packet to send out. */
return NULL;
/* we have our packet reassembled. */
if (mo != m)
m = mo;
}
}
/* if packet isn't IPv4, just accept it! */
return m;
}
#endif
/*----------------------------------------------------------------------------*/
uint8_t *
dpdk_get_rptr(struct mtcp_thread_context *ctxt, int ifidx, int index, uint16_t *len)
{
struct dpdk_private_context *dpc;
struct rte_mbuf *m;
uint8_t *pktbuf;
dpc = (struct dpdk_private_context *) ctxt->io_private_context;
m = dpc->pkts_burst[index];
#ifdef IP_DEFRAG
m = ip_reassemble(dpc, m);
#endif
*len = m->pkt_len;
pktbuf = rte_pktmbuf_mtod(m, uint8_t *);
/* enqueue the pkt ptr in mbuf */
dpc->rmbufs[ifidx].m_table[index] = m;
/* verify checksum values from ol_flags */
if ((m->ol_flags & (PKT_RX_L4_CKSUM_BAD | PKT_RX_IP_CKSUM_BAD)) != 0) {
TRACE_ERROR("%s(%p, %d, %d): mbuf with invalid checksum: "
"%p(%lu);\n",
__func__, ctxt, ifidx, index, m, m->ol_flags);
pktbuf = NULL;
}
#ifdef ENABLELRO
dpc->cur_rx_m = m;
#endif /* ENABLELRO */
return pktbuf;
}
/*----------------------------------------------------------------------------*/
int32_t
dpdk_select(struct mtcp_thread_context *ctxt)
{
#ifdef RX_IDLE_ENABLE
struct dpdk_private_context *dpc;
dpc = (struct dpdk_private_context *) ctxt->io_private_context;
if (dpc->rx_idle > RX_IDLE_THRESH) {
dpc->rx_idle = 0;
usleep(RX_IDLE_TIMEOUT);
}
#endif
return 0;
}
/*----------------------------------------------------------------------------*/
void
dpdk_destroy_handle(struct mtcp_thread_context *ctxt)
{
struct dpdk_private_context *dpc;
int i;
dpc = (struct dpdk_private_context *) ctxt->io_private_context;
/* free wmbufs */
for (i = 0; i < num_devices_attached; i++)
free_pkts(dpc->wmbufs[i].m_table, MAX_PKT_BURST);
#ifdef ENABLE_STATS_IOCTL
/* free fd */
if (dpc->fd >= 0)
close(dpc->fd);
#endif /* !ENABLE_STATS_IOCTL */
/* free it all up */
free(dpc);
}
/*----------------------------------------------------------------------------*/
static void
check_all_ports_link_status(uint8_t port_num, uint32_t port_mask)
{
#define CHECK_INTERVAL 100 /* 100ms */
#define MAX_CHECK_TIME 90 /* 9s (90 * 100ms) in total */
uint8_t portid, count, all_ports_up, print_flag = 0;
struct rte_eth_link link;
printf("\nChecking link status");
fflush(stdout);
for (count = 0; count <= MAX_CHECK_TIME; count++) {
all_ports_up = 1;
for (portid = 0; portid < port_num; portid++) {
if ((port_mask & (1 << portid)) == 0)
continue;
memset(&link, 0, sizeof(link));
rte_eth_link_get_nowait(portid, &link);
/* print link status if flag set */
if (print_flag == 1) {
if (link.link_status)
printf("Port %d Link Up - speed %u "
"Mbps - %s\n", (uint8_t)portid,
(unsigned)link.link_speed,
(link.link_duplex == ETH_LINK_FULL_DUPLEX) ?
("full-duplex") : ("half-duplex\n"));
else
printf("Port %d Link Down\n",
(uint8_t)portid);
continue;
}
/* clear all_ports_up flag if any link down */
if (link.link_status == 0) {
all_ports_up = 0;
break;
}
}
/* after finally printing all link status, get out */
if (print_flag == 1)
break;
if (all_ports_up == 0) {
printf(".");
fflush(stdout);
rte_delay_ms(CHECK_INTERVAL);
}
/* set the print_flag if all ports up or timeout */
if (all_ports_up == 1 || count == (MAX_CHECK_TIME - 1)) {
print_flag = 1;
printf("done\n");
}
}
}
/*----------------------------------------------------------------------------*/
void
dpdk_load_module(void)
{
int portid, rxlcore_id, ret;
/* for Ethernet flow control settings */
struct rte_eth_fc_conf fc_conf;
/* setting the rss key */
static uint8_t key[] = {
0x05, 0x05, 0x05, 0x05, 0x05, 0x05, 0x05, 0x05, 0x05, 0x05, /* 10 */
0x05, 0x05, 0x05, 0x05, 0x05, 0x05, 0x05, 0x05, 0x05, 0x05, /* 20 */
0x05, 0x05, 0x05, 0x05, 0x05, 0x05, 0x05, 0x05, 0x05, 0x05, /* 30 */
0x05, 0x05, 0x05, 0x05, 0x05, 0x05, 0x05, 0x05, 0x05, 0x05, /* 40 */
0x05, 0x05, 0x05, 0x05, 0x05, 0x05, 0x05, 0x05, 0x05, 0x05, /* 50 */
0x05, 0x05 /* 60 - 8 */
};
port_conf.rx_adv_conf.rss_conf.rss_key = (uint8_t *)key;
port_conf.rx_adv_conf.rss_conf.rss_key_len = sizeof(key);
if (!CONFIG.multi_process || (CONFIG.multi_process && CONFIG.multi_process_is_master)) {
for (rxlcore_id = 0; rxlcore_id < CONFIG.num_cores; rxlcore_id++) {
char name[RTE_MEMPOOL_NAMESIZE];
uint32_t nb_mbuf;
sprintf(name, "mbuf_pool-%d", rxlcore_id);
nb_mbuf = NB_MBUF;
#ifdef IP_DEFRAG
int max_flows;
max_flows = CONFIG.max_concurrency / CONFIG.num_cores;
/*
* At any given moment up to <max_flows * (MAX_FRAG_NUM)>
* mbufs could be stored int the fragment table.
* Plus, each TX queue can hold up to <max_flows> packets.
*/
nb_mbuf = RTE_MAX(max_flows, 2UL * MAX_PKT_BURST) * MAX_FRAG_NUM;
nb_mbuf *= (port_conf.rxmode.max_rx_pkt_len + BUF_SIZE - 1) / BUF_SIZE;
nb_mbuf += RTE_TEST_RX_DESC_DEFAULT + RTE_TEST_TX_DESC_DEFAULT;
nb_mbuf = RTE_MAX(nb_mbuf, (uint32_t)NB_MBUF);
#endif
/* create the mbuf pools */
pktmbuf_pool[rxlcore_id] =
rte_mempool_create(name, nb_mbuf,
MBUF_SIZE, MEMPOOL_CACHE_SIZE,
sizeof(struct rte_pktmbuf_pool_private),
rte_pktmbuf_pool_init, NULL,
rte_pktmbuf_init, NULL,
rte_socket_id(), MEMPOOL_F_SP_PUT |
MEMPOOL_F_SC_GET);
if (pktmbuf_pool[rxlcore_id] == NULL)
rte_exit(EXIT_FAILURE, "Cannot init mbuf pool, errno: %d\n",
rte_errno);
}
/* Initialise each port */
int i;
for (i = 0; i < num_devices_attached; ++i) {
/* get portid form the index of attached devices */
portid = devices_attached[i];
/* check port capabilities */
rte_eth_dev_info_get(portid, &dev_info[portid]);
#if (RTE_VER_YEAR >= 18) && (RTE_VER_MONTH >= 02)
/* re-adjust rss_hf */
port_conf.rx_adv_conf.rss_conf.rss_hf &= dev_info[portid].flow_type_rss_offloads;
#endif
/* init port */
printf("Initializing port %u... ", (unsigned) portid);
fflush(stdout);
ret = rte_eth_dev_configure(portid, CONFIG.num_cores, CONFIG.num_cores, &port_conf);
if (ret < 0)
rte_exit(EXIT_FAILURE, "Cannot configure device: err=%d, port=%u, cores: %d\n",
ret, (unsigned) portid, CONFIG.num_cores);
/* init one RX queue per CPU */
fflush(stdout);
#ifdef DEBUG
rte_eth_macaddr_get(portid, &ports_eth_addr[portid]);
#endif
for (rxlcore_id = 0; rxlcore_id < CONFIG.num_cores; rxlcore_id++) {
ret = rte_eth_rx_queue_setup(portid, rxlcore_id, nb_rxd,
rte_eth_dev_socket_id(portid), &rx_conf,
pktmbuf_pool[rxlcore_id]);
if (ret < 0)
rte_exit(EXIT_FAILURE,
"rte_eth_rx_queue_setup:err=%d, port=%u, queueid: %d\n",
ret, (unsigned) portid, rxlcore_id);
}
/* init one TX queue on each port per CPU (this is redundant for this app) */
fflush(stdout);
for (rxlcore_id = 0; rxlcore_id < CONFIG.num_cores; rxlcore_id++) {
ret = rte_eth_tx_queue_setup(portid, rxlcore_id, nb_txd,
rte_eth_dev_socket_id(portid), &tx_conf);
if (ret < 0)
rte_exit(EXIT_FAILURE,
"rte_eth_tx_queue_setup:err=%d, port=%u, queueid: %d\n",
ret, (unsigned) portid, rxlcore_id);
}
/* Start device */
ret = rte_eth_dev_start(portid);
if (ret < 0)
rte_exit(EXIT_FAILURE, "rte_eth_dev_start:err=%d, port=%u\n",
ret, (unsigned) portid);
printf("done: \n");
rte_eth_promiscuous_enable(portid);
/* retrieve current flow control settings per port */
memset(&fc_conf, 0, sizeof(fc_conf));
ret = rte_eth_dev_flow_ctrl_get(portid, &fc_conf);
if (ret != 0)
TRACE_INFO("Failed to get flow control info!\n");
/* and just disable the rx/tx flow control */
fc_conf.mode = RTE_FC_NONE;
ret = rte_eth_dev_flow_ctrl_set(portid, &fc_conf);
if (ret != 0)
TRACE_INFO("Failed to set flow control info!: errno: %d\n",
ret);
#ifdef DEBUG
printf("Port %u, MAC address: %02X:%02X:%02X:%02X:%02X:%02X\n\n",
(unsigned) portid,
ports_eth_addr[portid].addr_bytes[0],
ports_eth_addr[portid].addr_bytes[1],
ports_eth_addr[portid].addr_bytes[2],
ports_eth_addr[portid].addr_bytes[3],
ports_eth_addr[portid].addr_bytes[4],
ports_eth_addr[portid].addr_bytes[5]);
#endif
}
/* only check for link status if the thread is master */
check_all_ports_link_status(num_devices_attached, 0xFFFFFFFF);
} else { /* CONFIG.multi_process && !CONFIG.multi_process_is_master */
for (rxlcore_id = 0; rxlcore_id < CONFIG.num_cores; rxlcore_id++) {
char name[RTE_MEMPOOL_NAMESIZE];
sprintf(name, "mbuf_pool-%d", rxlcore_id);
/* initialize the mbuf pools */
pktmbuf_pool[rxlcore_id] =
rte_mempool_lookup(name);
if (pktmbuf_pool[rxlcore_id] == NULL)
rte_exit(EXIT_FAILURE, "Cannot init mbuf pool\n");
}
int i;
/* initializing dev_info struct */
for (i = 0; i < num_devices_attached; i++) {
/* get portid form the index of attached devices */
portid = devices_attached[i];
/* check port capabilities */
rte_eth_dev_info_get(i, &dev_info[portid]);
}
}
}
/*----------------------------------------------------------------------------*/
int32_t
dpdk_dev_ioctl(struct mtcp_thread_context *ctx, int nif, int cmd, void *argp)
{
struct dpdk_private_context *dpc;
struct rte_mbuf *m;
int len_of_mbuf;
struct iphdr *iph;
struct tcphdr *tcph;
void **argpptr = (void **)argp;
#ifdef ENABLELRO
uint8_t *payload, *to;
int seg_off;
#endif
if (cmd == DRV_NAME) {
*argpptr = (void *)dev_info[nif].driver_name;
return 0;
}
int eidx = CONFIG.nif_to_eidx[nif];
iph = (struct iphdr *)argp;
dpc = (struct dpdk_private_context *)ctx->io_private_context;
len_of_mbuf = dpc->wmbufs[eidx].len;
switch (cmd) {
case PKT_TX_IP_CSUM:
if ((dev_info[nif].tx_offload_capa & DEV_TX_OFFLOAD_IPV4_CKSUM) == 0)
goto dev_ioctl_err;
m = dpc->wmbufs[eidx].m_table[len_of_mbuf - 1];
m->ol_flags = PKT_TX_IP_CKSUM | PKT_TX_IPV4;
m->l2_len = sizeof(struct ether_hdr);
m->l3_len = (iph->ihl<<2);
break;
case PKT_TX_TCP_CSUM:
if ((dev_info[nif].tx_offload_capa & DEV_TX_OFFLOAD_TCP_CKSUM) == 0)
goto dev_ioctl_err;
m = dpc->wmbufs[eidx].m_table[len_of_mbuf - 1];
tcph = (struct tcphdr *)((unsigned char *)iph + (iph->ihl<<2));
m->ol_flags |= PKT_TX_TCP_CKSUM;
tcph->check = rte_ipv4_phdr_cksum((struct ipv4_hdr *)iph, m->ol_flags);
break;
#ifdef ENABLELRO
case PKT_RX_TCP_LROSEG:
m = dpc->cur_rx_m;
//if (m->next != NULL)
// rte_prefetch0(rte_pktmbuf_mtod(m->next, void *));
iph = rte_pktmbuf_mtod_offset(m, struct iphdr *, sizeof(struct ether_hdr));
tcph = (struct tcphdr *)((u_char *)iph + (iph->ihl << 2));
payload = (uint8_t *)tcph + (tcph->doff << 2);
seg_off = m->data_len -
sizeof(struct ether_hdr) - (iph->ihl << 2) -
(tcph->doff << 2);
to = (uint8_t *) argp;
m = m->next;
memcpy(to, payload, seg_off);
while (m != NULL) {
//if (m->next != NULL)
// rte_prefetch0(rte_pktmbuf_mtod(m->next, void *));
memcpy(to + seg_off,
rte_pktmbuf_mtod(m, uint8_t *),
m->data_len);
seg_off += m->data_len;
m = m->next;
}
break;
#endif
case PKT_TX_TCPIP_CSUM:
if ((dev_info[nif].tx_offload_capa & DEV_TX_OFFLOAD_IPV4_CKSUM) == 0)
goto dev_ioctl_err;
if ((dev_info[nif].tx_offload_capa & DEV_TX_OFFLOAD_TCP_CKSUM) == 0)
goto dev_ioctl_err;
m = dpc->wmbufs[eidx].m_table[len_of_mbuf - 1];
iph = rte_pktmbuf_mtod_offset(m, struct iphdr *, sizeof(struct ether_hdr));
tcph = (struct tcphdr *)((uint8_t *)iph + (iph->ihl<<2));
m->l2_len = sizeof(struct ether_hdr);
m->l3_len = (iph->ihl<<2);
m->l4_len = (tcph->doff<<2);
m->ol_flags = PKT_TX_TCP_CKSUM | PKT_TX_IP_CKSUM | PKT_TX_IPV4;
tcph->check = rte_ipv4_phdr_cksum((struct ipv4_hdr *)iph, m->ol_flags);
break;
case PKT_RX_IP_CSUM:
if ((dev_info[nif].rx_offload_capa & DEV_RX_OFFLOAD_IPV4_CKSUM) == 0)
goto dev_ioctl_err;
break;
case PKT_RX_TCP_CSUM:
if ((dev_info[nif].rx_offload_capa & DEV_RX_OFFLOAD_TCP_CKSUM) == 0)
goto dev_ioctl_err;
break;
case PKT_TX_TCPIP_CSUM_PEEK:
if ((dev_info[nif].tx_offload_capa & DEV_TX_OFFLOAD_IPV4_CKSUM) == 0)
goto dev_ioctl_err;
if ((dev_info[nif].tx_offload_capa & DEV_TX_OFFLOAD_TCP_CKSUM) == 0)
goto dev_ioctl_err;
break;
default:
goto dev_ioctl_err;
}
return 0;
dev_ioctl_err:
return -1;
}
/*----------------------------------------------------------------------------*/
io_module_func dpdk_module_func = {
.load_module = dpdk_load_module,
.init_handle = dpdk_init_handle,
.link_devices = dpdk_link_devices,
.release_pkt = dpdk_release_pkt,
.send_pkts = dpdk_send_pkts,
.get_wptr = dpdk_get_wptr,
.recv_pkts = dpdk_recv_pkts,
.get_rptr = dpdk_get_rptr,
.select = dpdk_select,
.destroy_handle = dpdk_destroy_handle,
.dev_ioctl = dpdk_dev_ioctl
};
/*----------------------------------------------------------------------------*/
#else
io_module_func dpdk_module_func = {
.load_module = NULL,
.init_handle = NULL,
.link_devices = NULL,
.release_pkt = NULL,
.send_pkts = NULL,
.get_wptr = NULL,
.recv_pkts = NULL,
.get_rptr = NULL,
.select = NULL,
.destroy_handle = NULL,
.dev_ioctl = NULL
};
/*----------------------------------------------------------------------------*/
#endif /* !DISABLE_DPDK */