diff --git a/config.ini b/config.ini index f5f8103b..d030042d 100644 --- a/config.ini +++ b/config.ini @@ -9,6 +9,8 @@ channel=4 nb_ports=1 promiscuous=1 numa_on=1 +## TCP segment offload, default: disabled. +tso=0 ## Port config section ## According to dpdk.nb_ports: port0, port1... @@ -19,8 +21,6 @@ broadcast=192.168.1.255 gateway=192.168.1.1 ## Packet capture path, this will hurt performance #pcap=./a.pcap -## Strip vlan tag, such as EC2 etc. -#vlanstrip=1 ## Kni config: if enabled and method=reject, ## all packets that do not belong to the following tcp_port and udp_port diff --git a/lib/ff_api.symlist b/lib/ff_api.symlist index a5d3c28b..9545a677 100644 --- a/lib/ff_api.symlist +++ b/lib/ff_api.symlist @@ -40,3 +40,4 @@ ff_mbuf_gethdr ff_mbuf_get ff_mbuf_free ff_mbuf_copydata +ff_mbuf_tx_offload \ No newline at end of file diff --git a/lib/ff_config.c b/lib/ff_config.c index 45080bad..5e66e0bd 100644 --- a/lib/ff_config.c +++ b/lib/ff_config.c @@ -154,8 +154,6 @@ port_cfg_handler(struct ff_config *cfg, const char *section, cur->gateway = strdup(value); } else if (strcmp(name, "pcap") == 0) { cur->pcap = strdup(value); - } else if (strcmp(name, "vlanstrip") == 0) { - cur->vlanstrip = atoi(value); } return 1; @@ -186,6 +184,8 @@ handler(void* user, const char* section, const char* name, pconfig->dpdk.promiscuous = atoi(value); } else if (MATCH("dpdk", "numa_on")) { pconfig->dpdk.numa_on = atoi(value); + } else if (MATCH("dpdk", "tso")) { + pconfig->dpdk.tso = atoi(value); } else if (MATCH("kni", "enable")) { pconfig->kni.enable= atoi(value); } else if (MATCH("kni", "method")) { diff --git a/lib/ff_config.h b/lib/ff_config.h index 08ad87bb..08cac0d2 100644 --- a/lib/ff_config.h +++ b/lib/ff_config.h @@ -34,11 +34,19 @@ extern int dpdk_argc; extern char *dpdk_argv[DPDK_CONFIG_NUM + 1]; +struct ff_hw_features { + uint8_t rx_csum; + uint8_t rx_lro; + uint8_t tx_csum_ip; + uint8_t tx_csum_l4; + uint8_t tx_tso; +}; + struct ff_port_cfg { char *name; uint8_t port_id; uint8_t mac[6]; - uint8_t vlanstrip; + struct ff_hw_features hw_features; char *addr; char *netmask; char *broadcast; @@ -72,6 +80,7 @@ struct ff_config { int nb_ports; int promiscuous; int numa_on; + int tso; struct ff_port_cfg *port_cfgs; } dpdk; diff --git a/lib/ff_dpdk_if.c b/lib/ff_dpdk_if.c index e77d89a4..53690bc5 100644 --- a/lib/ff_dpdk_if.c +++ b/lib/ff_dpdk_if.c @@ -45,6 +45,9 @@ #include #include #include +#include +#include +#include #include "ff_dpdk_if.h" #include "ff_dpdk_pcap.h" @@ -154,7 +157,8 @@ struct ff_dpdk_if_context { void *sc; void *ifp; uint16_t port_id; -}; + struct ff_hw_features hw_features; +} __rte_cache_aligned; static struct ff_dpdk_if_context *veth_ctx[RTE_MAX_ETHPORTS]; @@ -178,6 +182,7 @@ ff_dpdk_register_if(void *sc, void *ifp, struct ff_port_cfg *cfg) ctx->sc = sc; ctx->ifp = ifp; ctx->port_id = cfg->port_id; + ctx->hw_features = cfg->hw_features; return ctx; } @@ -478,10 +483,10 @@ init_arp_ring(void) if (arp_ring[i][port_id] == NULL) rte_panic("create kni ring::%s failed!\n", name_buf); - + if (rte_ring_lookup(name_buf) != arp_ring[i][port_id]) rte_panic("lookup kni ring:%s failed!\n", name_buf); - + printf("create arp ring:%s success, %u ring entries are now free!\n", name_buf, rte_ring_free_count(arp_ring[i][port_id])); } @@ -550,20 +555,97 @@ init_port_start(void) addr.addr_bytes[2], addr.addr_bytes[3], addr.addr_bytes[4], addr.addr_bytes[5]); - rte_memcpy(ff_global_cfg.dpdk.port_cfgs[port_id].mac, + rte_memcpy(ff_global_cfg.dpdk.port_cfgs[i].mac, addr.addr_bytes, ETHER_ADDR_LEN); + /* Clear txq_flags - we do not need multi-mempool and refcnt */ + dev_info.default_txconf.txq_flags = ETH_TXQ_FLAGS_NOMULTMEMP | + ETH_TXQ_FLAGS_NOREFCOUNT; + + /* Disable features that are not supported by port's HW */ + if (!(dev_info.tx_offload_capa & DEV_TX_OFFLOAD_UDP_CKSUM)) { + dev_info.default_txconf.txq_flags |= ETH_TXQ_FLAGS_NOXSUMUDP; + } + + if (!(dev_info.tx_offload_capa & DEV_TX_OFFLOAD_TCP_CKSUM)) { + dev_info.default_txconf.txq_flags |= ETH_TXQ_FLAGS_NOXSUMTCP; + } + + if (!(dev_info.tx_offload_capa & DEV_TX_OFFLOAD_SCTP_CKSUM)) { + dev_info.default_txconf.txq_flags |= ETH_TXQ_FLAGS_NOXSUMSCTP; + } + + if (!(dev_info.tx_offload_capa & DEV_TX_OFFLOAD_VLAN_INSERT)) { + dev_info.default_txconf.txq_flags |= ETH_TXQ_FLAGS_NOVLANOFFL; + } + + if (!(dev_info.tx_offload_capa & DEV_TX_OFFLOAD_VLAN_INSERT)) { + dev_info.default_txconf.txq_flags |= ETH_TXQ_FLAGS_NOVLANOFFL; + } + + if (!(dev_info.tx_offload_capa & DEV_TX_OFFLOAD_TCP_TSO) && + !(dev_info.tx_offload_capa & DEV_TX_OFFLOAD_UDP_TSO)) { + dev_info.default_txconf.txq_flags |= ETH_TXQ_FLAGS_NOMULTSEGS; + } + + struct rte_eth_conf port_conf = {0}; + + /* Set RSS mode */ + port_conf.rxmode.mq_mode = ETH_MQ_RX_RSS; + port_conf.rx_adv_conf.rss_conf.rss_hf = ETH_RSS_PROTO_MASK; + port_conf.rx_adv_conf.rss_conf.rss_key = default_rsskey_40bytes; + port_conf.rx_adv_conf.rss_conf.rss_key_len = 40; + + /* Set Rx VLAN stripping */ + if (dev_info.rx_offload_capa & DEV_RX_OFFLOAD_VLAN_STRIP) { + port_conf.rxmode.hw_vlan_strip = 1; + } + + /* Enable HW CRC stripping */ + port_conf.rxmode.hw_strip_crc = 1; + + /* FIXME: Enable TCP LRO ?*/ + #if 0 + if (dev_info.rx_offload_capa & DEV_RX_OFFLOAD_TCP_LRO) { + printf("LRO is supported\n"); + port_conf.rxmode.enable_lro = 1; + ff_global_cfg.dpdk.port_cfgs[i].hw_features.rx_lro = 1; + } + #endif + + /* Set Rx checksum checking */ + if ((dev_info.rx_offload_capa & DEV_RX_OFFLOAD_IPV4_CKSUM) && + (dev_info.rx_offload_capa & DEV_RX_OFFLOAD_UDP_CKSUM) && + (dev_info.rx_offload_capa & DEV_RX_OFFLOAD_TCP_CKSUM)) { + printf("RX checksum offload supported\n"); + port_conf.rxmode.hw_ip_checksum = 1; + ff_global_cfg.dpdk.port_cfgs[i].hw_features.rx_csum = 1; + } + + if ((dev_info.tx_offload_capa & DEV_TX_OFFLOAD_IPV4_CKSUM)) { + printf("TX ip checksum offload supported\n"); + ff_global_cfg.dpdk.port_cfgs[i].hw_features.tx_csum_ip = 1; + } + + if ((dev_info.tx_offload_capa & DEV_TX_OFFLOAD_UDP_CKSUM) && + (dev_info.tx_offload_capa & DEV_TX_OFFLOAD_TCP_CKSUM)) { + printf("TX TCP&UDP checksum offload supported\n"); + ff_global_cfg.dpdk.port_cfgs[i].hw_features.tx_csum_l4 = 1; + } + + if (ff_global_cfg.dpdk.tso) { + if (dev_info.tx_offload_capa & DEV_TX_OFFLOAD_TCP_TSO) { + printf("TSO is supported\n"); + ff_global_cfg.dpdk.port_cfgs[i].hw_features.tx_tso = 1; + } + } else { + printf("TSO is disabled\n"); + } + if (rte_eal_process_type() != RTE_PROC_PRIMARY) { return 0; } - /* - * TODO: - * Set port conf according to dev's capability. - */ - struct rte_eth_conf port_conf = default_port_conf; - port_conf.rxmode.hw_vlan_strip = ff_global_cfg.dpdk.port_cfgs[port_id].vlanstrip; - /* Currently, proc id 1:1 map to queue id per port. */ int ret = rte_eth_dev_configure(port_id, nb_procs, nb_procs, &port_conf); if (ret != 0) { @@ -573,13 +655,13 @@ init_port_start(void) uint16_t q; for (q = 0; q < nb_procs; q++) { ret = rte_eth_tx_queue_setup(port_id, q, TX_QUEUE_SIZE, - socketid, &dev_info.default_txconf); + socketid, &dev_info.default_txconf); if (ret < 0) { return ret; } ret = rte_eth_rx_queue_setup(port_id, q, RX_QUEUE_SIZE, - socketid, &dev_info.default_rxconf, mbuf_pool); + socketid, &dev_info.default_rxconf, mbuf_pool); if (ret < 0) { return ret; } @@ -602,8 +684,8 @@ init_port_start(void) } /* Enable pcap dump */ - if (ff_global_cfg.dpdk.port_cfgs[port_id].pcap) { - ff_enable_pcap(ff_global_cfg.dpdk.port_cfgs[port_id].pcap); + if (ff_global_cfg.dpdk.port_cfgs[i].pcap) { + ff_enable_pcap(ff_global_cfg.dpdk.port_cfgs[i].pcap); } } @@ -667,12 +749,24 @@ ff_dpdk_init(int argc, char **argv) } static void -ff_veth_input(void *ifp, struct rte_mbuf *pkt) +ff_veth_input(const struct ff_dpdk_if_context *ctx, struct rte_mbuf *pkt) { + uint8_t rx_csum = ctx->hw_features.rx_csum; + if (rx_csum) { + if (pkt->ol_flags & (PKT_RX_IP_CKSUM_BAD | PKT_RX_L4_CKSUM_BAD)) { + return; + } + } + + /* + * FIXME: should we save pkt->vlan_tci + * if (pkt->ol_flags & PKT_RX_VLAN_PKT) + */ + void *data = rte_pktmbuf_mtod(pkt, void*); uint16_t len = rte_pktmbuf_data_len(pkt); - void *hdr = ff_mbuf_gethdr(pkt, pkt->pkt_len, data, len); + void *hdr = ff_mbuf_gethdr(pkt, pkt->pkt_len, data, len, rx_csum); if (hdr == NULL) { rte_pktmbuf_free(pkt); return; @@ -693,7 +787,7 @@ ff_veth_input(void *ifp, struct rte_mbuf *pkt) prev = mb; } - ff_veth_process_packet(ifp, hdr); + ff_veth_process_packet(ctx->ifp, hdr); } static enum FilterReturn @@ -721,7 +815,7 @@ protocol_filter(const void *data, uint16_t len) static inline void process_packets(uint8_t port_id, uint16_t queue_id, struct rte_mbuf **bufs, - uint16_t count, void *ifp, int pkts_from_ring) + uint16_t count, const struct ff_dpdk_if_context *ctx, int pkts_from_ring) { struct lcore_conf *qconf = &lcore_conf; @@ -764,19 +858,19 @@ process_packets(uint8_t port_id, uint16_t queue_id, struct rte_mbuf **bufs, } } - ff_veth_input(ifp, rtem); + ff_veth_input(ctx, rtem); } else if (enable_kni && ((filter == FILTER_KNI && kni_accept) || (filter == FILTER_UNKNOWN && !kni_accept)) ) { ff_kni_enqueue(port_id, rtem); } else { - ff_veth_input(ifp, rtem); + ff_veth_input(ctx, rtem); } } } static inline int process_arp_ring(uint8_t port_id, uint16_t queue_id, - struct rte_mbuf **pkts_burst, void *ifp) + struct rte_mbuf **pkts_burst, const struct ff_dpdk_if_context *ctx) { /* read packet from ring buf and to process */ uint16_t nb_tx; @@ -784,7 +878,7 @@ process_arp_ring(uint8_t port_id, uint16_t queue_id, (void **)pkts_burst, MAX_PKT_BURST); if(nb_tx > 0) { - process_packets(port_id, queue_id, pkts_burst, nb_tx, ifp, 1); + process_packets(port_id, queue_id, pkts_burst, nb_tx, ctx, 1); } return 0; @@ -852,12 +946,13 @@ ff_dpdk_if_send(struct ff_dpdk_if_context *ctx, void *m, } head->pkt_len = total; + head->nb_segs = 0; int off = 0; struct rte_mbuf *cur = head, *prev = NULL; while(total > 0) { if (cur == NULL) { - struct rte_mbuf *cur = rte_pktmbuf_alloc(mbuf_pool); + cur = rte_pktmbuf_alloc(mbuf_pool); if (cur == NULL) { rte_pktmbuf_free(head); ff_mbuf_free(m); @@ -874,23 +969,46 @@ ff_dpdk_if_send(struct ff_dpdk_if_context *ctx, void *m, return -1; } - if (prev == NULL) { - prev = cur; - } else { + if (prev != NULL) { prev->next = cur; } + prev = cur; cur->data_len = len; off += len; total -= len; head->nb_segs++; + cur = NULL; } - /* - * FIXME: set offload flags according to mbuf.pkthdr; - */ - head->ol_flags = 0; - head->vlan_tci = 0; + struct ff_tx_offload offload = {0}; + ff_mbuf_tx_offload(m, &offload); + + if (offload.ip_csum) { + head->ol_flags |= PKT_TX_IP_CKSUM; + head->l2_len = sizeof(struct ether_hdr); + head->l3_len = sizeof(struct ipv4_hdr); + } + + if (ctx->hw_features.tx_csum_l4) { + if (offload.tcp_csum) { + head->ol_flags |= PKT_TX_TCP_CKSUM; + head->l2_len = sizeof(struct ether_hdr); + head->l3_len = sizeof(struct ipv4_hdr); + } + + if (offload.tso_seg_size) { + head->ol_flags |= PKT_TX_TCP_SEG; + head->l4_len = sizeof(struct tcp_hdr); + head->tso_segsz = offload.tso_seg_size; + } + + if (offload.udp_csum) { + head->ol_flags |= PKT_TX_UDP_CKSUM; + head->l2_len = sizeof(struct ether_hdr); + head->l3_len = sizeof(struct ipv4_hdr); + } + } ff_mbuf_free(m); @@ -910,7 +1028,7 @@ main_loop(void *arg) struct lcore_conf *qconf; const uint64_t drain_tsc = (rte_get_tsc_hz() + US_PER_S - 1) / US_PER_S * BURST_TX_DRAIN_US; - void *ifp; + struct ff_dpdk_if_context *ctx; prev_tsc = 0; @@ -955,13 +1073,13 @@ main_loop(void *arg) for (i = 0; i < qconf->nb_rx_queue; ++i) { port_id = qconf->rx_queue_list[i].port_id; queue_id = qconf->rx_queue_list[i].queue_id; - ifp = veth_ctx[port_id]->ifp; + ctx = veth_ctx[port_id]; if (enable_kni && rte_eal_process_type() == RTE_PROC_PRIMARY) { ff_kni_process(port_id, queue_id, pkts_burst, MAX_PKT_BURST); } - process_arp_ring(port_id, queue_id, pkts_burst, ifp); + process_arp_ring(port_id, queue_id, pkts_burst, ctx); nb_rx = rte_eth_rx_burst(port_id, queue_id, pkts_burst, MAX_PKT_BURST); @@ -978,12 +1096,12 @@ main_loop(void *arg) for (j = 0; j < (nb_rx - PREFETCH_OFFSET); j++) { rte_prefetch0(rte_pktmbuf_mtod(pkts_burst[ j + PREFETCH_OFFSET], void *)); - process_packets(port_id, queue_id, &pkts_burst[j], 1, ifp, 0); + process_packets(port_id, queue_id, &pkts_burst[j], 1, ctx, 0); } /* Handle remaining prefetched packets */ for (; j < nb_rx; j++) { - process_packets(port_id, queue_id, &pkts_burst[j], 1, ifp, 0); + process_packets(port_id, queue_id, &pkts_burst[j], 1, ctx, 0); } } diff --git a/lib/ff_dpdk_if.h b/lib/ff_dpdk_if.h index ccfc7b8e..59c7994e 100644 --- a/lib/ff_dpdk_if.h +++ b/lib/ff_dpdk_if.h @@ -43,6 +43,14 @@ void ff_dpdk_run(loop_func_t loop, void *arg); struct ff_dpdk_if_context; struct ff_port_cfg; +struct ff_tx_offload { + uint8_t ip_csum; + uint8_t tcp_csum; + uint8_t udp_csum; + uint8_t sctp_csum; + uint16_t tso_seg_size; +}; + struct ff_dpdk_if_context *ff_dpdk_register_if(void *sc, void *ifp, struct ff_port_cfg *cfg); void ff_dpdk_deregister_if(struct ff_dpdk_if_context *ctx); diff --git a/lib/ff_veth.c b/lib/ff_veth.c index ac5012f8..6d7e76c0 100644 --- a/lib/ff_veth.c +++ b/lib/ff_veth.c @@ -143,6 +143,31 @@ ff_mbuf_copydata(void *m, void *data, int off, int len) return 0; } +void +ff_mbuf_tx_offload(void *m, struct ff_tx_offload *offload) +{ + struct mbuf *mb = (struct mbuf *)m; + if (mb->m_pkthdr.csum_flags & CSUM_IP) { + offload->ip_csum = 1; + } + + if (mb->m_pkthdr.csum_flags & CSUM_TCP) { + offload->tcp_csum = 1; + } + + if (mb->m_pkthdr.csum_flags & CSUM_UDP) { + offload->udp_csum = 1; + } + + if (mb->m_pkthdr.csum_flags & CSUM_SCTP) { + offload->sctp_csum = 1; + } + + if (mb->m_pkthdr.csum_flags & CSUM_TSO) { + offload->tso_seg_size = mb->m_pkthdr.tso_segsz; + } +} + void ff_mbuf_free(void *m) { @@ -156,7 +181,8 @@ ff_mbuf_ext_free(struct mbuf *m, void *arg1, void *arg2) } void * -ff_mbuf_gethdr(void *pkt, uint16_t total, void *data, uint16_t len) +ff_mbuf_gethdr(void *pkt, uint16_t total, void *data, + uint16_t len, uint8_t rx_csum) { struct mbuf *m = m_gethdr(M_NOWAIT, MT_DATA); if (m == NULL) { @@ -174,6 +200,12 @@ ff_mbuf_gethdr(void *pkt, uint16_t total, void *data, uint16_t len) m->m_next = NULL; m->m_nextpkt = NULL; + if (rx_csum) { + m->m_pkthdr.csum_flags = CSUM_IP_CHECKED | CSUM_IP_VALID | + CSUM_DATA_VALID | CSUM_PSEUDO_HDR; + m->m_pkthdr.csum_data = 0xffff; + } + return (void *)m; } @@ -295,7 +327,23 @@ ff_veth_setup_interface(struct ff_veth_softc *sc, struct ff_port_cfg *cfg) ifp->if_transmit = ff_veth_transmit; ifp->if_qflush = ff_veth_qflush; ether_ifattach(ifp, sc->mac); - ifp->if_capabilities = ifp->if_capenable = 0; + + if (cfg->hw_features.rx_csum) { + ifp->if_capabilities |= IFCAP_RXCSUM; + } + if (cfg->hw_features.tx_csum_ip) { + ifp->if_capabilities |= IFCAP_TXCSUM; + ifp->if_hwassist |= CSUM_IP; + } + if (cfg->hw_features.tx_csum_l4) { + ifp->if_hwassist |= CSUM_DELAY_DATA; + } + if (cfg->hw_features.tx_tso) { + ifp->if_capabilities |= IFCAP_TSO; + ifp->if_hwassist |= CSUM_TSO; + } + + ifp->if_capenable = ifp->if_capabilities; sc->host_ctx = ff_dpdk_register_if((void *)sc, (void *)sc->ifp, cfg); if (sc->host_ctx == NULL) { diff --git a/lib/ff_veth.h b/lib/ff_veth.h index 5b452fef..920a491c 100644 --- a/lib/ff_veth.h +++ b/lib/ff_veth.h @@ -31,12 +31,16 @@ struct ff_port_cfg; void *ff_veth_attach(struct ff_port_cfg *cfg); int ff_veth_detach(void *arg); -void *ff_mbuf_gethdr(void *pkt, uint16_t total, void *data, uint16_t len); +void *ff_mbuf_gethdr(void *pkt, uint16_t total, void *data, + uint16_t len, uint8_t rx_csum); void *ff_mbuf_get(void *m, void *data, uint16_t len); void ff_mbuf_free(void *m); int ff_mbuf_copydata(void *m, void *data, int off, int len); +struct ff_tx_offload; +void ff_mbuf_tx_offload(void *m, struct ff_tx_offload *offload); + void ff_veth_process_packet(void *arg, void *m);