From d62644b46a86bce2c069af7122501e3ffd349d3c Mon Sep 17 00:00:00 2001 From: Ed Swierk via Qemu-devel Date: Tue, 14 Nov 2017 15:23:34 -0800 Subject: [PATCH] e1000: Separate TSO and non-TSO contexts, fixing UDP TX corruption The device is supposed to maintain two distinct contexts for transmit offloads: one has parameters for both segmentation and checksum offload, the other only for checksum offload. The guest driver can send two context descriptors, one for each context (the TSE flag specifies which). Then the guest can refer to one or the other context in subsequent transmit data descriptors, depending on what offloads it wants applied to each packet. Currently the e1000 device stores just one context, and misinterprets the TSE flags in the context and data descriptors. This is often okay: Linux happens to send a fresh context descriptor before every data descriptor, so forgetting the other context doesn't matter. Windows does rely on separate contexts for TSO vs. non-TSO packets, but for mostly-TCP traffic the two contexts have identical TCP-specific offload parameters so confusing them doesn't matter. One case where this confusion matters is when a Windows guest sets up a TSO context for TCP and a non-TSO context for UDP, and then transmits both TCP and UDP traffic in parallel. The e1000 device sometimes ends up using TCP-specific parameters while doing checksum offload on a UDP datagram: it writes the checksum to offset 16 (the correct location for a TCP checksum), stomping on two bytes of UDP data, and leaving the wrong value in the actual UDP checksum field at offset 6. (Even worse, the host network stack may then recompute the UDP checksum, "correcting" it to match the corrupt data before sending it out a physical interface.) Correct this by tracking the TSO context independently of the non-TSO context, and selecting the appropriate context based on the TSE flag in each transmit data descriptor. Signed-off-by: Ed Swierk Signed-off-by: Jason Wang --- hw/net/e1000.c | 70 ++++++++++++++++++++++++++++---------------------- 1 file changed, 40 insertions(+), 30 deletions(-) diff --git a/hw/net/e1000.c b/hw/net/e1000.c index 30aef93dad..804ec08721 100644 --- a/hw/net/e1000.c +++ b/hw/net/e1000.c @@ -101,6 +101,7 @@ typedef struct E1000State_st { unsigned char sum_needed; bool cptse; e1000x_txd_props props; + e1000x_txd_props tso_props; uint16_t tso_frames; } tx; @@ -541,35 +542,37 @@ xmit_seg(E1000State *s) uint16_t len; unsigned int frames = s->tx.tso_frames, css, sofar; struct e1000_tx *tp = &s->tx; + struct e1000x_txd_props *props = tp->cptse ? &tp->tso_props : &tp->props; - if (tp->props.tse && tp->cptse) { - css = tp->props.ipcss; + if (tp->cptse) { + css = props->ipcss; DBGOUT(TXSUM, "frames %d size %d ipcss %d\n", frames, tp->size, css); - if (tp->props.ip) { /* IPv4 */ + if (props->ip) { /* IPv4 */ stw_be_p(tp->data+css+2, tp->size - css); stw_be_p(tp->data+css+4, lduw_be_p(tp->data + css + 4) + frames); } else { /* IPv6 */ stw_be_p(tp->data+css+4, tp->size - css); } - css = tp->props.tucss; + css = props->tucss; len = tp->size - css; - DBGOUT(TXSUM, "tcp %d tucss %d len %d\n", tp->props.tcp, css, len); - if (tp->props.tcp) { - sofar = frames * tp->props.mss; + DBGOUT(TXSUM, "tcp %d tucss %d len %d\n", props->tcp, css, len); + if (props->tcp) { + sofar = frames * props->mss; stl_be_p(tp->data+css+4, ldl_be_p(tp->data+css+4)+sofar); /* seq */ - if (tp->props.paylen - sofar > tp->props.mss) { + if (props->paylen - sofar > props->mss) { tp->data[css + 13] &= ~9; /* PSH, FIN */ } else if (frames) { e1000x_inc_reg_if_not_full(s->mac_reg, TSCTC); } - } else /* UDP */ + } else { /* UDP */ stw_be_p(tp->data+css+4, len); + } if (tp->sum_needed & E1000_TXD_POPTS_TXSM) { unsigned int phsum; // add pseudo-header length before checksum calculation - void *sp = tp->data + tp->props.tucso; + void *sp = tp->data + props->tucso; phsum = lduw_be_p(sp) + len; phsum = (phsum >> 16) + (phsum & 0xffff); @@ -579,12 +582,10 @@ xmit_seg(E1000State *s) } if (tp->sum_needed & E1000_TXD_POPTS_TXSM) { - putsum(tp->data, tp->size, tp->props.tucso, - tp->props.tucss, tp->props.tucse); + putsum(tp->data, tp->size, props->tucso, props->tucss, props->tucse); } if (tp->sum_needed & E1000_TXD_POPTS_IXSM) { - putsum(tp->data, tp->size, tp->props.ipcso, - tp->props.ipcss, tp->props.ipcse); + putsum(tp->data, tp->size, props->ipcso, props->ipcss, props->ipcse); } if (tp->vlan_needed) { memmove(tp->vlan, tp->data, 4); @@ -616,11 +617,11 @@ process_tx_desc(E1000State *s, struct e1000_tx_desc *dp) s->mit_ide |= (txd_lower & E1000_TXD_CMD_IDE); if (dtype == E1000_TXD_CMD_DEXT) { /* context descriptor */ - e1000x_read_tx_ctx_descr(xp, &tp->props); - tp->tso_frames = 0; - if (tp->props.tucso == 0) { /* this is probably wrong */ - DBGOUT(TXSUM, "TCP/UDP: cso 0!\n"); - tp->props.tucso = tp->props.tucss + (tp->props.tcp ? 16 : 6); + if (le32_to_cpu(xp->cmd_and_length) & E1000_TXD_CMD_TSE) { + e1000x_read_tx_ctx_descr(xp, &tp->tso_props); + tp->tso_frames = 0; + } else { + e1000x_read_tx_ctx_descr(xp, &tp->props); } return; } else if (dtype == (E1000_TXD_CMD_DEXT | E1000_TXD_DTYP_D)) { @@ -645,8 +646,8 @@ process_tx_desc(E1000State *s, struct e1000_tx_desc *dp) } addr = le64_to_cpu(dp->buffer_addr); - if (tp->props.tse && tp->cptse) { - msh = tp->props.hdr_len + tp->props.mss; + if (tp->cptse) { + msh = tp->tso_props.hdr_len + tp->tso_props.mss; do { bytes = split_size; if (tp->size + bytes > msh) @@ -655,21 +656,19 @@ process_tx_desc(E1000State *s, struct e1000_tx_desc *dp) bytes = MIN(sizeof(tp->data) - tp->size, bytes); pci_dma_read(d, addr, tp->data + tp->size, bytes); sz = tp->size + bytes; - if (sz >= tp->props.hdr_len && tp->size < tp->props.hdr_len) { - memmove(tp->header, tp->data, tp->props.hdr_len); + if (sz >= tp->tso_props.hdr_len + && tp->size < tp->tso_props.hdr_len) { + memmove(tp->header, tp->data, tp->tso_props.hdr_len); } tp->size = sz; addr += bytes; if (sz == msh) { xmit_seg(s); - memmove(tp->data, tp->header, tp->props.hdr_len); - tp->size = tp->props.hdr_len; + memmove(tp->data, tp->header, tp->tso_props.hdr_len); + tp->size = tp->tso_props.hdr_len; } split_size -= bytes; } while (bytes && split_size); - } else if (!tp->props.tse && tp->cptse) { - // context descriptor TSE is not set, while data descriptor TSE is set - DBGOUT(TXERR, "TCP segmentation error\n"); } else { split_size = MIN(sizeof(tp->data) - tp->size, split_size); pci_dma_read(d, addr, tp->data + tp->size, split_size); @@ -678,7 +677,7 @@ process_tx_desc(E1000State *s, struct e1000_tx_desc *dp) if (!(txd_lower & E1000_TXD_CMD_EOP)) return; - if (!(tp->props.tse && tp->cptse && tp->size < tp->props.hdr_len)) { + if (!(tp->cptse && tp->size < tp->tso_props.hdr_len)) { xmit_seg(s); } tp->tso_frames = 0; @@ -1437,7 +1436,7 @@ static const VMStateDescription vmstate_e1000_full_mac_state = { static const VMStateDescription vmstate_e1000 = { .name = "e1000", - .version_id = 2, + .version_id = 3, .minimum_version_id = 1, .pre_save = e1000_pre_save, .post_load = e1000_post_load, @@ -1510,6 +1509,17 @@ static const VMStateDescription vmstate_e1000 = { VMSTATE_UINT32_SUB_ARRAY(mac_reg, E1000State, RA, 32), VMSTATE_UINT32_SUB_ARRAY(mac_reg, E1000State, MTA, 128), VMSTATE_UINT32_SUB_ARRAY(mac_reg, E1000State, VFTA, 128), + VMSTATE_UINT8_V(tx.tso_props.ipcss, E1000State, 3), + VMSTATE_UINT8_V(tx.tso_props.ipcso, E1000State, 3), + VMSTATE_UINT16_V(tx.tso_props.ipcse, E1000State, 3), + VMSTATE_UINT8_V(tx.tso_props.tucss, E1000State, 3), + VMSTATE_UINT8_V(tx.tso_props.tucso, E1000State, 3), + VMSTATE_UINT16_V(tx.tso_props.tucse, E1000State, 3), + VMSTATE_UINT32_V(tx.tso_props.paylen, E1000State, 3), + VMSTATE_UINT8_V(tx.tso_props.hdr_len, E1000State, 3), + VMSTATE_UINT16_V(tx.tso_props.mss, E1000State, 3), + VMSTATE_INT8_V(tx.tso_props.ip, E1000State, 3), + VMSTATE_INT8_V(tx.tso_props.tcp, E1000State, 3), VMSTATE_END_OF_LIST() }, .subsections = (const VMStateDescription*[]) {