From 88f632fbb1b3d31d5b6978d28f8735a6ed18b8f5 Mon Sep 17 00:00:00 2001 From: Finn Thain Date: Mon, 20 Jan 2020 09:59:21 +1100 Subject: [PATCH 01/23] dp8393x: Mask EOL bit from descriptor addresses The Least Significant bit of a descriptor address register is used as an EOL flag. It has to be masked when the register value is to be used as an actual address for copying memory around. But when the registers are to be updated the EOL bit should not be masked. Signed-off-by: Finn Thain Tested-by: Laurent Vivier Signed-off-by: Jason Wang --- hw/net/dp8393x.c | 17 +++++++++++------ 1 file changed, 11 insertions(+), 6 deletions(-) diff --git a/hw/net/dp8393x.c b/hw/net/dp8393x.c index 70451934ae..216d44bd46 100644 --- a/hw/net/dp8393x.c +++ b/hw/net/dp8393x.c @@ -145,6 +145,9 @@ do { printf("sonic ERROR: %s: " fmt, __func__ , ## __VA_ARGS__); } while (0) #define SONIC_ISR_PINT 0x0800 #define SONIC_ISR_LCD 0x1000 +#define SONIC_DESC_EOL 0x0001 +#define SONIC_DESC_ADDR 0xFFFE + #define TYPE_DP8393X "dp8393x" #define DP8393X(obj) OBJECT_CHECK(dp8393xState, (obj), TYPE_DP8393X) @@ -197,7 +200,8 @@ static uint32_t dp8393x_crba(dp8393xState *s) static uint32_t dp8393x_crda(dp8393xState *s) { - return (s->regs[SONIC_URDA] << 16) | s->regs[SONIC_CRDA]; + return (s->regs[SONIC_URDA] << 16) | + (s->regs[SONIC_CRDA] & SONIC_DESC_ADDR); } static uint32_t dp8393x_rbwc(dp8393xState *s) @@ -217,7 +221,8 @@ static uint32_t dp8393x_tsa(dp8393xState *s) static uint32_t dp8393x_ttda(dp8393xState *s) { - return (s->regs[SONIC_UTDA] << 16) | s->regs[SONIC_TTDA]; + return (s->regs[SONIC_UTDA] << 16) | + (s->regs[SONIC_TTDA] & SONIC_DESC_ADDR); } static uint32_t dp8393x_wt(dp8393xState *s) @@ -509,7 +514,7 @@ static void dp8393x_do_transmit_packets(dp8393xState *s) MEMTXATTRS_UNSPECIFIED, s->data, size); s->regs[SONIC_CTDA] = dp8393x_get(s, width, 0) & ~0x1; - if (dp8393x_get(s, width, 0) & 0x1) { + if (dp8393x_get(s, width, 0) & SONIC_DESC_EOL) { /* EOL detected */ break; } @@ -765,13 +770,13 @@ static ssize_t dp8393x_receive(NetClientState *nc, const uint8_t * buf, /* XXX: Check byte ordering */ /* Check for EOL */ - if (s->regs[SONIC_LLFA] & 0x1) { + if (s->regs[SONIC_LLFA] & SONIC_DESC_EOL) { /* Are we still in resource exhaustion? */ size = sizeof(uint16_t) * 1 * width; address = dp8393x_crda(s) + sizeof(uint16_t) * 5 * width; address_space_read(&s->as, address, MEMTXATTRS_UNSPECIFIED, s->data, size); - if (dp8393x_get(s, width, 0) & 0x1) { + if (dp8393x_get(s, width, 0) & SONIC_DESC_EOL) { /* Still EOL ; stop reception */ return -1; } else { @@ -831,7 +836,7 @@ static ssize_t dp8393x_receive(NetClientState *nc, const uint8_t * buf, dp8393x_crda(s) + sizeof(uint16_t) * 5 * width, MEMTXATTRS_UNSPECIFIED, s->data, size); s->regs[SONIC_LLFA] = dp8393x_get(s, width, 0); - if (s->regs[SONIC_LLFA] & 0x1) { + if (s->regs[SONIC_LLFA] & SONIC_DESC_EOL) { /* EOL detected */ s->regs[SONIC_ISR] |= SONIC_ISR_RDE; } else { From 3fe9a838ec3eae1374ced16b63bf56894b2ffbe6 Mon Sep 17 00:00:00 2001 From: Finn Thain Date: Wed, 29 Jan 2020 20:27:49 +1100 Subject: [PATCH 02/23] dp8393x: Always use 32-bit accesses The DP83932 and DP83934 have 32 data lines. The datasheet says, Data Bus: These bidirectional lines are used to transfer data on the system bus. When the SONIC is a bus master, 16-bit data is transferred on D15-D0 and 32-bit data is transferred on D31-D0. When the SONIC is accessed as a slave, register data is driven onto lines D15-D0. D31-D16 are held TRI-STATE if SONIC is in 16-bit mode. If SONIC is in 32-bit mode, they are driven, but invalid. Always use 32-bit accesses both as bus master and bus slave. Force the MSW to zero in bus master mode. This gets the Linux 'jazzsonic' driver working, and avoids the need for prior hacks to make the NetBSD 'sn' driver work. Signed-off-by: Finn Thain Tested-by: Laurent Vivier Signed-off-by: Jason Wang --- hw/net/dp8393x.c | 47 +++++++++++++++++++++++++++++------------------ 1 file changed, 29 insertions(+), 18 deletions(-) diff --git a/hw/net/dp8393x.c b/hw/net/dp8393x.c index 216d44bd46..51b71dac1e 100644 --- a/hw/net/dp8393x.c +++ b/hw/net/dp8393x.c @@ -246,9 +246,19 @@ static void dp8393x_put(dp8393xState *s, int width, int offset, uint16_t val) { if (s->big_endian) { - s->data[offset * width + width - 1] = cpu_to_be16(val); + if (width == 2) { + s->data[offset * 2] = 0; + s->data[offset * 2 + 1] = cpu_to_be16(val); + } else { + s->data[offset] = cpu_to_be16(val); + } } else { - s->data[offset * width] = cpu_to_le16(val); + if (width == 2) { + s->data[offset * 2] = cpu_to_le16(val); + s->data[offset * 2 + 1] = 0; + } else { + s->data[offset] = cpu_to_le16(val); + } } } @@ -590,7 +600,7 @@ static uint64_t dp8393x_read(void *opaque, hwaddr addr, unsigned int size) DPRINTF("read 0x%04x from reg %s\n", val, reg_names[reg]); - return val; + return s->big_endian ? val << 16 : val; } static void dp8393x_write(void *opaque, hwaddr addr, uint64_t data, @@ -598,13 +608,14 @@ static void dp8393x_write(void *opaque, hwaddr addr, uint64_t data, { dp8393xState *s = opaque; int reg = addr >> s->it_shift; + uint32_t val = s->big_endian ? data >> 16 : data; - DPRINTF("write 0x%04x to reg %s\n", (uint16_t)data, reg_names[reg]); + DPRINTF("write 0x%04x to reg %s\n", (uint16_t)val, reg_names[reg]); switch (reg) { /* Command register */ case SONIC_CR: - dp8393x_do_command(s, data); + dp8393x_do_command(s, val); break; /* Prevent write to read-only registers */ case SONIC_CAP2: @@ -617,36 +628,36 @@ static void dp8393x_write(void *opaque, hwaddr addr, uint64_t data, /* Accept write to some registers only when in reset mode */ case SONIC_DCR: if (s->regs[SONIC_CR] & SONIC_CR_RST) { - s->regs[reg] = data & 0xbfff; + s->regs[reg] = val & 0xbfff; } else { DPRINTF("writing to DCR invalid\n"); } break; case SONIC_DCR2: if (s->regs[SONIC_CR] & SONIC_CR_RST) { - s->regs[reg] = data & 0xf017; + s->regs[reg] = val & 0xf017; } else { DPRINTF("writing to DCR2 invalid\n"); } break; /* 12 lower bytes are Read Only */ case SONIC_TCR: - s->regs[reg] = data & 0xf000; + s->regs[reg] = val & 0xf000; break; /* 9 lower bytes are Read Only */ case SONIC_RCR: - s->regs[reg] = data & 0xffe0; + s->regs[reg] = val & 0xffe0; break; /* Ignore most significant bit */ case SONIC_IMR: - s->regs[reg] = data & 0x7fff; + s->regs[reg] = val & 0x7fff; dp8393x_update_irq(s); break; /* Clear bits by writing 1 to them */ case SONIC_ISR: - data &= s->regs[reg]; - s->regs[reg] &= ~data; - if (data & SONIC_ISR_RBE) { + val &= s->regs[reg]; + s->regs[reg] &= ~val; + if (val & SONIC_ISR_RBE) { dp8393x_do_read_rra(s); } dp8393x_update_irq(s); @@ -659,17 +670,17 @@ static void dp8393x_write(void *opaque, hwaddr addr, uint64_t data, case SONIC_REA: case SONIC_RRP: case SONIC_RWP: - s->regs[reg] = data & 0xfffe; + s->regs[reg] = val & 0xfffe; break; /* Invert written value for some registers */ case SONIC_CRCT: case SONIC_FAET: case SONIC_MPT: - s->regs[reg] = data ^ 0xffff; + s->regs[reg] = val ^ 0xffff; break; /* All other registers have no special contrainst */ default: - s->regs[reg] = data; + s->regs[reg] = val; } if (reg == SONIC_WT0 || reg == SONIC_WT1) { @@ -680,8 +691,8 @@ static void dp8393x_write(void *opaque, hwaddr addr, uint64_t data, static const MemoryRegionOps dp8393x_ops = { .read = dp8393x_read, .write = dp8393x_write, - .impl.min_access_size = 2, - .impl.max_access_size = 2, + .impl.min_access_size = 4, + .impl.max_access_size = 4, .endianness = DEVICE_NATIVE_ENDIAN, }; From 46ffee9ad43185cbee4182c208bbd534814086ca Mon Sep 17 00:00:00 2001 From: Finn Thain Date: Wed, 29 Jan 2020 20:27:49 +1100 Subject: [PATCH 03/23] dp8393x: Clean up endianness hacks MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit According to the datasheet, section 3.4.4, "in 32-bit mode ... the SONIC always writes long words". Therefore, use the same technique for the 'in_use' field that is used everywhere else, and write the full long word. Signed-off-by: Finn Thain Tested-by: Laurent Vivier Reviewed-by: Philippe Mathieu-Daudé Signed-off-by: Jason Wang --- hw/net/dp8393x.c | 17 ++++++----------- 1 file changed, 6 insertions(+), 11 deletions(-) diff --git a/hw/net/dp8393x.c b/hw/net/dp8393x.c index 51b71dac1e..1844482b3d 100644 --- a/hw/net/dp8393x.c +++ b/hw/net/dp8393x.c @@ -778,8 +778,6 @@ static ssize_t dp8393x_receive(NetClientState *nc, const uint8_t * buf, return -1; } - /* XXX: Check byte ordering */ - /* Check for EOL */ if (s->regs[SONIC_LLFA] & SONIC_DESC_EOL) { /* Are we still in resource exhaustion? */ @@ -851,15 +849,12 @@ static ssize_t dp8393x_receive(NetClientState *nc, const uint8_t * buf, /* EOL detected */ s->regs[SONIC_ISR] |= SONIC_ISR_RDE; } else { - /* Clear in_use, but it is always 16bit wide */ - int offset = dp8393x_crda(s) + sizeof(uint16_t) * 6 * width; - if (s->big_endian && width == 2) { - /* we need to adjust the offset of the 16bit field */ - offset += sizeof(uint16_t); - } - s->data[0] = 0; - address_space_write(&s->as, offset, MEMTXATTRS_UNSPECIFIED, - s->data, sizeof(uint16_t)); + /* Clear in_use */ + size = sizeof(uint16_t) * width; + address = dp8393x_crda(s) + sizeof(uint16_t) * 6 * width; + dp8393x_put(s, width, 0, 0); + address_space_write(&s->as, address, MEMTXATTRS_UNSPECIFIED, + s->data, size); s->regs[SONIC_CRDA] = s->regs[SONIC_LLFA]; s->regs[SONIC_ISR] |= SONIC_ISR_PKTRX; s->regs[SONIC_RSC] = (s->regs[SONIC_RSC] & 0xff00) | (((s->regs[SONIC_RSC] & 0x00ff) + 1) & 0x00ff); From 9e3cd456d85ad45e72bdba99203302342ce29b3b Mon Sep 17 00:00:00 2001 From: Finn Thain Date: Wed, 29 Jan 2020 20:27:49 +1100 Subject: [PATCH 04/23] dp8393x: Have dp8393x_receive() return the packet size MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit This function re-uses its 'size' argument as a scratch variable. Instead, declare a local 'size' variable for that purpose so that the function result doesn't get messed up. Signed-off-by: Finn Thain Reviewed-by: Philippe Mathieu-Daudé Tested-by: Laurent Vivier Signed-off-by: Jason Wang --- hw/net/dp8393x.c | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) diff --git a/hw/net/dp8393x.c b/hw/net/dp8393x.c index 1844482b3d..ed57f5f326 100644 --- a/hw/net/dp8393x.c +++ b/hw/net/dp8393x.c @@ -759,20 +759,21 @@ static int dp8393x_receive_filter(dp8393xState *s, const uint8_t * buf, } static ssize_t dp8393x_receive(NetClientState *nc, const uint8_t * buf, - size_t size) + size_t pkt_size) { dp8393xState *s = qemu_get_nic_opaque(nc); int packet_type; uint32_t available, address; - int width, rx_len = size; + int width, rx_len = pkt_size; uint32_t checksum; + int size; width = (s->regs[SONIC_DCR] & SONIC_DCR_DW) ? 2 : 1; s->regs[SONIC_RCR] &= ~(SONIC_RCR_PRX | SONIC_RCR_LBK | SONIC_RCR_FAER | SONIC_RCR_CRCR | SONIC_RCR_LPKT | SONIC_RCR_BC | SONIC_RCR_MC); - packet_type = dp8393x_receive_filter(s, buf, size); + packet_type = dp8393x_receive_filter(s, buf, pkt_size); if (packet_type < 0) { DPRINTF("packet not for netcard\n"); return -1; @@ -868,7 +869,7 @@ static ssize_t dp8393x_receive(NetClientState *nc, const uint8_t * buf, /* Done */ dp8393x_update_irq(s); - return size; + return pkt_size; } static void dp8393x_reset(DeviceState *dev) From 5b0c98fcb7ac006bd8efe0e0fecba52c43a9d028 Mon Sep 17 00:00:00 2001 From: Finn Thain Date: Wed, 29 Jan 2020 20:27:49 +1100 Subject: [PATCH 05/23] dp8393x: Update LLFA and CRDA registers from rx descriptor MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Follow the algorithm given in the National Semiconductor DP83932C datasheet in section 3.4.7: At the next reception, the SONIC re-reads the last RXpkt.link field, and updates its CRDA register to point to the next descriptor. The chip is designed to allow the host to provide a new list of descriptors in this way. Signed-off-by: Finn Thain Tested-by: Laurent Vivier Reviewed-by: Philippe Mathieu-Daudé Signed-off-by: Jason Wang --- hw/net/dp8393x.c | 11 +++++++---- 1 file changed, 7 insertions(+), 4 deletions(-) diff --git a/hw/net/dp8393x.c b/hw/net/dp8393x.c index ed57f5f326..960462397e 100644 --- a/hw/net/dp8393x.c +++ b/hw/net/dp8393x.c @@ -786,12 +786,13 @@ static ssize_t dp8393x_receive(NetClientState *nc, const uint8_t * buf, address = dp8393x_crda(s) + sizeof(uint16_t) * 5 * width; address_space_read(&s->as, address, MEMTXATTRS_UNSPECIFIED, s->data, size); - if (dp8393x_get(s, width, 0) & SONIC_DESC_EOL) { + s->regs[SONIC_LLFA] = dp8393x_get(s, width, 0); + if (s->regs[SONIC_LLFA] & SONIC_DESC_EOL) { /* Still EOL ; stop reception */ return -1; - } else { - s->regs[SONIC_CRDA] = s->regs[SONIC_LLFA]; } + /* Link has been updated by host */ + s->regs[SONIC_CRDA] = s->regs[SONIC_LLFA]; } /* Save current position */ @@ -840,7 +841,7 @@ static ssize_t dp8393x_receive(NetClientState *nc, const uint8_t * buf, MEMTXATTRS_UNSPECIFIED, s->data, size); - /* Move to next descriptor */ + /* Check link field */ size = sizeof(uint16_t) * width; address_space_read(&s->as, dp8393x_crda(s) + sizeof(uint16_t) * 5 * width, @@ -856,6 +857,8 @@ static ssize_t dp8393x_receive(NetClientState *nc, const uint8_t * buf, dp8393x_put(s, width, 0, 0); address_space_write(&s->as, address, MEMTXATTRS_UNSPECIFIED, s->data, size); + + /* Move to next descriptor */ s->regs[SONIC_CRDA] = s->regs[SONIC_LLFA]; s->regs[SONIC_ISR] |= SONIC_ISR_PKTRX; s->regs[SONIC_RSC] = (s->regs[SONIC_RSC] & 0xff00) | (((s->regs[SONIC_RSC] & 0x00ff) + 1) & 0x00ff); From a3cce2825a0b12bb717a5106daaca245557cc9ae Mon Sep 17 00:00:00 2001 From: Finn Thain Date: Wed, 29 Jan 2020 20:27:49 +1100 Subject: [PATCH 06/23] dp8393x: Clear RRRA command register bit only when appropriate MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit It doesn't make sense to clear the command register bit unless the command was actually issued. Signed-off-by: Finn Thain Reviewed-by: Philippe Mathieu-Daudé Tested-by: Laurent Vivier Signed-off-by: Jason Wang --- hw/net/dp8393x.c | 7 +++---- 1 file changed, 3 insertions(+), 4 deletions(-) diff --git a/hw/net/dp8393x.c b/hw/net/dp8393x.c index 960462397e..b5a9c6ac0f 100644 --- a/hw/net/dp8393x.c +++ b/hw/net/dp8393x.c @@ -352,9 +352,6 @@ static void dp8393x_do_read_rra(dp8393xState *s) s->regs[SONIC_ISR] |= SONIC_ISR_RBE; dp8393x_update_irq(s); } - - /* Done */ - s->regs[SONIC_CR] &= ~SONIC_CR_RRRA; } static void dp8393x_do_software_reset(dp8393xState *s) @@ -565,8 +562,10 @@ static void dp8393x_do_command(dp8393xState *s, uint16_t command) dp8393x_do_start_timer(s); if (command & SONIC_CR_RST) dp8393x_do_software_reset(s); - if (command & SONIC_CR_RRRA) + if (command & SONIC_CR_RRRA) { dp8393x_do_read_rra(s); + s->regs[SONIC_CR] &= ~SONIC_CR_RRRA; + } if (command & SONIC_CR_LCAM) dp8393x_do_load_cam(s); } From ada74315270d1dcabf4c9d4fece19df7ef5b9577 Mon Sep 17 00:00:00 2001 From: Finn Thain Date: Wed, 29 Jan 2020 20:27:49 +1100 Subject: [PATCH 07/23] dp8393x: Implement packet size limit and RBAE interrupt Add a bounds check to prevent a large packet from causing a buffer overflow. This is defensive programming -- I haven't actually tried sending an oversized packet or a jumbo ethernet frame. The SONIC handles packets that are too big for the buffer by raising the RBAE interrupt and dropping them. Linux uses that interrupt to count dropped packets. Signed-off-by: Finn Thain Tested-by: Laurent Vivier Signed-off-by: Jason Wang --- hw/net/dp8393x.c | 9 +++++++++ 1 file changed, 9 insertions(+) diff --git a/hw/net/dp8393x.c b/hw/net/dp8393x.c index b5a9c6ac0f..911f59e0c9 100644 --- a/hw/net/dp8393x.c +++ b/hw/net/dp8393x.c @@ -137,6 +137,7 @@ do { printf("sonic ERROR: %s: " fmt, __func__ , ## __VA_ARGS__); } while (0) #define SONIC_TCR_CRCI 0x2000 #define SONIC_TCR_PINT 0x8000 +#define SONIC_ISR_RBAE 0x0010 #define SONIC_ISR_RBE 0x0020 #define SONIC_ISR_RDE 0x0040 #define SONIC_ISR_TC 0x0080 @@ -772,6 +773,14 @@ static ssize_t dp8393x_receive(NetClientState *nc, const uint8_t * buf, s->regs[SONIC_RCR] &= ~(SONIC_RCR_PRX | SONIC_RCR_LBK | SONIC_RCR_FAER | SONIC_RCR_CRCR | SONIC_RCR_LPKT | SONIC_RCR_BC | SONIC_RCR_MC); + if (pkt_size + 4 > dp8393x_rbwc(s) * 2) { + DPRINTF("oversize packet, pkt_size is %d\n", pkt_size); + s->regs[SONIC_ISR] |= SONIC_ISR_RBAE; + dp8393x_update_irq(s); + dp8393x_do_read_rra(s); + return pkt_size; + } + packet_type = dp8393x_receive_filter(s, buf, pkt_size); if (packet_type < 0) { DPRINTF("packet not for netcard\n"); From bae112b80c9c42cea21ee7623c283668c3451c2e Mon Sep 17 00:00:00 2001 From: Finn Thain Date: Wed, 29 Jan 2020 20:27:49 +1100 Subject: [PATCH 08/23] dp8393x: Don't clobber packet checksum MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit A received packet consumes pkt_size bytes in the buffer and the frame checksum that's appended to it consumes another 4 bytes. The Receive Buffer Address register takes the former quantity into account but not the latter. So the next packet written to the buffer overwrites the frame checksum. Fix this. Signed-off-by: Finn Thain Reviewed-by: Philippe Mathieu-Daudé Tested-by: Laurent Vivier Signed-off-by: Jason Wang --- hw/net/dp8393x.c | 1 + 1 file changed, 1 insertion(+) diff --git a/hw/net/dp8393x.c b/hw/net/dp8393x.c index 911f59e0c9..63293411db 100644 --- a/hw/net/dp8393x.c +++ b/hw/net/dp8393x.c @@ -818,6 +818,7 @@ static ssize_t dp8393x_receive(NetClientState *nc, const uint8_t * buf, address += rx_len; address_space_write(&s->as, address, MEMTXATTRS_UNSPECIFIED, &checksum, 4); + address += 4; rx_len += 4; s->regs[SONIC_CRBA1] = address >> 16; s->regs[SONIC_CRBA0] = address & 0xffff; From ea2270279bc2e1635cb6e909e22e17e630198773 Mon Sep 17 00:00:00 2001 From: Finn Thain Date: Wed, 29 Jan 2020 20:27:49 +1100 Subject: [PATCH 09/23] dp8393x: Use long-word-aligned RRA pointers in 32-bit mode MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Section 3.4.1 of the datasheet says, The alignment of the RRA is confined to either word or long word boundaries, depending upon the data width mode. In 16-bit mode, the RRA must be aligned to a word boundary (A0 is always zero) and in 32-bit mode, the RRA is aligned to a long word boundary (A0 and A1 are always zero). This constraint has been implemented for 16-bit mode; implement it for 32-bit mode too. Signed-off-by: Finn Thain Tested-by: Laurent Vivier Reviewed-by: Philippe Mathieu-Daudé Signed-off-by: Jason Wang --- hw/net/dp8393x.c | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/hw/net/dp8393x.c b/hw/net/dp8393x.c index 63293411db..d8bf248bc6 100644 --- a/hw/net/dp8393x.c +++ b/hw/net/dp8393x.c @@ -665,12 +665,16 @@ static void dp8393x_write(void *opaque, hwaddr addr, uint64_t data, qemu_flush_queued_packets(qemu_get_queue(s->nic)); } break; - /* Ignore least significant bit */ + /* The guest is required to store aligned pointers here */ case SONIC_RSA: case SONIC_REA: case SONIC_RRP: case SONIC_RWP: - s->regs[reg] = val & 0xfffe; + if (s->regs[SONIC_DCR] & SONIC_DCR_DW) { + s->regs[reg] = val & 0xfffc; + } else { + s->regs[reg] = val & 0xfffe; + } break; /* Invert written value for some registers */ case SONIC_CRCT: From 350e7d9a77d3b9ac74d240e4b232db1ebe5c05bc Mon Sep 17 00:00:00 2001 From: Finn Thain Date: Wed, 29 Jan 2020 20:27:49 +1100 Subject: [PATCH 10/23] dp8393x: Pad frames to word or long word boundary MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The existing code has a bug where the Remaining Buffer Word Count (RBWC) is calculated with a truncating division, which gives the wrong result for odd-sized packets. Section 1.4.1 of the datasheet says, Once the end of the packet has been reached, the serializer will fill out the last word (16-bit mode) or long word (32-bit mode) if the last byte did not end on a word or long word boundary respectively. The fill byte will be 0FFh. Implement buffer padding so that buffer limits are correctly enforced. Signed-off-by: Finn Thain Tested-by: Laurent Vivier Reviewed-by: Philippe Mathieu-Daudé Signed-off-by: Jason Wang --- hw/net/dp8393x.c | 39 ++++++++++++++++++++++++++++----------- 1 file changed, 28 insertions(+), 11 deletions(-) diff --git a/hw/net/dp8393x.c b/hw/net/dp8393x.c index d8bf248bc6..22b4d362e2 100644 --- a/hw/net/dp8393x.c +++ b/hw/net/dp8393x.c @@ -768,16 +768,23 @@ static ssize_t dp8393x_receive(NetClientState *nc, const uint8_t * buf, dp8393xState *s = qemu_get_nic_opaque(nc); int packet_type; uint32_t available, address; - int width, rx_len = pkt_size; + int width, rx_len, padded_len; uint32_t checksum; int size; - width = (s->regs[SONIC_DCR] & SONIC_DCR_DW) ? 2 : 1; - s->regs[SONIC_RCR] &= ~(SONIC_RCR_PRX | SONIC_RCR_LBK | SONIC_RCR_FAER | SONIC_RCR_CRCR | SONIC_RCR_LPKT | SONIC_RCR_BC | SONIC_RCR_MC); - if (pkt_size + 4 > dp8393x_rbwc(s) * 2) { + rx_len = pkt_size + sizeof(checksum); + if (s->regs[SONIC_DCR] & SONIC_DCR_DW) { + width = 2; + padded_len = ((rx_len - 1) | 3) + 1; + } else { + width = 1; + padded_len = ((rx_len - 1) | 1) + 1; + } + + if (padded_len > dp8393x_rbwc(s) * 2) { DPRINTF("oversize packet, pkt_size is %d\n", pkt_size); s->regs[SONIC_ISR] |= SONIC_ISR_RBAE; dp8393x_update_irq(s); @@ -812,22 +819,32 @@ static ssize_t dp8393x_receive(NetClientState *nc, const uint8_t * buf, s->regs[SONIC_TRBA0] = s->regs[SONIC_CRBA0]; /* Calculate the ethernet checksum */ - checksum = cpu_to_le32(crc32(0, buf, rx_len)); + checksum = cpu_to_le32(crc32(0, buf, pkt_size)); /* Put packet into RBA */ DPRINTF("Receive packet at %08x\n", dp8393x_crba(s)); address = dp8393x_crba(s); address_space_write(&s->as, address, MEMTXATTRS_UNSPECIFIED, - buf, rx_len); - address += rx_len; + buf, pkt_size); + address += pkt_size; + + /* Put frame checksum into RBA */ address_space_write(&s->as, address, MEMTXATTRS_UNSPECIFIED, - &checksum, 4); - address += 4; - rx_len += 4; + &checksum, sizeof(checksum)); + address += sizeof(checksum); + + /* Pad short packets to keep pointers aligned */ + if (rx_len < padded_len) { + size = padded_len - rx_len; + address_space_rw(&s->as, address, MEMTXATTRS_UNSPECIFIED, + (uint8_t *)"\xFF\xFF\xFF", size, 1); + address += size; + } + s->regs[SONIC_CRBA1] = address >> 16; s->regs[SONIC_CRBA0] = address & 0xffff; available = dp8393x_rbwc(s); - available -= rx_len / 2; + available -= padded_len >> 1; s->regs[SONIC_RBWC1] = available >> 16; s->regs[SONIC_RBWC0] = available & 0xffff; From d9fae13196a31716f45dcddcdd958fbb8e59b35a Mon Sep 17 00:00:00 2001 From: Finn Thain Date: Wed, 29 Jan 2020 20:27:49 +1100 Subject: [PATCH 11/23] dp8393x: Clear descriptor in_use field to release packet When the SONIC receives a packet into the last available descriptor, it retains ownership of that descriptor for as long as necessary. Section 3.4.7 of the datasheet says, When the system appends more descriptors, the SONIC releases ownership of the descriptor after writing 0000h to the RXpkt.in_use field. The packet can now be processed by the host, so raise a PKTRX interrupt, just like the normal case. Signed-off-by: Finn Thain Tested-by: Laurent Vivier Signed-off-by: Jason Wang --- hw/net/dp8393x.c | 10 ++++++++++ 1 file changed, 10 insertions(+) diff --git a/hw/net/dp8393x.c b/hw/net/dp8393x.c index 22b4d362e2..08194a4968 100644 --- a/hw/net/dp8393x.c +++ b/hw/net/dp8393x.c @@ -811,7 +811,17 @@ static ssize_t dp8393x_receive(NetClientState *nc, const uint8_t * buf, return -1; } /* Link has been updated by host */ + + /* Clear in_use */ + size = sizeof(uint16_t) * width; + address = dp8393x_crda(s) + sizeof(uint16_t) * 6 * width; + dp8393x_put(s, width, 0, 0); + address_space_rw(&s->as, address, MEMTXATTRS_UNSPECIFIED, + (uint8_t *)s->data, size, 1); + + /* Move to next descriptor */ s->regs[SONIC_CRDA] = s->regs[SONIC_LLFA]; + s->regs[SONIC_ISR] |= SONIC_ISR_PKTRX; } /* Save current position */ From 80b60673ea598869050c66d95d8339480e4cefd0 Mon Sep 17 00:00:00 2001 From: Finn Thain Date: Wed, 29 Jan 2020 20:27:49 +1100 Subject: [PATCH 12/23] dp8393x: Always update RRA pointers and sequence numbers These operations need to take place regardless of whether or not rx descriptors have been used up (that is, EOL flag was observed). The algorithm is now the same for a packet that was withheld as for a packet that was not. Signed-off-by: Finn Thain Tested-by: Laurent Vivier Signed-off-by: Jason Wang --- hw/net/dp8393x.c | 12 +++++++----- 1 file changed, 7 insertions(+), 5 deletions(-) diff --git a/hw/net/dp8393x.c b/hw/net/dp8393x.c index 08194a4968..cfbc2ee552 100644 --- a/hw/net/dp8393x.c +++ b/hw/net/dp8393x.c @@ -901,12 +901,14 @@ static ssize_t dp8393x_receive(NetClientState *nc, const uint8_t * buf, /* Move to next descriptor */ s->regs[SONIC_CRDA] = s->regs[SONIC_LLFA]; s->regs[SONIC_ISR] |= SONIC_ISR_PKTRX; - s->regs[SONIC_RSC] = (s->regs[SONIC_RSC] & 0xff00) | (((s->regs[SONIC_RSC] & 0x00ff) + 1) & 0x00ff); + } - if (s->regs[SONIC_RCR] & SONIC_RCR_LPKT) { - /* Read next RRA */ - dp8393x_do_read_rra(s); - } + s->regs[SONIC_RSC] = (s->regs[SONIC_RSC] & 0xff00) | + ((s->regs[SONIC_RSC] + 1) & 0x00ff); + + if (s->regs[SONIC_RCR] & SONIC_RCR_LPKT) { + /* Read next RRA */ + dp8393x_do_read_rra(s); } /* Done */ From 083e21bbdde7dbd326baf29d21f49fc3f5614496 Mon Sep 17 00:00:00 2001 From: Finn Thain Date: Wed, 29 Jan 2020 20:27:49 +1100 Subject: [PATCH 13/23] dp8393x: Don't reset Silicon Revision register MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The jazzsonic driver in Linux uses the Silicon Revision register value to probe the chip. The driver fails unless the SR register contains 4. Unfortunately, reading this register in QEMU usually returns 0 because the s->regs[] array gets wiped after a software reset. Fixes: bd8f1ebce4 ("net/dp8393x: fix hardware reset") Suggested-by: Philippe Mathieu-Daudé Signed-off-by: Finn Thain Signed-off-by: Jason Wang --- hw/net/dp8393x.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/hw/net/dp8393x.c b/hw/net/dp8393x.c index cfbc2ee552..c2ac2a110e 100644 --- a/hw/net/dp8393x.c +++ b/hw/net/dp8393x.c @@ -923,6 +923,7 @@ static void dp8393x_reset(DeviceState *dev) timer_del(s->watchdog); memset(s->regs, 0, sizeof(s->regs)); + s->regs[SONIC_SR] = 0x0004; /* only revision recognized by Linux/mips */ s->regs[SONIC_CR] = SONIC_CR_RST | SONIC_CR_STP | SONIC_CR_RXDIS; s->regs[SONIC_DCR] &= ~(SONIC_DCR_EXBUS | SONIC_DCR_LBR); s->regs[SONIC_RCR] &= ~(SONIC_RCR_LB0 | SONIC_RCR_LB1 | SONIC_RCR_BRD | SONIC_RCR_RNT); @@ -975,7 +976,6 @@ static void dp8393x_realize(DeviceState *dev, Error **errp) qemu_format_nic_info_str(qemu_get_queue(s->nic), s->conf.macaddr.a); s->watchdog = timer_new_ns(QEMU_CLOCK_VIRTUAL, dp8393x_watchdog, s); - s->regs[SONIC_SR] = 0x0004; /* only revision recognized by Linux */ memory_region_init_ram(&s->prom, OBJECT(dev), "dp8393x-prom", SONIC_PROM_SIZE, &local_err); From c2279bd0a19b35057f2e4c3b4df9a915717d1142 Mon Sep 17 00:00:00 2001 From: Finn Thain Date: Wed, 29 Jan 2020 20:27:49 +1100 Subject: [PATCH 14/23] dp8393x: Don't stop reception upon RBE interrupt assertion Section 3.4.7 of the datasheet explains that, The RBE bit in the Interrupt Status register is set when the SONIC finishes using the second to last receive buffer and reads the last RRA descriptor. Actually, the SONIC is not truly out of resources, but gives the system an early warning of an impending out of resources condition. RBE does not mean actual receive buffer exhaustion, and reception should not be stopped. This is important because Linux will not check and clear the RBE interrupt until it receives another packet. But that won't happen if can_receive returns false. This bug causes the SONIC to become deaf (until reset). Fix this with a new flag to indicate actual receive buffer exhaustion. Signed-off-by: Finn Thain Tested-by: Laurent Vivier Signed-off-by: Jason Wang --- hw/net/dp8393x.c | 37 +++++++++++++++++++++++-------------- 1 file changed, 23 insertions(+), 14 deletions(-) diff --git a/hw/net/dp8393x.c b/hw/net/dp8393x.c index c2ac2a110e..8a3504d962 100644 --- a/hw/net/dp8393x.c +++ b/hw/net/dp8393x.c @@ -158,6 +158,7 @@ typedef struct dp8393xState { /* Hardware */ uint8_t it_shift; bool big_endian; + bool last_rba_is_full; qemu_irq irq; #ifdef DEBUG_SONIC int irq_level; @@ -347,12 +348,15 @@ static void dp8393x_do_read_rra(dp8393xState *s) s->regs[SONIC_RRP] = s->regs[SONIC_RSA]; } - /* Check resource exhaustion */ + /* Warn the host if CRBA now has the last available resource */ if (s->regs[SONIC_RRP] == s->regs[SONIC_RWP]) { s->regs[SONIC_ISR] |= SONIC_ISR_RBE; dp8393x_update_irq(s); } + + /* Allow packet reception */ + s->last_rba_is_full = false; } static void dp8393x_do_software_reset(dp8393xState *s) @@ -661,9 +665,6 @@ static void dp8393x_write(void *opaque, hwaddr addr, uint64_t data, dp8393x_do_read_rra(s); } dp8393x_update_irq(s); - if (dp8393x_can_receive(s->nic->ncs)) { - qemu_flush_queued_packets(qemu_get_queue(s->nic)); - } break; /* The guest is required to store aligned pointers here */ case SONIC_RSA: @@ -723,8 +724,6 @@ static int dp8393x_can_receive(NetClientState *nc) if (!(s->regs[SONIC_CR] & SONIC_CR_RXEN)) return 0; - if (s->regs[SONIC_ISR] & SONIC_ISR_RBE) - return 0; return 1; } @@ -775,6 +774,10 @@ static ssize_t dp8393x_receive(NetClientState *nc, const uint8_t * buf, s->regs[SONIC_RCR] &= ~(SONIC_RCR_PRX | SONIC_RCR_LBK | SONIC_RCR_FAER | SONIC_RCR_CRCR | SONIC_RCR_LPKT | SONIC_RCR_BC | SONIC_RCR_MC); + if (s->last_rba_is_full) { + return pkt_size; + } + rx_len = pkt_size + sizeof(checksum); if (s->regs[SONIC_DCR] & SONIC_DCR_DW) { width = 2; @@ -788,8 +791,8 @@ static ssize_t dp8393x_receive(NetClientState *nc, const uint8_t * buf, DPRINTF("oversize packet, pkt_size is %d\n", pkt_size); s->regs[SONIC_ISR] |= SONIC_ISR_RBAE; dp8393x_update_irq(s); - dp8393x_do_read_rra(s); - return pkt_size; + s->regs[SONIC_RCR] |= SONIC_RCR_LPKT; + goto done; } packet_type = dp8393x_receive_filter(s, buf, pkt_size); @@ -903,16 +906,22 @@ static ssize_t dp8393x_receive(NetClientState *nc, const uint8_t * buf, s->regs[SONIC_ISR] |= SONIC_ISR_PKTRX; } + dp8393x_update_irq(s); + s->regs[SONIC_RSC] = (s->regs[SONIC_RSC] & 0xff00) | ((s->regs[SONIC_RSC] + 1) & 0x00ff); - if (s->regs[SONIC_RCR] & SONIC_RCR_LPKT) { - /* Read next RRA */ - dp8393x_do_read_rra(s); - } +done: - /* Done */ - dp8393x_update_irq(s); + if (s->regs[SONIC_RCR] & SONIC_RCR_LPKT) { + if (s->regs[SONIC_RRP] == s->regs[SONIC_RWP]) { + /* Stop packet reception */ + s->last_rba_is_full = true; + } else { + /* Read next resource */ + dp8393x_do_read_rra(s); + } + } return pkt_size; } From b559ea95eaee8cbf72d910e8f98d2601c7f74f97 Mon Sep 17 00:00:00 2001 From: Yuri Benditovich Date: Mon, 27 Jan 2020 18:03:12 +0200 Subject: [PATCH 15/23] e1000e: Avoid hw_error if legacy mode used https://bugzilla.redhat.com/show_bug.cgi?id=1787142 The emulation issues hw_error if PSRCTL register is written, for example, with zero value. Such configuration does not present any problem when DTYP bits of RCTL register define legacy format of transfer descriptors. Current commit discards check for BSIZE0 and BSIZE1 when legacy mode used. Acked-by: Dmitry Fleytman Signed-off-by: Yuri Benditovich Signed-off-by: Jason Wang --- hw/net/e1000e_core.c | 13 ++++++++----- 1 file changed, 8 insertions(+), 5 deletions(-) diff --git a/hw/net/e1000e_core.c b/hw/net/e1000e_core.c index 9b76f82db5..d110c5a1bb 100644 --- a/hw/net/e1000e_core.c +++ b/hw/net/e1000e_core.c @@ -2813,12 +2813,15 @@ e1000e_set_eitr(E1000ECore *core, int index, uint32_t val) static void e1000e_set_psrctl(E1000ECore *core, int index, uint32_t val) { - if ((val & E1000_PSRCTL_BSIZE0_MASK) == 0) { - hw_error("e1000e: PSRCTL.BSIZE0 cannot be zero"); - } + if (core->mac[RCTL] & E1000_RCTL_DTYP_MASK) { - if ((val & E1000_PSRCTL_BSIZE1_MASK) == 0) { - hw_error("e1000e: PSRCTL.BSIZE1 cannot be zero"); + if ((val & E1000_PSRCTL_BSIZE0_MASK) == 0) { + hw_error("e1000e: PSRCTL.BSIZE0 cannot be zero"); + } + + if ((val & E1000_PSRCTL_BSIZE1_MASK) == 0) { + hw_error("e1000e: PSRCTL.BSIZE1 cannot be zero"); + } } core->mac[PSRCTL] = val; From 33bbc05eabe5109d270faf10493a3ebebd541ecd Mon Sep 17 00:00:00 2001 From: Yuri Benditovich Date: Mon, 27 Jan 2020 13:54:04 +0200 Subject: [PATCH 16/23] NetRxPkt: Introduce support for additional hash types Add support for following hash types: IPV6 TCP with extension headers IPV4 UDP IPV6 UDP IPV6 UDP with extension headers Signed-off-by: Yuri Benditovich Acked-by: Dmitry Fleytman Signed-off-by: Jason Wang --- hw/net/net_rx_pkt.c | 42 ++++++++++++++++++++++++++++++++++++++++++ hw/net/net_rx_pkt.h | 6 +++++- hw/net/trace-events | 4 ++++ 3 files changed, 51 insertions(+), 1 deletion(-) diff --git a/hw/net/net_rx_pkt.c b/hw/net/net_rx_pkt.c index 98a5030ace..b2a06bd27d 100644 --- a/hw/net/net_rx_pkt.c +++ b/hw/net/net_rx_pkt.c @@ -307,6 +307,20 @@ _net_rx_rss_prepare_tcp(uint8_t *rss_input, &tcphdr->th_dport, sizeof(uint16_t)); } +static inline void +_net_rx_rss_prepare_udp(uint8_t *rss_input, + struct NetRxPkt *pkt, + size_t *bytes_written) +{ + struct udp_header *udphdr = &pkt->l4hdr_info.hdr.udp; + + _net_rx_rss_add_chunk(rss_input, bytes_written, + &udphdr->uh_sport, sizeof(uint16_t)); + + _net_rx_rss_add_chunk(rss_input, bytes_written, + &udphdr->uh_dport, sizeof(uint16_t)); +} + uint32_t net_rx_pkt_calc_rss_hash(struct NetRxPkt *pkt, NetRxPktRssType type, @@ -347,6 +361,34 @@ net_rx_pkt_calc_rss_hash(struct NetRxPkt *pkt, trace_net_rx_pkt_rss_ip6_ex(); _net_rx_rss_prepare_ip6(&rss_input[0], pkt, true, &rss_length); break; + case NetPktRssIpV6TcpEx: + assert(pkt->isip6); + assert(pkt->istcp); + trace_net_rx_pkt_rss_ip6_ex_tcp(); + _net_rx_rss_prepare_ip6(&rss_input[0], pkt, true, &rss_length); + _net_rx_rss_prepare_tcp(&rss_input[0], pkt, &rss_length); + break; + case NetPktRssIpV4Udp: + assert(pkt->isip4); + assert(pkt->isudp); + trace_net_rx_pkt_rss_ip4_udp(); + _net_rx_rss_prepare_ip4(&rss_input[0], pkt, &rss_length); + _net_rx_rss_prepare_udp(&rss_input[0], pkt, &rss_length); + break; + case NetPktRssIpV6Udp: + assert(pkt->isip6); + assert(pkt->isudp); + trace_net_rx_pkt_rss_ip6_udp(); + _net_rx_rss_prepare_ip6(&rss_input[0], pkt, false, &rss_length); + _net_rx_rss_prepare_udp(&rss_input[0], pkt, &rss_length); + break; + case NetPktRssIpV6UdpEx: + assert(pkt->isip6); + assert(pkt->isudp); + trace_net_rx_pkt_rss_ip6_ex_udp(); + _net_rx_rss_prepare_ip6(&rss_input[0], pkt, true, &rss_length); + _net_rx_rss_prepare_udp(&rss_input[0], pkt, &rss_length); + break; default: assert(false); break; diff --git a/hw/net/net_rx_pkt.h b/hw/net/net_rx_pkt.h index 7adf0fad51..048e3461f0 100644 --- a/hw/net/net_rx_pkt.h +++ b/hw/net/net_rx_pkt.h @@ -133,7 +133,11 @@ typedef enum { NetPktRssIpV4Tcp, NetPktRssIpV6Tcp, NetPktRssIpV6, - NetPktRssIpV6Ex + NetPktRssIpV6Ex, + NetPktRssIpV6TcpEx, + NetPktRssIpV4Udp, + NetPktRssIpV6Udp, + NetPktRssIpV6UdpEx, } NetRxPktRssType; /** diff --git a/hw/net/trace-events b/hw/net/trace-events index 42066fcf66..a1da98a643 100644 --- a/hw/net/trace-events +++ b/hw/net/trace-events @@ -92,9 +92,13 @@ net_rx_pkt_l3_csum_validate_csum(size_t l3hdr_off, uint32_t csl, uint32_t cntr, net_rx_pkt_rss_ip4(void) "Calculating IPv4 RSS hash" net_rx_pkt_rss_ip4_tcp(void) "Calculating IPv4/TCP RSS hash" +net_rx_pkt_rss_ip4_udp(void) "Calculating IPv4/UDP RSS hash" net_rx_pkt_rss_ip6_tcp(void) "Calculating IPv6/TCP RSS hash" +net_rx_pkt_rss_ip6_udp(void) "Calculating IPv6/UDP RSS hash" net_rx_pkt_rss_ip6(void) "Calculating IPv6 RSS hash" net_rx_pkt_rss_ip6_ex(void) "Calculating IPv6/EX RSS hash" +net_rx_pkt_rss_ip6_ex_tcp(void) "Calculating IPv6/EX/TCP RSS hash" +net_rx_pkt_rss_ip6_ex_udp(void) "Calculating IPv6/EX/UDP RSS hash" net_rx_pkt_rss_hash(size_t rss_length, uint32_t rss_hash) "RSS hash for %zu bytes: 0x%X" net_rx_pkt_rss_add_chunk(void* ptr, size_t size, size_t input_offset) "Add RSS chunk %p, %zu bytes, RSS input offset %zu bytes" From 2683a927ff21b54a7300ac70c3c89652f2194969 Mon Sep 17 00:00:00 2001 From: Yuri Benditovich Date: Mon, 27 Jan 2020 13:54:05 +0200 Subject: [PATCH 17/23] NetRxPkt: fix hash calculation of IPV6 TCP When requested to calculate the hash for TCPV6 packet, ignore overrides of source and destination addresses in in extension headers. Use these overrides when new hash type NetPktRssIpV6TcpEx requested. Use this type in e1000e hash calculation for IPv6 TCP, which should take in account overrides of the addresses. Signed-off-by: Yuri Benditovich Acked-by: Dmitry Fleytman Signed-off-by: Jason Wang --- hw/net/e1000e_core.c | 2 +- hw/net/net_rx_pkt.c | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/hw/net/e1000e_core.c b/hw/net/e1000e_core.c index d110c5a1bb..94ea34dca5 100644 --- a/hw/net/e1000e_core.c +++ b/hw/net/e1000e_core.c @@ -582,7 +582,7 @@ e1000e_rss_calc_hash(E1000ECore *core, type = NetPktRssIpV4Tcp; break; case E1000_MRQ_RSS_TYPE_IPV6TCP: - type = NetPktRssIpV6Tcp; + type = NetPktRssIpV6TcpEx; break; case E1000_MRQ_RSS_TYPE_IPV6: type = NetPktRssIpV6; diff --git a/hw/net/net_rx_pkt.c b/hw/net/net_rx_pkt.c index b2a06bd27d..1e1c504e42 100644 --- a/hw/net/net_rx_pkt.c +++ b/hw/net/net_rx_pkt.c @@ -348,7 +348,7 @@ net_rx_pkt_calc_rss_hash(struct NetRxPkt *pkt, assert(pkt->isip6); assert(pkt->istcp); trace_net_rx_pkt_rss_ip6_tcp(); - _net_rx_rss_prepare_ip6(&rss_input[0], pkt, true, &rss_length); + _net_rx_rss_prepare_ip6(&rss_input[0], pkt, false, &rss_length); _net_rx_rss_prepare_tcp(&rss_input[0], pkt, &rss_length); break; case NetPktRssIpV6: From dda8f1854b8efae1dd30b666e8358f9b2ed8ae48 Mon Sep 17 00:00:00 2001 From: Bin Meng Date: Fri, 9 Aug 2019 00:25:44 -0700 Subject: [PATCH 18/23] hw: net: cadence_gem: Fix build errors in DB_PRINT() When CADENCE_GEM_ERR_DEBUG is turned on, there are several compilation errors in DB_PRINT(). Fix them. While we are here, update to use appropriate modifiers in the same DB_PRINT() call. Signed-off-by: Bin Meng Reviewed-by: Alistair Francis Signed-off-by: Jason Wang --- hw/net/cadence_gem.c | 11 ++++++----- 1 file changed, 6 insertions(+), 5 deletions(-) diff --git a/hw/net/cadence_gem.c b/hw/net/cadence_gem.c index ddabdb3f90..6340c1eaf8 100644 --- a/hw/net/cadence_gem.c +++ b/hw/net/cadence_gem.c @@ -987,8 +987,9 @@ static ssize_t gem_receive(NetClientState *nc, const uint8_t *buf, size_t size) return -1; } - DB_PRINT("copy %d bytes to 0x%x\n", MIN(bytes_to_copy, rxbufsize), - rx_desc_get_buffer(s->rx_desc[q])); + DB_PRINT("copy %u bytes to 0x%" PRIx64 "\n", + MIN(bytes_to_copy, rxbufsize), + rx_desc_get_buffer(s, s->rx_desc[q])); /* Copy packet data to emulated DMA buffer */ address_space_write(&s->dma_as, rx_desc_get_buffer(s, s->rx_desc[q]) + @@ -1159,9 +1160,9 @@ static void gem_transmit(CadenceGEMState *s) if (tx_desc_get_length(desc) > sizeof(tx_packet) - (p - tx_packet)) { - DB_PRINT("TX descriptor @ 0x%x too large: size 0x%x space " \ - "0x%x\n", (unsigned)packet_desc_addr, - (unsigned)tx_desc_get_length(desc), + DB_PRINT("TX descriptor @ 0x%" HWADDR_PRIx \ + " too large: size 0x%x space 0x%zx\n", + packet_desc_addr, tx_desc_get_length(desc), sizeof(tx_packet) - (p - tx_packet)); break; } From 08ddb4eb6d47d241257027751138001484c594b2 Mon Sep 17 00:00:00 2001 From: Lukas Straub Date: Thu, 24 Oct 2019 16:25:35 +0200 Subject: [PATCH 19/23] block/replication.c: Ignore requests after failover After failover the Secondary side of replication shouldn't change state, because it now functions as our primary disk. In replication_start, replication_do_checkpoint, replication_stop, ignore the request if current state is BLOCK_REPLICATION_DONE (sucessful failover) or BLOCK_REPLICATION_FAILOVER (failover in progres i.e. currently merging active and hidden images into the base image). Signed-off-by: Lukas Straub Reviewed-by: Zhang Chen Acked-by: Max Reitz Signed-off-by: Jason Wang --- block/replication.c | 35 ++++++++++++++++++++++++++++++++++- 1 file changed, 34 insertions(+), 1 deletion(-) diff --git a/block/replication.c b/block/replication.c index d6681b6c84..413d95407d 100644 --- a/block/replication.c +++ b/block/replication.c @@ -450,6 +450,17 @@ static void replication_start(ReplicationState *rs, ReplicationMode mode, aio_context_acquire(aio_context); s = bs->opaque; + if (s->stage == BLOCK_REPLICATION_DONE || + s->stage == BLOCK_REPLICATION_FAILOVER) { + /* + * This case happens when a secondary is promoted to primary. + * Ignore the request because the secondary side of replication + * doesn't have to do anything anymore. + */ + aio_context_release(aio_context); + return; + } + if (s->stage != BLOCK_REPLICATION_NONE) { error_setg(errp, "Block replication is running or done"); aio_context_release(aio_context); @@ -574,6 +585,17 @@ static void replication_do_checkpoint(ReplicationState *rs, Error **errp) aio_context_acquire(aio_context); s = bs->opaque; + if (s->stage == BLOCK_REPLICATION_DONE || + s->stage == BLOCK_REPLICATION_FAILOVER) { + /* + * This case happens when a secondary was promoted to primary. + * Ignore the request because the secondary side of replication + * doesn't have to do anything anymore. + */ + aio_context_release(aio_context); + return; + } + if (s->mode == REPLICATION_MODE_SECONDARY) { secondary_do_checkpoint(s, errp); } @@ -590,7 +612,7 @@ static void replication_get_error(ReplicationState *rs, Error **errp) aio_context_acquire(aio_context); s = bs->opaque; - if (s->stage != BLOCK_REPLICATION_RUNNING) { + if (s->stage == BLOCK_REPLICATION_NONE) { error_setg(errp, "Block replication is not running"); aio_context_release(aio_context); return; @@ -632,6 +654,17 @@ static void replication_stop(ReplicationState *rs, bool failover, Error **errp) aio_context_acquire(aio_context); s = bs->opaque; + if (s->stage == BLOCK_REPLICATION_DONE || + s->stage == BLOCK_REPLICATION_FAILOVER) { + /* + * This case happens when a secondary was promoted to primary. + * Ignore the request because the secondary side of replication + * doesn't have to do anything anymore. + */ + aio_context_release(aio_context); + return; + } + if (s->stage != BLOCK_REPLICATION_RUNNING) { error_setg(errp, "Block replication is not running"); aio_context_release(aio_context); From 7b9e215ed6381a1447d016a397f8b07d9dc652d3 Mon Sep 17 00:00:00 2001 From: Lukas Straub Date: Thu, 24 Oct 2019 16:25:44 +0200 Subject: [PATCH 20/23] tests/test-replication.c: Add test for for secondary node continuing replication This simulates the case that happens when we resume COLO after failover. Signed-off-by: Lukas Straub Signed-off-by: Jason Wang --- tests/test-replication.c | 52 ++++++++++++++++++++++++++++++++++++++++ 1 file changed, 52 insertions(+) diff --git a/tests/test-replication.c b/tests/test-replication.c index 4747d8a6be..cbc37db2df 100644 --- a/tests/test-replication.c +++ b/tests/test-replication.c @@ -490,6 +490,56 @@ static void test_secondary_stop(void) teardown_secondary(); } +static void test_secondary_continuous_replication(void) +{ + BlockBackend *top_blk, *local_blk; + Error *local_err = NULL; + + top_blk = start_secondary(); + replication_start_all(REPLICATION_MODE_SECONDARY, &local_err); + g_assert(!local_err); + + /* write 0x22 to s_local_disk (IMG_SIZE / 2, IMG_SIZE) */ + local_blk = blk_by_name(S_LOCAL_DISK_ID); + test_blk_write(local_blk, 0x22, IMG_SIZE / 2, IMG_SIZE / 2, false); + + /* replication will backup s_local_disk to s_hidden_disk */ + test_blk_read(top_blk, 0x11, IMG_SIZE / 2, + IMG_SIZE / 2, 0, IMG_SIZE, false); + + /* write 0x33 to s_active_disk (0, IMG_SIZE / 2) */ + test_blk_write(top_blk, 0x33, 0, IMG_SIZE / 2, false); + + /* do failover (active commit) */ + replication_stop_all(true, &local_err); + g_assert(!local_err); + + /* it should ignore all requests from now on */ + + /* start after failover */ + replication_start_all(REPLICATION_MODE_PRIMARY, &local_err); + g_assert(!local_err); + + /* checkpoint */ + replication_do_checkpoint_all(&local_err); + g_assert(!local_err); + + /* stop */ + replication_stop_all(true, &local_err); + g_assert(!local_err); + + /* read from s_local_disk (0, IMG_SIZE / 2) */ + test_blk_read(top_blk, 0x33, 0, IMG_SIZE / 2, + 0, IMG_SIZE / 2, false); + + + /* read from s_local_disk (IMG_SIZE / 2, IMG_SIZE) */ + test_blk_read(top_blk, 0x22, IMG_SIZE / 2, + IMG_SIZE / 2, 0, IMG_SIZE, false); + + teardown_secondary(); +} + static void test_secondary_do_checkpoint(void) { BlockBackend *top_blk, *local_blk; @@ -585,6 +635,8 @@ int main(int argc, char **argv) g_test_add_func("/replication/secondary/write", test_secondary_write); g_test_add_func("/replication/secondary/start", test_secondary_start); g_test_add_func("/replication/secondary/stop", test_secondary_stop); + g_test_add_func("/replication/secondary/continuous_replication", + test_secondary_continuous_replication); g_test_add_func("/replication/secondary/do_checkpoint", test_secondary_do_checkpoint); g_test_add_func("/replication/secondary/get_error_all", From 1973136532f75fdcf78251317e3c97b950595155 Mon Sep 17 00:00:00 2001 From: Lukas Straub Date: Thu, 24 Oct 2019 16:25:48 +0200 Subject: [PATCH 21/23] net/filter.c: Add Options to insert filters anywhere in the filter list To switch the Secondary to Primary, we need to insert new filters before the filter-rewriter. Add the options insert= and position= to be able to insert filters anywhere in the filter list. position should be "head" or "tail" to insert at the head or tail of the filter list or it should be "id=" to specify the id of another filter. insert should be either "before" or "behind" to specify where to insert the new filter relative to the one specified with position. Signed-off-by: Lukas Straub Reviewed-by: Zhang Chen Signed-off-by: Jason Wang --- include/net/filter.h | 2 + net/filter.c | 92 +++++++++++++++++++++++++++++++++++++++++++- qemu-options.hx | 31 ++++++++++++--- 3 files changed, 119 insertions(+), 6 deletions(-) diff --git a/include/net/filter.h b/include/net/filter.h index e8fb6259db..9393c59192 100644 --- a/include/net/filter.h +++ b/include/net/filter.h @@ -62,6 +62,8 @@ struct NetFilterState { NetClientState *netdev; NetFilterDirection direction; bool on; + char *position; + bool insert_before_flag; QTAILQ_ENTRY(NetFilterState) next; }; diff --git a/net/filter.c b/net/filter.c index 4b932e79f9..8221666263 100644 --- a/net/filter.c +++ b/net/filter.c @@ -171,11 +171,47 @@ static void netfilter_set_status(Object *obj, const char *str, Error **errp) } } +static char *netfilter_get_position(Object *obj, Error **errp) +{ + NetFilterState *nf = NETFILTER(obj); + + return g_strdup(nf->position); +} + +static void netfilter_set_position(Object *obj, const char *str, Error **errp) +{ + NetFilterState *nf = NETFILTER(obj); + + nf->position = g_strdup(str); +} + +static char *netfilter_get_insert(Object *obj, Error **errp) +{ + NetFilterState *nf = NETFILTER(obj); + + return nf->insert_before_flag ? g_strdup("before") : g_strdup("behind"); +} + +static void netfilter_set_insert(Object *obj, const char *str, Error **errp) +{ + NetFilterState *nf = NETFILTER(obj); + + if (strcmp(str, "before") && strcmp(str, "behind")) { + error_setg(errp, "Invalid value for netfilter insert, " + "should be 'before' or 'behind'"); + return; + } + + nf->insert_before_flag = !strcmp(str, "before"); +} + static void netfilter_init(Object *obj) { NetFilterState *nf = NETFILTER(obj); nf->on = true; + nf->insert_before_flag = false; + nf->position = g_strdup("tail"); object_property_add_str(obj, "netdev", netfilter_get_netdev_id, netfilter_set_netdev_id, @@ -187,11 +223,18 @@ static void netfilter_init(Object *obj) object_property_add_str(obj, "status", netfilter_get_status, netfilter_set_status, NULL); + object_property_add_str(obj, "position", + netfilter_get_position, netfilter_set_position, + NULL); + object_property_add_str(obj, "insert", + netfilter_get_insert, netfilter_set_insert, + NULL); } static void netfilter_complete(UserCreatable *uc, Error **errp) { NetFilterState *nf = NETFILTER(uc); + NetFilterState *position = NULL; NetClientState *ncs[MAX_QUEUE_NUM]; NetFilterClass *nfc = NETFILTER_GET_CLASS(uc); int queues; @@ -219,6 +262,41 @@ static void netfilter_complete(UserCreatable *uc, Error **errp) return; } + if (strcmp(nf->position, "head") && strcmp(nf->position, "tail")) { + Object *container; + Object *obj; + char *position_id; + + if (!g_str_has_prefix(nf->position, "id=")) { + error_setg(errp, QERR_INVALID_PARAMETER_VALUE, "position", + "'head', 'tail' or 'id='"); + return; + } + + /* get the id from the string */ + position_id = g_strndup(nf->position + 3, strlen(nf->position) - 3); + + /* Search for the position to insert before/behind */ + container = object_get_objects_root(); + obj = object_resolve_path_component(container, position_id); + if (!obj) { + error_setg(errp, "filter '%s' not found", position_id); + g_free(position_id); + return; + } + + position = NETFILTER(obj); + + if (position->netdev != ncs[0]) { + error_setg(errp, "filter '%s' belongs to a different netdev", + position_id); + g_free(position_id); + return; + } + + g_free(position_id); + } + nf->netdev = ncs[0]; if (nfc->setup) { @@ -228,7 +306,18 @@ static void netfilter_complete(UserCreatable *uc, Error **errp) return; } } - QTAILQ_INSERT_TAIL(&nf->netdev->filters, nf, next); + + if (position) { + if (nf->insert_before_flag) { + QTAILQ_INSERT_BEFORE(position, nf, next); + } else { + QTAILQ_INSERT_AFTER(&nf->netdev->filters, position, nf, next); + } + } else if (!strcmp(nf->position, "head")) { + QTAILQ_INSERT_HEAD(&nf->netdev->filters, nf, next); + } else if (!strcmp(nf->position, "tail")) { + QTAILQ_INSERT_TAIL(&nf->netdev->filters, nf, next); + } } static void netfilter_finalize(Object *obj) @@ -245,6 +334,7 @@ static void netfilter_finalize(Object *obj) QTAILQ_REMOVE(&nf->netdev->filters, nf, next); } g_free(nf->netdev_id); + g_free(nf->position); } static void default_handle_event(NetFilterState *nf, int event, Error **errp) diff --git a/qemu-options.hx b/qemu-options.hx index ac315c1ac4..828e71b831 100644 --- a/qemu-options.hx +++ b/qemu-options.hx @@ -4546,7 +4546,7 @@ applications, they can do this through this parameter. Its format is a gnutls priority string as described at @url{https://gnutls.org/manual/html_node/Priority-Strings.html}. -@item -object filter-buffer,id=@var{id},netdev=@var{netdevid},interval=@var{t}[,queue=@var{all|rx|tx}][,status=@var{on|off}] +@item -object filter-buffer,id=@var{id},netdev=@var{netdevid},interval=@var{t}[,queue=@var{all|rx|tx}][,status=@var{on|off}][,position=@var{head|tail|id=}][,insert=@var{behind|before}] Interval @var{t} can't be 0, this filter batches the packet delivery: all packets arriving in a given interval on netdev @var{netdevid} are delayed @@ -4565,11 +4565,32 @@ queue @var{all|rx|tx} is an option that can be applied to any netfilter. @option{tx}: the filter is attached to the transmit queue of the netdev, where it will receive packets sent by the netdev. -@item -object filter-mirror,id=@var{id},netdev=@var{netdevid},outdev=@var{chardevid},queue=@var{all|rx|tx}[,vnet_hdr_support] +position @var{head|tail|id=} is an option to specify where the +filter should be inserted in the filter list. It can be applied to any +netfilter. + +@option{head}: the filter is inserted at the head of the filter + list, before any existing filters. + +@option{tail}: the filter is inserted at the tail of the filter + list, behind any existing filters (default). + +@option{id=}: the filter is inserted before or behind the filter + specified by , see the insert option below. + +insert @var{behind|before} is an option to specify where to insert the +new filter relative to the one specified with position=id=. It can +be applied to any netfilter. + +@option{before}: insert before the specified filter. + +@option{behind}: insert behind the specified filter (default). + +@item -object filter-mirror,id=@var{id},netdev=@var{netdevid},outdev=@var{chardevid},queue=@var{all|rx|tx}[,vnet_hdr_support][,position=@var{head|tail|id=}][,insert=@var{behind|before}] filter-mirror on netdev @var{netdevid},mirror net packet to chardev@var{chardevid}, if it has the vnet_hdr_support flag, filter-mirror will mirror packet with vnet_hdr_len. -@item -object filter-redirector,id=@var{id},netdev=@var{netdevid},indev=@var{chardevid},outdev=@var{chardevid},queue=@var{all|rx|tx}[,vnet_hdr_support] +@item -object filter-redirector,id=@var{id},netdev=@var{netdevid},indev=@var{chardevid},outdev=@var{chardevid},queue=@var{all|rx|tx}[,vnet_hdr_support][,position=@var{head|tail|id=}][,insert=@var{behind|before}] filter-redirector on netdev @var{netdevid},redirect filter's net packet to chardev @var{chardevid},and redirect indev's packet to filter.if it has the vnet_hdr_support flag, @@ -4578,7 +4599,7 @@ Create a filter-redirector we need to differ outdev id from indev id, id can not be the same. we can just use indev or outdev, but at least one of indev or outdev need to be specified. -@item -object filter-rewriter,id=@var{id},netdev=@var{netdevid},queue=@var{all|rx|tx},[vnet_hdr_support] +@item -object filter-rewriter,id=@var{id},netdev=@var{netdevid},queue=@var{all|rx|tx},[vnet_hdr_support][,position=@var{head|tail|id=}][,insert=@var{behind|before}] Filter-rewriter is a part of COLO project.It will rewrite tcp packet to secondary from primary to keep secondary tcp connection,and rewrite @@ -4591,7 +4612,7 @@ colo secondary: -object filter-redirector,id=f2,netdev=hn0,queue=rx,outdev=red1 -object filter-rewriter,id=rew0,netdev=hn0,queue=all -@item -object filter-dump,id=@var{id},netdev=@var{dev}[,file=@var{filename}][,maxlen=@var{len}] +@item -object filter-dump,id=@var{id},netdev=@var{dev}[,file=@var{filename}][,maxlen=@var{len}][,position=@var{head|tail|id=}][,insert=@var{behind|before}] Dump the network traffic on netdev @var{dev} to the file specified by @var{filename}. At most @var{len} bytes (64k by default) per packet are stored. From 90dfe59b33c9c20b81641d46a4dddbe89c5fba7a Mon Sep 17 00:00:00 2001 From: Lukas Straub Date: Thu, 24 Oct 2019 16:25:57 +0200 Subject: [PATCH 22/23] colo: Update Documentation for continuous replication Document the qemu command-line and qmp commands for continuous replication Signed-off-by: Lukas Straub Signed-off-by: Jason Wang --- docs/COLO-FT.txt | 222 +++++++++++++++++++++++++++---------- docs/block-replication.txt | 28 +++-- 2 files changed, 183 insertions(+), 67 deletions(-) diff --git a/docs/COLO-FT.txt b/docs/COLO-FT.txt index ad24680d13..c8e1740935 100644 --- a/docs/COLO-FT.txt +++ b/docs/COLO-FT.txt @@ -145,81 +145,189 @@ The diagram just shows the main qmp command, you can get the detail in test procedure. == Test procedure == -1. Startup qemu -Primary: -# qemu-system-x86_64 -accel kvm -m 2048 -smp 2 -qmp stdio -name primary \ - -device piix3-usb-uhci -vnc :7 \ - -device usb-tablet -netdev tap,id=hn0,vhost=off \ - -device virtio-net-pci,id=net-pci0,netdev=hn0 \ - -drive if=virtio,id=primary-disk0,driver=quorum,read-pattern=fifo,vote-threshold=1,\ - children.0.file.filename=1.raw,\ - children.0.driver=raw -S -Secondary: -# qemu-system-x86_64 -accel kvm -m 2048 -smp 2 -qmp stdio -name secondary \ - -device piix3-usb-uhci -vnc :7 \ - -device usb-tablet -netdev tap,id=hn0,vhost=off \ - -device virtio-net-pci,id=net-pci0,netdev=hn0 \ - -drive if=none,id=secondary-disk0,file.filename=1.raw,driver=raw,node-name=node0 \ - -drive if=virtio,id=active-disk0,driver=replication,mode=secondary,\ - file.driver=qcow2,top-id=active-disk0,\ - file.file.filename=/mnt/ramfs/active_disk.img,\ - file.backing.driver=qcow2,\ - file.backing.file.filename=/mnt/ramfs/hidden_disk.img,\ - file.backing.backing=secondary-disk0 \ - -incoming tcp:0:8888 +Note: Here we are running both instances on the same host for testing, +change the IP Addresses if you want to run it on two hosts. Initally +127.0.0.1 is the Primary Host and 127.0.0.2 is the Secondary Host. -2. On Secondary VM's QEMU monitor, issue command +== Startup qemu == +1. Primary: +Note: Initally, $imagefolder/primary.qcow2 needs to be copied to all hosts. +You don't need to change any IP's here, because 0.0.0.0 listens on any +interface. The chardev's with 127.0.0.1 IP's loopback to the local qemu +instance. + +# imagefolder="/mnt/vms/colo-test-primary" + +# qemu-system-x86_64 -enable-kvm -cpu qemu64,+kvmclock -m 512 -smp 1 -qmp stdio \ + -device piix3-usb-uhci -device usb-tablet -name primary \ + -netdev tap,id=hn0,vhost=off,helper=/usr/lib/qemu/qemu-bridge-helper \ + -device rtl8139,id=e0,netdev=hn0 \ + -chardev socket,id=mirror0,host=0.0.0.0,port=9003,server,nowait \ + -chardev socket,id=compare1,host=0.0.0.0,port=9004,server,wait \ + -chardev socket,id=compare0,host=127.0.0.1,port=9001,server,nowait \ + -chardev socket,id=compare0-0,host=127.0.0.1,port=9001 \ + -chardev socket,id=compare_out,host=127.0.0.1,port=9005,server,nowait \ + -chardev socket,id=compare_out0,host=127.0.0.1,port=9005 \ + -object filter-mirror,id=m0,netdev=hn0,queue=tx,outdev=mirror0 \ + -object filter-redirector,netdev=hn0,id=redire0,queue=rx,indev=compare_out \ + -object filter-redirector,netdev=hn0,id=redire1,queue=rx,outdev=compare0 \ + -object iothread,id=iothread1 \ + -object colo-compare,id=comp0,primary_in=compare0-0,secondary_in=compare1,\ +outdev=compare_out0,iothread=iothread1 \ + -drive if=ide,id=colo-disk0,driver=quorum,read-pattern=fifo,vote-threshold=1,\ +children.0.file.filename=$imagefolder/primary.qcow2,children.0.driver=qcow2 -S + +2. Secondary: +Note: Active and hidden images need to be created only once and the +size should be the same as primary.qcow2. Again, you don't need to change +any IP's here, except for the $primary_ip variable. + +# imagefolder="/mnt/vms/colo-test-secondary" +# primary_ip=127.0.0.1 + +# qemu-img create -f qcow2 $imagefolder/secondary-active.qcow2 10G + +# qemu-img create -f qcow2 $imagefolder/secondary-hidden.qcow2 10G + +# qemu-system-x86_64 -enable-kvm -cpu qemu64,+kvmclock -m 512 -smp 1 -qmp stdio \ + -device piix3-usb-uhci -device usb-tablet -name secondary \ + -netdev tap,id=hn0,vhost=off,helper=/usr/lib/qemu/qemu-bridge-helper \ + -device rtl8139,id=e0,netdev=hn0 \ + -chardev socket,id=red0,host=$primary_ip,port=9003,reconnect=1 \ + -chardev socket,id=red1,host=$primary_ip,port=9004,reconnect=1 \ + -object filter-redirector,id=f1,netdev=hn0,queue=tx,indev=red0 \ + -object filter-redirector,id=f2,netdev=hn0,queue=rx,outdev=red1 \ + -object filter-rewriter,id=rew0,netdev=hn0,queue=all \ + -drive if=none,id=parent0,file.filename=$imagefolder/primary.qcow2,driver=qcow2 \ + -drive if=none,id=childs0,driver=replication,mode=secondary,file.driver=qcow2,\ +top-id=colo-disk0,file.file.filename=$imagefolder/secondary-active.qcow2,\ +file.backing.driver=qcow2,file.backing.file.filename=$imagefolder/secondary-hidden.qcow2,\ +file.backing.backing=parent0 \ + -drive if=ide,id=colo-disk0,driver=quorum,read-pattern=fifo,vote-threshold=1,\ +children.0=childs0 \ + -incoming tcp:0.0.0.0:9998 + + +3. On Secondary VM's QEMU monitor, issue command {'execute':'qmp_capabilities'} -{ 'execute': 'nbd-server-start', - 'arguments': {'addr': {'type': 'inet', 'data': {'host': 'xx.xx.xx.xx', 'port': '8889'} } } -} -{'execute': 'nbd-server-add', 'arguments': {'device': 'secondary-disk0', 'writable': true } } +{'execute': 'nbd-server-start', 'arguments': {'addr': {'type': 'inet', 'data': {'host': '0.0.0.0', 'port': '9999'} } } } +{'execute': 'nbd-server-add', 'arguments': {'device': 'parent0', 'writable': true } } Note: a. The qmp command nbd-server-start and nbd-server-add must be run before running the qmp command migrate on primary QEMU b. Active disk, hidden disk and nbd target's length should be the same. - c. It is better to put active disk and hidden disk in ramdisk. + c. It is better to put active disk and hidden disk in ramdisk. They + will be merged into the parent disk on failover. -3. On Primary VM's QEMU monitor, issue command: +4. On Primary VM's QEMU monitor, issue command: {'execute':'qmp_capabilities'} -{ 'execute': 'human-monitor-command', - 'arguments': {'command-line': 'drive_add -n buddy driver=replication,mode=primary,file.driver=nbd,file.host=xx.xx.xx.xx,file.port=8889,file.export=secondary-disk0,node-name=nbd_client0'}} -{ 'execute':'x-blockdev-change', 'arguments':{'parent': 'primary-disk0', 'node': 'nbd_client0' } } -{ 'execute': 'migrate-set-capabilities', - 'arguments': {'capabilities': [ {'capability': 'x-colo', 'state': true } ] } } -{ 'execute': 'migrate', 'arguments': {'uri': 'tcp:xx.xx.xx.xx:8888' } } +{'execute': 'human-monitor-command', 'arguments': {'command-line': 'drive_add -n buddy driver=replication,mode=primary,file.driver=nbd,file.host=127.0.0.2,file.port=9999,file.export=parent0,node-name=replication0'}} +{'execute': 'x-blockdev-change', 'arguments':{'parent': 'colo-disk0', 'node': 'replication0' } } +{'execute': 'migrate-set-capabilities', 'arguments': {'capabilities': [ {'capability': 'x-colo', 'state': true } ] } } +{'execute': 'migrate', 'arguments': {'uri': 'tcp:127.0.0.2:9998' } } Note: a. There should be only one NBD Client for each primary disk. - b. xx.xx.xx.xx is the secondary physical machine's hostname or IP - c. The qmp command line must be run after running qmp command line in + b. The qmp command line must be run after running qmp command line in secondary qemu. -4. After the above steps, you will see, whenever you make changes to PVM, SVM will be synced. +5. After the above steps, you will see, whenever you make changes to PVM, SVM will be synced. You can issue command '{ "execute": "migrate-set-parameters" , "arguments":{ "x-checkpoint-delay": 2000 } }' -to change the checkpoint period time +to change the idle checkpoint period time -5. Failover test -You can kill Primary VM and run 'x_colo_lost_heartbeat' in Secondary VM's -monitor at the same time, then SVM will failover and client will not detect this -change. +6. Failover test +You can kill one of the VMs and Failover on the surviving VM: -Before issuing '{ "execute": "x-colo-lost-heartbeat" }' command, we have to -issue block related command to stop block replication. -Primary: - Remove the nbd child from the quorum: - { 'execute': 'x-blockdev-change', 'arguments': {'parent': 'colo-disk0', 'child': 'children.1'}} - { 'execute': 'human-monitor-command','arguments': {'command-line': 'drive_del blk-buddy0'}} - Note: there is no qmp command to remove the blockdev now +If you killed the Secondary, then follow "Primary Failover". After that, +if you want to resume the replication, follow "Primary resume replication" -Secondary: - The primary host is down, so we should do the following thing: - { 'execute': 'nbd-server-stop' } +If you killed the Primary, then follow "Secondary Failover". After that, +if you want to resume the replication, follow "Secondary resume replication" + +== Primary Failover == +The Secondary died, resume on the Primary + +{'execute': 'x-blockdev-change', 'arguments':{ 'parent': 'colo-disk0', 'child': 'children.1'} } +{'execute': 'human-monitor-command', 'arguments':{ 'command-line': 'drive_del replication0' } } +{'execute': 'object-del', 'arguments':{ 'id': 'comp0' } } +{'execute': 'object-del', 'arguments':{ 'id': 'iothread1' } } +{'execute': 'object-del', 'arguments':{ 'id': 'm0' } } +{'execute': 'object-del', 'arguments':{ 'id': 'redire0' } } +{'execute': 'object-del', 'arguments':{ 'id': 'redire1' } } +{'execute': 'x-colo-lost-heartbeat' } + +== Secondary Failover == +The Primary died, resume on the Secondary and prepare to become the new Primary + +{'execute': 'nbd-server-stop'} +{'execute': 'x-colo-lost-heartbeat'} + +{'execute': 'object-del', 'arguments':{ 'id': 'f2' } } +{'execute': 'object-del', 'arguments':{ 'id': 'f1' } } +{'execute': 'chardev-remove', 'arguments':{ 'id': 'red1' } } +{'execute': 'chardev-remove', 'arguments':{ 'id': 'red0' } } + +{'execute': 'chardev-add', 'arguments':{ 'id': 'mirror0', 'backend': {'type': 'socket', 'data': {'addr': { 'type': 'inet', 'data': { 'host': '0.0.0.0', 'port': '9003' } }, 'server': true } } } } +{'execute': 'chardev-add', 'arguments':{ 'id': 'compare1', 'backend': {'type': 'socket', 'data': {'addr': { 'type': 'inet', 'data': { 'host': '0.0.0.0', 'port': '9004' } }, 'server': true } } } } +{'execute': 'chardev-add', 'arguments':{ 'id': 'compare0', 'backend': {'type': 'socket', 'data': {'addr': { 'type': 'inet', 'data': { 'host': '127.0.0.1', 'port': '9001' } }, 'server': true } } } } +{'execute': 'chardev-add', 'arguments':{ 'id': 'compare0-0', 'backend': {'type': 'socket', 'data': {'addr': { 'type': 'inet', 'data': { 'host': '127.0.0.1', 'port': '9001' } }, 'server': false } } } } +{'execute': 'chardev-add', 'arguments':{ 'id': 'compare_out', 'backend': {'type': 'socket', 'data': {'addr': { 'type': 'inet', 'data': { 'host': '127.0.0.1', 'port': '9005' } }, 'server': true } } } } +{'execute': 'chardev-add', 'arguments':{ 'id': 'compare_out0', 'backend': {'type': 'socket', 'data': {'addr': { 'type': 'inet', 'data': { 'host': '127.0.0.1', 'port': '9005' } }, 'server': false } } } } + +== Primary resume replication == +Resume replication after new Secondary is up. + +Start the new Secondary (Steps 2 and 3 above), then on the Primary: +{'execute': 'drive-mirror', 'arguments':{ 'device': 'colo-disk0', 'job-id': 'resync', 'target': 'nbd://127.0.0.2:9999/parent0', 'mode': 'existing', 'format': 'raw', 'sync': 'full'} } + +Wait until disk is synced, then: +{'execute': 'stop'} +{'execute': 'block-job-cancel', 'arguments':{ 'device': 'resync'} } + +{'execute': 'human-monitor-command', 'arguments':{ 'command-line': 'drive_add -n buddy driver=replication,mode=primary,file.driver=nbd,file.host=127.0.0.2,file.port=9999,file.export=parent0,node-name=replication0'}} +{'execute': 'x-blockdev-change', 'arguments':{ 'parent': 'colo-disk0', 'node': 'replication0' } } + +{'execute': 'object-add', 'arguments':{ 'qom-type': 'filter-mirror', 'id': 'm0', 'props': { 'netdev': 'hn0', 'queue': 'tx', 'outdev': 'mirror0' } } } +{'execute': 'object-add', 'arguments':{ 'qom-type': 'filter-redirector', 'id': 'redire0', 'props': { 'netdev': 'hn0', 'queue': 'rx', 'indev': 'compare_out' } } } +{'execute': 'object-add', 'arguments':{ 'qom-type': 'filter-redirector', 'id': 'redire1', 'props': { 'netdev': 'hn0', 'queue': 'rx', 'outdev': 'compare0' } } } +{'execute': 'object-add', 'arguments':{ 'qom-type': 'iothread', 'id': 'iothread1' } } +{'execute': 'object-add', 'arguments':{ 'qom-type': 'colo-compare', 'id': 'comp0', 'props': { 'primary_in': 'compare0-0', 'secondary_in': 'compare1', 'outdev': 'compare_out0', 'iothread': 'iothread1' } } } + +{'execute': 'migrate-set-capabilities', 'arguments':{ 'capabilities': [ {'capability': 'x-colo', 'state': true } ] } } +{'execute': 'migrate', 'arguments':{ 'uri': 'tcp:127.0.0.2:9998' } } + +Note: +If this Primary previously was a Secondary, then we need to insert the +filters before the filter-rewriter by using the +"'insert': 'before', 'position': 'id=rew0'" Options. See below. + +== Secondary resume replication == +Become Primary and resume replication after new Secondary is up. Note +that now 127.0.0.1 is the Secondary and 127.0.0.2 is the Primary. + +Start the new Secondary (Steps 2 and 3 above, but with primary_ip=127.0.0.2), +then on the old Secondary: +{'execute': 'drive-mirror', 'arguments':{ 'device': 'colo-disk0', 'job-id': 'resync', 'target': 'nbd://127.0.0.1:9999/parent0', 'mode': 'existing', 'format': 'raw', 'sync': 'full'} } + +Wait until disk is synced, then: +{'execute': 'stop'} +{'execute': 'block-job-cancel', 'arguments':{ 'device': 'resync' } } + +{'execute': 'human-monitor-command', 'arguments':{ 'command-line': 'drive_add -n buddy driver=replication,mode=primary,file.driver=nbd,file.host=127.0.0.1,file.port=9999,file.export=parent0,node-name=replication0'}} +{'execute': 'x-blockdev-change', 'arguments':{ 'parent': 'colo-disk0', 'node': 'replication0' } } + +{'execute': 'object-add', 'arguments':{ 'qom-type': 'filter-mirror', 'id': 'm0', 'props': { 'insert': 'before', 'position': 'id=rew0', 'netdev': 'hn0', 'queue': 'tx', 'outdev': 'mirror0' } } } +{'execute': 'object-add', 'arguments':{ 'qom-type': 'filter-redirector', 'id': 'redire0', 'props': { 'insert': 'before', 'position': 'id=rew0', 'netdev': 'hn0', 'queue': 'rx', 'indev': 'compare_out' } } } +{'execute': 'object-add', 'arguments':{ 'qom-type': 'filter-redirector', 'id': 'redire1', 'props': { 'insert': 'before', 'position': 'id=rew0', 'netdev': 'hn0', 'queue': 'rx', 'outdev': 'compare0' } } } +{'execute': 'object-add', 'arguments':{ 'qom-type': 'iothread', 'id': 'iothread1' } } +{'execute': 'object-add', 'arguments':{ 'qom-type': 'colo-compare', 'id': 'comp0', 'props': { 'primary_in': 'compare0-0', 'secondary_in': 'compare1', 'outdev': 'compare_out0', 'iothread': 'iothread1' } } } + +{'execute': 'migrate-set-capabilities', 'arguments':{ 'capabilities': [ {'capability': 'x-colo', 'state': true } ] } } +{'execute': 'migrate', 'arguments':{ 'uri': 'tcp:127.0.0.1:9998' } } == TODO == -1. Support continuous VM replication. -2. Support shared storage. -3. Develop the heartbeat part. -4. Reduce checkpoint VM’s downtime while doing checkpoint. +1. Support shared storage. +2. Develop the heartbeat part. +3. Reduce checkpoint VM’s downtime while doing checkpoint. diff --git a/docs/block-replication.txt b/docs/block-replication.txt index 6bde6737fb..108e9166a8 100644 --- a/docs/block-replication.txt +++ b/docs/block-replication.txt @@ -65,12 +65,12 @@ blocks that are already in QEMU. ^ || .---------- | || | Secondary 1 Quorum || '---------- - / \ || - / \ || - Primary 2 filter - disk ^ virtio-blk - | ^ - 3 NBD -------> 3 NBD | + / \ || virtio-blk + / \ || ^ + Primary 2 filter | + disk ^ 7 Quorum + | / + 3 NBD -------> 3 NBD / client || server 2 filter || ^ ^ --------. || | | @@ -106,6 +106,10 @@ any state that would otherwise be lost by the speculative write-through of the NBD server into the secondary disk. So before block replication, the primary disk and secondary disk should contain the same data. +7) The secondary also has a quorum node, so after secondary failover it +can become the new primary and continue replication. + + == Failure Handling == There are 7 internal errors when block replication is running: 1. I/O error on primary disk @@ -171,16 +175,18 @@ Primary: leading whitespace. 5. The qmp command line must be run after running qmp command line in secondary qemu. - 6. After failover we need remove children.1 (replication driver). + 6. After primary failover we need remove children.1 (replication driver). Secondary: -drive if=none,driver=raw,file.filename=1.raw,id=colo1 \ - -drive if=xxx,id=topxxx,driver=replication,mode=secondary,top-id=topxxx\ + -drive if=none,id=childs1,driver=replication,mode=secondary,top-id=childs1 file.file.filename=active_disk.qcow2,\ file.driver=qcow2,\ file.backing.file.filename=hidden_disk.qcow2,\ file.backing.driver=qcow2,\ file.backing.backing=colo1 + -drive if=xxx,driver=quorum,read-pattern=fifo,id=top-disk1,\ + vote-threshold=1,children.0=childs1 Then run qmp command in secondary qemu: { 'execute': 'nbd-server-start', @@ -234,6 +240,8 @@ Secondary: The primary host is down, so we should do the following thing: { 'execute': 'nbd-server-stop' } +Promote Secondary to Primary: + see COLO-FT.txt + TODO: -1. Continuous block replication -2. Shared disk +1. Shared disk From 21843dc48e38c27cbddb85b4719000c70a70b6bc Mon Sep 17 00:00:00 2001 From: Stefan Hajnoczi Date: Sat, 29 Feb 2020 11:17:27 +0000 Subject: [PATCH 23/23] l2tpv3: fix RFC number typo in qemu-options.hx The L2TPv3 RFC number is 3931: https://tools.ietf.org/html/rfc3931 Reported-by: Henrik Johansson Reviewed-by: Stefan Weil Signed-off-by: Stefan Hajnoczi Signed-off-by: Jason Wang --- qemu-options.hx | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/qemu-options.hx b/qemu-options.hx index 828e71b831..084a1c1f8c 100644 --- a/qemu-options.hx +++ b/qemu-options.hx @@ -2330,7 +2330,7 @@ DEF("netdev", HAS_ARG, QEMU_OPTION_netdev, " Linux kernel 3.3+ as well as most routers can talk\n" " L2TPv3. This transport allows connecting a VM to a VM,\n" " VM to a router and even VM to Host. It is a nearly-universal\n" - " standard (RFC3391). Note - this implementation uses static\n" + " standard (RFC3931). Note - this implementation uses static\n" " pre-configured tunnels (same as the Linux kernel).\n" " use 'src=' to specify source address\n" " use 'dst=' to specify destination address\n" @@ -2737,7 +2737,7 @@ Example (send packets from host's 1.2.3.4): @end example @item -netdev l2tpv3,id=@var{id},src=@var{srcaddr},dst=@var{dstaddr}[,srcport=@var{srcport}][,dstport=@var{dstport}],txsession=@var{txsession}[,rxsession=@var{rxsession}][,ipv6][,udp][,cookie64][,counter][,pincounter][,txcookie=@var{txcookie}][,rxcookie=@var{rxcookie}][,offset=@var{offset}] -Configure a L2TPv3 pseudowire host network backend. L2TPv3 (RFC3391) is a +Configure a L2TPv3 pseudowire host network backend. L2TPv3 (RFC3931) is a popular protocol to transport Ethernet (and other Layer 2) data frames between two systems. It is present in routers, firewalls and the Linux kernel (from version 3.3 onwards).