From 9394133f8678ea02bbabab11c8e89e576ec8e9e8 Mon Sep 17 00:00:00 2001 From: Mao Zhongyi Date: Mon, 25 Dec 2017 10:54:11 +0800 Subject: [PATCH 1/6] colo: modified the payload compare function Modified the function colo_packet_compare_common to prepare for the tcp packet comparison in the next patch. Cc: Zhang Chen Cc: Li Zhijian Cc: Jason Wang Signed-off-by: Mao Zhongyi Signed-off-by: Li Zhijian Signed-off-by: Zhang Chen Reviewed-by: Zhang Chen Signed-off-by: Jason Wang --- net/colo-compare.c | 88 +++++++++++++++++++++++----------------------- 1 file changed, 44 insertions(+), 44 deletions(-) diff --git a/net/colo-compare.c b/net/colo-compare.c index 0ebdec936c..f39ca02fc0 100644 --- a/net/colo-compare.c +++ b/net/colo-compare.c @@ -190,10 +190,12 @@ static int packet_enqueue(CompareState *s, int mode, Connection **con) * return: 0 means packet same * > 0 || < 0 means packet different */ -static int colo_packet_compare_common(Packet *ppkt, - Packet *spkt, - int poffset, - int soffset) +static int colo_compare_packet_payload(Packet *ppkt, + Packet *spkt, + uint16_t poffset, + uint16_t soffset, + uint16_t len) + { if (trace_event_get_state_backends(TRACE_COLO_COMPARE_MISCOMPARE)) { char pri_ip_src[20], pri_ip_dst[20], sec_ip_src[20], sec_ip_dst[20]; @@ -208,17 +210,7 @@ static int colo_packet_compare_common(Packet *ppkt, sec_ip_src, sec_ip_dst); } - poffset = ppkt->vnet_hdr_len + poffset; - soffset = ppkt->vnet_hdr_len + soffset; - - if (ppkt->size - poffset == spkt->size - soffset) { - return memcmp(ppkt->data + poffset, - spkt->data + soffset, - spkt->size - soffset); - } else { - trace_colo_compare_main("Net packet size are not the same"); - return -1; - } + return memcmp(ppkt->data + poffset, spkt->data + soffset, len); } /* @@ -270,24 +262,19 @@ static int colo_packet_compare_tcp(Packet *spkt, Packet *ppkt) * the secondary guest's timestamp. COLO just focus on payload, * so we just need skip this field. */ - if (ptcp->th_off > 5) { - ptrdiff_t ptcp_offset, stcp_offset; - ptcp_offset = ppkt->transport_header - (uint8_t *)ppkt->data - + (ptcp->th_off * 4) - ppkt->vnet_hdr_len; - stcp_offset = spkt->transport_header - (uint8_t *)spkt->data - + (stcp->th_off * 4) - spkt->vnet_hdr_len; + ptrdiff_t ptcp_offset, stcp_offset; - /* - * When network is busy, some tcp options(like sack) will unpredictable - * occur in primary side or secondary side. it will make packet size - * not same, but the two packet's payload is identical. colo just - * care about packet payload, so we skip the option field. - */ - res = colo_packet_compare_common(ppkt, spkt, ptcp_offset, stcp_offset); - } else if (ptcp->th_sum == stcp->th_sum) { - res = colo_packet_compare_common(ppkt, spkt, ETH_HLEN, ETH_HLEN); + ptcp_offset = ppkt->transport_header - (uint8_t *)ppkt->data + + (ptcp->th_off << 2) - ppkt->vnet_hdr_len; + stcp_offset = spkt->transport_header - (uint8_t *)spkt->data + + (stcp->th_off << 2) - spkt->vnet_hdr_len; + if (ppkt->size - ptcp_offset == spkt->size - stcp_offset) { + res = colo_compare_packet_payload(ppkt, spkt, + ptcp_offset, stcp_offset, + ppkt->size - ptcp_offset); } else { + trace_colo_compare_main("TCP: payload size of packets are different"); res = -1; } @@ -331,8 +318,8 @@ static int colo_packet_compare_tcp(Packet *spkt, Packet *ppkt) */ static int colo_packet_compare_udp(Packet *spkt, Packet *ppkt) { - int ret; - int network_header_length = ppkt->ip->ip_hl * 4; + uint16_t network_header_length = ppkt->ip->ip_hl << 2; + uint16_t offset = network_header_length + ETH_HLEN + ppkt->vnet_hdr_len; trace_colo_compare_main("compare udp"); @@ -346,11 +333,12 @@ static int colo_packet_compare_udp(Packet *spkt, Packet *ppkt) * other field like TOS,TTL,IP Checksum. we only need to compare * the ip payload here. */ - ret = colo_packet_compare_common(ppkt, spkt, - network_header_length + ETH_HLEN, - network_header_length + ETH_HLEN); - - if (ret) { + if (ppkt->size != spkt->size) { + trace_colo_compare_main("UDP: payload size of packets are different"); + return -1; + } + if (colo_compare_packet_payload(ppkt, spkt, offset, offset, + ppkt->size - offset)) { trace_colo_compare_udp_miscompare("primary pkt size", ppkt->size); trace_colo_compare_udp_miscompare("Secondary pkt size", spkt->size); if (trace_event_get_state_backends(TRACE_COLO_COMPARE_MISCOMPARE)) { @@ -359,9 +347,10 @@ static int colo_packet_compare_udp(Packet *spkt, Packet *ppkt) qemu_hexdump((char *)spkt->data, stderr, "colo-compare sec pkt", spkt->size); } + return -1; + } else { + return 0; } - - return ret; } /* @@ -370,7 +359,8 @@ static int colo_packet_compare_udp(Packet *spkt, Packet *ppkt) */ static int colo_packet_compare_icmp(Packet *spkt, Packet *ppkt) { - int network_header_length = ppkt->ip->ip_hl * 4; + uint16_t network_header_length = ppkt->ip->ip_hl << 2; + uint16_t offset = network_header_length + ETH_HLEN + ppkt->vnet_hdr_len; trace_colo_compare_main("compare icmp"); @@ -384,9 +374,12 @@ static int colo_packet_compare_icmp(Packet *spkt, Packet *ppkt) * other field like TOS,TTL,IP Checksum. we only need to compare * the ip payload here. */ - if (colo_packet_compare_common(ppkt, spkt, - network_header_length + ETH_HLEN, - network_header_length + ETH_HLEN)) { + if (ppkt->size != spkt->size) { + trace_colo_compare_main("ICMP: payload size of packets are different"); + return -1; + } + if (colo_compare_packet_payload(ppkt, spkt, offset, offset, + ppkt->size - offset)) { trace_colo_compare_icmp_miscompare("primary pkt size", ppkt->size); trace_colo_compare_icmp_miscompare("Secondary pkt size", @@ -409,6 +402,8 @@ static int colo_packet_compare_icmp(Packet *spkt, Packet *ppkt) */ static int colo_packet_compare_other(Packet *spkt, Packet *ppkt) { + uint16_t offset = ppkt->vnet_hdr_len; + trace_colo_compare_main("compare other"); if (trace_event_get_state_backends(TRACE_COLO_COMPARE_MISCOMPARE)) { char pri_ip_src[20], pri_ip_dst[20], sec_ip_src[20], sec_ip_dst[20]; @@ -423,7 +418,12 @@ static int colo_packet_compare_other(Packet *spkt, Packet *ppkt) sec_ip_src, sec_ip_dst); } - return colo_packet_compare_common(ppkt, spkt, 0, 0); + if (ppkt->size != spkt->size) { + trace_colo_compare_main("Other: payload size of packets are different"); + return -1; + } + return colo_compare_packet_payload(ppkt, spkt, offset, offset, + ppkt->size - offset); } static int colo_old_packet_check_one(Packet *pkt, int64_t *check_time) From f449c9e549c2f93839a805ea9c7d66c9699794af Mon Sep 17 00:00:00 2001 From: Mao Zhongyi Date: Mon, 25 Dec 2017 10:54:12 +0800 Subject: [PATCH 2/6] colo: compare the packet based on the tcp sequence number Packet size some time different or when network is busy. Based on same payload size, but TCP protocol can not guarantee send the same one packet in the same way, like that: We send this payload: ------------------------------ | header |1|2|3|4|5|6|7|8|9|0| ------------------------------ primary: ppkt1: ---------------- | header |1|2|3| ---------------- ppkt2: ------------------------ | header |4|5|6|7|8|9|0| ------------------------ secondary: spkt1: ------------------------------ | header |1|2|3|4|5|6|7|8|9|0| ------------------------------ In the original method, ppkt1 and ppkt2 are different in size and spkt1, so they can't compare and trigger the checkpoint. I have tested FTP get 200M and 1G file many times, I found that the performance was less than 1% of the native. Now I reconstructed the comparison of TCP packets based on the TCP sequence number. first of all, ppkt1 and spkt1 have the same starting sequence number, so they can compare, even though their length is different. And then ppkt1 with a smaller payload length is used as the comparison length, if the payload is same, send out the ppkt1 and record the offset(the length of ppkt1 payload) in spkt1. The next comparison, ppkt2 and spkt1 can be compared from the recorded position of spkt1. like that: ---------------- | header |1|2|3| ppkt1 ---------|-----| | | ---------v-----v-------------- | header |1|2|3|4|5|6|7|8|9|0| spkt1 ---------------|\------------| | \offset | ---------v-------------v | header |4|5|6|7|8|9|0| ppkt2 ------------------------ In this way, the performance can reach native 20% in my multiple tests. Cc: Zhang Chen Cc: Li Zhijian Cc: Jason Wang Signed-off-by: Mao Zhongyi Signed-off-by: Li Zhijian Signed-off-by: Zhang Chen Reviewed-by: Zhang Chen Tested-by: Zhang Chen Signed-off-by: Jason Wang --- net/colo-compare.c | 345 +++++++++++++++++++++++++++++---------------- net/colo.c | 9 ++ net/colo.h | 15 ++ net/trace-events | 2 +- 4 files changed, 251 insertions(+), 120 deletions(-) diff --git a/net/colo-compare.c b/net/colo-compare.c index f39ca02fc0..8622b0b35a 100644 --- a/net/colo-compare.c +++ b/net/colo-compare.c @@ -37,6 +37,9 @@ #define COMPARE_READ_LEN_MAX NET_BUFSIZE #define MAX_QUEUE_SIZE 1024 +#define COLO_COMPARE_FREE_PRIMARY 0x01 +#define COLO_COMPARE_FREE_SECONDARY 0x02 + /* TODO: Should be configurable */ #define REGULAR_PACKET_CHECK_MS 3000 @@ -111,14 +114,32 @@ static gint seq_sorter(Packet *a, Packet *b, gpointer data) return ntohl(atcp->th_seq) - ntohl(btcp->th_seq); } +static void fill_pkt_tcp_info(void *data, uint32_t *max_ack) +{ + Packet *pkt = data; + struct tcphdr *tcphd; + + tcphd = (struct tcphdr *)pkt->transport_header; + + pkt->tcp_seq = ntohl(tcphd->th_seq); + pkt->tcp_ack = ntohl(tcphd->th_ack); + *max_ack = *max_ack > pkt->tcp_ack ? *max_ack : pkt->tcp_ack; + pkt->header_size = pkt->transport_header - (uint8_t *)pkt->data + + (tcphd->th_off << 2) - pkt->vnet_hdr_len; + pkt->payload_size = pkt->size - pkt->header_size; + pkt->seq_end = pkt->tcp_seq + pkt->payload_size; + pkt->flags = tcphd->th_flags; +} + /* * Return 1 on success, if return 0 means the * packet will be dropped */ -static int colo_insert_packet(GQueue *queue, Packet *pkt) +static int colo_insert_packet(GQueue *queue, Packet *pkt, uint32_t *max_ack) { if (g_queue_get_length(queue) <= MAX_QUEUE_SIZE) { if (pkt->ip->ip_p == IPPROTO_TCP) { + fill_pkt_tcp_info(pkt, max_ack); g_queue_insert_sorted(queue, pkt, (GCompareDataFunc)seq_sorter, @@ -168,12 +189,12 @@ static int packet_enqueue(CompareState *s, int mode, Connection **con) } if (mode == PRIMARY_IN) { - if (!colo_insert_packet(&conn->primary_list, pkt)) { + if (!colo_insert_packet(&conn->primary_list, pkt, &conn->pack)) { error_report("colo compare primary queue size too big," "drop packet"); } } else { - if (!colo_insert_packet(&conn->secondary_list, pkt)) { + if (!colo_insert_packet(&conn->secondary_list, pkt, &conn->sack)) { error_report("colo compare secondary queue size too big," "drop packet"); } @@ -183,6 +204,25 @@ static int packet_enqueue(CompareState *s, int mode, Connection **con) return 0; } +static inline bool after(uint32_t seq1, uint32_t seq2) +{ + return (int32_t)(seq1 - seq2) > 0; +} + +static void colo_release_primary_pkt(CompareState *s, Packet *pkt) +{ + int ret; + ret = compare_chr_send(s, + pkt->data, + pkt->size, + pkt->vnet_hdr_len); + if (ret < 0) { + error_report("colo send primary packet failed"); + } + trace_colo_compare_main("packet same and release packet"); + packet_destroy(pkt, NULL); +} + /* * The IP packets sent by primary and secondary * will be compared in here @@ -214,104 +254,175 @@ static int colo_compare_packet_payload(Packet *ppkt, } /* - * Called from the compare thread on the primary - * for compare tcp packet - * compare_tcp copied from Dr. David Alan Gilbert's branch - */ -static int colo_packet_compare_tcp(Packet *spkt, Packet *ppkt) + * return true means that the payload is consist and + * need to make the next comparison, false means do + * the checkpoint +*/ +static bool colo_mark_tcp_pkt(Packet *ppkt, Packet *spkt, + int8_t *mark, uint32_t max_ack) { - struct tcphdr *ptcp, *stcp; - int res; + *mark = 0; - trace_colo_compare_main("compare tcp"); - - ptcp = (struct tcphdr *)ppkt->transport_header; - stcp = (struct tcphdr *)spkt->transport_header; - - /* - * The 'identification' field in the IP header is *very* random - * it almost never matches. Fudge this by ignoring differences in - * unfragmented packets; they'll normally sort themselves out if different - * anyway, and it should recover at the TCP level. - * An alternative would be to get both the primary and secondary to rewrite - * somehow; but that would need some sync traffic to sync the state - */ - if (ntohs(ppkt->ip->ip_off) & IP_DF) { - spkt->ip->ip_id = ppkt->ip->ip_id; - /* and the sum will be different if the IDs were different */ - spkt->ip->ip_sum = ppkt->ip->ip_sum; + if (ppkt->tcp_seq == spkt->tcp_seq && ppkt->seq_end == spkt->seq_end) { + if (colo_compare_packet_payload(ppkt, spkt, + ppkt->header_size, spkt->header_size, + ppkt->payload_size)) { + *mark = COLO_COMPARE_FREE_SECONDARY | COLO_COMPARE_FREE_PRIMARY; + return true; + } + } + if (ppkt->tcp_seq == spkt->tcp_seq && ppkt->seq_end == spkt->seq_end) { + if (colo_compare_packet_payload(ppkt, spkt, + ppkt->header_size, spkt->header_size, + ppkt->payload_size)) { + *mark = COLO_COMPARE_FREE_SECONDARY | COLO_COMPARE_FREE_PRIMARY; + return true; + } } - /* - * Check tcp header length for tcp option field. - * th_off > 5 means this tcp packet have options field. - * The tcp options maybe always different. - * for example: - * From RFC 7323. - * TCP Timestamps option (TSopt): - * Kind: 8 - * - * Length: 10 bytes - * - * +-------+-------+---------------------+---------------------+ - * |Kind=8 | 10 | TS Value (TSval) |TS Echo Reply (TSecr)| - * +-------+-------+---------------------+---------------------+ - * 1 1 4 4 - * - * In this case the primary guest's timestamp always different with - * the secondary guest's timestamp. COLO just focus on payload, - * so we just need skip this field. - */ - - ptrdiff_t ptcp_offset, stcp_offset; - - ptcp_offset = ppkt->transport_header - (uint8_t *)ppkt->data - + (ptcp->th_off << 2) - ppkt->vnet_hdr_len; - stcp_offset = spkt->transport_header - (uint8_t *)spkt->data - + (stcp->th_off << 2) - spkt->vnet_hdr_len; - if (ppkt->size - ptcp_offset == spkt->size - stcp_offset) { - res = colo_compare_packet_payload(ppkt, spkt, - ptcp_offset, stcp_offset, - ppkt->size - ptcp_offset); + /* one part of secondary packet payload still need to be compared */ + if (!after(ppkt->seq_end, spkt->seq_end)) { + if (colo_compare_packet_payload(ppkt, spkt, + ppkt->header_size + ppkt->offset, + spkt->header_size + spkt->offset, + ppkt->payload_size - ppkt->offset)) { + if (!after(ppkt->tcp_ack, max_ack)) { + *mark = COLO_COMPARE_FREE_PRIMARY; + spkt->offset += ppkt->payload_size - ppkt->offset; + return true; + } else { + /* secondary guest hasn't ack the data, don't send + * out this packet + */ + return false; + } + } } else { - trace_colo_compare_main("TCP: payload size of packets are different"); - res = -1; + /* primary packet is longer than secondary packet, compare + * the same part and mark the primary packet offset + */ + if (colo_compare_packet_payload(ppkt, spkt, + ppkt->header_size + ppkt->offset, + spkt->header_size + spkt->offset, + spkt->payload_size - spkt->offset)) { + *mark = COLO_COMPARE_FREE_SECONDARY; + ppkt->offset += spkt->payload_size - spkt->offset; + return true; + } } - if (res != 0 && - trace_event_get_state_backends(TRACE_COLO_COMPARE_MISCOMPARE)) { - char pri_ip_src[20], pri_ip_dst[20], sec_ip_src[20], sec_ip_dst[20]; + return false; +} - strcpy(pri_ip_src, inet_ntoa(ppkt->ip->ip_src)); - strcpy(pri_ip_dst, inet_ntoa(ppkt->ip->ip_dst)); - strcpy(sec_ip_src, inet_ntoa(spkt->ip->ip_src)); - strcpy(sec_ip_dst, inet_ntoa(spkt->ip->ip_dst)); +static void colo_compare_tcp(CompareState *s, Connection *conn) +{ + Packet *ppkt = NULL, *spkt = NULL; + int8_t mark; - trace_colo_compare_ip_info(ppkt->size, pri_ip_src, - pri_ip_dst, spkt->size, - sec_ip_src, sec_ip_dst); + /* + * If ppkt and spkt have the same payload, but ppkt's ACK + * is greater than spkt's ACK, in this case we can not + * send the ppkt because it will cause the secondary guest + * to miss sending some data in the next. Therefore, we + * record the maximum ACK in the current queue at both + * primary side and secondary side. Only when the ack is + * less than the smaller of the two maximum ack, then we + * can ensure that the packet's payload is acknowledged by + * primary and secondary. + */ + uint32_t min_ack = conn->pack > conn->sack ? conn->sack : conn->pack; - trace_colo_compare_tcp_info("pri tcp packet", - ntohl(ptcp->th_seq), - ntohl(ptcp->th_ack), - res, ptcp->th_flags, - ppkt->size); +pri: + if (g_queue_is_empty(&conn->primary_list)) { + return; + } + ppkt = g_queue_pop_head(&conn->primary_list); +sec: + if (g_queue_is_empty(&conn->secondary_list)) { + g_queue_push_head(&conn->primary_list, ppkt); + return; + } + spkt = g_queue_pop_head(&conn->secondary_list); - trace_colo_compare_tcp_info("sec tcp packet", - ntohl(stcp->th_seq), - ntohl(stcp->th_ack), - res, stcp->th_flags, - spkt->size); + if (ppkt->tcp_seq == ppkt->seq_end) { + colo_release_primary_pkt(s, ppkt); + ppkt = NULL; + } + + if (ppkt && conn->compare_seq && !after(ppkt->seq_end, conn->compare_seq)) { + trace_colo_compare_main("pri: this packet has compared"); + colo_release_primary_pkt(s, ppkt); + ppkt = NULL; + } + + if (spkt->tcp_seq == spkt->seq_end) { + packet_destroy(spkt, NULL); + if (!ppkt) { + goto pri; + } else { + goto sec; + } + } else { + if (conn->compare_seq && !after(spkt->seq_end, conn->compare_seq)) { + trace_colo_compare_main("sec: this packet has compared"); + packet_destroy(spkt, NULL); + if (!ppkt) { + goto pri; + } else { + goto sec; + } + } + if (!ppkt) { + g_queue_push_head(&conn->secondary_list, spkt); + goto pri; + } + } + + if (colo_mark_tcp_pkt(ppkt, spkt, &mark, min_ack)) { + trace_colo_compare_tcp_info("pri", + ppkt->tcp_seq, ppkt->tcp_ack, + ppkt->header_size, ppkt->payload_size, + ppkt->offset, ppkt->flags); + + trace_colo_compare_tcp_info("sec", + spkt->tcp_seq, spkt->tcp_ack, + spkt->header_size, spkt->payload_size, + spkt->offset, spkt->flags); + + if (mark == COLO_COMPARE_FREE_PRIMARY) { + conn->compare_seq = ppkt->seq_end; + colo_release_primary_pkt(s, ppkt); + g_queue_push_head(&conn->secondary_list, spkt); + goto pri; + } + if (mark == COLO_COMPARE_FREE_SECONDARY) { + conn->compare_seq = spkt->seq_end; + packet_destroy(spkt, NULL); + goto sec; + } + if (mark == (COLO_COMPARE_FREE_PRIMARY | COLO_COMPARE_FREE_SECONDARY)) { + conn->compare_seq = ppkt->seq_end; + colo_release_primary_pkt(s, ppkt); + packet_destroy(spkt, NULL); + goto pri; + } + } else { + g_queue_push_head(&conn->primary_list, ppkt); + g_queue_push_head(&conn->secondary_list, spkt); qemu_hexdump((char *)ppkt->data, stderr, "colo-compare ppkt", ppkt->size); qemu_hexdump((char *)spkt->data, stderr, "colo-compare spkt", spkt->size); - } - return res; + /* + * colo_compare_inconsistent_notify(); + * TODO: notice to checkpoint(); + */ + } } + /* * Called from the compare thread on the primary * for compare udp packet @@ -477,53 +588,22 @@ static void colo_old_packet_check(void *opaque) (GCompareFunc)colo_old_packet_check_one_conn); } -/* - * Called from the compare thread on the primary - * for compare packet with secondary list of the - * specified connection when a new packet was - * queued to it. - */ -static void colo_compare_connection(void *opaque, void *user_data) +static void colo_compare_packet(CompareState *s, Connection *conn, + int (*HandlePacket)(Packet *spkt, + Packet *ppkt)) { - CompareState *s = user_data; - Connection *conn = opaque; Packet *pkt = NULL; GList *result = NULL; - int ret; while (!g_queue_is_empty(&conn->primary_list) && !g_queue_is_empty(&conn->secondary_list)) { pkt = g_queue_pop_head(&conn->primary_list); - switch (conn->ip_proto) { - case IPPROTO_TCP: - result = g_queue_find_custom(&conn->secondary_list, - pkt, (GCompareFunc)colo_packet_compare_tcp); - break; - case IPPROTO_UDP: - result = g_queue_find_custom(&conn->secondary_list, - pkt, (GCompareFunc)colo_packet_compare_udp); - break; - case IPPROTO_ICMP: - result = g_queue_find_custom(&conn->secondary_list, - pkt, (GCompareFunc)colo_packet_compare_icmp); - break; - default: - result = g_queue_find_custom(&conn->secondary_list, - pkt, (GCompareFunc)colo_packet_compare_other); - break; - } + result = g_queue_find_custom(&conn->secondary_list, + pkt, (GCompareFunc)HandlePacket); if (result) { - ret = compare_chr_send(s, - pkt->data, - pkt->size, - pkt->vnet_hdr_len); - if (ret < 0) { - error_report("colo_send_primary_packet failed"); - } - trace_colo_compare_main("packet same and release packet"); + colo_release_primary_pkt(s, pkt); g_queue_remove(&conn->secondary_list, result->data); - packet_destroy(pkt, NULL); } else { /* * If one packet arrive late, the secondary_list or @@ -538,6 +618,33 @@ static void colo_compare_connection(void *opaque, void *user_data) } } +/* + * Called from the compare thread on the primary + * for compare packet with secondary list of the + * specified connection when a new packet was + * queued to it. + */ +static void colo_compare_connection(void *opaque, void *user_data) +{ + CompareState *s = user_data; + Connection *conn = opaque; + + switch (conn->ip_proto) { + case IPPROTO_TCP: + colo_compare_tcp(s, conn); + break; + case IPPROTO_UDP: + colo_compare_packet(s, conn, colo_packet_compare_udp); + break; + case IPPROTO_ICMP: + colo_compare_packet(s, conn, colo_packet_compare_icmp); + break; + default: + colo_compare_packet(s, conn, colo_packet_compare_other); + break; + } +} + static int compare_chr_send(CompareState *s, const uint8_t *buf, uint32_t size, diff --git a/net/colo.c b/net/colo.c index a39d600f34..842626502e 100644 --- a/net/colo.c +++ b/net/colo.c @@ -138,6 +138,8 @@ Connection *connection_new(ConnectionKey *key) conn->processing = false; conn->offset = 0; conn->syn_flag = 0; + conn->pack = 0; + conn->sack = 0; g_queue_init(&conn->primary_list); g_queue_init(&conn->secondary_list); @@ -163,6 +165,13 @@ Packet *packet_new(const void *data, int size, int vnet_hdr_len) pkt->size = size; pkt->creation_ms = qemu_clock_get_ms(QEMU_CLOCK_HOST); pkt->vnet_hdr_len = vnet_hdr_len; + pkt->tcp_seq = 0; + pkt->tcp_ack = 0; + pkt->seq_end = 0; + pkt->header_size = 0; + pkt->payload_size = 0; + pkt->offset = 0; + pkt->flags = 0; return pkt; } diff --git a/net/colo.h b/net/colo.h index 0658e869b4..da6c36dcf7 100644 --- a/net/colo.h +++ b/net/colo.h @@ -45,6 +45,15 @@ typedef struct Packet { int64_t creation_ms; /* Get vnet_hdr_len from filter */ uint32_t vnet_hdr_len; + uint32_t tcp_seq; /* sequence number */ + uint32_t tcp_ack; /* acknowledgement number */ + /* the sequence number of the last byte of the packet */ + uint32_t seq_end; + uint8_t header_size; /* the header length */ + uint16_t payload_size; /* the payload length */ + /* record the payload offset(the length that has been compared) */ + uint16_t offset; + uint8_t flags; /* Flags(aka Control bits) */ } Packet; typedef struct ConnectionKey { @@ -64,6 +73,12 @@ typedef struct Connection { /* flag to enqueue unprocessed_connections */ bool processing; uint8_t ip_proto; + /* record the sequence number that has been compared */ + uint32_t compare_seq; + /* the maximum of acknowledgement number in primary_list queue */ + uint32_t pack; + /* the maximum of acknowledgement number in secondary_list queue */ + uint32_t sack; /* offset = secondary_seq - primary_seq */ tcp_seq offset; /* diff --git a/net/trace-events b/net/trace-events index 938263dd7a..7b594cfdd2 100644 --- a/net/trace-events +++ b/net/trace-events @@ -13,7 +13,7 @@ colo_compare_icmp_miscompare(const char *sta, int size) ": %s = %d" colo_compare_ip_info(int psize, const char *sta, const char *stb, int ssize, const char *stc, const char *std) "ppkt size = %d, ip_src = %s, ip_dst = %s, spkt size = %d, ip_src = %s, ip_dst = %s" colo_old_packet_check_found(int64_t old_time) "%" PRId64 colo_compare_miscompare(void) "" -colo_compare_tcp_info(const char *pkt, uint32_t seq, uint32_t ack, int res, uint32_t flag, int size) "side: %s seq/ack= %u/%u res= %d flags= 0x%x pkt_size: %d\n" +colo_compare_tcp_info(const char *pkt, uint32_t seq, uint32_t ack, int hdlen, int pdlen, int offset, int flags) "%s: seq/ack= %u/%u hdlen= %d pdlen= %d offset= %d flags=%d\n" # net/filter-rewriter.c colo_filter_rewriter_debug(void) "" From 18d65d225831995f9163e718ad57e839758314e0 Mon Sep 17 00:00:00 2001 From: Thomas Huth Date: Mon, 15 Jan 2018 20:50:55 +0100 Subject: [PATCH 3/6] net: Allow hubports to connect to other netdevs QEMU can emulate hubs to connect NICs and netdevs. This is currently primarily used for the mis-named 'vlan' feature of the networking subsystem. Now the 'vlan' feature has been marked as deprecated, since its name is rather confusing and the users often rather mis-configure their network when trying to use it. But while the 'vlan' parameter should be removed at one point in time, the basic idea of emulating a hub in QEMU is still good: It's useful for bundling up the output of multiple NICs into one single l2tp netdev for example. Now to be able to use the hubport feature without 'vlan's, there is one missing piece: The possibility to connect a hubport to a netdev, too. This patch adds this possibility by introducing a new "netdev=..." parameter to the hubports. To bundle up the output of multiple NICs into one socket netdev, you can now run QEMU with these parameters for example: qemu-system-ppc64 ... -netdev socket,id=s1,connect=:11122 \ -netdev hubport,hubid=1,id=h1,netdev=s1 \ -netdev hubport,hubid=1,id=h2 -device e1000,netdev=h2 \ -netdev hubport,hubid=1,id=h3 -device virtio-net-pci,netdev=h3 For using the socket netdev, you have got to start another QEMU as the receiving side first, for example with network dumping enabled: qemu-system-x86_64 -M isapc -netdev socket,id=s0,listen=:11122 \ -device ne2k_isa,netdev=s0 \ -object filter-dump,id=f1,netdev=s0,file=/tmp/dump.dat After the ppc64 guest tried to boot from both NICs, you can see in the dump file (using Wireshark, for example), that the output of both NICs (the e1000 and the virtio-net-pci) has been successfully transfered via the socket netdev in this case. Suggested-by: Paolo Bonzini Signed-off-by: Thomas Huth Signed-off-by: Jason Wang --- net/hub.c | 27 +++++++++++++++++++++------ net/hub.h | 3 ++- net/net.c | 2 +- qapi/net.json | 4 +++- qemu-options.hx | 8 +++++--- 5 files changed, 32 insertions(+), 12 deletions(-) diff --git a/net/hub.c b/net/hub.c index 14b4eec68f..5e84a9ad93 100644 --- a/net/hub.c +++ b/net/hub.c @@ -13,6 +13,7 @@ */ #include "qemu/osdep.h" +#include "qapi/error.h" #include "monitor/monitor.h" #include "net/net.h" #include "clients.h" @@ -140,7 +141,8 @@ static NetClientInfo net_hub_port_info = { .cleanup = net_hub_port_cleanup, }; -static NetHubPort *net_hub_port_new(NetHub *hub, const char *name) +static NetHubPort *net_hub_port_new(NetHub *hub, const char *name, + NetClientState *hubpeer) { NetClientState *nc; NetHubPort *port; @@ -153,7 +155,7 @@ static NetHubPort *net_hub_port_new(NetHub *hub, const char *name) name = default_name; } - nc = qemu_new_net_client(&net_hub_port_info, NULL, "hub", name); + nc = qemu_new_net_client(&net_hub_port_info, hubpeer, "hub", name); port = DO_UPCAST(NetHubPort, nc, nc); port->id = id; port->hub = hub; @@ -165,11 +167,14 @@ static NetHubPort *net_hub_port_new(NetHub *hub, const char *name) /** * Create a port on a given hub + * @hub_id: Number of the hub * @name: Net client name or NULL for default name. + * @hubpeer: Peer to use (if "netdev=id" has been specified) * * If there is no existing hub with the given id then a new hub is created. */ -NetClientState *net_hub_add_port(int hub_id, const char *name) +NetClientState *net_hub_add_port(int hub_id, const char *name, + NetClientState *hubpeer) { NetHub *hub; NetHubPort *port; @@ -184,7 +189,7 @@ NetClientState *net_hub_add_port(int hub_id, const char *name) hub = net_hub_new(hub_id); } - port = net_hub_port_new(hub, name); + port = net_hub_port_new(hub, name, hubpeer); return &port->nc; } @@ -232,7 +237,7 @@ NetClientState *net_hub_port_find(int hub_id) } } - nc = net_hub_add_port(hub_id, NULL); + nc = net_hub_add_port(hub_id, NULL, NULL); return nc; } @@ -286,12 +291,22 @@ int net_init_hubport(const Netdev *netdev, const char *name, NetClientState *peer, Error **errp) { const NetdevHubPortOptions *hubport; + NetClientState *hubpeer = NULL; assert(netdev->type == NET_CLIENT_DRIVER_HUBPORT); assert(!peer); hubport = &netdev->u.hubport; - net_hub_add_port(hubport->hubid, name); + if (hubport->has_netdev) { + hubpeer = qemu_find_netdev(hubport->netdev); + if (!hubpeer) { + error_setg(errp, "netdev '%s' not found", hubport->netdev); + return -1; + } + } + + net_hub_add_port(hubport->hubid, name, hubpeer); + return 0; } diff --git a/net/hub.h b/net/hub.h index a625effe00..6a16f0487a 100644 --- a/net/hub.h +++ b/net/hub.h @@ -17,7 +17,8 @@ #include "qemu-common.h" -NetClientState *net_hub_add_port(int hub_id, const char *name); +NetClientState *net_hub_add_port(int hub_id, const char *name, + NetClientState *hubpeer); NetClientState *net_hub_find_client_by_name(int hub_id, const char *name); void net_hub_info(Monitor *mon); void net_hub_check_clients(void); diff --git a/net/net.c b/net/net.c index 2b81c93193..e1569e7d89 100644 --- a/net/net.c +++ b/net/net.c @@ -1063,7 +1063,7 @@ static int net_client_init1(const void *object, bool is_netdev, Error **errp) /* Do not add to a vlan if it's a nic with a netdev= parameter. */ if (netdev->type != NET_CLIENT_DRIVER_NIC || !opts->u.nic.has_netdev) { - peer = net_hub_add_port(net->has_vlan ? net->vlan : 0, NULL); + peer = net_hub_add_port(net->has_vlan ? net->vlan : 0, NULL, NULL); } if (net->has_vlan && !vlan_warned) { diff --git a/qapi/net.json b/qapi/net.json index 4beff5d582..1238ba5de1 100644 --- a/qapi/net.json +++ b/qapi/net.json @@ -410,12 +410,14 @@ # Connect two or more net clients through a software hub. # # @hubid: hub identifier number +# @netdev: used to connect hub to a netdev instead of a device (since 2.12) # # Since: 1.2 ## { 'struct': 'NetdevHubPortOptions', 'data': { - 'hubid': 'int32' } } + 'hubid': 'int32', + '*netdev': 'str' } } ## # @NetdevNetmapOptions: diff --git a/qemu-options.hx b/qemu-options.hx index 1d73fb151d..56b9a8692e 100644 --- a/qemu-options.hx +++ b/qemu-options.hx @@ -2000,7 +2000,7 @@ DEF("netdev", HAS_ARG, QEMU_OPTION_netdev, #endif "-netdev vhost-user,id=str,chardev=dev[,vhostforce=on|off]\n" " configure a vhost-user network, backed by a chardev 'dev'\n" - "-netdev hubport,id=str,hubid=n\n" + "-netdev hubport,id=str,hubid=n[,netdev=nd]\n" " configure a hub port on QEMU VLAN 'n'\n", QEMU_ARCH_ALL) DEF("net", HAS_ARG, QEMU_OPTION_net, "-net nic[,vlan=n][,netdev=nd][,macaddr=mac][,model=type][,name=str][,addr=str][,vectors=v]\n" @@ -2428,13 +2428,15 @@ vde_switch -F -sock /tmp/myswitch qemu-system-i386 linux.img -net nic -net vde,sock=/tmp/myswitch @end example -@item -netdev hubport,id=@var{id},hubid=@var{hubid} +@item -netdev hubport,id=@var{id},hubid=@var{hubid}[,netdev=@var{nd}] Create a hub port on QEMU "vlan" @var{hubid}. The hubport netdev lets you connect a NIC to a QEMU "vlan" instead of a single netdev. @code{-net} and @code{-device} with parameter @option{vlan} create the -required hub automatically. +required hub automatically. Alternatively, you can also connect the hubport +to another netdev with ID @var{nd} by using the @option{netdev=@var{nd}} +option. @item -netdev vhost-user,chardev=@var{id}[,vhostforce=on|off][,queues=n] From 93653066445bfab5f034225892f512af8e465dcd Mon Sep 17 00:00:00 2001 From: Thomas Huth Date: Thu, 11 Jan 2018 21:02:40 +0100 Subject: [PATCH 4/6] net: Allow netdevs to be used with 'hostfwd_add' and 'hostfwd_remove' It does not make much sense to limit these commands to the legacy 'vlan' concept only, they should work with the modern netdevs, too. So now it is possible to use this command with one, two or three parameters. With one parameter, the command installs a hostfwd rule on the default "user" network: hostfwd_add tcp:... With two parameters, the command installs a hostfwd rule on a netdev (that's the new way of using this command): hostfwd_add netdev_id tcp:... With three parameters, the command installs a rule on a 'vlan' (aka hub): hostfwd_add hub_id name tcp:... Same applies to the hostfwd_remove command now. Signed-off-by: Thomas Huth Signed-off-by: Jason Wang --- hmp-commands.hx | 4 ++-- net/slirp.c | 33 +++++++++++++++++++++++---------- 2 files changed, 25 insertions(+), 12 deletions(-) diff --git a/hmp-commands.hx b/hmp-commands.hx index 6d5ebdf6ab..45eebf27e6 100644 --- a/hmp-commands.hx +++ b/hmp-commands.hx @@ -1383,7 +1383,7 @@ ETEXI { .name = "hostfwd_add", .args_type = "arg1:s,arg2:s?,arg3:s?", - .params = "[vlan_id name] [tcp|udp]:[hostaddr]:hostport-[guestaddr]:guestport", + .params = "[hub_id name]|[netdev_id] [tcp|udp]:[hostaddr]:hostport-[guestaddr]:guestport", .help = "redirect TCP or UDP connections from host to guest (requires -net user)", .cmd = hmp_hostfwd_add, }, @@ -1398,7 +1398,7 @@ ETEXI { .name = "hostfwd_remove", .args_type = "arg1:s,arg2:s?,arg3:s?", - .params = "[vlan_id name] [tcp|udp]:[hostaddr]:hostport", + .params = "[hub_id name]|[netdev_id] [tcp|udp]:[hostaddr]:hostport", .help = "remove host-to-guest TCP or UDP redirection", .cmd = hmp_hostfwd_remove, }, diff --git a/net/slirp.c b/net/slirp.c index cb8ca2312f..7044d292c8 100644 --- a/net/slirp.c +++ b/net/slirp.c @@ -405,16 +405,23 @@ error: return -1; } -static SlirpState *slirp_lookup(Monitor *mon, const char *vlan, - const char *stack) +static SlirpState *slirp_lookup(Monitor *mon, const char *hub_id, + const char *name) { - - if (vlan) { + if (name) { NetClientState *nc; - nc = net_hub_find_client_by_name(strtol(vlan, NULL, 0), stack); - if (!nc) { - monitor_printf(mon, "unrecognized (vlan-id, stackname) pair\n"); - return NULL; + if (hub_id) { + nc = net_hub_find_client_by_name(strtol(hub_id, NULL, 0), name); + if (!nc) { + monitor_printf(mon, "unrecognized (vlan-id, stackname) pair\n"); + return NULL; + } + } else { + nc = qemu_find_netdev(name); + if (!nc) { + monitor_printf(mon, "unrecognized netdev id '%s'\n", name); + return NULL; + } } if (strcmp(nc->model, "user")) { monitor_printf(mon, "invalid device specified\n"); @@ -443,9 +450,12 @@ void hmp_hostfwd_remove(Monitor *mon, const QDict *qdict) const char *arg2 = qdict_get_try_str(qdict, "arg2"); const char *arg3 = qdict_get_try_str(qdict, "arg3"); - if (arg2) { + if (arg3) { s = slirp_lookup(mon, arg1, arg2); src_str = arg3; + } else if (arg2) { + s = slirp_lookup(mon, NULL, arg1); + src_str = arg2; } else { s = slirp_lookup(mon, NULL, NULL); src_str = arg1; @@ -570,9 +580,12 @@ void hmp_hostfwd_add(Monitor *mon, const QDict *qdict) const char *arg2 = qdict_get_try_str(qdict, "arg2"); const char *arg3 = qdict_get_try_str(qdict, "arg3"); - if (arg2) { + if (arg3) { s = slirp_lookup(mon, arg1, arg2); redir_str = arg3; + } else if (arg2) { + s = slirp_lookup(mon, NULL, arg1); + redir_str = arg2; } else { s = slirp_lookup(mon, NULL, NULL); redir_str = arg1; From 74f78b993214c8cf02d8c431dd7eabc6c9f43997 Mon Sep 17 00:00:00 2001 From: Thomas Huth Date: Mon, 15 Jan 2018 08:40:17 +0100 Subject: [PATCH 5/6] qemu-doc: Get rid of "vlan=X" example in the documentation The vlan concept is marked as deprecated, so we should not use this for examples in the documentation anymore. Signed-off-by: Thomas Huth Signed-off-by: Jason Wang --- qemu-options.hx | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/qemu-options.hx b/qemu-options.hx index 56b9a8692e..8ce427da78 100644 --- a/qemu-options.hx +++ b/qemu-options.hx @@ -2242,8 +2242,8 @@ qemu-system-i386 linux.img -net nic -net tap #launch a QEMU instance with two NICs, each one connected #to a TAP device qemu-system-i386 linux.img \ - -net nic,vlan=0 -net tap,vlan=0,ifname=tap0 \ - -net nic,vlan=1 -net tap,vlan=1,ifname=tap1 + -netdev tap,id=nd0,ifname=tap0 -device e1000,netdev=nd0 \ + -netdev tap,id=nd1,ifname=tap1 -device rtl8139,netdev=nd1 @end example @example From bf4835a4d5338bb7424827715df22570a8adc67c Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Philippe=20Mathieu-Daud=C3=A9?= Date: Mon, 8 Jan 2018 15:34:06 -0300 Subject: [PATCH 6/6] MAINTAINERS: update Dmitry Fleytman email MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit gently asked by his automatic reply :) Signed-off-by: Philippe Mathieu-Daudé Signed-off-by: Jason Wang --- MAINTAINERS | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/MAINTAINERS b/MAINTAINERS index 753e7996ce..fe39b30450 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -1163,7 +1163,7 @@ F: hw/scsi/mfi.h F: tests/megasas-test.c Network packet abstractions -M: Dmitry Fleytman +M: Dmitry Fleytman S: Maintained F: include/net/eth.h F: net/eth.c @@ -1171,7 +1171,7 @@ F: hw/net/net_rx_pkt* F: hw/net/net_tx_pkt* Vmware -M: Dmitry Fleytman +M: Dmitry Fleytman S: Maintained F: hw/net/vmxnet* F: hw/scsi/vmw_pvscsi* @@ -1192,12 +1192,12 @@ F: hw/mem/nvdimm.c F: include/hw/mem/nvdimm.h e1000x -M: Dmitry Fleytman +M: Dmitry Fleytman S: Maintained F: hw/net/e1000x* e1000e -M: Dmitry Fleytman +M: Dmitry Fleytman S: Maintained F: hw/net/e1000e*