From 34506b30e4210b417aa38d1518aa2c53fb7cf39b Mon Sep 17 00:00:00 2001 From: Paolo Bonzini Date: Fri, 15 Jul 2016 20:58:41 +0300 Subject: [PATCH 01/25] util/qht: Document memory ordering assumptions It is naturally expected that some memory ordering should be provided around qht_insert() and qht_lookup(). Document these assumptions in the header file and put some comments in the source to denote how that memory ordering requirements are fulfilled. Signed-off-by: Paolo Bonzini [Sergey Fedorov: commit title and message provided; comment on qht_remove() elided] Signed-off-by: Sergey Fedorov Message-Id: <20160715175852.30749-2-sergey.fedorov@linaro.org> Signed-off-by: Paolo Bonzini --- include/qemu/qht.h | 5 +++++ util/qht.c | 7 ++++++- 2 files changed, 11 insertions(+), 1 deletion(-) diff --git a/include/qemu/qht.h b/include/qemu/qht.h index 70bfc68b8d..311139b85a 100644 --- a/include/qemu/qht.h +++ b/include/qemu/qht.h @@ -69,6 +69,9 @@ void qht_destroy(struct qht *ht); * Attempting to insert a NULL @p is a bug. * Inserting the same pointer @p with different @hash values is a bug. * + * In case of successful operation, smp_wmb() is implied before the pointer is + * inserted into the hash table. + * * Returns true on sucess. * Returns false if the @p-@hash pair already exists in the hash table. */ @@ -83,6 +86,8 @@ bool qht_insert(struct qht *ht, void *p, uint32_t hash); * * Needs to be called under an RCU read-critical section. * + * smp_read_barrier_depends() is implied before the call to @func. + * * The user-provided @func compares pointers in QHT against @userp. * If the function returns true, a match has been found. * diff --git a/util/qht.c b/util/qht.c index 40d6e218f7..28ce289245 100644 --- a/util/qht.c +++ b/util/qht.c @@ -445,7 +445,11 @@ void *qht_do_lookup(struct qht_bucket *head, qht_lookup_func_t func, do { for (i = 0; i < QHT_BUCKET_ENTRIES; i++) { if (b->hashes[i] == hash) { - void *p = atomic_read(&b->pointers[i]); + /* The pointer is dereferenced before seqlock_read_retry, + * so (unlike qht_insert__locked) we need to use + * atomic_rcu_read here. + */ + void *p = atomic_rcu_read(&b->pointers[i]); if (likely(p) && likely(func(p, userp))) { return p; @@ -535,6 +539,7 @@ static bool qht_insert__locked(struct qht *ht, struct qht_map *map, atomic_rcu_set(&prev->next, b); } b->hashes[i] = hash; + /* smp_wmb() implicit in seqlock_write_begin. */ atomic_set(&b->pointers[i], p); seqlock_write_end(&head->sequence); return true; From 0b21757124d200ff86100a3bc7bb5f81521b42c4 Mon Sep 17 00:00:00 2001 From: Greg Kurz Date: Tue, 19 Jul 2016 10:28:35 +0200 Subject: [PATCH 02/25] numa: set the memory backend "is_mapped" field Commit 2aece63 "hostmem: detect host backend memory is being used properly" added a way to know if a memory backend is busy or available for use. It caused a slight regression if we pass the same backend to a NUMA node and to a pc-dimm device: -m 1G,slots=2,maxmem=2G \ -object memory-backend-ram,size=1G,id=mem-mem1 \ -device pc-dimm,id=dimm-mem1,memdev=mem-mem1 \ -numa node,nodeid=0,memdev=mem-mem1 Before commit 2aece63, this would cause QEMU to print an error message and to exit gracefully: qemu-system-ppc64: -device pc-dimm,id=dimm-mem1,memdev=mem-mem1: can't use already busy memdev: mem-mem1 Since commit 2aece63, QEMU hits an assertion in the memory code: qemu-system-ppc64: memory.c:1934: memory_region_add_subregion_common: Assertion `!subregion->container' failed. Aborted This happens because pc-dimm devices don't use memory_region_is_mapped() anymore and cannot guess the backend is already used by a NUMA node. Let's revert to the previous behavior by turning the NUMA code to also call host_memory_backend_set_mapped() when it uses a backend. Fixes: 2aece63c8a9d2c3a8ff41d2febc4cdeff2633331 Signed-off-by: Greg Kurz Message-Id: <146891691503.15642.9817215371777203794.stgit@bahia.lan> Signed-off-by: Paolo Bonzini --- numa.c | 1 + 1 file changed, 1 insertion(+) diff --git a/numa.c b/numa.c index cbae430e36..72861713e5 100644 --- a/numa.c +++ b/numa.c @@ -463,6 +463,7 @@ void memory_region_allocate_system_memory(MemoryRegion *mr, Object *owner, exit(1); } + host_memory_backend_set_mapped(backend, true); memory_region_add_subregion(mr, addr, seg); vmstate_register_ram_global(seg); addr += size; From 056b68af773b31fa98fe4538f6424c0079b61415 Mon Sep 17 00:00:00 2001 From: Igor Mammedov Date: Wed, 20 Jul 2016 11:54:03 +0200 Subject: [PATCH 03/25] fix qemu exit on memory hotplug when allocation fails at prealloc time When adding hostmem backend at runtime, QEMU might exit with error: "os_mem_prealloc: Insufficient free host memory pages available to allocate guest RAM" It happens due to os_mem_prealloc() not handling errors gracefully. Fix it by passing errp argument so that os_mem_prealloc() could report error to callers and undo performed allocation when os_mem_prealloc() fails. Signed-off-by: Igor Mammedov Message-Id: <1469008443-72059-1-git-send-email-imammedo@redhat.com> Reviewed-by: Markus Armbruster Signed-off-by: Paolo Bonzini --- backends/hostmem.c | 18 ++++++++++++++---- exec.c | 10 ++++++++-- include/qemu/osdep.h | 2 +- util/oslib-posix.c | 28 ++++++++++++++-------------- util/oslib-win32.c | 2 +- 5 files changed, 38 insertions(+), 22 deletions(-) diff --git a/backends/hostmem.c b/backends/hostmem.c index ac802570a8..b7a208d0da 100644 --- a/backends/hostmem.c +++ b/backends/hostmem.c @@ -203,6 +203,7 @@ static bool host_memory_backend_get_prealloc(Object *obj, Error **errp) static void host_memory_backend_set_prealloc(Object *obj, bool value, Error **errp) { + Error *local_err = NULL; HostMemoryBackend *backend = MEMORY_BACKEND(obj); if (backend->force_prealloc) { @@ -223,7 +224,11 @@ static void host_memory_backend_set_prealloc(Object *obj, bool value, void *ptr = memory_region_get_ram_ptr(&backend->mr); uint64_t sz = memory_region_size(&backend->mr); - os_mem_prealloc(fd, ptr, sz); + os_mem_prealloc(fd, ptr, sz, &local_err); + if (local_err) { + error_propagate(errp, local_err); + return; + } backend->prealloc = true; } } @@ -286,8 +291,7 @@ host_memory_backend_memory_complete(UserCreatable *uc, Error **errp) if (bc->alloc) { bc->alloc(backend, &local_err); if (local_err) { - error_propagate(errp, local_err); - return; + goto out; } ptr = memory_region_get_ram_ptr(&backend->mr); @@ -343,9 +347,15 @@ host_memory_backend_memory_complete(UserCreatable *uc, Error **errp) * specified NUMA policy in place. */ if (backend->prealloc) { - os_mem_prealloc(memory_region_get_fd(&backend->mr), ptr, sz); + os_mem_prealloc(memory_region_get_fd(&backend->mr), ptr, sz, + &local_err); + if (local_err) { + goto out; + } } } +out: + error_propagate(errp, local_err); } static bool diff --git a/exec.c b/exec.c index 50e3ee237c..8ffde75983 100644 --- a/exec.c +++ b/exec.c @@ -1226,7 +1226,7 @@ static void *file_ram_alloc(RAMBlock *block, char *filename; char *sanitized_name; char *c; - void *area; + void *area = MAP_FAILED; int fd = -1; int64_t page_size; @@ -1314,13 +1314,19 @@ static void *file_ram_alloc(RAMBlock *block, } if (mem_prealloc) { - os_mem_prealloc(fd, area, memory); + os_mem_prealloc(fd, area, memory, errp); + if (errp && *errp) { + goto error; + } } block->fd = fd; return area; error: + if (area != MAP_FAILED) { + qemu_ram_munmap(area, memory); + } if (unlink_on_error) { unlink(path); } diff --git a/include/qemu/osdep.h b/include/qemu/osdep.h index fbb875959f..d7c111d169 100644 --- a/include/qemu/osdep.h +++ b/include/qemu/osdep.h @@ -379,7 +379,7 @@ unsigned long qemu_getauxval(unsigned long type); void qemu_set_tty_echo(int fd, bool echo); -void os_mem_prealloc(int fd, char *area, size_t sz); +void os_mem_prealloc(int fd, char *area, size_t sz, Error **errp); int qemu_read_password(char *buf, int buf_size); diff --git a/util/oslib-posix.c b/util/oslib-posix.c index 6d70d9a706..f2d4e9e592 100644 --- a/util/oslib-posix.c +++ b/util/oslib-posix.c @@ -318,7 +318,7 @@ static void sigbus_handler(int signal) siglongjmp(sigjump, 1); } -void os_mem_prealloc(int fd, char *area, size_t memory) +void os_mem_prealloc(int fd, char *area, size_t memory, Error **errp) { int ret; struct sigaction act, oldact; @@ -330,8 +330,9 @@ void os_mem_prealloc(int fd, char *area, size_t memory) ret = sigaction(SIGBUS, &act, &oldact); if (ret) { - perror("os_mem_prealloc: failed to install signal handler"); - exit(1); + error_setg_errno(errp, errno, + "os_mem_prealloc: failed to install signal handler"); + return; } /* unblock SIGBUS */ @@ -340,9 +341,8 @@ void os_mem_prealloc(int fd, char *area, size_t memory) pthread_sigmask(SIG_UNBLOCK, &set, &oldset); if (sigsetjmp(sigjump, 1)) { - fprintf(stderr, "os_mem_prealloc: Insufficient free host memory " - "pages available to allocate guest RAM\n"); - exit(1); + error_setg(errp, "os_mem_prealloc: Insufficient free host memory " + "pages available to allocate guest RAM\n"); } else { int i; size_t hpagesize = qemu_fd_getpagesize(fd); @@ -352,15 +352,15 @@ void os_mem_prealloc(int fd, char *area, size_t memory) for (i = 0; i < numpages; i++) { memset(area + (hpagesize * i), 0, 1); } - - ret = sigaction(SIGBUS, &oldact, NULL); - if (ret) { - perror("os_mem_prealloc: failed to reinstall signal handler"); - exit(1); - } - - pthread_sigmask(SIG_SETMASK, &oldset, NULL); } + + ret = sigaction(SIGBUS, &oldact, NULL); + if (ret) { + /* Terminate QEMU since it can't recover from error */ + perror("os_mem_prealloc: failed to reinstall signal handler"); + exit(1); + } + pthread_sigmask(SIG_SETMASK, &oldset, NULL); } diff --git a/util/oslib-win32.c b/util/oslib-win32.c index 6debc2b8a6..4c1dcf1e66 100644 --- a/util/oslib-win32.c +++ b/util/oslib-win32.c @@ -539,7 +539,7 @@ int getpagesize(void) return system_info.dwPageSize; } -void os_mem_prealloc(int fd, char *area, size_t memory) +void os_mem_prealloc(int fd, char *area, size_t memory, Error **errp) { int i; size_t pagesize = getpagesize(); From 3f822cff4473fd6313f6c06623edf148b3ad2cc6 Mon Sep 17 00:00:00 2001 From: Paolo Bonzini Date: Thu, 21 Jul 2016 11:03:11 +0200 Subject: [PATCH 04/25] checkpatch: add check for bzero Tested-By: Peter Xu Signed-off-by: Paolo Bonzini --- scripts/checkpatch.pl | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/scripts/checkpatch.pl b/scripts/checkpatch.pl index afa7f79f83..b7cb4ab478 100755 --- a/scripts/checkpatch.pl +++ b/scripts/checkpatch.pl @@ -2544,7 +2544,7 @@ sub process { } } -# check for non-portable ffs() calls that have portable alternatives in QEMU +# check for non-portable libc calls that have portable alternatives in QEMU if ($line =~ /\bffs\(/) { ERROR("use ctz32() instead of ffs()\n" . $herecurr); } @@ -2554,6 +2554,9 @@ sub process { if ($line =~ /\bffsll\(/) { ERROR("use ctz64() instead of ffsll()\n" . $herecurr); } + if ($line =~ /\bbzero\(/) { + ERROR("use memset() instead of bzero()\n" . $herecurr); + } } # If we have no input at all, then there is nothing to report on From 00432b69538559285ca55677c1002cad556aea50 Mon Sep 17 00:00:00 2001 From: Cao jin Date: Thu, 21 Jul 2016 18:33:32 +0800 Subject: [PATCH 05/25] util: drop inet_nonblocking_connect() It is never used; all nonblocking connect now goes through socket_connect(), which calls inet_connect_addr(). Cc: Daniel P. Berrange Cc: Gerd Hoffmann Cc: Paolo Bonzini Signed-off-by: Cao jin Message-Id: <1469097213-26441-2-git-send-email-caoj.fnst@cn.fujitsu.com> Signed-off-by: Paolo Bonzini --- include/qemu/sockets.h | 3 --- util/qemu-sockets.c | 30 ------------------------------ 2 files changed, 33 deletions(-) diff --git a/include/qemu/sockets.h b/include/qemu/sockets.h index 5fe01fbc6c..2cbe6432cd 100644 --- a/include/qemu/sockets.h +++ b/include/qemu/sockets.h @@ -36,9 +36,6 @@ InetSocketAddress *inet_parse(const char *str, Error **errp); int inet_listen(const char *str, char *ostr, int olen, int socktype, int port_offset, Error **errp); int inet_connect(const char *str, Error **errp); -int inet_nonblocking_connect(const char *str, - NonBlockingConnectHandler *callback, - void *opaque, Error **errp); NetworkAddressFamily inet_netfamily(int family); diff --git a/util/qemu-sockets.c b/util/qemu-sockets.c index 777af49d53..2e0570b2b7 100644 --- a/util/qemu-sockets.c +++ b/util/qemu-sockets.c @@ -674,36 +674,6 @@ int inet_connect(const char *str, Error **errp) return sock; } -/** - * Create a non-blocking socket and connect it to an address. - * Calls the callback function with fd in case of success or -1 in case of - * error. - * - * @str: address string - * @callback: callback function that is called when connect completes, - * cannot be NULL. - * @opaque: opaque for callback function - * @errp: set in case of an error - * - * Returns: -1 on immediate error, file descriptor on success. - **/ -int inet_nonblocking_connect(const char *str, - NonBlockingConnectHandler *callback, - void *opaque, Error **errp) -{ - int sock = -1; - InetSocketAddress *addr; - - g_assert(callback != NULL); - - addr = inet_parse(str, errp); - if (addr != NULL) { - sock = inet_connect_saddr(addr, errp, callback, opaque); - qapi_free_InetSocketAddress(addr); - } - return sock; -} - #ifndef _WIN32 static int unix_listen_saddr(UnixSocketAddress *saddr, From f8ea7a8656a807a8a3e4e2389059679716a75baa Mon Sep 17 00:00:00 2001 From: Cao jin Date: Thu, 21 Jul 2016 18:33:33 +0800 Subject: [PATCH 06/25] util: drop unix_nonblocking_connect() It is never used; all nonblocking connect now goes through socket_connect(), which calls unix_connect_addr(). Cc: Daniel P. Berrange Cc: Gerd Hoffmann Cc: Paolo Bonzini Signed-off-by: Cao jin Message-Id: <1469097213-26441-3-git-send-email-caoj.fnst@cn.fujitsu.com> Signed-off-by: Paolo Bonzini --- include/qemu/sockets.h | 3 --- util/qemu-sockets.c | 16 ---------------- 2 files changed, 19 deletions(-) diff --git a/include/qemu/sockets.h b/include/qemu/sockets.h index 2cbe6432cd..28a28c0717 100644 --- a/include/qemu/sockets.h +++ b/include/qemu/sockets.h @@ -41,9 +41,6 @@ NetworkAddressFamily inet_netfamily(int family); int unix_listen(const char *path, char *ostr, int olen, Error **errp); int unix_connect(const char *path, Error **errp); -int unix_nonblocking_connect(const char *str, - NonBlockingConnectHandler *callback, - void *opaque, Error **errp); SocketAddress *socket_parse(const char *str, Error **errp); int socket_connect(SocketAddress *addr, Error **errp, diff --git a/util/qemu-sockets.c b/util/qemu-sockets.c index 2e0570b2b7..58f9a2cce0 100644 --- a/util/qemu-sockets.c +++ b/util/qemu-sockets.c @@ -863,22 +863,6 @@ int unix_connect(const char *path, Error **errp) } -int unix_nonblocking_connect(const char *path, - NonBlockingConnectHandler *callback, - void *opaque, Error **errp) -{ - UnixSocketAddress *saddr; - int sock = -1; - - g_assert(callback != NULL); - - saddr = g_new0(UnixSocketAddress, 1); - saddr->path = g_strdup(path); - sock = unix_connect_saddr(saddr, errp, callback, opaque); - qapi_free_UnixSocketAddress(saddr); - return sock; -} - SocketAddress *socket_parse(const char *str, Error **errp) { SocketAddress *addr; From 767db021bc45affa7a7c20ea21730d2867a0c37f Mon Sep 17 00:00:00 2001 From: Cao jin Date: Mon, 25 Jul 2016 21:02:51 +0800 Subject: [PATCH 07/25] util: Drop inet_listen() Since commit e65c67e4, inet_listen() is not used anymore, and all inet listen operation goes through QIOChannel. Cc: Daniel P. Berrange Cc: Gerd Hoffmann Cc: Paolo Bonzini Cc: Eric Blake Signed-off-by: Cao jin Message-Id: <1469451771-1173-3-git-send-email-caoj.fnst@cn.fujitsu.com> Signed-off-by: Paolo Bonzini --- include/qemu/sockets.h | 2 -- util/qemu-sockets.c | 28 ---------------------------- 2 files changed, 30 deletions(-) diff --git a/include/qemu/sockets.h b/include/qemu/sockets.h index 28a28c0717..9eb24707df 100644 --- a/include/qemu/sockets.h +++ b/include/qemu/sockets.h @@ -33,8 +33,6 @@ int socket_set_fast_reuse(int fd); typedef void NonBlockingConnectHandler(int fd, Error *err, void *opaque); InetSocketAddress *inet_parse(const char *str, Error **errp); -int inet_listen(const char *str, char *ostr, int olen, - int socktype, int port_offset, Error **errp); int inet_connect(const char *str, Error **errp); NetworkAddressFamily inet_netfamily(int family); diff --git a/util/qemu-sockets.c b/util/qemu-sockets.c index 58f9a2cce0..2aed799e97 100644 --- a/util/qemu-sockets.c +++ b/util/qemu-sockets.c @@ -624,34 +624,6 @@ fail: return NULL; } -int inet_listen(const char *str, char *ostr, int olen, - int socktype, int port_offset, Error **errp) -{ - char *optstr; - int sock = -1; - InetSocketAddress *addr; - - addr = inet_parse(str, errp); - if (addr != NULL) { - sock = inet_listen_saddr(addr, port_offset, true, errp); - if (sock != -1 && ostr) { - optstr = strchr(str, ','); - if (addr->ipv6) { - snprintf(ostr, olen, "[%s]:%s%s", - addr->host, - addr->port, - optstr ? optstr : ""); - } else { - snprintf(ostr, olen, "%s:%s%s", - addr->host, - addr->port, - optstr ? optstr : ""); - } - } - qapi_free_InetSocketAddress(addr); - } - return sock; -} /** * Create a blocking socket and connect it to an address. From 7266ae91a111001abda65c79299c9b7e365456b6 Mon Sep 17 00:00:00 2001 From: "Emilio G. Cota" Date: Fri, 22 Jul 2016 12:36:30 -0400 Subject: [PATCH 08/25] qht: do not segfault when gathering stats from an uninitialized qht So far, QHT functions assume that the passed qht has previously been initialized--otherwise they segfault. This patch makes an exception for qht_statistics_init, with the goal of simplifying calling code. For instance, qht_statistics_init is called from the 'info jit' dump, and given that under KVM the TB qht is never initialized, we get a segfault. Thus, instead of complicating the 'info jit' code with additional checks, let's allow passing an uninitialized qht to qht_statistics_init. While at it, add a test for this to test-qht. Before the patch (for $ qemu -enable-kvm [...]): (qemu) info jit [...] direct jump count 0 (0%) (2 jumps=0 0%) Program received signal SIGSEGV, Segmentation fault. After the patch the "TB hash buckets", "TB hash occupancy" and "TB hash avg chain" lines are omitted. (qemu) info jit [...] direct jump count 0 (0%) (2 jumps=0 0%) TB hash buckets 0/0 (-nan% head buckets used) TB hash occupancy nan% avg chain occ. Histogram: (null) TB hash avg chain nan buckets. Histogram: (null) [...] Reported by: Changlong Xie Signed-off-by: Emilio G. Cota Message-Id: <1469205390-14369-1-git-send-email-cota@braap.org> [Extract printing statistics to an entirely separate function. - Paolo] Signed-off-by: Paolo Bonzini --- tests/test-qht.c | 4 +++ translate-all.c | 70 +++++++++++++++++++++++++++--------------------- util/qht.c | 7 ++++- 3 files changed, 49 insertions(+), 32 deletions(-) diff --git a/tests/test-qht.c b/tests/test-qht.c index f1d628371d..46a64b6731 100644 --- a/tests/test-qht.c +++ b/tests/test-qht.c @@ -95,8 +95,12 @@ static void iter_check(unsigned int count) static void qht_do_test(unsigned int mode, size_t init_entries) { + /* under KVM we might fetch stats from an uninitialized qht */ + check_n(0); + qht_init(&ht, 0, mode); + check_n(0); insert(0, N); check(0, N, true); check_n(N); diff --git a/translate-all.c b/translate-all.c index 0d47c1c0cf..efeba298b9 100644 --- a/translate-all.c +++ b/translate-all.c @@ -1663,15 +1663,50 @@ void tb_flush_jmp_cache(CPUState *cpu, target_ulong addr) TB_JMP_PAGE_SIZE * sizeof(TranslationBlock *)); } +static void print_qht_statistics(FILE *f, fprintf_function cpu_fprintf, + struct qht_stats hst) +{ + uint32_t hgram_opts; + size_t hgram_bins; + char *hgram; + + if (!hst.head_buckets) { + return; + } + cpu_fprintf(f, "TB hash buckets %zu/%zu (%0.2f%% head buckets used)\n", + hst.used_head_buckets, hst.head_buckets, + (double)hst.used_head_buckets / hst.head_buckets * 100); + + hgram_opts = QDIST_PR_BORDER | QDIST_PR_LABELS; + hgram_opts |= QDIST_PR_100X | QDIST_PR_PERCENT; + if (qdist_xmax(&hst.occupancy) - qdist_xmin(&hst.occupancy) == 1) { + hgram_opts |= QDIST_PR_NODECIMAL; + } + hgram = qdist_pr(&hst.occupancy, 10, hgram_opts); + cpu_fprintf(f, "TB hash occupancy %0.2f%% avg chain occ. Histogram: %s\n", + qdist_avg(&hst.occupancy) * 100, hgram); + g_free(hgram); + + hgram_opts = QDIST_PR_BORDER | QDIST_PR_LABELS; + hgram_bins = qdist_xmax(&hst.chain) - qdist_xmin(&hst.chain); + if (hgram_bins > 10) { + hgram_bins = 10; + } else { + hgram_bins = 0; + hgram_opts |= QDIST_PR_NODECIMAL | QDIST_PR_NOBINRANGE; + } + hgram = qdist_pr(&hst.chain, hgram_bins, hgram_opts); + cpu_fprintf(f, "TB hash avg chain %0.3f buckets. Histogram: %s\n", + qdist_avg(&hst.chain), hgram); + g_free(hgram); +} + void dump_exec_info(FILE *f, fprintf_function cpu_fprintf) { int i, target_code_size, max_target_code_size; int direct_jmp_count, direct_jmp2_count, cross_page; TranslationBlock *tb; struct qht_stats hst; - uint32_t hgram_opts; - size_t hgram_bins; - char *hgram; target_code_size = 0; max_target_code_size = 0; @@ -1724,34 +1759,7 @@ void dump_exec_info(FILE *f, fprintf_function cpu_fprintf) tcg_ctx.tb_ctx.nb_tbs : 0); qht_statistics_init(&tcg_ctx.tb_ctx.htable, &hst); - - cpu_fprintf(f, "TB hash buckets %zu/%zu (%0.2f%% head buckets used)\n", - hst.used_head_buckets, hst.head_buckets, - (double)hst.used_head_buckets / hst.head_buckets * 100); - - hgram_opts = QDIST_PR_BORDER | QDIST_PR_LABELS; - hgram_opts |= QDIST_PR_100X | QDIST_PR_PERCENT; - if (qdist_xmax(&hst.occupancy) - qdist_xmin(&hst.occupancy) == 1) { - hgram_opts |= QDIST_PR_NODECIMAL; - } - hgram = qdist_pr(&hst.occupancy, 10, hgram_opts); - cpu_fprintf(f, "TB hash occupancy %0.2f%% avg chain occ. Histogram: %s\n", - qdist_avg(&hst.occupancy) * 100, hgram); - g_free(hgram); - - hgram_opts = QDIST_PR_BORDER | QDIST_PR_LABELS; - hgram_bins = qdist_xmax(&hst.chain) - qdist_xmin(&hst.chain); - if (hgram_bins > 10) { - hgram_bins = 10; - } else { - hgram_bins = 0; - hgram_opts |= QDIST_PR_NODECIMAL | QDIST_PR_NOBINRANGE; - } - hgram = qdist_pr(&hst.chain, hgram_bins, hgram_opts); - cpu_fprintf(f, "TB hash avg chain %0.3f buckets. Histogram: %s\n", - qdist_avg(&hst.chain), hgram); - g_free(hgram); - + print_qht_statistics(f, cpu_fprintf, hst); qht_statistics_destroy(&hst); cpu_fprintf(f, "\nStatistics:\n"); diff --git a/util/qht.c b/util/qht.c index 28ce289245..16a8d7950e 100644 --- a/util/qht.c +++ b/util/qht.c @@ -789,11 +789,16 @@ void qht_statistics_init(struct qht *ht, struct qht_stats *stats) map = atomic_rcu_read(&ht->map); - stats->head_buckets = map->n_buckets; stats->used_head_buckets = 0; stats->entries = 0; qdist_init(&stats->chain); qdist_init(&stats->occupancy); + /* bail out if the qht has not yet been initialized */ + if (unlikely(map == NULL)) { + stats->head_buckets = 0; + return; + } + stats->head_buckets = map->n_buckets; for (i = 0; i < map->n_buckets; i++) { struct qht_bucket *head = &map->buckets[i]; From ba03584f4f88082368b2562e515c3d60421b68ce Mon Sep 17 00:00:00 2001 From: Dave Hansen Date: Sat, 23 Jul 2016 09:50:25 +0200 Subject: [PATCH 09/25] target-i386: fix typo in xsetbv implementation QEMU 2.6 added support for the XSAVE family of instructions, which includes the XSETBV instruction which allows setting the XCR0 register. But, when booting Linux kernels with XSAVE support enabled, I was getting very early crashes where the instruction pointer was set to 0x3. I tracked it down to a jump instruction generated by this: gen_jmp_im(s->pc - pc_start); where s->pc is pointing to the instruction after XSETBV and pc_start is pointing _at_ XSETBV. Subtract the two and you get 0x3. Whoops. The fix is to replace this typo with the pattern found everywhere else in the file when folks want to end the translation buffer. Richard Henderson confirmed that this is a bug and that this is the correct fix. Signed-off-by: Dave Hansen Cc: qemu-stable@nongnu.org Cc: Eduardo Habkost Reviewed-by: Richard Henderson Signed-off-by: Paolo Bonzini --- target-i386/translate.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/target-i386/translate.c b/target-i386/translate.c index e81fce7bc2..fa2ac48173 100644 --- a/target-i386/translate.c +++ b/target-i386/translate.c @@ -7176,7 +7176,7 @@ static target_ulong disas_insn(CPUX86State *env, DisasContext *s, tcg_gen_trunc_tl_i32(cpu_tmp2_i32, cpu_regs[R_ECX]); gen_helper_xsetbv(cpu_env, cpu_tmp2_i32, cpu_tmp1_i64); /* End TB because translation flags may change. */ - gen_jmp_im(s->pc - pc_start); + gen_jmp_im(s->pc - s->cs_base); gen_eob(s); break; From f9dbc19e8bf58d0cbc830083352475bb16f315c4 Mon Sep 17 00:00:00 2001 From: "Emilio G. Cota" Date: Mon, 25 Jul 2016 11:03:43 -0400 Subject: [PATCH 10/25] qdist: fix memory leak during binning In qdist_bin__internal(), to->entries is initialized to a 1-element array, which we then leak when n == from->n. Fix it. Signed-off-by: Emilio G. Cota Message-Id: <1469459025-23606-2-git-send-email-cota@braap.org> Signed-off-by: Paolo Bonzini --- util/qdist.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/util/qdist.c b/util/qdist.c index 56f573837d..eb2236cdc8 100644 --- a/util/qdist.c +++ b/util/qdist.c @@ -188,7 +188,7 @@ void qdist_bin__internal(struct qdist *to, const struct qdist *from, size_t n) } } /* they're equally spaced, so copy the dist and bail out */ - to->entries = g_new(struct qdist_entry, from->n); + to->entries = g_realloc_n(to->entries, n, sizeof(*to->entries)); to->n = from->n; memcpy(to->entries, from->entries, sizeof(*to->entries) * to->n); return; From 071d4054770205ddb8a58a9e2735069d8fe52af1 Mon Sep 17 00:00:00 2001 From: "Emilio G. Cota" Date: Mon, 25 Jul 2016 11:03:44 -0400 Subject: [PATCH 11/25] qdist: use g_renew and g_new instead of g_realloc and g_malloc. This is safer against overflow. g_renew is available in all version of glib, while g_realloc_n is only available in 2.24. Signed-off-by: Emilio G. Cota Message-Id: <1469459025-23606-3-git-send-email-cota@braap.org> [Rewritten to use g_new/g_renew. - Paolo] Signed-off-by: Paolo Bonzini --- util/qdist.c | 7 +++---- 1 file changed, 3 insertions(+), 4 deletions(-) diff --git a/util/qdist.c b/util/qdist.c index eb2236cdc8..e95722b8a1 100644 --- a/util/qdist.c +++ b/util/qdist.c @@ -16,7 +16,7 @@ void qdist_init(struct qdist *dist) { - dist->entries = g_malloc(sizeof(*dist->entries)); + dist->entries = g_new(struct qdist_entry, 1); dist->size = 1; dist->n = 0; } @@ -62,8 +62,7 @@ void qdist_add(struct qdist *dist, double x, long count) if (unlikely(dist->n == dist->size)) { dist->size *= 2; - dist->entries = g_realloc(dist->entries, - sizeof(*dist->entries) * (dist->size)); + dist->entries = g_renew(struct qdist_entry, dist->entries, dist->size); } dist->n++; entry = &dist->entries[dist->n - 1]; @@ -188,7 +187,7 @@ void qdist_bin__internal(struct qdist *to, const struct qdist *from, size_t n) } } /* they're equally spaced, so copy the dist and bail out */ - to->entries = g_realloc_n(to->entries, n, sizeof(*to->entries)); + to->entries = g_renew(struct qdist_entry, to->entries, n); to->n = from->n; memcpy(to->entries, from->entries, sizeof(*to->entries) * to->n); return; From 11b7b07f8a15879134a54e73fade98d5e11e04f8 Mon Sep 17 00:00:00 2001 From: "Emilio G. Cota" Date: Mon, 25 Jul 2016 11:03:45 -0400 Subject: [PATCH 12/25] qdist: return "(empty)" instead of NULL when printing an empty dist Printf'ing a NULL string is undefined behaviour. Avoid it. Reported-by: Peter Maydell Signed-off-by: Emilio G. Cota Message-Id: <1469459025-23606-4-git-send-email-cota@braap.org> Signed-off-by: Paolo Bonzini --- tests/test-qdist.c | 10 ++++++++-- util/qdist.c | 6 ++++-- 2 files changed, 12 insertions(+), 4 deletions(-) diff --git a/tests/test-qdist.c b/tests/test-qdist.c index 0298986ac9..9541ce31eb 100644 --- a/tests/test-qdist.c +++ b/tests/test-qdist.c @@ -360,10 +360,16 @@ static void test_none(void) g_assert(isnan(qdist_xmax(&dist))); pr = qdist_pr_plain(&dist, 0); - g_assert(pr == NULL); + g_assert_cmpstr(pr, ==, "(empty)"); + g_free(pr); pr = qdist_pr_plain(&dist, 2); - g_assert(pr == NULL); + g_assert_cmpstr(pr, ==, "(empty)"); + g_free(pr); + + pr = qdist_pr(&dist, 0, QDIST_PR_BORDER); + g_assert_cmpstr(pr, ==, "(empty)"); + g_free(pr); qdist_destroy(&dist); } diff --git a/util/qdist.c b/util/qdist.c index e95722b8a1..5f75e24c29 100644 --- a/util/qdist.c +++ b/util/qdist.c @@ -14,6 +14,8 @@ #define NAN (0.0 / 0.0) #endif +#define QDIST_EMPTY_STR "(empty)" + void qdist_init(struct qdist *dist) { dist->entries = g_new(struct qdist_entry, 1); @@ -233,7 +235,7 @@ char *qdist_pr_plain(const struct qdist *dist, size_t n) char *ret; if (dist->n == 0) { - return NULL; + return g_strdup(QDIST_EMPTY_STR); } qdist_bin__internal(&binned, dist, n); ret = qdist_pr_internal(&binned); @@ -308,7 +310,7 @@ char *qdist_pr(const struct qdist *dist, size_t n_bins, uint32_t opt) GString *s; if (dist->n == 0) { - return NULL; + return g_strdup(QDIST_EMPTY_STR); } s = g_string_new(""); From 0b646f44d9976c33aee78b0427d1b357dde5dd50 Mon Sep 17 00:00:00 2001 From: Paolo Bonzini Date: Mon, 1 Aug 2016 10:25:05 +0200 Subject: [PATCH 13/25] mptsas: really fix migration compatibility Commit 2e2aa316 removed internal flag msi_in_use, but it existed in vmstate. Restore it for migration to older QEMU versions. Reported-by: Amit Shah Suggested-by: Amit Shah Cc: Markus Armbruster Cc: Marcel Apfelbaum Cc: Paolo Bonzini Cc: Michael S. Tsirkin Cc: Amit Shah Cc: Cao jin Signed-off-by: Paolo Bonzini --- hw/scsi/mptsas.c | 4 +++- hw/scsi/mptsas.h | 2 ++ 2 files changed, 5 insertions(+), 1 deletion(-) diff --git a/hw/scsi/mptsas.c b/hw/scsi/mptsas.c index bebe5130fe..0e0a22f696 100644 --- a/hw/scsi/mptsas.c +++ b/hw/scsi/mptsas.c @@ -1295,6 +1295,8 @@ static void mptsas_scsi_init(PCIDevice *dev, Error **errp) /* With msi=auto, we fall back to MSI off silently */ error_free(err); + /* Only used for migration. */ + s->msi_in_use = (ret == 0); } memory_region_init_io(&s->mmio_io, OBJECT(s), &mptsas_mmio_ops, s, @@ -1370,7 +1372,7 @@ static const VMStateDescription vmstate_mptsas = { .post_load = mptsas_post_load, .fields = (VMStateField[]) { VMSTATE_PCI_DEVICE(dev, MPTSASState), - VMSTATE_UNUSED(sizeof(bool)), /* Was msi_in_use */ + VMSTATE_BOOL(msi_in_use, MPTSASState), VMSTATE_UINT32(state, MPTSASState), VMSTATE_UINT8(who_init, MPTSASState), VMSTATE_UINT8(doorbell_state, MPTSASState), diff --git a/hw/scsi/mptsas.h b/hw/scsi/mptsas.h index da014a397e..0436a33911 100644 --- a/hw/scsi/mptsas.h +++ b/hw/scsi/mptsas.h @@ -31,6 +31,8 @@ struct MPTSASState { OnOffAuto msi; uint64_t sas_addr; + bool msi_in_use; + /* Doorbell register */ uint32_t state; uint8_t who_init; From 71ae65e552fc6e03572e430009b98b80b40f1c4d Mon Sep 17 00:00:00 2001 From: Igor Mammedov Date: Wed, 27 Jul 2016 14:39:58 +0200 Subject: [PATCH 14/25] i2c: fix migration regression introduced by broadcast support QEMU fails migration with following error: qemu-system-x86_64: Missing section footer for i2c_bus qemu-system-x86_64: load of migration failed: Invalid argument when migrating from: qemu-system-x86_64-v2.6.0 -m 256M rhel72.img -M pc-i440fx-2.6 to qemu-system-x86_64-v2.7.0-rc0 -m 256M rhel72.img -M pc-i440fx-2.6 Regression is added by commit 2293c27f (i2c: implement broadcast write) Fix it by dropping 'broadcast' VMState introduced by 2293c27f and reuse broadcast 0x00 address as broadcast flag in bus->saved_address. Then if there were ongoing broadcast at migration time, set bus->saved_address to it and at i2c_slave_post_load() time check for it instead of transfering and using 'broadcast' VMState. As result of reusing existing saved_address VMState, no compat glue will be needed to keep forward/backward compatiblity. which makes fix much less intrusive. Signed-off-by: Igor Mammedov Message-Id: <1469623198-177227-1-git-send-email-imammedo@redhat.com> Reviewed-by: Dr. David Alan Gilbert Signed-off-by: Paolo Bonzini --- hw/i2c/core.c | 10 +++++++--- 1 file changed, 7 insertions(+), 3 deletions(-) diff --git a/hw/i2c/core.c b/hw/i2c/core.c index abb3efb1db..4afbe0bde5 100644 --- a/hw/i2c/core.c +++ b/hw/i2c/core.c @@ -17,6 +17,8 @@ struct I2CNode { QLIST_ENTRY(I2CNode) next; }; +#define I2C_BROADCAST 0x00 + struct I2CBus { BusState qbus; @@ -47,6 +49,8 @@ static void i2c_bus_pre_save(void *opaque) if (!QLIST_EMPTY(&bus->current_devs)) { if (!bus->broadcast) { bus->saved_address = QLIST_FIRST(&bus->current_devs)->elt->address; + } else { + bus->saved_address = I2C_BROADCAST; } } } @@ -58,7 +62,6 @@ static const VMStateDescription vmstate_i2c_bus = { .pre_save = i2c_bus_pre_save, .fields = (VMStateField[]) { VMSTATE_UINT8(saved_address, I2CBus), - VMSTATE_BOOL(broadcast, I2CBus), VMSTATE_END_OF_LIST() } }; @@ -93,7 +96,7 @@ int i2c_start_transfer(I2CBus *bus, uint8_t address, int recv) I2CSlaveClass *sc; I2CNode *node; - if (address == 0x00) { + if (address == I2C_BROADCAST) { /* * This is a broadcast, the current_devs will be all the devices of the * bus. @@ -221,7 +224,8 @@ static int i2c_slave_post_load(void *opaque, int version_id) I2CNode *node; bus = I2C_BUS(qdev_get_parent_bus(DEVICE(dev))); - if ((bus->saved_address == dev->address) || (bus->broadcast)) { + if ((bus->saved_address == dev->address) || + (bus->saved_address == I2C_BROADCAST)) { node = g_malloc(sizeof(struct I2CNode)); node->elt = dev; QLIST_INSERT_HEAD(&bus->current_devs, node, next); From 5bee0f4717c4c67394aaade0c5a9cee3d42cc614 Mon Sep 17 00:00:00 2001 From: Eric Blake Date: Thu, 21 Jul 2016 13:34:45 -0600 Subject: [PATCH 15/25] nbd: Fix bad flag detection on server Commit ab7c548e added a check for invalid flags, but used an early return on error instead of properly going through the cleanup label. Signed-off-by: Eric Blake Message-Id: <1469129688-22848-2-git-send-email-eblake@redhat.com> Signed-off-by: Paolo Bonzini --- nbd/server.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/nbd/server.c b/nbd/server.c index 29e2099b5e..3c1e2b336b 100644 --- a/nbd/server.c +++ b/nbd/server.c @@ -1057,7 +1057,8 @@ static ssize_t nbd_co_receive_request(NBDRequest *req, if (request->type & ~NBD_CMD_MASK_COMMAND & ~NBD_CMD_FLAG_FUA) { LOG("unsupported flags (got 0x%x)", request->type & ~NBD_CMD_MASK_COMMAND); - return -EINVAL; + rc = -EINVAL; + goto out; } rc = 0; From 7423f417827146f956df820f172d0bf80a489495 Mon Sep 17 00:00:00 2001 From: Eric Blake Date: Thu, 21 Jul 2016 13:34:46 -0600 Subject: [PATCH 16/25] nbd: Limit nbdflags to 16 bits Rather than asserting that nbdflags is within range, just give it the correct type to begin with :) nbdflags corresponds to the per-export portion of NBD Protocol "transmission flags", which is 16 bits in response to NBD_OPT_EXPORT_NAME and NBD_OPT_GO. Furthermore, upstream NBD has never passed the global flags to the kernel via ioctl(NBD_SET_FLAGS) (the ioctl was first introduced in NBD 2.9.22; then a latent bug in NBD 3.1 actually tried to OR the global flags with the transmission flags, with the disaster that the addition of NBD_FLAG_NO_ZEROES in 3.9 caused all earlier NBD 3.x clients to treat every export as read-only; NBD 3.10 and later intentionally clip things to 16 bits to pass only transmission flags). Qemu should follow suit, since the current two global flags (NBD_FLAG_FIXED_NEWSTYLE and NBD_FLAG_NO_ZEROES) have no impact on the kernel's behavior during transmission. CC: qemu-stable@nongnu.org Signed-off-by: Eric Blake Message-Id: <1469129688-22848-3-git-send-email-eblake@redhat.com> Signed-off-by: Paolo Bonzini --- block/nbd-client.h | 2 +- include/block/nbd.h | 6 +++--- nbd/client.c | 28 +++++++++++++++------------- nbd/server.c | 10 ++++------ qemu-nbd.c | 4 ++-- 5 files changed, 25 insertions(+), 25 deletions(-) diff --git a/block/nbd-client.h b/block/nbd-client.h index fa9817b7d7..044aca4530 100644 --- a/block/nbd-client.h +++ b/block/nbd-client.h @@ -20,7 +20,7 @@ typedef struct NbdClientSession { QIOChannelSocket *sioc; /* The master data channel */ QIOChannel *ioc; /* The current I/O channel which may differ (eg TLS) */ - uint32_t nbdflags; + uint16_t nbdflags; off_t size; CoMutex send_mutex; diff --git a/include/block/nbd.h b/include/block/nbd.h index cb91820f38..1897557a9b 100644 --- a/include/block/nbd.h +++ b/include/block/nbd.h @@ -90,11 +90,11 @@ ssize_t nbd_wr_syncv(QIOChannel *ioc, size_t niov, size_t length, bool do_read); -int nbd_receive_negotiate(QIOChannel *ioc, const char *name, uint32_t *flags, +int nbd_receive_negotiate(QIOChannel *ioc, const char *name, uint16_t *flags, QCryptoTLSCreds *tlscreds, const char *hostname, QIOChannel **outioc, off_t *size, Error **errp); -int nbd_init(int fd, QIOChannelSocket *sioc, uint32_t flags, off_t size); +int nbd_init(int fd, QIOChannelSocket *sioc, uint16_t flags, off_t size); ssize_t nbd_send_request(QIOChannel *ioc, struct nbd_request *request); ssize_t nbd_receive_reply(QIOChannel *ioc, struct nbd_reply *reply); int nbd_client(int fd); @@ -104,7 +104,7 @@ typedef struct NBDExport NBDExport; typedef struct NBDClient NBDClient; NBDExport *nbd_export_new(BlockBackend *blk, off_t dev_offset, off_t size, - uint32_t nbdflags, void (*close)(NBDExport *), + uint16_t nbdflags, void (*close)(NBDExport *), Error **errp); void nbd_export_close(NBDExport *exp); void nbd_export_get(NBDExport *exp); diff --git a/nbd/client.c b/nbd/client.c index 78a7195c45..a92f1e2275 100644 --- a/nbd/client.c +++ b/nbd/client.c @@ -408,7 +408,7 @@ static QIOChannel *nbd_receive_starttls(QIOChannel *ioc, } -int nbd_receive_negotiate(QIOChannel *ioc, const char *name, uint32_t *flags, +int nbd_receive_negotiate(QIOChannel *ioc, const char *name, uint16_t *flags, QCryptoTLSCreds *tlscreds, const char *hostname, QIOChannel **outioc, off_t *size, Error **errp) @@ -468,7 +468,6 @@ int nbd_receive_negotiate(QIOChannel *ioc, const char *name, uint32_t *flags, uint32_t opt; uint32_t namesize; uint16_t globalflags; - uint16_t exportflags; bool fixedNewStyle = false; if (read_sync(ioc, &globalflags, sizeof(globalflags)) != @@ -477,7 +476,6 @@ int nbd_receive_negotiate(QIOChannel *ioc, const char *name, uint32_t *flags, goto fail; } globalflags = be16_to_cpu(globalflags); - *flags = globalflags << 16; TRACE("Global flags are %" PRIx32, globalflags); if (globalflags & NBD_FLAG_FIXED_NEWSTYLE) { fixedNewStyle = true; @@ -545,17 +543,15 @@ int nbd_receive_negotiate(QIOChannel *ioc, const char *name, uint32_t *flags, goto fail; } *size = be64_to_cpu(s); - TRACE("Size is %" PRIu64, *size); - if (read_sync(ioc, &exportflags, sizeof(exportflags)) != - sizeof(exportflags)) { + if (read_sync(ioc, flags, sizeof(*flags)) != sizeof(*flags)) { error_setg(errp, "Failed to read export flags"); goto fail; } - exportflags = be16_to_cpu(exportflags); - *flags |= exportflags; - TRACE("Export flags are %" PRIx16, exportflags); + be16_to_cpus(flags); } else if (magic == NBD_CLIENT_MAGIC) { + uint32_t oldflags; + if (name) { error_setg(errp, "Server does not support export names"); goto fail; @@ -572,16 +568,22 @@ int nbd_receive_negotiate(QIOChannel *ioc, const char *name, uint32_t *flags, *size = be64_to_cpu(s); TRACE("Size is %" PRIu64, *size); - if (read_sync(ioc, flags, sizeof(*flags)) != sizeof(*flags)) { + if (read_sync(ioc, &oldflags, sizeof(oldflags)) != sizeof(oldflags)) { error_setg(errp, "Failed to read export flags"); goto fail; } - *flags = be32_to_cpu(*flags); + be32_to_cpus(&oldflags); + if (oldflags & ~0xffff) { + error_setg(errp, "Unexpected export flags %0x" PRIx32, oldflags); + goto fail; + } + *flags = oldflags; } else { error_setg(errp, "Bad magic received"); goto fail; } + TRACE("Size is %" PRIu64 ", export flags %" PRIx16, *size, *flags); if (read_sync(ioc, &buf, 124) != 124) { error_setg(errp, "Failed to read reserved block"); goto fail; @@ -593,7 +595,7 @@ fail: } #ifdef __linux__ -int nbd_init(int fd, QIOChannelSocket *sioc, uint32_t flags, off_t size) +int nbd_init(int fd, QIOChannelSocket *sioc, uint16_t flags, off_t size) { unsigned long sectors = size / BDRV_SECTOR_SIZE; if (size / BDRV_SECTOR_SIZE != sectors) { @@ -689,7 +691,7 @@ int nbd_disconnect(int fd) } #else -int nbd_init(int fd, QIOChannelSocket *ioc, uint32_t flags, off_t size) +int nbd_init(int fd, QIOChannelSocket *ioc, uint16_t flags, off_t size) { return -ENOTSUP; } diff --git a/nbd/server.c b/nbd/server.c index 3c1e2b336b..80fbb4da1d 100644 --- a/nbd/server.c +++ b/nbd/server.c @@ -63,7 +63,7 @@ struct NBDExport { char *name; off_t dev_offset; off_t size; - uint32_t nbdflags; + uint16_t nbdflags; QTAILQ_HEAD(, NBDClient) clients; QTAILQ_ENTRY(NBDExport) next; @@ -544,8 +544,8 @@ static coroutine_fn int nbd_negotiate(NBDClientNewData *data) NBDClient *client = data->client; char buf[8 + 8 + 8 + 128]; int rc; - const int myflags = (NBD_FLAG_HAS_FLAGS | NBD_FLAG_SEND_TRIM | - NBD_FLAG_SEND_FLUSH | NBD_FLAG_SEND_FUA); + const uint16_t myflags = (NBD_FLAG_HAS_FLAGS | NBD_FLAG_SEND_TRIM | + NBD_FLAG_SEND_FLUSH | NBD_FLAG_SEND_FUA); bool oldStyle; /* Old style negotiation header without options @@ -575,7 +575,6 @@ static coroutine_fn int nbd_negotiate(NBDClientNewData *data) oldStyle = client->exp != NULL && !client->tlscreds; if (oldStyle) { - assert ((client->exp->nbdflags & ~65535) == 0); TRACE("advertising size %" PRIu64 " and flags %x", client->exp->size, client->exp->nbdflags | myflags); stq_be_p(buf + 8, NBD_CLIENT_MAGIC); @@ -606,7 +605,6 @@ static coroutine_fn int nbd_negotiate(NBDClientNewData *data) goto fail; } - assert ((client->exp->nbdflags & ~65535) == 0); TRACE("advertising size %" PRIu64 " and flags %x", client->exp->size, client->exp->nbdflags | myflags); stq_be_p(buf + 18, client->exp->size); @@ -810,7 +808,7 @@ static void nbd_eject_notifier(Notifier *n, void *data) } NBDExport *nbd_export_new(BlockBackend *blk, off_t dev_offset, off_t size, - uint32_t nbdflags, void (*close)(NBDExport *), + uint16_t nbdflags, void (*close)(NBDExport *), Error **errp) { NBDExport *exp = g_malloc0(sizeof(NBDExport)); diff --git a/qemu-nbd.c b/qemu-nbd.c index 321f02bd15..e3571c2025 100644 --- a/qemu-nbd.c +++ b/qemu-nbd.c @@ -251,7 +251,7 @@ static void *nbd_client_thread(void *arg) { char *device = arg; off_t size; - uint32_t nbdflags; + uint16_t nbdflags; QIOChannelSocket *sioc; int fd; int ret; @@ -465,7 +465,7 @@ int main(int argc, char **argv) BlockBackend *blk; BlockDriverState *bs; off_t dev_offset = 0; - uint32_t nbdflags = 0; + uint16_t nbdflags = 0; bool disconnect = false; const char *bindto = "0.0.0.0"; const char *port = NULL; From e9fd416e66539ad43bbab018f346cb164136c099 Mon Sep 17 00:00:00 2001 From: Eric Blake Date: Thu, 21 Jul 2016 13:34:47 -0600 Subject: [PATCH 17/25] osdep: Document differences in rounding macros Make it obvious which macros are safe in which situations. Useful since QEMU_ALIGN_UP and ROUND_UP both purport to do the same thing, but differ on whether the alignment must be a power of 2. Signed-off-by: Eric Blake Message-Id: <1469129688-22848-4-git-send-email-eblake@redhat.com> Signed-off-by: Paolo Bonzini --- include/qemu/osdep.h | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/include/qemu/osdep.h b/include/qemu/osdep.h index d7c111d169..9e9fa61546 100644 --- a/include/qemu/osdep.h +++ b/include/qemu/osdep.h @@ -158,7 +158,8 @@ extern int daemon(int, int); /* Round number down to multiple */ #define QEMU_ALIGN_DOWN(n, m) ((n) / (m) * (m)) -/* Round number up to multiple */ +/* Round number up to multiple. Safe when m is not a power of 2 (see + * ROUND_UP for a faster version when a power of 2 is guaranteed) */ #define QEMU_ALIGN_UP(n, m) QEMU_ALIGN_DOWN((n) + (m) - 1, (m)) /* Check if n is a multiple of m */ @@ -175,6 +176,9 @@ extern int daemon(int, int); /* Check if pointer p is n-bytes aligned */ #define QEMU_PTR_IS_ALIGNED(p, n) QEMU_IS_ALIGNED((uintptr_t)(p), (n)) +/* Round number up to multiple. Requires that d be a power of 2 (see + * QEMU_ALIGN_UP for a safer but slower version on arbitrary + * numbers) */ #ifndef ROUND_UP #define ROUND_UP(n,d) (((n) + (d) - 1) & -(d)) #endif From b8d0a9804d0bd43c9b662a6917ae9cd514a54dff Mon Sep 17 00:00:00 2001 From: Eric Blake Date: Thu, 21 Jul 2016 13:34:48 -0600 Subject: [PATCH 18/25] block: Cater to iscsi with non-power-of-2 discard Dell Equallogic iSCSI SANs have a very unusual advertised geometry: $ iscsi-inq -e 1 -c $((0xb0)) iscsi://XXX/0 wsnz:0 maximum compare and write length:1 optimal transfer length granularity:0 maximum transfer length:0 optimal transfer length:0 maximum prefetch xdread xdwrite transfer length:0 maximum unmap lba count:30720 maximum unmap block descriptor count:2 optimal unmap granularity:30720 ugavalid:1 unmap granularity alignment:0 maximum write same length:30720 which says that both the maximum and the optimal discard size is 15M. It is not immediately apparent if the device allows discard requests not aligned to the optimal size, nor if it allows discards at a finer granularity than the optimal size. I tried to find details in the SCSI Commands Reference Manual Rev. A on what valid values of maximum and optimal sizes are permitted, but while that document mentions a "Block Limits VPD Page", I couldn't actually find documentation of that page or what values it would have, or if a SCSI device has an advertisement of its minimal unmap granularity. So it is not obvious to me whether the Dell Equallogic device is compliance with the SCSI specification. Fortunately, it is easy enough to support non-power-of-2 sizing, even if it means we are less efficient than truly possible when targetting that device (for example, it means that we refuse to unmap anything that is not a multiple of 15M and aligned to a 15M boundary, even if the device truly does support a smaller granularity where unmapping actually works). Reported-by: Peter Lieven Signed-off-by: Eric Blake Message-Id: <1469129688-22848-5-git-send-email-eblake@redhat.com> Acked-by: Stefan Hajnoczi Signed-off-by: Paolo Bonzini --- block/io.c | 15 +++++++++------ include/block/block_int.h | 37 ++++++++++++++++++++----------------- 2 files changed, 29 insertions(+), 23 deletions(-) diff --git a/block/io.c b/block/io.c index 7323f0fb7b..d5493ba349 100644 --- a/block/io.c +++ b/block/io.c @@ -1180,10 +1180,11 @@ static int coroutine_fn bdrv_co_do_pwrite_zeroes(BlockDriverState *bs, int alignment = MAX(bs->bl.pwrite_zeroes_alignment, bs->bl.request_alignment); - assert(is_power_of_2(alignment)); - head = offset & (alignment - 1); - tail = (offset + count) & (alignment - 1); - max_write_zeroes &= ~(alignment - 1); + assert(alignment % bs->bl.request_alignment == 0); + head = offset % alignment; + tail = (offset + count) % alignment; + max_write_zeroes = QEMU_ALIGN_DOWN(max_write_zeroes, alignment); + assert(max_write_zeroes >= bs->bl.request_alignment); while (count > 0 && !ret) { int num = count; @@ -2429,9 +2430,10 @@ int coroutine_fn bdrv_co_pdiscard(BlockDriverState *bs, int64_t offset, /* Discard is advisory, so ignore any unaligned head or tail */ align = MAX(bs->bl.pdiscard_alignment, bs->bl.request_alignment); - assert(is_power_of_2(align)); - head = MIN(count, -offset & (align - 1)); + assert(align % bs->bl.request_alignment == 0); + head = offset % align; if (head) { + head = MIN(count, align - head); count -= head; offset += head; } @@ -2449,6 +2451,7 @@ int coroutine_fn bdrv_co_pdiscard(BlockDriverState *bs, int64_t offset, max_pdiscard = QEMU_ALIGN_DOWN(MIN_NON_ZERO(bs->bl.max_pdiscard, INT_MAX), align); + assert(max_pdiscard); while (count > 0) { int ret; diff --git a/include/block/block_int.h b/include/block/block_int.h index 1fe0fd9ff6..47665be81e 100644 --- a/include/block/block_int.h +++ b/include/block/block_int.h @@ -330,36 +330,39 @@ typedef struct BlockLimits { * otherwise. */ uint32_t request_alignment; - /* maximum number of bytes that can be discarded at once (since it - * is signed, it must be < 2G, if set), should be multiple of + /* Maximum number of bytes that can be discarded at once (since it + * is signed, it must be < 2G, if set). Must be multiple of * pdiscard_alignment, but need not be power of 2. May be 0 if no * inherent 32-bit limit */ int32_t max_pdiscard; - /* optimal alignment for discard requests in bytes, must be power - * of 2, less than max_pdiscard if that is set, and multiple of - * bl.request_alignment. May be 0 if bl.request_alignment is good - * enough */ + /* Optimal alignment for discard requests in bytes. A power of 2 + * is best but not mandatory. Must be a multiple of + * bl.request_alignment, and must be less than max_pdiscard if + * that is set. May be 0 if bl.request_alignment is good enough */ uint32_t pdiscard_alignment; - /* maximum number of bytes that can zeroized at once (since it is - * signed, it must be < 2G, if set), should be multiple of + /* Maximum number of bytes that can zeroized at once (since it is + * signed, it must be < 2G, if set). Must be multiple of * pwrite_zeroes_alignment. May be 0 if no inherent 32-bit limit */ int32_t max_pwrite_zeroes; - /* optimal alignment for write zeroes requests in bytes, must be - * power of 2, less than max_pwrite_zeroes if that is set, and - * multiple of bl.request_alignment. May be 0 if - * bl.request_alignment is good enough */ + /* Optimal alignment for write zeroes requests in bytes. A power + * of 2 is best but not mandatory. Must be a multiple of + * bl.request_alignment, and must be less than max_pwrite_zeroes + * if that is set. May be 0 if bl.request_alignment is good + * enough */ uint32_t pwrite_zeroes_alignment; - /* optimal transfer length in bytes (must be power of 2, and - * multiple of bl.request_alignment), or 0 if no preferred size */ + /* Optimal transfer length in bytes. A power of 2 is best but not + * mandatory. Must be a multiple of bl.request_alignment, or 0 if + * no preferred size */ uint32_t opt_transfer; - /* maximal transfer length in bytes (need not be power of 2, but - * should be multiple of opt_transfer), or 0 for no 32-bit limit. - * For now, anything larger than INT_MAX is clamped down. */ + /* Maximal transfer length in bytes. Need not be power of 2, but + * must be multiple of opt_transfer and bl.request_alignment, or 0 + * for no 32-bit limit. For now, anything larger than INT_MAX is + * clamped down. */ uint32_t max_transfer; /* memory alignment, in bytes so that no bounce buffer is needed */ From e061fa3ca9cf769aebcc2ef5db7fc385a810abb1 Mon Sep 17 00:00:00 2001 From: Markus Armbruster Date: Fri, 29 Jul 2016 09:29:13 +0200 Subject: [PATCH 19/25] fw_cfg: Make base type "fw_cfg" abstract Missed when commit 5712db6 split off "fw_cfg_io" and "fw_cfg_mem". Signed-off-by: Markus Armbruster Message-Id: <1469777353-9383-1-git-send-email-armbru@redhat.com> Reviewed-by: Laszlo Ersek Signed-off-by: Paolo Bonzini --- hw/nvram/fw_cfg.c | 1 + 1 file changed, 1 insertion(+) diff --git a/hw/nvram/fw_cfg.c b/hw/nvram/fw_cfg.c index 2873030ade..f10d5ec9c8 100644 --- a/hw/nvram/fw_cfg.c +++ b/hw/nvram/fw_cfg.c @@ -990,6 +990,7 @@ static void fw_cfg_class_init(ObjectClass *klass, void *data) static const TypeInfo fw_cfg_info = { .name = TYPE_FW_CFG, .parent = TYPE_SYS_BUS_DEVICE, + .abstract = true, .instance_size = sizeof(FWCfgState), .class_init = fw_cfg_class_init, }; From 7298d4fd515c190b1b6c1266735f6212300313ae Mon Sep 17 00:00:00 2001 From: Igor Mammedov Date: Fri, 29 Jul 2016 15:55:42 +0200 Subject: [PATCH 20/25] apic: fix broken migration for kvm-apic commit f6e98444 (apic: Use apic_id as apic's migration instance_id) breaks migration when in kernel irqchip is used for 2.6 and older machine types. It applies compat property only for userspace 'apic' type instead of applying it to all apic types inherited from 'apic-common' type as it was supposed to do. Fix it by setting compat property 'legacy-instance-id' for 'apic-common' type which affects inherited types (i.e. not only 'apic' but also 'kvm-apic' types) Signed-off-by: Igor Mammedov Message-Id: <1469800542-11402-1-git-send-email-imammedo@redhat.com> Reviewed-by: Michael S. Tsirkin Signed-off-by: Paolo Bonzini --- include/hw/i386/pc.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/include/hw/i386/pc.h b/include/hw/i386/pc.h index c87c5c1eec..74c175c1e5 100644 --- a/include/hw/i386/pc.h +++ b/include/hw/i386/pc.h @@ -388,7 +388,7 @@ bool e820_get_entry(int, uint32_t, uint64_t *, uint64_t *); .value = "off",\ },\ {\ - .driver = "apic",\ + .driver = "apic-common",\ .property = "legacy-instance-id",\ .value = "on",\ }, From f99b86b94987561580a94838766458e1c7b8685d Mon Sep 17 00:00:00 2001 From: Peter Xu Date: Sun, 31 Jul 2016 22:18:05 +0800 Subject: [PATCH 21/25] x86: ioapic: ignore level irq during processing For level triggered interrupts, we will get Remote IRR bit cleared after guest kernel finished processing specific request. Before that, we should ignore the same interrupt from triggering again. Signed-off-by: Peter Xu Message-Id: <1469974685-4144-1-git-send-email-peterx@redhat.com> [Push new "if" up so that it covers KVM split irqchip as well. - Paolo] Signed-off-by: Paolo Bonzini --- hw/intc/ioapic.c | 14 +++++++++----- 1 file changed, 9 insertions(+), 5 deletions(-) diff --git a/hw/intc/ioapic.c b/hw/intc/ioapic.c index 2d3282a864..a00d88210a 100644 --- a/hw/intc/ioapic.c +++ b/hw/intc/ioapic.c @@ -117,21 +117,25 @@ static void ioapic_service(IOAPICCommonState *s) s->ioredtbl[i] |= IOAPIC_LVT_REMOTE_IRR; } + if (coalesce) { + /* We are level triggered interrupts, and the + * guest should be still working on previous one, + * so skip it. */ + continue; + } + #ifdef CONFIG_KVM if (kvm_irqchip_is_split()) { if (info.trig_mode == IOAPIC_TRIGGER_EDGE) { kvm_set_irq(kvm_state, i, 1); kvm_set_irq(kvm_state, i, 0); } else { - if (!coalesce) { - kvm_set_irq(kvm_state, i, 1); - } + kvm_set_irq(kvm_state, i, 1); } continue; } -#else - (void)coalesce; #endif + /* No matter whether IR is enabled, we translate * the IOAPIC message into a MSI one, and its * address space will decide whether we need a From 20fd4b7b6d9282fe0cb83601f1821f31bd257458 Mon Sep 17 00:00:00 2001 From: Peter Xu Date: Mon, 1 Aug 2016 21:59:19 +0800 Subject: [PATCH 22/25] x86: ioapic: add support for explicit EOI Some old Linux kernels (upstream before v4.0), or any released RHEL kernels has problem in sending APIC EOI when IR is enabled. Meanwhile, many of them only support explicit EOI for IOAPIC, which is only introduced in IOAPIC version 0x20. This patch provide a way to boost QEMU IOAPIC to version 0x20, in order for QEMU to correctly receive EOI messages. Without boosting IOAPIC version to 0x20, kernels before commit d32932d ("x86/irq: Convert IOAPIC to use hierarchical irqdomain interfaces") will have trouble enabling both IR and level-triggered interrupt devices (like e1000). To upgrade IOAPIC to version 0x20, we need to specify: -global ioapic.version=0x20 To be compatible with old systems, 0x11 will still be the default IOAPIC version. Here 0x11 and 0x20 are the only versions to be supported. One thing to mention: this patch only applies to emulated IOAPIC. It does not affect kernel IOAPIC behavior. Signed-off-by: Peter Xu Message-Id: <1470059959-372-1-git-send-email-peterx@redhat.com> Signed-off-by: Paolo Bonzini --- hw/intc/ioapic.c | 22 +++++++++++++++++++++- include/hw/i386/ioapic_internal.h | 4 ++-- 2 files changed, 23 insertions(+), 3 deletions(-) diff --git a/hw/intc/ioapic.c b/hw/intc/ioapic.c index a00d88210a..31791b0986 100644 --- a/hw/intc/ioapic.c +++ b/hw/intc/ioapic.c @@ -21,6 +21,7 @@ */ #include "qemu/osdep.h" +#include "qemu/error-report.h" #include "monitor/monitor.h" #include "hw/hw.h" #include "hw/i386/pc.h" @@ -269,7 +270,7 @@ ioapic_mem_read(void *opaque, hwaddr addr, unsigned int size) val = s->id << IOAPIC_ID_SHIFT; break; case IOAPIC_REG_VER: - val = IOAPIC_VERSION | + val = s->version | ((IOAPIC_NUM_PINS - 1) << IOAPIC_VER_ENTRIES_SHIFT); break; default: @@ -358,6 +359,13 @@ ioapic_mem_write(void *opaque, hwaddr addr, uint64_t val, } } break; + case IOAPIC_EOI: + /* Explicit EOI is only supported for IOAPIC version 0x20 */ + if (size != 4 || s->version != 0x20) { + break; + } + ioapic_eoi_broadcast(val); + break; } ioapic_update_kvm_routes(s); @@ -391,6 +399,12 @@ static void ioapic_realize(DeviceState *dev, Error **errp) { IOAPICCommonState *s = IOAPIC_COMMON(dev); + if (s->version != 0x11 && s->version != 0x20) { + error_report("IOAPIC only supports version 0x11 or 0x20 " + "(default: 0x11)."); + exit(1); + } + memory_region_init_io(&s->io_memory, OBJECT(s), &ioapic_io_ops, s, "ioapic", 0x1000); @@ -401,6 +415,11 @@ static void ioapic_realize(DeviceState *dev, Error **errp) qemu_add_machine_init_done_notifier(&s->machine_done); } +static Property ioapic_properties[] = { + DEFINE_PROP_UINT8("version", IOAPICCommonState, version, 0x11), + DEFINE_PROP_END_OF_LIST(), +}; + static void ioapic_class_init(ObjectClass *klass, void *data) { IOAPICCommonClass *k = IOAPIC_COMMON_CLASS(klass); @@ -408,6 +427,7 @@ static void ioapic_class_init(ObjectClass *klass, void *data) k->realize = ioapic_realize; dc->reset = ioapic_reset_common; + dc->props = ioapic_properties; } static const TypeInfo ioapic_info = { diff --git a/include/hw/i386/ioapic_internal.h b/include/hw/i386/ioapic_internal.h index d89ea1b63b..a11d86de46 100644 --- a/include/hw/i386/ioapic_internal.h +++ b/include/hw/i386/ioapic_internal.h @@ -29,8 +29,6 @@ #define MAX_IOAPICS 1 -#define IOAPIC_VERSION 0x11 - #define IOAPIC_LVT_DEST_SHIFT 56 #define IOAPIC_LVT_DEST_IDX_SHIFT 48 #define IOAPIC_LVT_MASKED_SHIFT 16 @@ -71,6 +69,7 @@ #define IOAPIC_IOREGSEL 0x00 #define IOAPIC_IOWIN 0x10 +#define IOAPIC_EOI 0x40 #define IOAPIC_REG_ID 0x00 #define IOAPIC_REG_VER 0x01 @@ -109,6 +108,7 @@ struct IOAPICCommonState { uint32_t irr; uint64_t ioredtbl[IOAPIC_NUM_PINS]; Notifier machine_done; + uint8_t version; }; void ioapic_reset_common(DeviceState *dev); From f04ec5afbb7d60a56863add800fd90ceee66f362 Mon Sep 17 00:00:00 2001 From: Robert Ho Date: Tue, 26 Jul 2016 18:17:11 +0800 Subject: [PATCH 23/25] Reorganize help output of '-display' option The '-display' help information is not very correct. This patch sort it a little. Also, in its help information, reveals what implicit display option will be chosen if no definition. Signed-off-by: Robert Ho Message-Id: <1469528231-26206-1-git-send-email-robert.hu@intel.com> Reviewed-by: Markus Armbruster Signed-off-by: Paolo Bonzini --- qemu-options.hx | 29 ++++++++++++++++++++++------- 1 file changed, 22 insertions(+), 7 deletions(-) diff --git a/qemu-options.hx b/qemu-options.hx index 8e0d9a51bf..a71aaf8ea8 100644 --- a/qemu-options.hx +++ b/qemu-options.hx @@ -927,10 +927,25 @@ ETEXI DEF("display", HAS_ARG, QEMU_OPTION_display, "-display sdl[,frame=on|off][,alt_grab=on|off][,ctrl_grab=on|off]\n" - " [,window_close=on|off]|curses|none|\n" - " gtk[,grab_on_hover=on|off]|\n" - " vnc=[,]\n" - " select display type\n", QEMU_ARCH_ALL) + " [,window_close=on|off][,gl=on|off]|curses|none|\n" + "-display gtk[,grab_on_hover=on|off][,gl=on|off]|\n" + "-display vnc=[,]\n" + "-display curses\n" + "-display none" + " select display type\n" + "The default display is equivalent to\n" +#if defined(CONFIG_GTK) + "\t\"-display gtk\"\n" +#elif defined(CONFIG_SDL) + "\t\"-display sdl\"\n" +#elif defined(CONFIG_COCOA) + "\t\"-display cocoa\"\n" +#elif defined(CONFIG_VNC) + "\t\"-vnc localhost:0,to=99,id=default\"\n" +#else + "\t\"-display none\"\n" +#endif + , QEMU_ARCH_ALL) STEXI @item -display @var{type} @findex -display @@ -977,7 +992,7 @@ the console and monitor. ETEXI DEF("curses", 0, QEMU_OPTION_curses, - "-curses use a curses/ncurses interface instead of SDL\n", + "-curses shorthand for -display curses\n", QEMU_ARCH_ALL) STEXI @item -curses @@ -1027,7 +1042,7 @@ Disable SDL window close capability. ETEXI DEF("sdl", 0, QEMU_OPTION_sdl, - "-sdl enable SDL\n", QEMU_ARCH_ALL) + "-sdl shorthand for -display sdl\n", QEMU_ARCH_ALL) STEXI @item -sdl @findex -sdl @@ -1224,7 +1239,7 @@ Set the initial graphical resolution and depth (PPC, SPARC only). ETEXI DEF("vnc", HAS_ARG, QEMU_OPTION_vnc , - "-vnc display start a VNC server on display\n", QEMU_ARCH_ALL) + "-vnc shorthand for -display vnc=\n", QEMU_ARCH_ALL) STEXI @item -vnc @var{display}[,@var{option}[,@var{option}[,...]]] @findex -vnc From 0d4104e5760221547fad158bbbb655a4e4c22b50 Mon Sep 17 00:00:00 2001 From: Fam Zheng Date: Tue, 2 Aug 2016 11:41:41 +0800 Subject: [PATCH 24/25] qdev: Fix use after free in qdev_init_nofail error path Since 69382d8b (qdev: Fix object reference leak in case device.realize() fails), object_property_set_bool could release the object. The error path wants the type name, so hold an reference before realizing it. Cc: Igor Mammedov Signed-off-by: Fam Zheng Message-Id: <1470109301-12966-1-git-send-email-famz@redhat.com> Reviewed-by: John Snow Signed-off-by: Paolo Bonzini --- hw/core/qdev.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/hw/core/qdev.c b/hw/core/qdev.c index ee4a083e64..57834423b9 100644 --- a/hw/core/qdev.c +++ b/hw/core/qdev.c @@ -354,12 +354,14 @@ void qdev_init_nofail(DeviceState *dev) assert(!dev->realized); + object_ref(OBJECT(dev)); object_property_set_bool(OBJECT(dev), true, "realized", &err); if (err) { error_reportf_err(err, "Initialization of device %s failed: ", object_get_typename(OBJECT(dev))); exit(1); } + object_unref(OBJECT(dev)); } void qdev_machine_creation_done(void) From e911765cbb9e9ddf5d952c88bb52180a62c6cea0 Mon Sep 17 00:00:00 2001 From: Shmulik Ladkani Date: Tue, 2 Aug 2016 12:41:20 +0300 Subject: [PATCH 25/25] util: Fix assertion in iov_copy() upon zero 'bytes' and non-zero 'offset' In cases where iov_copy() is passed with zero 'bytes' argument and a non-zero 'offset' argument, nothing gets copied - as expected. However no copy iterations are performed, so 'offset' is left unaltered, leading to the final assert(offset == 0) to fail. Instead, change the loop condition to continue as long as 'offset || bytes', similar to other iov_* functions. This ensures 'offset' gets zeroed (even if no actual copy is made), unless it is beyond end of source iov - which is asserted. Signed-off-by: Shmulik Ladkani Message-Id: <1470130880-1050-1-git-send-email-shmulik.ladkani@oracle.com> Signed-off-by: Paolo Bonzini --- util/iov.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/util/iov.c b/util/iov.c index 003fcce66f..74e6ca8ed7 100644 --- a/util/iov.c +++ b/util/iov.c @@ -247,7 +247,8 @@ unsigned iov_copy(struct iovec *dst_iov, unsigned int dst_iov_cnt, { size_t len; unsigned int i, j; - for (i = 0, j = 0; i < iov_cnt && j < dst_iov_cnt && bytes; i++) { + for (i = 0, j = 0; + i < iov_cnt && j < dst_iov_cnt && (offset || bytes); i++) { if (offset >= iov[i].iov_len) { offset -= iov[i].iov_len; continue;