From 3d4f2534834cd9f9bbb3dd145fa61fd2ac0dd535 Mon Sep 17 00:00:00 2001 From: Thomas Huth Date: Mon, 18 Jul 2016 15:19:04 +0200 Subject: [PATCH 1/5] ppc: Huge page detection mechanism fixes - Episode III After already fixing two issues with the huge page detection mechanism (see commit 159d2e39a860 and 86b50f2e1bef), Greg Kurz noticed another case that caused the guest to crash where QEMU announces huge pages though they should not be available for the guest: qemu-system-ppc64 -enable-kvm ... -mem-path /dev/hugepages \ -m 1G,slots=4,maxmem=32G -object memory-backend-ram,policy=default,size=1G,id=mem-mem1 \ -device pc-dimm,id=dimm-mem1,memdev=mem-mem1 -smp 2 \ -numa node,nodeid=0 -numa node,nodeid=1 That means if there is a global mem-path option, we still have to look at the memory-backend objects that have been specified additionally and return their minimum page size if that value is smaller than the page size of the main memory. Reported-by: Greg Kurz Signed-off-by: Thomas Huth Reviewed-by: Greg Kurz Tested-by: Greg Kurz Signed-off-by: David Gibson --- target-ppc/kvm.c | 27 ++++++++++++++------------- 1 file changed, 14 insertions(+), 13 deletions(-) diff --git a/target-ppc/kvm.c b/target-ppc/kvm.c index 91e6daf4fd..84764edeae 100644 --- a/target-ppc/kvm.c +++ b/target-ppc/kvm.c @@ -366,10 +366,13 @@ static int find_max_supported_pagesize(Object *obj, void *opaque) static long getrampagesize(void) { long hpsize = LONG_MAX; + long mainrampagesize; Object *memdev_root; if (mem_path) { - return gethugepagesize(mem_path); + mainrampagesize = gethugepagesize(mem_path); + } else { + mainrampagesize = getpagesize(); } /* it's possible we have memory-backend objects with @@ -383,28 +386,26 @@ static long getrampagesize(void) * backend isn't backed by hugepages. */ memdev_root = object_resolve_path("/objects", NULL); - if (!memdev_root) { - return getpagesize(); + if (memdev_root) { + object_child_foreach(memdev_root, find_max_supported_pagesize, &hpsize); } - - object_child_foreach(memdev_root, find_max_supported_pagesize, &hpsize); - - if (hpsize == LONG_MAX || hpsize == getpagesize()) { - return getpagesize(); + if (hpsize == LONG_MAX) { + /* No additional memory regions found ==> Report main RAM page size */ + return mainrampagesize; } /* If NUMA is disabled or the NUMA nodes are not backed with a - * memory-backend, then there is at least one node using "normal" - * RAM. And since normal RAM has not been configured with "-mem-path" - * (what we've checked earlier here already), we can not use huge pages! + * memory-backend, then there is at least one node using "normal" RAM, + * so if its page size is smaller we have got to report that size instead. */ - if (nb_numa_nodes == 0 || numa_info[0].node_memdev == NULL) { + if (hpsize > mainrampagesize && + (nb_numa_nodes == 0 || numa_info[0].node_memdev == NULL)) { static bool warned; if (!warned) { error_report("Huge page support disabled (n/a for main memory)."); warned = true; } - return getpagesize(); + return mainrampagesize; } return hpsize; From c573fc03da65e35bf44bce0448ea12801e3631ac Mon Sep 17 00:00:00 2001 From: Thomas Huth Date: Thu, 21 Jul 2016 11:21:34 +0200 Subject: [PATCH 2/5] hw/ppc/spapr: Make sure to close the htab_fd when migration is canceled When canceling a migration process, we currently do not close the HTAB migration file descriptor since htab_save_complete() is never called in that case. So we leave the migration process with a dangling htab_fd value around, and this causes any further migration attempts to fail. To fix this issue, simply make sure that the htab_fd is closed during the migration cleanup stage. And since the cleanup() function is also called when migration succeeds, we can also remove the call to close_htab_fd() from the htab_save_complete() function. Buglink: https://bugzilla.redhat.com/show_bug.cgi?id=1354341 Signed-off-by: Thomas Huth Reviewed-by: Greg Kurz Signed-off-by: David Gibson --- hw/ppc/spapr.c | 9 ++++++++- 1 file changed, 8 insertions(+), 1 deletion(-) diff --git a/hw/ppc/spapr.c b/hw/ppc/spapr.c index 7f33a1b2b5..9193ac2c12 100644 --- a/hw/ppc/spapr.c +++ b/hw/ppc/spapr.c @@ -1512,7 +1512,6 @@ static int htab_save_complete(QEMUFile *f, void *opaque) if (rc < 0) { return rc; } - close_htab_fd(spapr); } else { if (spapr->htab_first_pass) { htab_save_first_pass(f, spapr, -1); @@ -1614,10 +1613,18 @@ static int htab_load(QEMUFile *f, void *opaque, int version_id) return 0; } +static void htab_cleanup(void *opaque) +{ + sPAPRMachineState *spapr = opaque; + + close_htab_fd(spapr); +} + static SaveVMHandlers savevm_htab_handlers = { .save_live_setup = htab_save_setup, .save_live_iterate = htab_save_iterate, .save_live_complete_precopy = htab_save_complete, + .cleanup = htab_cleanup, .load_state = htab_load, }; From cf472f48d57a533cc46ee651159abcbe0b362df0 Mon Sep 17 00:00:00 2001 From: "lvivier@redhat.com" Date: Thu, 21 Jul 2016 14:05:46 +0200 Subject: [PATCH 3/5] spapr: fix spapr-nvram migration When spapr-nvram is backed by a file using pflash interface, migration fails on the destination guest with assert: bdrv_co_pwritev: Assertion `!(bs->open_flags & 0x0800)' failed. This avoids the problem by delaying the pflash update until after the device loads complete. This fix is similar to the one for the pflash_cfi01 migration: 90c647d Fix pflash migration Signed-off-by: Laurent Vivier Reviewed-by: Paolo Bonzini Signed-off-by: David Gibson --- hw/nvram/spapr_nvram.c | 23 +++++++++++++++-------- 1 file changed, 15 insertions(+), 8 deletions(-) diff --git a/hw/nvram/spapr_nvram.c b/hw/nvram/spapr_nvram.c index 019f25dc58..4de5f705d8 100644 --- a/hw/nvram/spapr_nvram.c +++ b/hw/nvram/spapr_nvram.c @@ -39,6 +39,7 @@ typedef struct sPAPRNVRAM { uint32_t size; uint8_t *buf; BlockBackend *blk; + VMChangeStateEntry *vmstate; } sPAPRNVRAM; #define TYPE_VIO_SPAPR_NVRAM "spapr-nvram" @@ -185,19 +186,25 @@ static int spapr_nvram_pre_load(void *opaque) return 0; } +static void postload_update_cb(void *opaque, int running, RunState state) +{ + sPAPRNVRAM *nvram = opaque; + + /* This is called after bdrv_invalidate_cache_all. */ + + qemu_del_vm_change_state_handler(nvram->vmstate); + nvram->vmstate = NULL; + + blk_pwrite(nvram->blk, 0, nvram->buf, nvram->size, 0); +} + static int spapr_nvram_post_load(void *opaque, int version_id) { sPAPRNVRAM *nvram = VIO_SPAPR_NVRAM(opaque); if (nvram->blk) { - int alen = blk_pwrite(nvram->blk, 0, nvram->buf, nvram->size, 0); - - if (alen < 0) { - return alen; - } - if (alen != nvram->size) { - return -1; - } + nvram->vmstate = qemu_add_vm_change_state_handler(postload_update_cb, + nvram); } return 0; From 2fff4bad40abe5781d21c5e134161a1bd13c43cd Mon Sep 17 00:00:00 2001 From: Michael Walle Date: Fri, 22 Jul 2016 18:53:51 +0200 Subject: [PATCH 4/5] target-ppc: add PPC_MFTB flag to e500mc and e5500 According to the e500mc and e5500 core reference manual they have support for the mftb instruction. Signed-off-by: Michael Walle Signed-off-by: David Gibson --- target-ppc/translate_init.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/target-ppc/translate_init.c b/target-ppc/translate_init.c index 5ecafc7b8b..5f28a36998 100644 --- a/target-ppc/translate_init.c +++ b/target-ppc/translate_init.c @@ -5133,7 +5133,7 @@ POWERPC_FAMILY(e500mc)(ObjectClass *oc, void *data) dc->desc = "e500mc core"; pcc->init_proc = init_proc_e500mc; pcc->check_pow = check_pow_none; - pcc->insns_flags = PPC_INSNS_BASE | PPC_ISEL | + pcc->insns_flags = PPC_INSNS_BASE | PPC_ISEL | PPC_MFTB | PPC_WRTEE | PPC_RFDI | PPC_RFMCI | PPC_CACHE | PPC_CACHE_LOCK | PPC_CACHE_ICBI | PPC_CACHE_DCBZ | PPC_CACHE_DCBA | @@ -5179,7 +5179,7 @@ POWERPC_FAMILY(e5500)(ObjectClass *oc, void *data) dc->desc = "e5500 core"; pcc->init_proc = init_proc_e5500; pcc->check_pow = check_pow_none; - pcc->insns_flags = PPC_INSNS_BASE | PPC_ISEL | + pcc->insns_flags = PPC_INSNS_BASE | PPC_ISEL | PPC_MFTB | PPC_WRTEE | PPC_RFDI | PPC_RFMCI | PPC_CACHE | PPC_CACHE_LOCK | PPC_CACHE_ICBI | PPC_CACHE_DCBZ | PPC_CACHE_DCBA | From 12bf2d33fe520f9cfd09f7bf9d46ae3202c3cb49 Mon Sep 17 00:00:00 2001 From: Greg Kurz Date: Fri, 22 Jul 2016 13:10:36 +0200 Subject: [PATCH 5/5] spapr: disintricate core-id from DT semantics The goal of this patch is to have a stable core-id which does not depend on any DT related semantics, which involve non-obvious computations on modern PowerPC server cpus. With this patch, the DT core id is computed on-demand as: (core-id / smp_threads) * smt where smt is the number of threads per core in the host. This formula should be consolidated in a helper since it is needed in several places. Other uses for core-id includes: compute a stable cpu_index (which allows random order hotplug/unplug without breaking migration) and NUMA. Signed-off-by: Greg Kurz Signed-off-by: David Gibson --- hw/ppc/spapr.c | 10 +++++----- hw/ppc/spapr_cpu_core.c | 24 +++++++++++------------- 2 files changed, 16 insertions(+), 18 deletions(-) diff --git a/hw/ppc/spapr.c b/hw/ppc/spapr.c index 9193ac2c12..fbbd0518ed 100644 --- a/hw/ppc/spapr.c +++ b/hw/ppc/spapr.c @@ -1815,10 +1815,11 @@ static void ppc_spapr_init(MachineState *machine) spapr->cores = g_new0(Object *, spapr_max_cores); for (i = 0; i < spapr_max_cores; i++) { - int core_dt_id = i * smt; + int core_id = i * smp_threads; sPAPRDRConnector *drc = spapr_dr_connector_new(OBJECT(spapr), - SPAPR_DR_CONNECTOR_TYPE_CPU, core_dt_id); + SPAPR_DR_CONNECTOR_TYPE_CPU, + (core_id / smp_threads) * smt); qemu_register_reset(spapr_drc_reset, drc); @@ -1834,7 +1835,7 @@ static void ppc_spapr_init(MachineState *machine) core = object_new(type); object_property_set_int(core, smp_threads, "nr-threads", &error_fatal); - object_property_set_int(core, core_dt_id, CPU_CORE_PROP_CORE_ID, + object_property_set_int(core, core_id, CPU_CORE_PROP_CORE_ID, &error_fatal); object_property_set_bool(core, true, "realized", &error_fatal); } @@ -2376,7 +2377,6 @@ static HotpluggableCPUList *spapr_query_hotpluggable_cpus(MachineState *machine) HotpluggableCPUList *head = NULL; sPAPRMachineState *spapr = SPAPR_MACHINE(machine); int spapr_max_cores = max_cpus / smp_threads; - int smt = kvmppc_smt_threads(); for (i = 0; i < spapr_max_cores; i++) { HotpluggableCPUList *list_item = g_new0(typeof(*list_item), 1); @@ -2386,7 +2386,7 @@ static HotpluggableCPUList *spapr_query_hotpluggable_cpus(MachineState *machine) cpu_item->type = spapr_get_cpu_core_type(machine->cpu_model); cpu_item->vcpus_count = smp_threads; cpu_props->has_core_id = true; - cpu_props->core_id = i * smt; + cpu_props->core_id = i * smp_threads; /* TODO: add 'has_node/node' here to describe to which node core belongs */ diff --git a/hw/ppc/spapr_cpu_core.c b/hw/ppc/spapr_cpu_core.c index 4bfc96bd5a..c04aaa47d7 100644 --- a/hw/ppc/spapr_cpu_core.c +++ b/hw/ppc/spapr_cpu_core.c @@ -103,7 +103,6 @@ static void spapr_core_release(DeviceState *dev, void *opaque) size_t size = object_type_get_instance_size(typename); sPAPRMachineState *spapr = SPAPR_MACHINE(qdev_get_machine()); CPUCore *cc = CPU_CORE(dev); - int smt = kvmppc_smt_threads(); int i; for (i = 0; i < cc->nr_threads; i++) { @@ -117,7 +116,7 @@ static void spapr_core_release(DeviceState *dev, void *opaque) object_unparent(obj); } - spapr->cores[cc->core_id / smt] = NULL; + spapr->cores[cc->core_id / smp_threads] = NULL; g_free(sc->threads); object_unparent(OBJECT(dev)); @@ -128,18 +127,19 @@ void spapr_core_unplug(HotplugHandler *hotplug_dev, DeviceState *dev, { sPAPRMachineState *spapr = SPAPR_MACHINE(OBJECT(hotplug_dev)); CPUCore *cc = CPU_CORE(dev); + int smt = kvmppc_smt_threads(); + int index = cc->core_id / smp_threads; sPAPRDRConnector *drc = - spapr_dr_connector_by_id(SPAPR_DR_CONNECTOR_TYPE_CPU, cc->core_id); + spapr_dr_connector_by_id(SPAPR_DR_CONNECTOR_TYPE_CPU, index * smt); sPAPRDRConnectorClass *drck; Error *local_err = NULL; - int smt = kvmppc_smt_threads(); - int index = cc->core_id / smt; int spapr_max_cores = max_cpus / smp_threads; int i; for (i = spapr_max_cores - 1; i > index; i--) { if (spapr->cores[i]) { - error_setg(errp, "core-id %d should be removed first", i * smt); + error_setg(errp, "core-id %d should be removed first", + i * smp_threads); return; } } @@ -168,11 +168,10 @@ void spapr_core_plug(HotplugHandler *hotplug_dev, DeviceState *dev, Error *local_err = NULL; void *fdt = NULL; int fdt_offset = 0; - int index; + int index = cc->core_id / smp_threads; int smt = kvmppc_smt_threads(); - drc = spapr_dr_connector_by_id(SPAPR_DR_CONNECTOR_TYPE_CPU, cc->core_id); - index = cc->core_id / smt; + drc = spapr_dr_connector_by_id(SPAPR_DR_CONNECTOR_TYPE_CPU, index * smt); spapr->cores[index] = OBJECT(dev); if (!smc->dr_cpu_enabled) { @@ -226,7 +225,6 @@ void spapr_core_pre_plug(HotplugHandler *hotplug_dev, DeviceState *dev, sPAPRMachineState *spapr = SPAPR_MACHINE(OBJECT(hotplug_dev)); int spapr_max_cores = max_cpus / smp_threads; int index, i; - int smt = kvmppc_smt_threads(); Error *local_err = NULL; CPUCore *cc = CPU_CORE(dev); char *base_core_type = spapr_get_cpu_core_type(machine->cpu_model); @@ -247,12 +245,12 @@ void spapr_core_pre_plug(HotplugHandler *hotplug_dev, DeviceState *dev, goto out; } - if (cc->core_id % smt) { + if (cc->core_id % smp_threads) { error_setg(&local_err, "invalid core id %d\n", cc->core_id); goto out; } - index = cc->core_id / smt; + index = cc->core_id / smp_threads; if (index < 0 || index >= spapr_max_cores) { error_setg(&local_err, "core id %d out of range", cc->core_id); goto out; @@ -266,7 +264,7 @@ void spapr_core_pre_plug(HotplugHandler *hotplug_dev, DeviceState *dev, for (i = 0; i < index; i++) { if (!spapr->cores[i]) { error_setg(&local_err, "core-id %d should be added first", - i * smt); + i * smp_threads); goto out; } }