From 7c4228b4771acddcb8815079bc116007cec8a1ff Mon Sep 17 00:00:00 2001 From: Alex Williamson Date: Wed, 15 Jan 2014 10:07:26 -0700 Subject: [PATCH 1/8] vfio: Destroy memory regions Somehow this has been lurking for a while; we remove our subregions from the base BAR and VGA region mappings, but we don't destroy them, creating a leak and more serious problems when we try to migrate after removing these devices. Add the trivial bit of final cleanup to remove these entirely. Signed-off-by: Alex Williamson --- hw/misc/vfio.c | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/hw/misc/vfio.c b/hw/misc/vfio.c index 9aecaa82bc..ec9f41b98d 100644 --- a/hw/misc/vfio.c +++ b/hw/misc/vfio.c @@ -1968,6 +1968,7 @@ static void vfio_vga_quirk_teardown(VFIODevice *vdev) while (!QLIST_EMPTY(&vdev->vga.region[i].quirks)) { VFIOQuirk *quirk = QLIST_FIRST(&vdev->vga.region[i].quirks); memory_region_del_subregion(&vdev->vga.region[i].mem, &quirk->mem); + memory_region_destroy(&quirk->mem); QLIST_REMOVE(quirk, next); g_free(quirk); } @@ -1990,6 +1991,7 @@ static void vfio_bar_quirk_teardown(VFIODevice *vdev, int nr) while (!QLIST_EMPTY(&bar->quirks)) { VFIOQuirk *quirk = QLIST_FIRST(&bar->quirks); memory_region_del_subregion(&bar->mem, &quirk->mem); + memory_region_destroy(&quirk->mem); QLIST_REMOVE(quirk, next); g_free(quirk); } @@ -2412,10 +2414,12 @@ static void vfio_unmap_bar(VFIODevice *vdev, int nr) memory_region_del_subregion(&bar->mem, &bar->mmap_mem); munmap(bar->mmap, memory_region_size(&bar->mmap_mem)); + memory_region_destroy(&bar->mmap_mem); if (vdev->msix && vdev->msix->table_bar == nr) { memory_region_del_subregion(&bar->mem, &vdev->msix->mmap_mem); munmap(vdev->msix->mmap, memory_region_size(&vdev->msix->mmap_mem)); + memory_region_destroy(&vdev->msix->mmap_mem); } memory_region_destroy(&bar->mem); From d20b43dfea1587b561aae17e4fa0f7407779d253 Mon Sep 17 00:00:00 2001 From: Bandan Das Date: Wed, 15 Jan 2014 10:11:06 -0700 Subject: [PATCH 2/8] vfio: warn if host device rom can't be read If the device rom can't be read, report an error to the user. This alerts the user that the device has a bad state that is causing rom read failure or option rom loading has been disabled from the device boot menu (among other reasons). Signed-off-by: Bandan Das Signed-off-by: Alex Williamson --- hw/misc/vfio.c | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/hw/misc/vfio.c b/hw/misc/vfio.c index ec9f41b98d..ef615fc28f 100644 --- a/hw/misc/vfio.c +++ b/hw/misc/vfio.c @@ -1125,6 +1125,13 @@ static void vfio_pci_load_rom(VFIODevice *vdev) vdev->rom_offset = reg_info.offset; if (!vdev->rom_size) { + error_report("vfio-pci: Cannot read device rom at " + "%04x:%02x:%02x.%x\n", + vdev->host.domain, vdev->host.bus, vdev->host.slot, + vdev->host.function); + error_printf("Device option ROM contents are probably invalid " + "(check dmesg).\nSkip option ROM probe with rombar=0, " + "or load from file with romfile=\n"); return; } From e638073c569e801ce9def2016a84f955cbbca779 Mon Sep 17 00:00:00 2001 From: Bandan Das Date: Wed, 15 Jan 2014 10:11:52 -0700 Subject: [PATCH 3/8] vfio: Do not reattempt a failed rom read During lazy rom loading, if rom read fails, and the guest attempts a read again, vfio will again attempt it. Add a boolean to prevent this. There could be a case where a failed rom read might succeed the next time because of a device reset or such, but it's best to exclude unpredictable behavior Signed-off-by: Bandan Das Signed-off-by: Alex Williamson --- hw/misc/vfio.c | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/hw/misc/vfio.c b/hw/misc/vfio.c index ef615fc28f..30b1a78f9c 100644 --- a/hw/misc/vfio.c +++ b/hw/misc/vfio.c @@ -191,6 +191,7 @@ typedef struct VFIODevice { bool has_flr; bool has_pm_reset; bool needs_reset; + bool rom_read_failed; } VFIODevice; typedef struct VFIOGroup { @@ -1125,6 +1126,7 @@ static void vfio_pci_load_rom(VFIODevice *vdev) vdev->rom_offset = reg_info.offset; if (!vdev->rom_size) { + vdev->rom_read_failed = true; error_report("vfio-pci: Cannot read device rom at " "%04x:%02x:%02x.%x\n", vdev->host.domain, vdev->host.bus, vdev->host.slot, @@ -1163,6 +1165,9 @@ static uint64_t vfio_rom_read(void *opaque, hwaddr addr, unsigned size) /* Load the ROM lazily when the guest tries to read it */ if (unlikely(!vdev->rom)) { vfio_pci_load_rom(vdev); + if (unlikely(!vdev->rom && !vdev->rom_read_failed)) { + vfio_pci_load_rom(vdev); + } } memcpy(&val, vdev->rom + addr, @@ -1230,6 +1235,7 @@ static void vfio_pci_size_rom(VFIODevice *vdev) PCI_BASE_ADDRESS_SPACE_MEMORY, &vdev->pdev.rom); vdev->pdev.has_rom = true; + vdev->rom_read_failed = false; } static void vfio_vga_write(void *opaque, hwaddr addr, From d3a2fd9b29e43e202315d5e99399b99622469c4a Mon Sep 17 00:00:00 2001 From: Alex Williamson Date: Thu, 16 Jan 2014 09:22:07 -0700 Subject: [PATCH 4/8] vfio: Filter out bogus mappings Since 57271d63 we now see spurious mappings with the upper bits set if 64bit PCI BARs are sized while enabled. The guest writes a mask of 0xffffffff to the lower BAR to size it, then restores it, then writes the same mask to the upper BAR resulting in a spurious BAR mapping into the last 4G of the 64bit address space. Most architectures do not support or make use of the full 64bits address space for PCI BARs, so we filter out mappings with the high bit set. Long term, we probably need to think about vfio telling us the address width limitations of the IOMMU. Signed-off-by: Alex Williamson Reviewed-by: Michael S. Tsirkin --- hw/misc/vfio.c | 9 ++++++++- 1 file changed, 8 insertions(+), 1 deletion(-) diff --git a/hw/misc/vfio.c b/hw/misc/vfio.c index 30b1a78f9c..d304213bf1 100644 --- a/hw/misc/vfio.c +++ b/hw/misc/vfio.c @@ -2156,7 +2156,14 @@ static int vfio_dma_map(VFIOContainer *container, hwaddr iova, static bool vfio_listener_skipped_section(MemoryRegionSection *section) { - return !memory_region_is_ram(section->mr); + return !memory_region_is_ram(section->mr) || + /* + * Sizing an enabled 64-bit BAR can cause spurious mappings to + * addresses in the upper part of the 64-bit address space. These + * are never accessed by the CPU and beyond the address width of + * some IOMMU hardware. TODO: VFIO should tell us the IOMMU width. + */ + section->offset_within_address_space & (1ULL << 63); } static void vfio_listener_region_add(MemoryListener *listener, From 87ca1f77b1c406137fe36ab73b2dc91fb75f8d0a Mon Sep 17 00:00:00 2001 From: Alex Williamson Date: Thu, 16 Jan 2014 09:22:07 -0700 Subject: [PATCH 5/8] vfio-pci: Fail initfn on DMA mapping errors The vfio-pci initfn will currently succeed even if DMA mappings fail. A typical reason for failure is if the user does not have sufficient privilege to lock all the memory for the guest. In this case, the device gets attached, but can only access a portion of guest memory and is extremely unlikely to work. DMA mappings are done via a MemoryListener, which provides no direct error return path. We therefore stuff the errno into our container structure and check for error after registration completes. We can also test for mapping errors during runtime, but our only option for resolution at that point is to kill the guest with a hw_error. Signed-off-by: Alex Williamson --- hw/misc/vfio.c | 44 ++++++++++++++++++++++++++++++++++++++------ 1 file changed, 38 insertions(+), 6 deletions(-) diff --git a/hw/misc/vfio.c b/hw/misc/vfio.c index d304213bf1..432547ce16 100644 --- a/hw/misc/vfio.c +++ b/hw/misc/vfio.c @@ -135,12 +135,18 @@ enum { struct VFIOGroup; +typedef struct VFIOType1 { + MemoryListener listener; + int error; + bool initialized; +} VFIOType1; + typedef struct VFIOContainer { int fd; /* /dev/vfio/vfio, empowered by the attached groups */ struct { /* enable abstraction to support various iommu backends */ union { - MemoryListener listener; /* Used by type1 iommu */ + VFIOType1 type1; }; void (*release)(struct VFIOContainer *); } iommu_data; @@ -2170,7 +2176,7 @@ static void vfio_listener_region_add(MemoryListener *listener, MemoryRegionSection *section) { VFIOContainer *container = container_of(listener, VFIOContainer, - iommu_data.listener); + iommu_data.type1.listener); hwaddr iova, end; void *vaddr; int ret; @@ -2212,6 +2218,19 @@ static void vfio_listener_region_add(MemoryListener *listener, error_report("vfio_dma_map(%p, 0x%"HWADDR_PRIx", " "0x%"HWADDR_PRIx", %p) = %d (%m)", container, iova, end - iova, vaddr, ret); + + /* + * On the initfn path, store the first error in the container so we + * can gracefully fail. Runtime, there's not much we can do other + * than throw a hardware error. + */ + if (!container->iommu_data.type1.initialized) { + if (!container->iommu_data.type1.error) { + container->iommu_data.type1.error = ret; + } + } else { + hw_error("vfio: DMA mapping failed, unable to continue\n"); + } } } @@ -2219,7 +2238,7 @@ static void vfio_listener_region_del(MemoryListener *listener, MemoryRegionSection *section) { VFIOContainer *container = container_of(listener, VFIOContainer, - iommu_data.listener); + iommu_data.type1.listener); hwaddr iova, end; int ret; @@ -2264,7 +2283,7 @@ static MemoryListener vfio_memory_listener = { static void vfio_listener_release(VFIOContainer *container) { - memory_listener_unregister(&container->iommu_data.listener); + memory_listener_unregister(&container->iommu_data.type1.listener); } /* @@ -3236,10 +3255,23 @@ static int vfio_connect_container(VFIOGroup *group) return -errno; } - container->iommu_data.listener = vfio_memory_listener; + container->iommu_data.type1.listener = vfio_memory_listener; container->iommu_data.release = vfio_listener_release; - memory_listener_register(&container->iommu_data.listener, &address_space_memory); + memory_listener_register(&container->iommu_data.type1.listener, + &address_space_memory); + + if (container->iommu_data.type1.error) { + ret = container->iommu_data.type1.error; + vfio_listener_release(container); + g_free(container); + close(fd); + error_report("vfio: memory listener initialization failed for container\n"); + return ret; + } + + container->iommu_data.type1.initialized = true; + } else { error_report("vfio: No available IOMMU models"); g_free(container); From 47c16ed56aa6bc4037bdb7b61f049097993cd244 Mon Sep 17 00:00:00 2001 From: Alexey Kardashevskiy Date: Fri, 17 Jan 2014 11:12:07 -0700 Subject: [PATCH 6/8] kvm: initialize qemu_host_page_size There is a HOST_PAGE_ALIGN macro which makes sense for KVM accelerator but it uses qemu_host_page_size/qemu_host_page_mask which initialized for TCG only. This moves qemu_host_page_size/qemu_host_page_mask initialization from TCG's page_init() and adds a call for it from kvm_init(). Signed-off-by: Alexey Kardashevskiy Acked-by: Paolo Bonzini Signed-off-by: Alex Williamson --- include/exec/exec-all.h | 1 + kvm-all.c | 1 + translate-all.c | 14 ++++++++------ 3 files changed, 10 insertions(+), 6 deletions(-) diff --git a/include/exec/exec-all.h b/include/exec/exec-all.h index ea90b649d4..3b03cbfcf8 100644 --- a/include/exec/exec-all.h +++ b/include/exec/exec-all.h @@ -81,6 +81,7 @@ void cpu_gen_init(void); int cpu_gen_code(CPUArchState *env, struct TranslationBlock *tb, int *gen_code_size_ptr); bool cpu_restore_state(CPUArchState *env, uintptr_t searched_pc); +void page_size_init(void); void QEMU_NORETURN cpu_resume_from_signal(CPUArchState *env1, void *puc); void QEMU_NORETURN cpu_io_recompile(CPUArchState *env, uintptr_t retaddr); diff --git a/kvm-all.c b/kvm-all.c index 0bfb060fa7..edf2365b7a 100644 --- a/kvm-all.c +++ b/kvm-all.c @@ -1360,6 +1360,7 @@ int kvm_init(void) * page size for the system though. */ assert(TARGET_PAGE_SIZE <= getpagesize()); + page_size_init(); #ifdef KVM_CAP_SET_GUEST_DEBUG QTAILQ_INIT(&s->kvm_sw_breakpoints); diff --git a/translate-all.c b/translate-all.c index 105c25aff3..543e1ffe77 100644 --- a/translate-all.c +++ b/translate-all.c @@ -289,17 +289,15 @@ static inline void map_exec(void *addr, long size) } #endif -static void page_init(void) +void page_size_init(void) { /* NOTE: we can always suppose that qemu_host_page_size >= TARGET_PAGE_SIZE */ #ifdef _WIN32 - { - SYSTEM_INFO system_info; + SYSTEM_INFO system_info; - GetSystemInfo(&system_info); - qemu_real_host_page_size = system_info.dwPageSize; - } + GetSystemInfo(&system_info); + qemu_real_host_page_size = system_info.dwPageSize; #else qemu_real_host_page_size = getpagesize(); #endif @@ -310,7 +308,11 @@ static void page_init(void) qemu_host_page_size = TARGET_PAGE_SIZE; } qemu_host_page_mask = ~(qemu_host_page_size - 1); +} +static void page_init(void) +{ + page_size_init(); #if defined(CONFIG_BSD) && defined(CONFIG_USER_ONLY) { #ifdef HAVE_KINFO_GETVMMAP From 8d7b5a1da0e06aa7addd7f084d9ec9d433c4bafb Mon Sep 17 00:00:00 2001 From: Alexey Kardashevskiy Date: Fri, 17 Jan 2014 11:12:56 -0700 Subject: [PATCH 7/8] vfio: fix mapping of MSIX bar VFIO virtualizes MSIX table for the guest but not mapping the part of a BAR which contains an MSIX table. Since vfio_mmap_bar() mmaps chunks before and after the MSIX table, they have to be aligned to the host page size which may be TARGET_PAGE_MASK (4K) or 64K in case of PPC64. This fixes boundaries calculations to use the real host page size. Without the patch, the chunk before MSIX table may overlap with the MSIX table and mmap will fail in the host kernel. The result will be serious slowdown as the whole BAR will be emulated by QEMU. Signed-off-by: Alexey Kardashevskiy Signed-off-by: Alex Williamson --- hw/misc/vfio.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/hw/misc/vfio.c b/hw/misc/vfio.c index 432547ce16..8a1f1a124d 100644 --- a/hw/misc/vfio.c +++ b/hw/misc/vfio.c @@ -2544,7 +2544,7 @@ static void vfio_map_bar(VFIODevice *vdev, int nr) * potentially insert a direct-mapped subregion before and after it. */ if (vdev->msix && vdev->msix->table_bar == nr) { - size = vdev->msix->table_offset & TARGET_PAGE_MASK; + size = vdev->msix->table_offset & qemu_host_page_mask; } strncat(name, " mmap", sizeof(name) - strlen(name) - 1); @@ -2556,8 +2556,8 @@ static void vfio_map_bar(VFIODevice *vdev, int nr) if (vdev->msix && vdev->msix->table_bar == nr) { unsigned start; - start = TARGET_PAGE_ALIGN(vdev->msix->table_offset + - (vdev->msix->entries * PCI_MSIX_ENTRY_SIZE)); + start = HOST_PAGE_ALIGN(vdev->msix->table_offset + + (vdev->msix->entries * PCI_MSIX_ENTRY_SIZE)); size = start < bar->size ? bar->size - start : 0; strncat(name, " msix-hi", sizeof(name) - strlen(name) - 1); From 8b6d14087d487203f4d1a67aeaddc3be6c73f49f Mon Sep 17 00:00:00 2001 From: Bandan Das Date: Tue, 28 Jan 2014 08:23:19 -0700 Subject: [PATCH 8/8] vfio: correct debug macro typo Change to DEBUG_VFIO in vfio_msi_interrupt() for debug messages to get printed Signed-off-by: Bandan Das Signed-off-by: Alex Williamson --- hw/misc/vfio.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/hw/misc/vfio.c b/hw/misc/vfio.c index 8a1f1a124d..8db182fa3d 100644 --- a/hw/misc/vfio.c +++ b/hw/misc/vfio.c @@ -599,7 +599,7 @@ static void vfio_msi_interrupt(void *opaque) return; } -#ifdef VFIO_DEBUG +#ifdef DEBUG_VFIO MSIMessage msg; if (vdev->interrupt == VFIO_INT_MSIX) {