From 62fe83318d2fc5b31f473d66326910d94c1c4907 Mon Sep 17 00:00:00 2001 From: Christian Borntraeger Date: Fri, 10 Aug 2012 15:11:45 +0200 Subject: [PATCH 1/6] qemu: Use valgrind annotations to mark kvm guest memory as defined valgrind with kvm produces a big amount of false positives regarding "Conditional jump or move depends on uninitialised value(s)". This happens because the guest memory is allocated with qemu_vmalloc which boils down posix_memalign etc. This function is (correctly) considered by valgrind as returning undefined memory. Since valgrind is based on jitting code, it will not be able to see changes made by the guest to guest memory if this is done by KVM_RUN, thus keeping most of the guest memory undefined. Now lots of places in qemu will then use guest memory to change behaviour. To avoid the flood of these messages, lets declare the whole guest memory as defined. This will reduce the noise and allows us to see real problems. In the future we might want to make this conditional, since there is actually something that we can use those false positives for: These messages will point to code that depends on guest memory, so we can use these backtraces to actually make an audit that is focussed only at those code places. For normal development we dont want to see those messages, though. Signed-off-by: Christian Borntraeger Signed-off-by: Avi Kivity --- configure | 3 ++- kvm-all.c | 7 +++++++ 2 files changed, 9 insertions(+), 1 deletion(-) diff --git a/configure b/configure index d97fd81709..056b3561b9 100755 --- a/configure +++ b/configure @@ -2963,11 +2963,12 @@ if compile_prog "-Werror" "" ; then fi ######################################## -# check if we have valgrind/valgrind.h +# check if we have valgrind/valgrind.h and valgrind/memcheck.h valgrind_h=no cat > $TMPC << EOF #include +#include int main(void) { return 0; } diff --git a/kvm-all.c b/kvm-all.c index 34b02c1fba..e0244b6d1e 100644 --- a/kvm-all.c +++ b/kvm-all.c @@ -39,6 +39,10 @@ #include #endif +#ifdef CONFIG_VALGRIND_H +#include +#endif + /* KVM uses PAGE_SIZE in its definition of COALESCED_MMIO_MAX */ #define PAGE_SIZE TARGET_PAGE_SIZE @@ -1719,6 +1723,9 @@ void *kvm_vmalloc(ram_addr_t size) void kvm_setup_guest_memory(void *start, size_t size) { +#ifdef CONFIG_VALGRIND_H + VALGRIND_MAKE_MEM_DEFINED(start, size); +#endif if (!kvm_has_sync_mmu()) { int ret = qemu_madvise(start, size, QEMU_MADV_DONTFORK); From b131c74a0e485b084ddaffc8214c8a19af492be7 Mon Sep 17 00:00:00 2001 From: Jan Kiszka Date: Mon, 20 Aug 2012 10:55:56 +0200 Subject: [PATCH 2/6] kvm: Clean up irqfd API No need to expose the fd-based interface, everyone will already be fine with the more handy EventNotifier variant. Rename the latter to clarify that we are still talking about irqfds here. Signed-off-by: Jan Kiszka Acked-by: Alex Williamson Signed-off-by: Avi Kivity --- hw/virtio-pci.c | 4 ++-- kvm-all.c | 18 ++++-------------- kvm-stub.c | 14 ++------------ kvm.h | 6 ++---- 4 files changed, 10 insertions(+), 32 deletions(-) diff --git a/hw/virtio-pci.c b/hw/virtio-pci.c index b3f0710f39..400f3c26b6 100644 --- a/hw/virtio-pci.c +++ b/hw/virtio-pci.c @@ -517,7 +517,7 @@ static int kvm_virtio_pci_vq_vector_use(VirtIOPCIProxy *proxy, } irqfd->users++; - ret = kvm_irqchip_add_irq_notifier(kvm_state, n, irqfd->virq); + ret = kvm_irqchip_add_irqfd_notifier(kvm_state, n, irqfd->virq); if (ret < 0) { if (--irqfd->users == 0) { kvm_irqchip_release_virq(kvm_state, irqfd->virq); @@ -538,7 +538,7 @@ static void kvm_virtio_pci_vq_vector_release(VirtIOPCIProxy *proxy, VirtIOIRQFD *irqfd = &proxy->vector_irqfd[vector]; int ret; - ret = kvm_irqchip_remove_irq_notifier(kvm_state, n, irqfd->virq); + ret = kvm_irqchip_remove_irqfd_notifier(kvm_state, n, irqfd->virq); assert(ret == 0); if (--irqfd->users == 0) { diff --git a/kvm-all.c b/kvm-all.c index e0244b6d1e..f96be21034 100644 --- a/kvm-all.c +++ b/kvm-all.c @@ -1166,24 +1166,14 @@ static int kvm_irqchip_assign_irqfd(KVMState *s, int fd, int virq, bool assign) } #endif /* !KVM_CAP_IRQ_ROUTING */ -int kvm_irqchip_add_irqfd(KVMState *s, int fd, int virq) +int kvm_irqchip_add_irqfd_notifier(KVMState *s, EventNotifier *n, int virq) { - return kvm_irqchip_assign_irqfd(s, fd, virq, true); + return kvm_irqchip_assign_irqfd(s, event_notifier_get_fd(n), virq, true); } -int kvm_irqchip_add_irq_notifier(KVMState *s, EventNotifier *n, int virq) +int kvm_irqchip_remove_irqfd_notifier(KVMState *s, EventNotifier *n, int virq) { - return kvm_irqchip_add_irqfd(s, event_notifier_get_fd(n), virq); -} - -int kvm_irqchip_remove_irqfd(KVMState *s, int fd, int virq) -{ - return kvm_irqchip_assign_irqfd(s, fd, virq, false); -} - -int kvm_irqchip_remove_irq_notifier(KVMState *s, EventNotifier *n, int virq) -{ - return kvm_irqchip_remove_irqfd(s, event_notifier_get_fd(n), virq); + return kvm_irqchip_assign_irqfd(s, event_notifier_get_fd(n), virq, false); } static int kvm_irqchip_create(KVMState *s) diff --git a/kvm-stub.c b/kvm-stub.c index 94c9ea15b0..3c52eb5bc6 100644 --- a/kvm-stub.c +++ b/kvm-stub.c @@ -141,22 +141,12 @@ void kvm_irqchip_release_virq(KVMState *s, int virq) { } -int kvm_irqchip_add_irqfd(KVMState *s, int fd, int virq) +int kvm_irqchip_add_irqfd_notifier(KVMState *s, EventNotifier *n, int virq) { return -ENOSYS; } -int kvm_irqchip_add_irq_notifier(KVMState *s, EventNotifier *n, int virq) -{ - return -ENOSYS; -} - -int kvm_irqchip_remove_irqfd(KVMState *s, int fd, int virq) -{ - return -ENOSYS; -} - -int kvm_irqchip_remove_irq_notifier(KVMState *s, EventNotifier *n, int virq) +int kvm_irqchip_remove_irqfd_notifier(KVMState *s, EventNotifier *n, int virq) { return -ENOSYS; } diff --git a/kvm.h b/kvm.h index 5b8f588813..37d1f8166a 100644 --- a/kvm.h +++ b/kvm.h @@ -272,8 +272,6 @@ int kvm_set_ioeventfd_pio_word(int fd, uint16_t adr, uint16_t val, bool assign); int kvm_irqchip_add_msi_route(KVMState *s, MSIMessage msg); void kvm_irqchip_release_virq(KVMState *s, int virq); -int kvm_irqchip_add_irqfd(KVMState *s, int fd, int virq); -int kvm_irqchip_remove_irqfd(KVMState *s, int fd, int virq); -int kvm_irqchip_add_irq_notifier(KVMState *s, EventNotifier *n, int virq); -int kvm_irqchip_remove_irq_notifier(KVMState *s, EventNotifier *n, int virq); +int kvm_irqchip_add_irqfd_notifier(KVMState *s, EventNotifier *n, int virq); +int kvm_irqchip_remove_irqfd_notifier(KVMState *s, EventNotifier *n, int virq); #endif From cc57407e966e8467f9742e7a4ad47567b5951122 Mon Sep 17 00:00:00 2001 From: Jan Kiszka Date: Mon, 27 Aug 2012 08:28:38 +0200 Subject: [PATCH 3/6] kvm: Introduce kvm_irqchip_update_msi_route This service allows to update an MSI route without releasing/reacquiring the associated VIRQ. Will be used by PCI device assignment, later on likely also by virtio/vhost and VFIO. Signed-off-by: Jan Kiszka Acked-by: Acked-by: Michael S. Tsirkin Signed-off-by: Avi Kivity --- kvm-all.c | 42 ++++++++++++++++++++++++++++++++++++++++++ kvm.h | 1 + 2 files changed, 43 insertions(+) diff --git a/kvm-all.c b/kvm-all.c index f96be21034..e2abfb98d6 100644 --- a/kvm-all.c +++ b/kvm-all.c @@ -963,6 +963,30 @@ static void kvm_add_routing_entry(KVMState *s, kvm_irqchip_commit_routes(s); } +static int kvm_update_routing_entry(KVMState *s, + struct kvm_irq_routing_entry *new_entry) +{ + struct kvm_irq_routing_entry *entry; + int n; + + for (n = 0; n < s->irq_routes->nr; n++) { + entry = &s->irq_routes->entries[n]; + if (entry->gsi != new_entry->gsi) { + continue; + } + + entry->type = new_entry->type; + entry->flags = new_entry->flags; + entry->u = new_entry->u; + + kvm_irqchip_commit_routes(s); + + return 0; + } + + return -ESRCH; +} + void kvm_irqchip_add_irq_route(KVMState *s, int irq, int irqchip, int pin) { struct kvm_irq_routing_entry e; @@ -1125,6 +1149,24 @@ int kvm_irqchip_add_msi_route(KVMState *s, MSIMessage msg) return virq; } +int kvm_irqchip_update_msi_route(KVMState *s, int virq, MSIMessage msg) +{ + struct kvm_irq_routing_entry kroute; + + if (!kvm_irqchip_in_kernel()) { + return -ENOSYS; + } + + kroute.gsi = virq; + kroute.type = KVM_IRQ_ROUTING_MSI; + kroute.flags = 0; + kroute.u.msi.address_lo = (uint32_t)msg.address; + kroute.u.msi.address_hi = msg.address >> 32; + kroute.u.msi.data = msg.data; + + return kvm_update_routing_entry(s, &kroute); +} + static int kvm_irqchip_assign_irqfd(KVMState *s, int fd, int virq, bool assign) { struct kvm_irqfd irqfd = { diff --git a/kvm.h b/kvm.h index 37d1f8166a..5cefe3a1a3 100644 --- a/kvm.h +++ b/kvm.h @@ -270,6 +270,7 @@ int kvm_set_ioeventfd_mmio(int fd, uint32_t adr, uint32_t val, bool assign, int kvm_set_ioeventfd_pio_word(int fd, uint16_t adr, uint16_t val, bool assign); int kvm_irqchip_add_msi_route(KVMState *s, MSIMessage msg); +int kvm_irqchip_update_msi_route(KVMState *s, int virq, MSIMessage msg); void kvm_irqchip_release_virq(KVMState *s, int virq); int kvm_irqchip_add_irqfd_notifier(KVMState *s, EventNotifier *n, int virq); From 3ab73842446a9f2d9d78b23daa43ff382679eece Mon Sep 17 00:00:00 2001 From: Jan Kiszka Date: Mon, 27 Aug 2012 08:28:39 +0200 Subject: [PATCH 4/6] kvm: Introduce kvm_has_intx_set_mask Will be used by PCI device assignment code. Signed-off-by: Jan Kiszka Acked-by: Acked-by: Michael S. Tsirkin Signed-off-by: Avi Kivity --- kvm-all.c | 8 ++++++++ kvm.h | 1 + 2 files changed, 9 insertions(+) diff --git a/kvm-all.c b/kvm-all.c index e2abfb98d6..39cff55f5b 100644 --- a/kvm-all.c +++ b/kvm-all.c @@ -88,6 +88,7 @@ struct KVMState int pit_state2; int xsave, xcrs; int many_ioeventfds; + int intx_set_mask; /* The man page (and posix) say ioctl numbers are signed int, but * they're not. Linux, glibc and *BSD all treat ioctl numbers as * unsigned, and treating them as signed here can break things */ @@ -1386,6 +1387,8 @@ int kvm_init(void) s->direct_msi = (kvm_check_extension(s, KVM_CAP_SIGNAL_MSI) > 0); #endif + s->intx_set_mask = kvm_check_extension(s, KVM_CAP_PCI_2_3); + ret = kvm_arch_init(s); if (ret < 0) { goto err; @@ -1740,6 +1743,11 @@ int kvm_has_gsi_routing(void) #endif } +int kvm_has_intx_set_mask(void) +{ + return kvm_state->intx_set_mask; +} + void *kvm_vmalloc(ram_addr_t size) { #ifdef TARGET_S390X diff --git a/kvm.h b/kvm.h index 5cefe3a1a3..dea2998fd4 100644 --- a/kvm.h +++ b/kvm.h @@ -117,6 +117,7 @@ int kvm_has_xcrs(void); int kvm_has_pit_state2(void); int kvm_has_many_ioeventfds(void); int kvm_has_gsi_routing(void); +int kvm_has_intx_set_mask(void); #ifdef NEED_CPU_H int kvm_init_vcpu(CPUArchState *env); From b139bd300f4f579c526d153efa4960c380e2b6e3 Mon Sep 17 00:00:00 2001 From: Jan Kiszka Date: Mon, 27 Aug 2012 08:28:40 +0200 Subject: [PATCH 5/6] kvm: i386: Add services required for PCI device assignment These helpers abstract the interaction of upcoming pci-assign with the KVM kernel services. Put them under i386 only as other archs will implement device pass-through via VFIO and not this classic interface. Signed-off-by: Jan Kiszka Acked-by: Acked-by: Michael S. Tsirkin Signed-off-by: Avi Kivity --- target-i386/kvm.c | 141 +++++++++++++++++++++++++++++++++++++++++ target-i386/kvm_i386.h | 22 +++++++ 2 files changed, 163 insertions(+) diff --git a/target-i386/kvm.c b/target-i386/kvm.c index ffc294ec39..6790180b0a 100644 --- a/target-i386/kvm.c +++ b/target-i386/kvm.c @@ -31,6 +31,7 @@ #include "hw/apic.h" #include "ioport.h" #include "hyperv.h" +#include "hw/pci.h" //#define DEBUG_KVM @@ -2068,3 +2069,143 @@ void kvm_arch_init_irq_routing(KVMState *s) kvm_msi_via_irqfd_allowed = true; kvm_gsi_routing_allowed = true; } + +/* Classic KVM device assignment interface. Will remain x86 only. */ +int kvm_device_pci_assign(KVMState *s, PCIHostDeviceAddress *dev_addr, + uint32_t flags, uint32_t *dev_id) +{ + struct kvm_assigned_pci_dev dev_data = { + .segnr = dev_addr->domain, + .busnr = dev_addr->bus, + .devfn = PCI_DEVFN(dev_addr->slot, dev_addr->function), + .flags = flags, + }; + int ret; + + dev_data.assigned_dev_id = + (dev_addr->domain << 16) | (dev_addr->bus << 8) | dev_data.devfn; + + ret = kvm_vm_ioctl(s, KVM_ASSIGN_PCI_DEVICE, &dev_data); + if (ret < 0) { + return ret; + } + + *dev_id = dev_data.assigned_dev_id; + + return 0; +} + +int kvm_device_pci_deassign(KVMState *s, uint32_t dev_id) +{ + struct kvm_assigned_pci_dev dev_data = { + .assigned_dev_id = dev_id, + }; + + return kvm_vm_ioctl(s, KVM_DEASSIGN_PCI_DEVICE, &dev_data); +} + +static int kvm_assign_irq_internal(KVMState *s, uint32_t dev_id, + uint32_t irq_type, uint32_t guest_irq) +{ + struct kvm_assigned_irq assigned_irq = { + .assigned_dev_id = dev_id, + .guest_irq = guest_irq, + .flags = irq_type, + }; + + if (kvm_check_extension(s, KVM_CAP_ASSIGN_DEV_IRQ)) { + return kvm_vm_ioctl(s, KVM_ASSIGN_DEV_IRQ, &assigned_irq); + } else { + return kvm_vm_ioctl(s, KVM_ASSIGN_IRQ, &assigned_irq); + } +} + +int kvm_device_intx_assign(KVMState *s, uint32_t dev_id, bool use_host_msi, + uint32_t guest_irq) +{ + uint32_t irq_type = KVM_DEV_IRQ_GUEST_INTX | + (use_host_msi ? KVM_DEV_IRQ_HOST_MSI : KVM_DEV_IRQ_HOST_INTX); + + return kvm_assign_irq_internal(s, dev_id, irq_type, guest_irq); +} + +int kvm_device_intx_set_mask(KVMState *s, uint32_t dev_id, bool masked) +{ + struct kvm_assigned_pci_dev dev_data = { + .assigned_dev_id = dev_id, + .flags = masked ? KVM_DEV_ASSIGN_MASK_INTX : 0, + }; + + return kvm_vm_ioctl(s, KVM_ASSIGN_SET_INTX_MASK, &dev_data); +} + +static int kvm_deassign_irq_internal(KVMState *s, uint32_t dev_id, + uint32_t type) +{ + struct kvm_assigned_irq assigned_irq = { + .assigned_dev_id = dev_id, + .flags = type, + }; + + return kvm_vm_ioctl(s, KVM_DEASSIGN_DEV_IRQ, &assigned_irq); +} + +int kvm_device_intx_deassign(KVMState *s, uint32_t dev_id, bool use_host_msi) +{ + return kvm_deassign_irq_internal(s, dev_id, KVM_DEV_IRQ_GUEST_INTX | + (use_host_msi ? KVM_DEV_IRQ_HOST_MSI : KVM_DEV_IRQ_HOST_INTX)); +} + +int kvm_device_msi_assign(KVMState *s, uint32_t dev_id, int virq) +{ + return kvm_assign_irq_internal(s, dev_id, KVM_DEV_IRQ_HOST_MSI | + KVM_DEV_IRQ_GUEST_MSI, virq); +} + +int kvm_device_msi_deassign(KVMState *s, uint32_t dev_id) +{ + return kvm_deassign_irq_internal(s, dev_id, KVM_DEV_IRQ_GUEST_MSI | + KVM_DEV_IRQ_HOST_MSI); +} + +bool kvm_device_msix_supported(KVMState *s) +{ + /* The kernel lacks a corresponding KVM_CAP, so we probe by calling + * KVM_ASSIGN_SET_MSIX_NR with an invalid parameter. */ + return kvm_vm_ioctl(s, KVM_ASSIGN_SET_MSIX_NR, NULL) == -EFAULT; +} + +int kvm_device_msix_init_vectors(KVMState *s, uint32_t dev_id, + uint32_t nr_vectors) +{ + struct kvm_assigned_msix_nr msix_nr = { + .assigned_dev_id = dev_id, + .entry_nr = nr_vectors, + }; + + return kvm_vm_ioctl(s, KVM_ASSIGN_SET_MSIX_NR, &msix_nr); +} + +int kvm_device_msix_set_vector(KVMState *s, uint32_t dev_id, uint32_t vector, + int virq) +{ + struct kvm_assigned_msix_entry msix_entry = { + .assigned_dev_id = dev_id, + .gsi = virq, + .entry = vector, + }; + + return kvm_vm_ioctl(s, KVM_ASSIGN_SET_MSIX_ENTRY, &msix_entry); +} + +int kvm_device_msix_assign(KVMState *s, uint32_t dev_id) +{ + return kvm_assign_irq_internal(s, dev_id, KVM_DEV_IRQ_HOST_MSIX | + KVM_DEV_IRQ_GUEST_MSIX, 0); +} + +int kvm_device_msix_deassign(KVMState *s, uint32_t dev_id) +{ + return kvm_deassign_irq_internal(s, dev_id, KVM_DEV_IRQ_GUEST_MSIX | + KVM_DEV_IRQ_HOST_MSIX); +} diff --git a/target-i386/kvm_i386.h b/target-i386/kvm_i386.h index b82bbf401e..f6ab82f93c 100644 --- a/target-i386/kvm_i386.h +++ b/target-i386/kvm_i386.h @@ -11,6 +11,28 @@ #ifndef QEMU_KVM_I386_H #define QEMU_KVM_I386_H +#include "kvm.h" + bool kvm_allows_irq0_override(void); +int kvm_device_pci_assign(KVMState *s, PCIHostDeviceAddress *dev_addr, + uint32_t flags, uint32_t *dev_id); +int kvm_device_pci_deassign(KVMState *s, uint32_t dev_id); + +int kvm_device_intx_assign(KVMState *s, uint32_t dev_id, + bool use_host_msi, uint32_t guest_irq); +int kvm_device_intx_set_mask(KVMState *s, uint32_t dev_id, bool masked); +int kvm_device_intx_deassign(KVMState *s, uint32_t dev_id, bool use_host_msi); + +int kvm_device_msi_assign(KVMState *s, uint32_t dev_id, int virq); +int kvm_device_msi_deassign(KVMState *s, uint32_t dev_id); + +bool kvm_device_msix_supported(KVMState *s); +int kvm_device_msix_init_vectors(KVMState *s, uint32_t dev_id, + uint32_t nr_vectors); +int kvm_device_msix_set_vector(KVMState *s, uint32_t dev_id, uint32_t vector, + int virq); +int kvm_device_msix_assign(KVMState *s, uint32_t dev_id); +int kvm_device_msix_deassign(KVMState *s, uint32_t dev_id); + #endif From c3ebd3ba786e1be5449527e47001eded6b59322b Mon Sep 17 00:00:00 2001 From: Jan Kiszka Date: Thu, 30 Aug 2012 20:30:00 +0200 Subject: [PATCH 6/6] kvm: i386: Add classic PCI device assignment This adds PCI device assignment for i386 targets using the classic KVM interfaces. This version is 100% identical to what is being maintained in qemu-kvm for several years and is supported by libvirt as well. It is expected to remain relevant for another couple of years until kernels without full-features and performance-wise equivalent VFIO support are obsolete. A refactoring to-do that should be done in-tree is to model MSI and MSI-X support via the generic PCI layer, similar to what VFIO is already doing for MSI-X. This should improve the correctness and clean up the code from duplicate logic. Signed-off-by: Jan Kiszka Acked-by: Acked-by: Michael S. Tsirkin Signed-off-by: Avi Kivity --- hw/kvm/Makefile.objs | 2 +- hw/kvm/pci-assign.c | 1915 ++++++++++++++++++++++++++++++++++++++++++ hw/qdev-monitor.c | 1 + 3 files changed, 1917 insertions(+), 1 deletion(-) create mode 100644 hw/kvm/pci-assign.c diff --git a/hw/kvm/Makefile.objs b/hw/kvm/Makefile.objs index 226497a58f..f620d7ff85 100644 --- a/hw/kvm/Makefile.objs +++ b/hw/kvm/Makefile.objs @@ -1 +1 @@ -obj-$(CONFIG_KVM) += clock.o apic.o i8259.o ioapic.o i8254.o +obj-$(CONFIG_KVM) += clock.o apic.o i8259.o ioapic.o i8254.o pci-assign.o diff --git a/hw/kvm/pci-assign.c b/hw/kvm/pci-assign.c new file mode 100644 index 0000000000..05b93d9a51 --- /dev/null +++ b/hw/kvm/pci-assign.c @@ -0,0 +1,1915 @@ +/* + * Copyright (c) 2007, Neocleus Corporation. + * + * This work is licensed under the terms of the GNU GPL, version 2. See + * the COPYING file in the top-level directory. + * + * + * Assign a PCI device from the host to a guest VM. + * + * This implementation uses the classic device assignment interface of KVM + * and is only available on x86 hosts. It is expected to be obsoleted by VFIO + * based device assignment. + * + * Adapted for KVM (qemu-kvm) by Qumranet. QEMU version was based on qemu-kvm + * revision 4144fe9d48. See its repository for the history. + * + * Copyright (c) 2007, Neocleus, Alex Novik (alex@neocleus.com) + * Copyright (c) 2007, Neocleus, Guy Zana (guy@neocleus.com) + * Copyright (C) 2008, Qumranet, Amit Shah (amit.shah@qumranet.com) + * Copyright (C) 2008, Red Hat, Amit Shah (amit.shah@redhat.com) + * Copyright (C) 2008, IBM, Muli Ben-Yehuda (muli@il.ibm.com) + */ +#include +#include +#include +#include +#include +#include +#include "hw/hw.h" +#include "hw/pc.h" +#include "qemu-error.h" +#include "console.h" +#include "hw/loader.h" +#include "monitor.h" +#include "range.h" +#include "sysemu.h" +#include "hw/pci.h" +#include "hw/msi.h" +#include "kvm_i386.h" + +#define MSIX_PAGE_SIZE 0x1000 + +/* From linux/ioport.h */ +#define IORESOURCE_IO 0x00000100 /* Resource type */ +#define IORESOURCE_MEM 0x00000200 +#define IORESOURCE_IRQ 0x00000400 +#define IORESOURCE_DMA 0x00000800 +#define IORESOURCE_PREFETCH 0x00002000 /* No side effects */ + +//#define DEVICE_ASSIGNMENT_DEBUG + +#ifdef DEVICE_ASSIGNMENT_DEBUG +#define DEBUG(fmt, ...) \ + do { \ + fprintf(stderr, "%s: " fmt, __func__ , __VA_ARGS__); \ + } while (0) +#else +#define DEBUG(fmt, ...) +#endif + +typedef struct PCIRegion { + int type; /* Memory or port I/O */ + int valid; + uint64_t base_addr; + uint64_t size; /* size of the region */ + int resource_fd; +} PCIRegion; + +typedef struct PCIDevRegions { + uint8_t bus, dev, func; /* Bus inside domain, device and function */ + int irq; /* IRQ number */ + uint16_t region_number; /* number of active regions */ + + /* Port I/O or MMIO Regions */ + PCIRegion regions[PCI_NUM_REGIONS - 1]; + int config_fd; +} PCIDevRegions; + +typedef struct AssignedDevRegion { + MemoryRegion container; + MemoryRegion real_iomem; + union { + uint8_t *r_virtbase; /* mmapped access address for memory regions */ + uint32_t r_baseport; /* the base guest port for I/O regions */ + } u; + pcibus_t e_size; /* emulated size of region in bytes */ + pcibus_t r_size; /* real size of region in bytes */ + PCIRegion *region; +} AssignedDevRegion; + +#define ASSIGNED_DEVICE_PREFER_MSI_BIT 0 +#define ASSIGNED_DEVICE_SHARE_INTX_BIT 1 + +#define ASSIGNED_DEVICE_PREFER_MSI_MASK (1 << ASSIGNED_DEVICE_PREFER_MSI_BIT) +#define ASSIGNED_DEVICE_SHARE_INTX_MASK (1 << ASSIGNED_DEVICE_SHARE_INTX_BIT) + +typedef struct MSIXTableEntry { + uint32_t addr_lo; + uint32_t addr_hi; + uint32_t data; + uint32_t ctrl; +} MSIXTableEntry; + +typedef enum AssignedIRQType { + ASSIGNED_IRQ_NONE = 0, + ASSIGNED_IRQ_INTX_HOST_INTX, + ASSIGNED_IRQ_INTX_HOST_MSI, + ASSIGNED_IRQ_MSI, + ASSIGNED_IRQ_MSIX +} AssignedIRQType; + +typedef struct AssignedDevice { + PCIDevice dev; + PCIHostDeviceAddress host; + uint32_t dev_id; + uint32_t features; + int intpin; + AssignedDevRegion v_addrs[PCI_NUM_REGIONS - 1]; + PCIDevRegions real_device; + PCIINTxRoute intx_route; + AssignedIRQType assigned_irq_type; + struct { +#define ASSIGNED_DEVICE_CAP_MSI (1 << 0) +#define ASSIGNED_DEVICE_CAP_MSIX (1 << 1) + uint32_t available; +#define ASSIGNED_DEVICE_MSI_ENABLED (1 << 0) +#define ASSIGNED_DEVICE_MSIX_ENABLED (1 << 1) +#define ASSIGNED_DEVICE_MSIX_MASKED (1 << 2) + uint32_t state; + } cap; + uint8_t emulate_config_read[PCI_CONFIG_SPACE_SIZE]; + uint8_t emulate_config_write[PCI_CONFIG_SPACE_SIZE]; + int msi_virq_nr; + int *msi_virq; + MSIXTableEntry *msix_table; + target_phys_addr_t msix_table_addr; + uint16_t msix_max; + MemoryRegion mmio; + char *configfd_name; + int32_t bootindex; +} AssignedDevice; + +static void assigned_dev_update_irq_routing(PCIDevice *dev); + +static void assigned_dev_load_option_rom(AssignedDevice *dev); + +static void assigned_dev_unregister_msix_mmio(AssignedDevice *dev); + +static uint64_t assigned_dev_ioport_rw(AssignedDevRegion *dev_region, + target_phys_addr_t addr, int size, + uint64_t *data) +{ + uint64_t val = 0; + int fd = dev_region->region->resource_fd; + + if (fd >= 0) { + if (data) { + DEBUG("pwrite data=%" PRIx64 ", size=%d, e_phys=" TARGET_FMT_plx + ", addr="TARGET_FMT_plx"\n", *data, size, addr, addr); + if (pwrite(fd, data, size, addr) != size) { + error_report("%s - pwrite failed %s", + __func__, strerror(errno)); + } + } else { + if (pread(fd, &val, size, addr) != size) { + error_report("%s - pread failed %s", + __func__, strerror(errno)); + val = (1UL << (size * 8)) - 1; + } + DEBUG("pread val=%" PRIx64 ", size=%d, e_phys=" TARGET_FMT_plx + ", addr=" TARGET_FMT_plx "\n", val, size, addr, addr); + } + } else { + uint32_t port = addr + dev_region->u.r_baseport; + + if (data) { + DEBUG("out data=%" PRIx64 ", size=%d, e_phys=" TARGET_FMT_plx + ", host=%x\n", *data, size, addr, port); + switch (size) { + case 1: + outb(*data, port); + break; + case 2: + outw(*data, port); + break; + case 4: + outl(*data, port); + break; + } + } else { + switch (size) { + case 1: + val = inb(port); + break; + case 2: + val = inw(port); + break; + case 4: + val = inl(port); + break; + } + DEBUG("in data=%" PRIx64 ", size=%d, e_phys=" TARGET_FMT_plx + ", host=%x\n", val, size, addr, port); + } + } + return val; +} + +static void assigned_dev_ioport_write(void *opaque, target_phys_addr_t addr, + uint64_t data, unsigned size) +{ + assigned_dev_ioport_rw(opaque, addr, size, &data); +} + +static uint64_t assigned_dev_ioport_read(void *opaque, + target_phys_addr_t addr, unsigned size) +{ + return assigned_dev_ioport_rw(opaque, addr, size, NULL); +} + +static uint32_t slow_bar_readb(void *opaque, target_phys_addr_t addr) +{ + AssignedDevRegion *d = opaque; + uint8_t *in = d->u.r_virtbase + addr; + uint32_t r; + + r = *in; + DEBUG("slow_bar_readl addr=0x" TARGET_FMT_plx " val=0x%08x\n", addr, r); + + return r; +} + +static uint32_t slow_bar_readw(void *opaque, target_phys_addr_t addr) +{ + AssignedDevRegion *d = opaque; + uint16_t *in = (uint16_t *)(d->u.r_virtbase + addr); + uint32_t r; + + r = *in; + DEBUG("slow_bar_readl addr=0x" TARGET_FMT_plx " val=0x%08x\n", addr, r); + + return r; +} + +static uint32_t slow_bar_readl(void *opaque, target_phys_addr_t addr) +{ + AssignedDevRegion *d = opaque; + uint32_t *in = (uint32_t *)(d->u.r_virtbase + addr); + uint32_t r; + + r = *in; + DEBUG("slow_bar_readl addr=0x" TARGET_FMT_plx " val=0x%08x\n", addr, r); + + return r; +} + +static void slow_bar_writeb(void *opaque, target_phys_addr_t addr, uint32_t val) +{ + AssignedDevRegion *d = opaque; + uint8_t *out = d->u.r_virtbase + addr; + + DEBUG("slow_bar_writeb addr=0x" TARGET_FMT_plx " val=0x%02x\n", addr, val); + *out = val; +} + +static void slow_bar_writew(void *opaque, target_phys_addr_t addr, uint32_t val) +{ + AssignedDevRegion *d = opaque; + uint16_t *out = (uint16_t *)(d->u.r_virtbase + addr); + + DEBUG("slow_bar_writew addr=0x" TARGET_FMT_plx " val=0x%04x\n", addr, val); + *out = val; +} + +static void slow_bar_writel(void *opaque, target_phys_addr_t addr, uint32_t val) +{ + AssignedDevRegion *d = opaque; + uint32_t *out = (uint32_t *)(d->u.r_virtbase + addr); + + DEBUG("slow_bar_writel addr=0x" TARGET_FMT_plx " val=0x%08x\n", addr, val); + *out = val; +} + +static const MemoryRegionOps slow_bar_ops = { + .old_mmio = { + .read = { slow_bar_readb, slow_bar_readw, slow_bar_readl, }, + .write = { slow_bar_writeb, slow_bar_writew, slow_bar_writel, }, + }, + .endianness = DEVICE_NATIVE_ENDIAN, +}; + +static void assigned_dev_iomem_setup(PCIDevice *pci_dev, int region_num, + pcibus_t e_size) +{ + AssignedDevice *r_dev = DO_UPCAST(AssignedDevice, dev, pci_dev); + AssignedDevRegion *region = &r_dev->v_addrs[region_num]; + PCIRegion *real_region = &r_dev->real_device.regions[region_num]; + + if (e_size > 0) { + memory_region_init(®ion->container, "assigned-dev-container", + e_size); + memory_region_add_subregion(®ion->container, 0, ®ion->real_iomem); + + /* deal with MSI-X MMIO page */ + if (real_region->base_addr <= r_dev->msix_table_addr && + real_region->base_addr + real_region->size > + r_dev->msix_table_addr) { + uint64_t offset = r_dev->msix_table_addr - real_region->base_addr; + + memory_region_add_subregion_overlap(®ion->container, + offset, + &r_dev->mmio, + 1); + } + } +} + +static const MemoryRegionOps assigned_dev_ioport_ops = { + .read = assigned_dev_ioport_read, + .write = assigned_dev_ioport_write, + .endianness = DEVICE_NATIVE_ENDIAN, +}; + +static void assigned_dev_ioport_setup(PCIDevice *pci_dev, int region_num, + pcibus_t size) +{ + AssignedDevice *r_dev = DO_UPCAST(AssignedDevice, dev, pci_dev); + AssignedDevRegion *region = &r_dev->v_addrs[region_num]; + + region->e_size = size; + memory_region_init(®ion->container, "assigned-dev-container", size); + memory_region_init_io(®ion->real_iomem, &assigned_dev_ioport_ops, + r_dev->v_addrs + region_num, + "assigned-dev-iomem", size); + memory_region_add_subregion(®ion->container, 0, ®ion->real_iomem); +} + +static uint32_t assigned_dev_pci_read(PCIDevice *d, int pos, int len) +{ + AssignedDevice *pci_dev = DO_UPCAST(AssignedDevice, dev, d); + uint32_t val; + ssize_t ret; + int fd = pci_dev->real_device.config_fd; + +again: + ret = pread(fd, &val, len, pos); + if (ret != len) { + if ((ret < 0) && (errno == EINTR || errno == EAGAIN)) { + goto again; + } + + hw_error("pci read failed, ret = %zd errno = %d\n", ret, errno); + } + + return val; +} + +static uint8_t assigned_dev_pci_read_byte(PCIDevice *d, int pos) +{ + return (uint8_t)assigned_dev_pci_read(d, pos, 1); +} + +static void assigned_dev_pci_write(PCIDevice *d, int pos, uint32_t val, int len) +{ + AssignedDevice *pci_dev = DO_UPCAST(AssignedDevice, dev, d); + ssize_t ret; + int fd = pci_dev->real_device.config_fd; + +again: + ret = pwrite(fd, &val, len, pos); + if (ret != len) { + if ((ret < 0) && (errno == EINTR || errno == EAGAIN)) { + goto again; + } + + hw_error("pci write failed, ret = %zd errno = %d\n", ret, errno); + } +} + +static void assigned_dev_emulate_config_read(AssignedDevice *dev, + uint32_t offset, uint32_t len) +{ + memset(dev->emulate_config_read + offset, 0xff, len); +} + +static void assigned_dev_direct_config_read(AssignedDevice *dev, + uint32_t offset, uint32_t len) +{ + memset(dev->emulate_config_read + offset, 0, len); +} + +static void assigned_dev_direct_config_write(AssignedDevice *dev, + uint32_t offset, uint32_t len) +{ + memset(dev->emulate_config_write + offset, 0, len); +} + +static uint8_t pci_find_cap_offset(PCIDevice *d, uint8_t cap, uint8_t start) +{ + int id; + int max_cap = 48; + int pos = start ? start : PCI_CAPABILITY_LIST; + int status; + + status = assigned_dev_pci_read_byte(d, PCI_STATUS); + if ((status & PCI_STATUS_CAP_LIST) == 0) { + return 0; + } + + while (max_cap--) { + pos = assigned_dev_pci_read_byte(d, pos); + if (pos < 0x40) { + break; + } + + pos &= ~3; + id = assigned_dev_pci_read_byte(d, pos + PCI_CAP_LIST_ID); + + if (id == 0xff) { + break; + } + if (id == cap) { + return pos; + } + + pos += PCI_CAP_LIST_NEXT; + } + return 0; +} + +static int assigned_dev_register_regions(PCIRegion *io_regions, + unsigned long regions_num, + AssignedDevice *pci_dev) +{ + uint32_t i; + PCIRegion *cur_region = io_regions; + + for (i = 0; i < regions_num; i++, cur_region++) { + if (!cur_region->valid) { + continue; + } + + /* handle memory io regions */ + if (cur_region->type & IORESOURCE_MEM) { + int t = cur_region->type & IORESOURCE_PREFETCH + ? PCI_BASE_ADDRESS_MEM_PREFETCH + : PCI_BASE_ADDRESS_SPACE_MEMORY; + + /* map physical memory */ + pci_dev->v_addrs[i].u.r_virtbase = mmap(NULL, cur_region->size, + PROT_WRITE | PROT_READ, + MAP_SHARED, + cur_region->resource_fd, + (off_t)0); + + if (pci_dev->v_addrs[i].u.r_virtbase == MAP_FAILED) { + pci_dev->v_addrs[i].u.r_virtbase = NULL; + error_report("%s: Error: Couldn't mmap 0x%" PRIx64 "!", + __func__, cur_region->base_addr); + return -1; + } + + pci_dev->v_addrs[i].r_size = cur_region->size; + pci_dev->v_addrs[i].e_size = 0; + + /* add offset */ + pci_dev->v_addrs[i].u.r_virtbase += + (cur_region->base_addr & 0xFFF); + + if (cur_region->size & 0xFFF) { + error_report("PCI region %d at address 0x%" PRIx64 " has " + "size 0x%" PRIx64 ", which is not a multiple of " + "4K. You might experience some performance hit " + "due to that.", + i, cur_region->base_addr, cur_region->size); + memory_region_init_io(&pci_dev->v_addrs[i].real_iomem, + &slow_bar_ops, &pci_dev->v_addrs[i], + "assigned-dev-slow-bar", + cur_region->size); + } else { + void *virtbase = pci_dev->v_addrs[i].u.r_virtbase; + char name[32]; + snprintf(name, sizeof(name), "%s.bar%d", + object_get_typename(OBJECT(pci_dev)), i); + memory_region_init_ram_ptr(&pci_dev->v_addrs[i].real_iomem, + name, cur_region->size, + virtbase); + vmstate_register_ram(&pci_dev->v_addrs[i].real_iomem, + &pci_dev->dev.qdev); + } + + assigned_dev_iomem_setup(&pci_dev->dev, i, cur_region->size); + pci_register_bar((PCIDevice *) pci_dev, i, t, + &pci_dev->v_addrs[i].container); + continue; + } else { + /* handle port io regions */ + uint32_t val; + int ret; + + /* Test kernel support for ioport resource read/write. Old + * kernels return EIO. New kernels only allow 1/2/4 byte reads + * so should return EINVAL for a 3 byte read */ + ret = pread(pci_dev->v_addrs[i].region->resource_fd, &val, 3, 0); + if (ret >= 0) { + error_report("Unexpected return from I/O port read: %d", ret); + abort(); + } else if (errno != EINVAL) { + error_report("Kernel doesn't support ioport resource " + "access, hiding this region."); + close(pci_dev->v_addrs[i].region->resource_fd); + cur_region->valid = 0; + continue; + } + + pci_dev->v_addrs[i].u.r_baseport = cur_region->base_addr; + pci_dev->v_addrs[i].r_size = cur_region->size; + pci_dev->v_addrs[i].e_size = 0; + + assigned_dev_ioport_setup(&pci_dev->dev, i, cur_region->size); + pci_register_bar((PCIDevice *) pci_dev, i, + PCI_BASE_ADDRESS_SPACE_IO, + &pci_dev->v_addrs[i].container); + } + } + + /* success */ + return 0; +} + +static int get_real_id(const char *devpath, const char *idname, uint16_t *val) +{ + FILE *f; + char name[128]; + long id; + + snprintf(name, sizeof(name), "%s%s", devpath, idname); + f = fopen(name, "r"); + if (f == NULL) { + error_report("%s: %s: %m", __func__, name); + return -1; + } + if (fscanf(f, "%li\n", &id) == 1) { + *val = id; + } else { + return -1; + } + fclose(f); + + return 0; +} + +static int get_real_vendor_id(const char *devpath, uint16_t *val) +{ + return get_real_id(devpath, "vendor", val); +} + +static int get_real_device_id(const char *devpath, uint16_t *val) +{ + return get_real_id(devpath, "device", val); +} + +static int get_real_device(AssignedDevice *pci_dev, uint16_t r_seg, + uint8_t r_bus, uint8_t r_dev, uint8_t r_func) +{ + char dir[128], name[128]; + int fd, r = 0, v; + FILE *f; + uint64_t start, end, size, flags; + uint16_t id; + PCIRegion *rp; + PCIDevRegions *dev = &pci_dev->real_device; + + dev->region_number = 0; + + snprintf(dir, sizeof(dir), "/sys/bus/pci/devices/%04x:%02x:%02x.%x/", + r_seg, r_bus, r_dev, r_func); + + snprintf(name, sizeof(name), "%sconfig", dir); + + if (pci_dev->configfd_name && *pci_dev->configfd_name) { + if (qemu_isdigit(pci_dev->configfd_name[0])) { + dev->config_fd = strtol(pci_dev->configfd_name, NULL, 0); + } else { + dev->config_fd = monitor_get_fd(cur_mon, pci_dev->configfd_name); + if (dev->config_fd < 0) { + error_report("%s: (%s) unkown", __func__, + pci_dev->configfd_name); + return 1; + } + } + } else { + dev->config_fd = open(name, O_RDWR); + + if (dev->config_fd == -1) { + error_report("%s: %s: %m", __func__, name); + return 1; + } + } +again: + r = read(dev->config_fd, pci_dev->dev.config, + pci_config_size(&pci_dev->dev)); + if (r < 0) { + if (errno == EINTR || errno == EAGAIN) { + goto again; + } + error_report("%s: read failed, errno = %d", __func__, errno); + } + + /* Restore or clear multifunction, this is always controlled by qemu */ + if (pci_dev->dev.cap_present & QEMU_PCI_CAP_MULTIFUNCTION) { + pci_dev->dev.config[PCI_HEADER_TYPE] |= PCI_HEADER_TYPE_MULTI_FUNCTION; + } else { + pci_dev->dev.config[PCI_HEADER_TYPE] &= ~PCI_HEADER_TYPE_MULTI_FUNCTION; + } + + /* Clear host resource mapping info. If we choose not to register a + * BAR, such as might be the case with the option ROM, we can get + * confusing, unwritable, residual addresses from the host here. */ + memset(&pci_dev->dev.config[PCI_BASE_ADDRESS_0], 0, 24); + memset(&pci_dev->dev.config[PCI_ROM_ADDRESS], 0, 4); + + snprintf(name, sizeof(name), "%sresource", dir); + + f = fopen(name, "r"); + if (f == NULL) { + error_report("%s: %s: %m", __func__, name); + return 1; + } + + for (r = 0; r < PCI_ROM_SLOT; r++) { + if (fscanf(f, "%" SCNi64 " %" SCNi64 " %" SCNi64 "\n", + &start, &end, &flags) != 3) { + break; + } + + rp = dev->regions + r; + rp->valid = 0; + rp->resource_fd = -1; + size = end - start + 1; + flags &= IORESOURCE_IO | IORESOURCE_MEM | IORESOURCE_PREFETCH; + if (size == 0 || (flags & ~IORESOURCE_PREFETCH) == 0) { + continue; + } + if (flags & IORESOURCE_MEM) { + flags &= ~IORESOURCE_IO; + } else { + flags &= ~IORESOURCE_PREFETCH; + } + snprintf(name, sizeof(name), "%sresource%d", dir, r); + fd = open(name, O_RDWR); + if (fd == -1) { + continue; + } + rp->resource_fd = fd; + + rp->type = flags; + rp->valid = 1; + rp->base_addr = start; + rp->size = size; + pci_dev->v_addrs[r].region = rp; + DEBUG("region %d size %" PRIu64 " start 0x%" PRIx64 + " type %d resource_fd %d\n", + r, rp->size, start, rp->type, rp->resource_fd); + } + + fclose(f); + + /* read and fill vendor ID */ + v = get_real_vendor_id(dir, &id); + if (v) { + return 1; + } + pci_dev->dev.config[0] = id & 0xff; + pci_dev->dev.config[1] = (id & 0xff00) >> 8; + + /* read and fill device ID */ + v = get_real_device_id(dir, &id); + if (v) { + return 1; + } + pci_dev->dev.config[2] = id & 0xff; + pci_dev->dev.config[3] = (id & 0xff00) >> 8; + + pci_word_test_and_clear_mask(pci_dev->emulate_config_write + PCI_COMMAND, + PCI_COMMAND_MASTER | PCI_COMMAND_INTX_DISABLE); + + dev->region_number = r; + return 0; +} + +static void free_msi_virqs(AssignedDevice *dev) +{ + int i; + + for (i = 0; i < dev->msi_virq_nr; i++) { + if (dev->msi_virq[i] >= 0) { + kvm_irqchip_release_virq(kvm_state, dev->msi_virq[i]); + dev->msi_virq[i] = -1; + } + } + g_free(dev->msi_virq); + dev->msi_virq = NULL; + dev->msi_virq_nr = 0; +} + +static void free_assigned_device(AssignedDevice *dev) +{ + int i; + + if (dev->cap.available & ASSIGNED_DEVICE_CAP_MSIX) { + assigned_dev_unregister_msix_mmio(dev); + } + for (i = 0; i < dev->real_device.region_number; i++) { + PCIRegion *pci_region = &dev->real_device.regions[i]; + AssignedDevRegion *region = &dev->v_addrs[i]; + + if (!pci_region->valid) { + continue; + } + if (pci_region->type & IORESOURCE_IO) { + if (region->u.r_baseport) { + memory_region_del_subregion(®ion->container, + ®ion->real_iomem); + memory_region_destroy(®ion->real_iomem); + memory_region_destroy(®ion->container); + } + } else if (pci_region->type & IORESOURCE_MEM) { + if (region->u.r_virtbase) { + memory_region_del_subregion(®ion->container, + ®ion->real_iomem); + + /* Remove MSI-X table subregion */ + if (pci_region->base_addr <= dev->msix_table_addr && + pci_region->base_addr + pci_region->size > + dev->msix_table_addr) { + memory_region_del_subregion(®ion->container, + &dev->mmio); + } + + memory_region_destroy(®ion->real_iomem); + memory_region_destroy(®ion->container); + if (munmap(region->u.r_virtbase, + (pci_region->size + 0xFFF) & 0xFFFFF000)) { + error_report("Failed to unmap assigned device region: %s", + strerror(errno)); + } + } + } + if (pci_region->resource_fd >= 0) { + close(pci_region->resource_fd); + } + } + + if (dev->real_device.config_fd >= 0) { + close(dev->real_device.config_fd); + } + + free_msi_virqs(dev); +} + +static void assign_failed_examine(AssignedDevice *dev) +{ + char name[PATH_MAX], dir[PATH_MAX], driver[PATH_MAX] = {}, *ns; + uint16_t vendor_id, device_id; + int r; + + snprintf(dir, sizeof(dir), "/sys/bus/pci/devices/%04x:%02x:%02x.%01x/", + dev->host.domain, dev->host.bus, dev->host.slot, + dev->host.function); + + snprintf(name, sizeof(name), "%sdriver", dir); + + r = readlink(name, driver, sizeof(driver)); + if ((r <= 0) || r >= sizeof(driver)) { + goto fail; + } + + ns = strrchr(driver, '/'); + if (!ns) { + goto fail; + } + + ns++; + + if (get_real_vendor_id(dir, &vendor_id) || + get_real_device_id(dir, &device_id)) { + goto fail; + } + + error_report("*** The driver '%s' is occupying your device " + "%04x:%02x:%02x.%x.", + ns, dev->host.domain, dev->host.bus, dev->host.slot, + dev->host.function); + error_report("***"); + error_report("*** You can try the following commands to free it:"); + error_report("***"); + error_report("*** $ echo \"%04x %04x\" > /sys/bus/pci/drivers/pci-stub/" + "new_id", vendor_id, device_id); + error_report("*** $ echo \"%04x:%02x:%02x.%x\" > /sys/bus/pci/drivers/" + "%s/unbind", + dev->host.domain, dev->host.bus, dev->host.slot, + dev->host.function, ns); + error_report("*** $ echo \"%04x:%02x:%02x.%x\" > /sys/bus/pci/drivers/" + "pci-stub/bind", + dev->host.domain, dev->host.bus, dev->host.slot, + dev->host.function); + error_report("*** $ echo \"%04x %04x\" > /sys/bus/pci/drivers/pci-stub" + "/remove_id", vendor_id, device_id); + error_report("***"); + + return; + +fail: + error_report("Couldn't find out why."); +} + +static int assign_device(AssignedDevice *dev) +{ + uint32_t flags = KVM_DEV_ASSIGN_ENABLE_IOMMU; + int r; + + /* Only pass non-zero PCI segment to capable module */ + if (!kvm_check_extension(kvm_state, KVM_CAP_PCI_SEGMENT) && + dev->host.domain) { + error_report("Can't assign device inside non-zero PCI segment " + "as this KVM module doesn't support it."); + return -ENODEV; + } + + if (!kvm_check_extension(kvm_state, KVM_CAP_IOMMU)) { + error_report("No IOMMU found. Unable to assign device \"%s\"", + dev->dev.qdev.id); + return -ENODEV; + } + + if (dev->features & ASSIGNED_DEVICE_SHARE_INTX_MASK && + kvm_has_intx_set_mask()) { + flags |= KVM_DEV_ASSIGN_PCI_2_3; + } + + r = kvm_device_pci_assign(kvm_state, &dev->host, flags, &dev->dev_id); + if (r < 0) { + error_report("Failed to assign device \"%s\" : %s", + dev->dev.qdev.id, strerror(-r)); + + switch (r) { + case -EBUSY: + assign_failed_examine(dev); + break; + default: + break; + } + } + return r; +} + +static bool check_irqchip_in_kernel(void) +{ + if (kvm_irqchip_in_kernel()) { + return true; + } + error_report("pci-assign: error: requires KVM with in-kernel irqchip " + "enabled"); + return false; +} + +static int assign_intx(AssignedDevice *dev) +{ + AssignedIRQType new_type; + PCIINTxRoute intx_route; + bool intx_host_msi; + int r; + + /* Interrupt PIN 0 means don't use INTx */ + if (assigned_dev_pci_read_byte(&dev->dev, PCI_INTERRUPT_PIN) == 0) { + pci_device_set_intx_routing_notifier(&dev->dev, NULL); + return 0; + } + + if (!check_irqchip_in_kernel()) { + return -ENOTSUP; + } + + pci_device_set_intx_routing_notifier(&dev->dev, + assigned_dev_update_irq_routing); + + intx_route = pci_device_route_intx_to_irq(&dev->dev, dev->intpin); + assert(intx_route.mode != PCI_INTX_INVERTED); + + if (dev->intx_route.mode == intx_route.mode && + dev->intx_route.irq == intx_route.irq) { + return 0; + } + + switch (dev->assigned_irq_type) { + case ASSIGNED_IRQ_INTX_HOST_INTX: + case ASSIGNED_IRQ_INTX_HOST_MSI: + intx_host_msi = dev->assigned_irq_type == ASSIGNED_IRQ_INTX_HOST_MSI; + r = kvm_device_intx_deassign(kvm_state, dev->dev_id, intx_host_msi); + break; + case ASSIGNED_IRQ_MSI: + r = kvm_device_msi_deassign(kvm_state, dev->dev_id); + break; + case ASSIGNED_IRQ_MSIX: + r = kvm_device_msix_deassign(kvm_state, dev->dev_id); + break; + default: + r = 0; + break; + } + if (r) { + perror("assign_intx: deassignment of previous interrupt failed"); + } + dev->assigned_irq_type = ASSIGNED_IRQ_NONE; + + if (intx_route.mode == PCI_INTX_DISABLED) { + dev->intx_route = intx_route; + return 0; + } + +retry: + if (dev->features & ASSIGNED_DEVICE_PREFER_MSI_MASK && + dev->cap.available & ASSIGNED_DEVICE_CAP_MSI) { + intx_host_msi = true; + new_type = ASSIGNED_IRQ_INTX_HOST_MSI; + } else { + intx_host_msi = false; + new_type = ASSIGNED_IRQ_INTX_HOST_INTX; + } + + r = kvm_device_intx_assign(kvm_state, dev->dev_id, intx_host_msi, + intx_route.irq); + if (r < 0) { + if (r == -EIO && !(dev->features & ASSIGNED_DEVICE_PREFER_MSI_MASK) && + dev->cap.available & ASSIGNED_DEVICE_CAP_MSI) { + /* Retry with host-side MSI. There might be an IRQ conflict and + * either the kernel or the device doesn't support sharing. */ + error_report("Host-side INTx sharing not supported, " + "using MSI instead.\n" + "Some devices do not to work properly in this mode."); + dev->features |= ASSIGNED_DEVICE_PREFER_MSI_MASK; + goto retry; + } + error_report("Failed to assign irq for \"%s\": %s", + dev->dev.qdev.id, strerror(-r)); + error_report("Perhaps you are assigning a device " + "that shares an IRQ with another device?"); + return r; + } + + dev->intx_route = intx_route; + dev->assigned_irq_type = new_type; + return r; +} + +static void deassign_device(AssignedDevice *dev) +{ + int r; + + r = kvm_device_pci_deassign(kvm_state, dev->dev_id); + assert(r == 0); +} + +/* The pci config space got updated. Check if irq numbers have changed + * for our devices + */ +static void assigned_dev_update_irq_routing(PCIDevice *dev) +{ + AssignedDevice *assigned_dev = DO_UPCAST(AssignedDevice, dev, dev); + Error *err = NULL; + int r; + + r = assign_intx(assigned_dev); + if (r < 0) { + qdev_unplug(&dev->qdev, &err); + assert(!err); + } +} + +static void assigned_dev_update_msi(PCIDevice *pci_dev) +{ + AssignedDevice *assigned_dev = DO_UPCAST(AssignedDevice, dev, pci_dev); + uint8_t ctrl_byte = pci_get_byte(pci_dev->config + pci_dev->msi_cap + + PCI_MSI_FLAGS); + int r; + + /* Some guests gratuitously disable MSI even if they're not using it, + * try to catch this by only deassigning irqs if the guest is using + * MSI or intends to start. */ + if (assigned_dev->assigned_irq_type == ASSIGNED_IRQ_MSI || + (ctrl_byte & PCI_MSI_FLAGS_ENABLE)) { + r = kvm_device_msi_deassign(kvm_state, assigned_dev->dev_id); + /* -ENXIO means no assigned irq */ + if (r && r != -ENXIO) { + perror("assigned_dev_update_msi: deassign irq"); + } + + free_msi_virqs(assigned_dev); + + assigned_dev->assigned_irq_type = ASSIGNED_IRQ_NONE; + pci_device_set_intx_routing_notifier(pci_dev, NULL); + } + + if (ctrl_byte & PCI_MSI_FLAGS_ENABLE) { + uint8_t *pos = pci_dev->config + pci_dev->msi_cap; + MSIMessage msg; + int virq; + + msg.address = pci_get_long(pos + PCI_MSI_ADDRESS_LO); + msg.data = pci_get_word(pos + PCI_MSI_DATA_32); + virq = kvm_irqchip_add_msi_route(kvm_state, msg); + if (virq < 0) { + perror("assigned_dev_update_msi: kvm_irqchip_add_msi_route"); + return; + } + + assigned_dev->msi_virq = g_malloc(sizeof(*assigned_dev->msi_virq)); + assigned_dev->msi_virq_nr = 1; + assigned_dev->msi_virq[0] = virq; + if (kvm_device_msi_assign(kvm_state, assigned_dev->dev_id, virq) < 0) { + perror("assigned_dev_update_msi: kvm_device_msi_assign"); + } + + assigned_dev->intx_route.mode = PCI_INTX_DISABLED; + assigned_dev->intx_route.irq = -1; + assigned_dev->assigned_irq_type = ASSIGNED_IRQ_MSI; + } else { + assign_intx(assigned_dev); + } +} + +static bool assigned_dev_msix_masked(MSIXTableEntry *entry) +{ + return (entry->ctrl & cpu_to_le32(0x1)) != 0; +} + +static int assigned_dev_update_msix_mmio(PCIDevice *pci_dev) +{ + AssignedDevice *adev = DO_UPCAST(AssignedDevice, dev, pci_dev); + uint16_t entries_nr = 0; + int i, r = 0; + MSIXTableEntry *entry = adev->msix_table; + MSIMessage msg; + + /* Get the usable entry number for allocating */ + for (i = 0; i < adev->msix_max; i++, entry++) { + if (assigned_dev_msix_masked(entry)) { + continue; + } + entries_nr++; + } + + DEBUG("MSI-X entries: %d\n", entries_nr); + + /* It's valid to enable MSI-X with all entries masked */ + if (!entries_nr) { + return 0; + } + + r = kvm_device_msix_init_vectors(kvm_state, adev->dev_id, entries_nr); + if (r != 0) { + error_report("fail to set MSI-X entry number for MSIX! %s", + strerror(-r)); + return r; + } + + free_msi_virqs(adev); + + adev->msi_virq_nr = adev->msix_max; + adev->msi_virq = g_malloc(adev->msix_max * sizeof(*adev->msi_virq)); + + entry = adev->msix_table; + for (i = 0; i < adev->msix_max; i++, entry++) { + adev->msi_virq[i] = -1; + + if (assigned_dev_msix_masked(entry)) { + continue; + } + + msg.address = entry->addr_lo | ((uint64_t)entry->addr_hi << 32); + msg.data = entry->data; + r = kvm_irqchip_add_msi_route(kvm_state, msg); + if (r < 0) { + return r; + } + adev->msi_virq[i] = r; + + DEBUG("MSI-X vector %d, gsi %d, addr %08x_%08x, data %08x\n", i, + r, entry->addr_hi, entry->addr_lo, entry->data); + + r = kvm_device_msix_set_vector(kvm_state, adev->dev_id, i, + adev->msi_virq[i]); + if (r) { + error_report("fail to set MSI-X entry! %s", strerror(-r)); + break; + } + } + + return r; +} + +static void assigned_dev_update_msix(PCIDevice *pci_dev) +{ + AssignedDevice *assigned_dev = DO_UPCAST(AssignedDevice, dev, pci_dev); + uint16_t ctrl_word = pci_get_word(pci_dev->config + pci_dev->msix_cap + + PCI_MSIX_FLAGS); + int r; + + /* Some guests gratuitously disable MSIX even if they're not using it, + * try to catch this by only deassigning irqs if the guest is using + * MSIX or intends to start. */ + if ((assigned_dev->assigned_irq_type == ASSIGNED_IRQ_MSIX) || + (ctrl_word & PCI_MSIX_FLAGS_ENABLE)) { + r = kvm_device_msix_deassign(kvm_state, assigned_dev->dev_id); + /* -ENXIO means no assigned irq */ + if (r && r != -ENXIO) { + perror("assigned_dev_update_msix: deassign irq"); + } + + free_msi_virqs(assigned_dev); + + assigned_dev->assigned_irq_type = ASSIGNED_IRQ_NONE; + pci_device_set_intx_routing_notifier(pci_dev, NULL); + } + + if (ctrl_word & PCI_MSIX_FLAGS_ENABLE) { + if (assigned_dev_update_msix_mmio(pci_dev) < 0) { + perror("assigned_dev_update_msix_mmio"); + return; + } + + if (assigned_dev->msi_virq_nr > 0) { + if (kvm_device_msix_assign(kvm_state, assigned_dev->dev_id) < 0) { + perror("assigned_dev_enable_msix: assign irq"); + return; + } + } + assigned_dev->intx_route.mode = PCI_INTX_DISABLED; + assigned_dev->intx_route.irq = -1; + assigned_dev->assigned_irq_type = ASSIGNED_IRQ_MSIX; + } else { + assign_intx(assigned_dev); + } +} + +static uint32_t assigned_dev_pci_read_config(PCIDevice *pci_dev, + uint32_t address, int len) +{ + AssignedDevice *assigned_dev = DO_UPCAST(AssignedDevice, dev, pci_dev); + uint32_t virt_val = pci_default_read_config(pci_dev, address, len); + uint32_t real_val, emulate_mask, full_emulation_mask; + + emulate_mask = 0; + memcpy(&emulate_mask, assigned_dev->emulate_config_read + address, len); + emulate_mask = le32_to_cpu(emulate_mask); + + full_emulation_mask = 0xffffffff >> (32 - len * 8); + + if (emulate_mask != full_emulation_mask) { + real_val = assigned_dev_pci_read(pci_dev, address, len); + return (virt_val & emulate_mask) | (real_val & ~emulate_mask); + } else { + return virt_val; + } +} + +static void assigned_dev_pci_write_config(PCIDevice *pci_dev, uint32_t address, + uint32_t val, int len) +{ + AssignedDevice *assigned_dev = DO_UPCAST(AssignedDevice, dev, pci_dev); + uint16_t old_cmd = pci_get_word(pci_dev->config + PCI_COMMAND); + uint32_t emulate_mask, full_emulation_mask; + int ret; + + pci_default_write_config(pci_dev, address, val, len); + + if (kvm_has_intx_set_mask() && + range_covers_byte(address, len, PCI_COMMAND + 1)) { + bool intx_masked = (pci_get_word(pci_dev->config + PCI_COMMAND) & + PCI_COMMAND_INTX_DISABLE); + + if (intx_masked != !!(old_cmd & PCI_COMMAND_INTX_DISABLE)) { + ret = kvm_device_intx_set_mask(kvm_state, assigned_dev->dev_id, + intx_masked); + if (ret) { + perror("assigned_dev_pci_write_config: set intx mask"); + } + } + } + if (assigned_dev->cap.available & ASSIGNED_DEVICE_CAP_MSI) { + if (range_covers_byte(address, len, + pci_dev->msi_cap + PCI_MSI_FLAGS)) { + assigned_dev_update_msi(pci_dev); + } + } + if (assigned_dev->cap.available & ASSIGNED_DEVICE_CAP_MSIX) { + if (range_covers_byte(address, len, + pci_dev->msix_cap + PCI_MSIX_FLAGS + 1)) { + assigned_dev_update_msix(pci_dev); + } + } + + emulate_mask = 0; + memcpy(&emulate_mask, assigned_dev->emulate_config_write + address, len); + emulate_mask = le32_to_cpu(emulate_mask); + + full_emulation_mask = 0xffffffff >> (32 - len * 8); + + if (emulate_mask != full_emulation_mask) { + if (emulate_mask) { + val &= ~emulate_mask; + val |= assigned_dev_pci_read(pci_dev, address, len) & emulate_mask; + } + assigned_dev_pci_write(pci_dev, address, val, len); + } +} + +static void assigned_dev_setup_cap_read(AssignedDevice *dev, uint32_t offset, + uint32_t len) +{ + assigned_dev_direct_config_read(dev, offset, len); + assigned_dev_emulate_config_read(dev, offset + PCI_CAP_LIST_NEXT, 1); +} + +static int assigned_device_pci_cap_init(PCIDevice *pci_dev) +{ + AssignedDevice *dev = DO_UPCAST(AssignedDevice, dev, pci_dev); + PCIRegion *pci_region = dev->real_device.regions; + int ret, pos; + + /* Clear initial capabilities pointer and status copied from hw */ + pci_set_byte(pci_dev->config + PCI_CAPABILITY_LIST, 0); + pci_set_word(pci_dev->config + PCI_STATUS, + pci_get_word(pci_dev->config + PCI_STATUS) & + ~PCI_STATUS_CAP_LIST); + + /* Expose MSI capability + * MSI capability is the 1st capability in capability config */ + pos = pci_find_cap_offset(pci_dev, PCI_CAP_ID_MSI, 0); + if (pos != 0 && kvm_check_extension(kvm_state, KVM_CAP_ASSIGN_DEV_IRQ)) { + if (!check_irqchip_in_kernel()) { + return -ENOTSUP; + } + dev->cap.available |= ASSIGNED_DEVICE_CAP_MSI; + /* Only 32-bit/no-mask currently supported */ + ret = pci_add_capability(pci_dev, PCI_CAP_ID_MSI, pos, 10); + if (ret < 0) { + return ret; + } + pci_dev->msi_cap = pos; + + pci_set_word(pci_dev->config + pos + PCI_MSI_FLAGS, + pci_get_word(pci_dev->config + pos + PCI_MSI_FLAGS) & + PCI_MSI_FLAGS_QMASK); + pci_set_long(pci_dev->config + pos + PCI_MSI_ADDRESS_LO, 0); + pci_set_word(pci_dev->config + pos + PCI_MSI_DATA_32, 0); + + /* Set writable fields */ + pci_set_word(pci_dev->wmask + pos + PCI_MSI_FLAGS, + PCI_MSI_FLAGS_QSIZE | PCI_MSI_FLAGS_ENABLE); + pci_set_long(pci_dev->wmask + pos + PCI_MSI_ADDRESS_LO, 0xfffffffc); + pci_set_word(pci_dev->wmask + pos + PCI_MSI_DATA_32, 0xffff); + } + /* Expose MSI-X capability */ + pos = pci_find_cap_offset(pci_dev, PCI_CAP_ID_MSIX, 0); + if (pos != 0 && kvm_device_msix_supported(kvm_state)) { + int bar_nr; + uint32_t msix_table_entry; + + if (!check_irqchip_in_kernel()) { + return -ENOTSUP; + } + dev->cap.available |= ASSIGNED_DEVICE_CAP_MSIX; + ret = pci_add_capability(pci_dev, PCI_CAP_ID_MSIX, pos, 12); + if (ret < 0) { + return ret; + } + pci_dev->msix_cap = pos; + + pci_set_word(pci_dev->config + pos + PCI_MSIX_FLAGS, + pci_get_word(pci_dev->config + pos + PCI_MSIX_FLAGS) & + PCI_MSIX_FLAGS_QSIZE); + + /* Only enable and function mask bits are writable */ + pci_set_word(pci_dev->wmask + pos + PCI_MSIX_FLAGS, + PCI_MSIX_FLAGS_ENABLE | PCI_MSIX_FLAGS_MASKALL); + + msix_table_entry = pci_get_long(pci_dev->config + pos + PCI_MSIX_TABLE); + bar_nr = msix_table_entry & PCI_MSIX_FLAGS_BIRMASK; + msix_table_entry &= ~PCI_MSIX_FLAGS_BIRMASK; + dev->msix_table_addr = pci_region[bar_nr].base_addr + msix_table_entry; + dev->msix_max = pci_get_word(pci_dev->config + pos + PCI_MSIX_FLAGS); + dev->msix_max &= PCI_MSIX_FLAGS_QSIZE; + dev->msix_max += 1; + } + + /* Minimal PM support, nothing writable, device appears to NAK changes */ + pos = pci_find_cap_offset(pci_dev, PCI_CAP_ID_PM, 0); + if (pos) { + uint16_t pmc; + + ret = pci_add_capability(pci_dev, PCI_CAP_ID_PM, pos, PCI_PM_SIZEOF); + if (ret < 0) { + return ret; + } + + assigned_dev_setup_cap_read(dev, pos, PCI_PM_SIZEOF); + + pmc = pci_get_word(pci_dev->config + pos + PCI_CAP_FLAGS); + pmc &= (PCI_PM_CAP_VER_MASK | PCI_PM_CAP_DSI); + pci_set_word(pci_dev->config + pos + PCI_CAP_FLAGS, pmc); + + /* assign_device will bring the device up to D0, so we don't need + * to worry about doing that ourselves here. */ + pci_set_word(pci_dev->config + pos + PCI_PM_CTRL, + PCI_PM_CTRL_NO_SOFT_RESET); + + pci_set_byte(pci_dev->config + pos + PCI_PM_PPB_EXTENSIONS, 0); + pci_set_byte(pci_dev->config + pos + PCI_PM_DATA_REGISTER, 0); + } + + pos = pci_find_cap_offset(pci_dev, PCI_CAP_ID_EXP, 0); + if (pos) { + uint8_t version, size = 0; + uint16_t type, devctl, lnksta; + uint32_t devcap, lnkcap; + + version = pci_get_byte(pci_dev->config + pos + PCI_EXP_FLAGS); + version &= PCI_EXP_FLAGS_VERS; + if (version == 1) { + size = 0x14; + } else if (version == 2) { + /* + * Check for non-std size, accept reduced size to 0x34, + * which is what bcm5761 implemented, violating the + * PCIe v3.0 spec that regs should exist and be read as 0, + * not optionally provided and shorten the struct size. + */ + size = MIN(0x3c, PCI_CONFIG_SPACE_SIZE - pos); + if (size < 0x34) { + error_report("%s: Invalid size PCIe cap-id 0x%x", + __func__, PCI_CAP_ID_EXP); + return -EINVAL; + } else if (size != 0x3c) { + error_report("WARNING, %s: PCIe cap-id 0x%x has " + "non-standard size 0x%x; std size should be 0x3c", + __func__, PCI_CAP_ID_EXP, size); + } + } else if (version == 0) { + uint16_t vid, did; + vid = pci_get_word(pci_dev->config + PCI_VENDOR_ID); + did = pci_get_word(pci_dev->config + PCI_DEVICE_ID); + if (vid == PCI_VENDOR_ID_INTEL && did == 0x10ed) { + /* + * quirk for Intel 82599 VF with invalid PCIe capability + * version, should really be version 2 (same as PF) + */ + size = 0x3c; + } + } + + if (size == 0) { + error_report("%s: Unsupported PCI express capability version %d", + __func__, version); + return -EINVAL; + } + + ret = pci_add_capability(pci_dev, PCI_CAP_ID_EXP, pos, size); + if (ret < 0) { + return ret; + } + + assigned_dev_setup_cap_read(dev, pos, size); + + type = pci_get_word(pci_dev->config + pos + PCI_EXP_FLAGS); + type = (type & PCI_EXP_FLAGS_TYPE) >> 4; + if (type != PCI_EXP_TYPE_ENDPOINT && + type != PCI_EXP_TYPE_LEG_END && type != PCI_EXP_TYPE_RC_END) { + error_report("Device assignment only supports endpoint assignment," + " device type %d", type); + return -EINVAL; + } + + /* capabilities, pass existing read-only copy + * PCI_EXP_FLAGS_IRQ: updated by hardware, should be direct read */ + + /* device capabilities: hide FLR */ + devcap = pci_get_long(pci_dev->config + pos + PCI_EXP_DEVCAP); + devcap &= ~PCI_EXP_DEVCAP_FLR; + pci_set_long(pci_dev->config + pos + PCI_EXP_DEVCAP, devcap); + + /* device control: clear all error reporting enable bits, leaving + * only a few host values. Note, these are + * all writable, but not passed to hw. + */ + devctl = pci_get_word(pci_dev->config + pos + PCI_EXP_DEVCTL); + devctl = (devctl & (PCI_EXP_DEVCTL_READRQ | PCI_EXP_DEVCTL_PAYLOAD)) | + PCI_EXP_DEVCTL_RELAX_EN | PCI_EXP_DEVCTL_NOSNOOP_EN; + pci_set_word(pci_dev->config + pos + PCI_EXP_DEVCTL, devctl); + devctl = PCI_EXP_DEVCTL_BCR_FLR | PCI_EXP_DEVCTL_AUX_PME; + pci_set_word(pci_dev->wmask + pos + PCI_EXP_DEVCTL, ~devctl); + + /* Clear device status */ + pci_set_word(pci_dev->config + pos + PCI_EXP_DEVSTA, 0); + + /* Link capabilities, expose links and latencues, clear reporting */ + lnkcap = pci_get_long(pci_dev->config + pos + PCI_EXP_LNKCAP); + lnkcap &= (PCI_EXP_LNKCAP_SLS | PCI_EXP_LNKCAP_MLW | + PCI_EXP_LNKCAP_ASPMS | PCI_EXP_LNKCAP_L0SEL | + PCI_EXP_LNKCAP_L1EL); + pci_set_long(pci_dev->config + pos + PCI_EXP_LNKCAP, lnkcap); + + /* Link control, pass existing read-only copy. Should be writable? */ + + /* Link status, only expose current speed and width */ + lnksta = pci_get_word(pci_dev->config + pos + PCI_EXP_LNKSTA); + lnksta &= (PCI_EXP_LNKSTA_CLS | PCI_EXP_LNKSTA_NLW); + pci_set_word(pci_dev->config + pos + PCI_EXP_LNKSTA, lnksta); + + if (version >= 2) { + /* Slot capabilities, control, status - not needed for endpoints */ + pci_set_long(pci_dev->config + pos + PCI_EXP_SLTCAP, 0); + pci_set_word(pci_dev->config + pos + PCI_EXP_SLTCTL, 0); + pci_set_word(pci_dev->config + pos + PCI_EXP_SLTSTA, 0); + + /* Root control, capabilities, status - not needed for endpoints */ + pci_set_word(pci_dev->config + pos + PCI_EXP_RTCTL, 0); + pci_set_word(pci_dev->config + pos + PCI_EXP_RTCAP, 0); + pci_set_long(pci_dev->config + pos + PCI_EXP_RTSTA, 0); + + /* Device capabilities/control 2, pass existing read-only copy */ + /* Link control 2, pass existing read-only copy */ + } + } + + pos = pci_find_cap_offset(pci_dev, PCI_CAP_ID_PCIX, 0); + if (pos) { + uint16_t cmd; + uint32_t status; + + /* Only expose the minimum, 8 byte capability */ + ret = pci_add_capability(pci_dev, PCI_CAP_ID_PCIX, pos, 8); + if (ret < 0) { + return ret; + } + + assigned_dev_setup_cap_read(dev, pos, 8); + + /* Command register, clear upper bits, including extended modes */ + cmd = pci_get_word(pci_dev->config + pos + PCI_X_CMD); + cmd &= (PCI_X_CMD_DPERR_E | PCI_X_CMD_ERO | PCI_X_CMD_MAX_READ | + PCI_X_CMD_MAX_SPLIT); + pci_set_word(pci_dev->config + pos + PCI_X_CMD, cmd); + + /* Status register, update with emulated PCI bus location, clear + * error bits, leave the rest. */ + status = pci_get_long(pci_dev->config + pos + PCI_X_STATUS); + status &= ~(PCI_X_STATUS_BUS | PCI_X_STATUS_DEVFN); + status |= (pci_bus_num(pci_dev->bus) << 8) | pci_dev->devfn; + status &= ~(PCI_X_STATUS_SPL_DISC | PCI_X_STATUS_UNX_SPL | + PCI_X_STATUS_SPL_ERR); + pci_set_long(pci_dev->config + pos + PCI_X_STATUS, status); + } + + pos = pci_find_cap_offset(pci_dev, PCI_CAP_ID_VPD, 0); + if (pos) { + /* Direct R/W passthrough */ + ret = pci_add_capability(pci_dev, PCI_CAP_ID_VPD, pos, 8); + if (ret < 0) { + return ret; + } + + assigned_dev_setup_cap_read(dev, pos, 8); + + /* direct write for cap content */ + assigned_dev_direct_config_write(dev, pos + 2, 6); + } + + /* Devices can have multiple vendor capabilities, get them all */ + for (pos = 0; (pos = pci_find_cap_offset(pci_dev, PCI_CAP_ID_VNDR, pos)); + pos += PCI_CAP_LIST_NEXT) { + uint8_t len = pci_get_byte(pci_dev->config + pos + PCI_CAP_FLAGS); + /* Direct R/W passthrough */ + ret = pci_add_capability(pci_dev, PCI_CAP_ID_VNDR, pos, len); + if (ret < 0) { + return ret; + } + + assigned_dev_setup_cap_read(dev, pos, len); + + /* direct write for cap content */ + assigned_dev_direct_config_write(dev, pos + 2, len - 2); + } + + /* If real and virtual capability list status bits differ, virtualize the + * access. */ + if ((pci_get_word(pci_dev->config + PCI_STATUS) & PCI_STATUS_CAP_LIST) != + (assigned_dev_pci_read_byte(pci_dev, PCI_STATUS) & + PCI_STATUS_CAP_LIST)) { + dev->emulate_config_read[PCI_STATUS] |= PCI_STATUS_CAP_LIST; + } + + return 0; +} + +static uint64_t +assigned_dev_msix_mmio_read(void *opaque, target_phys_addr_t addr, + unsigned size) +{ + AssignedDevice *adev = opaque; + uint64_t val; + + memcpy(&val, (void *)((uint8_t *)adev->msix_table + addr), size); + + return val; +} + +static void assigned_dev_msix_mmio_write(void *opaque, target_phys_addr_t addr, + uint64_t val, unsigned size) +{ + AssignedDevice *adev = opaque; + PCIDevice *pdev = &adev->dev; + uint16_t ctrl; + MSIXTableEntry orig; + int i = addr >> 4; + + if (i >= adev->msix_max) { + return; /* Drop write */ + } + + ctrl = pci_get_word(pdev->config + pdev->msix_cap + PCI_MSIX_FLAGS); + + DEBUG("write to MSI-X table offset 0x%lx, val 0x%lx\n", addr, val); + + if (ctrl & PCI_MSIX_FLAGS_ENABLE) { + orig = adev->msix_table[i]; + } + + memcpy((uint8_t *)adev->msix_table + addr, &val, size); + + if (ctrl & PCI_MSIX_FLAGS_ENABLE) { + MSIXTableEntry *entry = &adev->msix_table[i]; + + if (!assigned_dev_msix_masked(&orig) && + assigned_dev_msix_masked(entry)) { + /* + * Vector masked, disable it + * + * XXX It's not clear if we can or should actually attempt + * to mask or disable the interrupt. KVM doesn't have + * support for pending bits and kvm_assign_set_msix_entry + * doesn't modify the device hardware mask. Interrupts + * while masked are simply not injected to the guest, so + * are lost. Can we get away with always injecting an + * interrupt on unmask? + */ + } else if (assigned_dev_msix_masked(&orig) && + !assigned_dev_msix_masked(entry)) { + /* Vector unmasked */ + if (i >= adev->msi_virq_nr || adev->msi_virq[i] < 0) { + /* Previously unassigned vector, start from scratch */ + assigned_dev_update_msix(pdev); + return; + } else { + /* Update an existing, previously masked vector */ + MSIMessage msg; + int ret; + + msg.address = entry->addr_lo | + ((uint64_t)entry->addr_hi << 32); + msg.data = entry->data; + + ret = kvm_irqchip_update_msi_route(kvm_state, + adev->msi_virq[i], msg); + if (ret) { + error_report("Error updating irq routing entry (%d)", ret); + } + } + } + } +} + +static const MemoryRegionOps assigned_dev_msix_mmio_ops = { + .read = assigned_dev_msix_mmio_read, + .write = assigned_dev_msix_mmio_write, + .endianness = DEVICE_NATIVE_ENDIAN, + .valid = { + .min_access_size = 4, + .max_access_size = 8, + }, + .impl = { + .min_access_size = 4, + .max_access_size = 8, + }, +}; + +static void assigned_dev_msix_reset(AssignedDevice *dev) +{ + MSIXTableEntry *entry; + int i; + + if (!dev->msix_table) { + return; + } + + memset(dev->msix_table, 0, MSIX_PAGE_SIZE); + + for (i = 0, entry = dev->msix_table; i < dev->msix_max; i++, entry++) { + entry->ctrl = cpu_to_le32(0x1); /* Masked */ + } +} + +static int assigned_dev_register_msix_mmio(AssignedDevice *dev) +{ + dev->msix_table = mmap(NULL, MSIX_PAGE_SIZE, PROT_READ|PROT_WRITE, + MAP_ANONYMOUS|MAP_PRIVATE, 0, 0); + if (dev->msix_table == MAP_FAILED) { + error_report("fail allocate msix_table! %s", strerror(errno)); + return -EFAULT; + } + + assigned_dev_msix_reset(dev); + + memory_region_init_io(&dev->mmio, &assigned_dev_msix_mmio_ops, dev, + "assigned-dev-msix", MSIX_PAGE_SIZE); + return 0; +} + +static void assigned_dev_unregister_msix_mmio(AssignedDevice *dev) +{ + if (!dev->msix_table) { + return; + } + + memory_region_destroy(&dev->mmio); + + if (munmap(dev->msix_table, MSIX_PAGE_SIZE) == -1) { + error_report("error unmapping msix_table! %s", strerror(errno)); + } + dev->msix_table = NULL; +} + +static const VMStateDescription vmstate_assigned_device = { + .name = "pci-assign", + .unmigratable = 1, +}; + +static void reset_assigned_device(DeviceState *dev) +{ + PCIDevice *pci_dev = DO_UPCAST(PCIDevice, qdev, dev); + AssignedDevice *adev = DO_UPCAST(AssignedDevice, dev, pci_dev); + char reset_file[64]; + const char reset[] = "1"; + int fd, ret; + + /* + * If a guest is reset without being shutdown, MSI/MSI-X can still + * be running. We want to return the device to a known state on + * reset, so disable those here. We especially do not want MSI-X + * enabled since it lives in MMIO space, which is about to get + * disabled. + */ + if (adev->assigned_irq_type == ASSIGNED_IRQ_MSIX) { + uint16_t ctrl = pci_get_word(pci_dev->config + + pci_dev->msix_cap + PCI_MSIX_FLAGS); + + pci_set_word(pci_dev->config + pci_dev->msix_cap + PCI_MSIX_FLAGS, + ctrl & ~PCI_MSIX_FLAGS_ENABLE); + assigned_dev_update_msix(pci_dev); + } else if (adev->assigned_irq_type == ASSIGNED_IRQ_MSI) { + uint8_t ctrl = pci_get_byte(pci_dev->config + + pci_dev->msi_cap + PCI_MSI_FLAGS); + + pci_set_byte(pci_dev->config + pci_dev->msi_cap + PCI_MSI_FLAGS, + ctrl & ~PCI_MSI_FLAGS_ENABLE); + assigned_dev_update_msi(pci_dev); + } + + snprintf(reset_file, sizeof(reset_file), + "/sys/bus/pci/devices/%04x:%02x:%02x.%01x/reset", + adev->host.domain, adev->host.bus, adev->host.slot, + adev->host.function); + + /* + * Issue a device reset via pci-sysfs. Note that we use write(2) here + * and ignore the return value because some kernels have a bug that + * returns 0 rather than bytes written on success, sending us into an + * infinite retry loop using other write mechanisms. + */ + fd = open(reset_file, O_WRONLY); + if (fd != -1) { + ret = write(fd, reset, strlen(reset)); + (void)ret; + close(fd); + } + + /* + * When a 0 is written to the bus master register, the device is logically + * disconnected from the PCI bus. This avoids further DMA transfers. + */ + assigned_dev_pci_write_config(pci_dev, PCI_COMMAND, 0, 1); +} + +static int assigned_initfn(struct PCIDevice *pci_dev) +{ + AssignedDevice *dev = DO_UPCAST(AssignedDevice, dev, pci_dev); + uint8_t e_intx; + int r; + + if (!kvm_enabled()) { + error_report("pci-assign: error: requires KVM support"); + return -1; + } + + if (!dev->host.domain && !dev->host.bus && !dev->host.slot && + !dev->host.function) { + error_report("pci-assign: error: no host device specified"); + return -1; + } + + /* + * Set up basic config space access control. Will be further refined during + * device initialization. + */ + assigned_dev_emulate_config_read(dev, 0, PCI_CONFIG_SPACE_SIZE); + assigned_dev_direct_config_read(dev, PCI_STATUS, 2); + assigned_dev_direct_config_read(dev, PCI_REVISION_ID, 1); + assigned_dev_direct_config_read(dev, PCI_CLASS_PROG, 3); + assigned_dev_direct_config_read(dev, PCI_CACHE_LINE_SIZE, 1); + assigned_dev_direct_config_read(dev, PCI_LATENCY_TIMER, 1); + assigned_dev_direct_config_read(dev, PCI_BIST, 1); + assigned_dev_direct_config_read(dev, PCI_CARDBUS_CIS, 4); + assigned_dev_direct_config_read(dev, PCI_SUBSYSTEM_VENDOR_ID, 2); + assigned_dev_direct_config_read(dev, PCI_SUBSYSTEM_ID, 2); + assigned_dev_direct_config_read(dev, PCI_CAPABILITY_LIST + 1, 7); + assigned_dev_direct_config_read(dev, PCI_MIN_GNT, 1); + assigned_dev_direct_config_read(dev, PCI_MAX_LAT, 1); + memcpy(dev->emulate_config_write, dev->emulate_config_read, + sizeof(dev->emulate_config_read)); + + if (get_real_device(dev, dev->host.domain, dev->host.bus, + dev->host.slot, dev->host.function)) { + error_report("pci-assign: Error: Couldn't get real device (%s)!", + dev->dev.qdev.id); + goto out; + } + + if (assigned_device_pci_cap_init(pci_dev) < 0) { + goto out; + } + + /* intercept MSI-X entry page in the MMIO */ + if (dev->cap.available & ASSIGNED_DEVICE_CAP_MSIX) { + if (assigned_dev_register_msix_mmio(dev)) { + goto out; + } + } + + /* handle real device's MMIO/PIO BARs */ + if (assigned_dev_register_regions(dev->real_device.regions, + dev->real_device.region_number, + dev)) { + goto out; + } + + /* handle interrupt routing */ + e_intx = dev->dev.config[PCI_INTERRUPT_PIN] - 1; + dev->intpin = e_intx; + dev->intx_route.mode = PCI_INTX_DISABLED; + dev->intx_route.irq = -1; + + /* assign device to guest */ + r = assign_device(dev); + if (r < 0) { + goto out; + } + + /* assign legacy INTx to the device */ + r = assign_intx(dev); + if (r < 0) { + goto assigned_out; + } + + assigned_dev_load_option_rom(dev); + + add_boot_device_path(dev->bootindex, &pci_dev->qdev, NULL); + + return 0; + +assigned_out: + deassign_device(dev); +out: + free_assigned_device(dev); + return -1; +} + +static void assigned_exitfn(struct PCIDevice *pci_dev) +{ + AssignedDevice *dev = DO_UPCAST(AssignedDevice, dev, pci_dev); + + deassign_device(dev); + free_assigned_device(dev); +} + +static Property assigned_dev_properties[] = { + DEFINE_PROP_PCI_HOST_DEVADDR("host", AssignedDevice, host), + DEFINE_PROP_BIT("prefer_msi", AssignedDevice, features, + ASSIGNED_DEVICE_PREFER_MSI_BIT, false), + DEFINE_PROP_BIT("share_intx", AssignedDevice, features, + ASSIGNED_DEVICE_SHARE_INTX_BIT, true), + DEFINE_PROP_INT32("bootindex", AssignedDevice, bootindex, -1), + DEFINE_PROP_STRING("configfd", AssignedDevice, configfd_name), + DEFINE_PROP_END_OF_LIST(), +}; + +static void assign_class_init(ObjectClass *klass, void *data) +{ + PCIDeviceClass *k = PCI_DEVICE_CLASS(klass); + DeviceClass *dc = DEVICE_CLASS(klass); + + k->init = assigned_initfn; + k->exit = assigned_exitfn; + k->config_read = assigned_dev_pci_read_config; + k->config_write = assigned_dev_pci_write_config; + dc->props = assigned_dev_properties; + dc->vmsd = &vmstate_assigned_device; + dc->reset = reset_assigned_device; + dc->desc = "KVM-based PCI passthrough"; +} + +static const TypeInfo assign_info = { + .name = "kvm-pci-assign", + .parent = TYPE_PCI_DEVICE, + .instance_size = sizeof(AssignedDevice), + .class_init = assign_class_init, +}; + +static void assign_register_types(void) +{ + type_register_static(&assign_info); +} + +type_init(assign_register_types) + +/* + * Scan the assigned devices for the devices that have an option ROM, and then + * load the corresponding ROM data to RAM. If an error occurs while loading an + * option ROM, we just ignore that option ROM and continue with the next one. + */ +static void assigned_dev_load_option_rom(AssignedDevice *dev) +{ + char name[32], rom_file[64]; + FILE *fp; + uint8_t val; + struct stat st; + void *ptr; + + /* If loading ROM from file, pci handles it */ + if (dev->dev.romfile || !dev->dev.rom_bar) { + return; + } + + snprintf(rom_file, sizeof(rom_file), + "/sys/bus/pci/devices/%04x:%02x:%02x.%01x/rom", + dev->host.domain, dev->host.bus, dev->host.slot, + dev->host.function); + + if (stat(rom_file, &st)) { + return; + } + + if (access(rom_file, F_OK)) { + error_report("pci-assign: Insufficient privileges for %s", rom_file); + return; + } + + /* Write "1" to the ROM file to enable it */ + fp = fopen(rom_file, "r+"); + if (fp == NULL) { + return; + } + val = 1; + if (fwrite(&val, 1, 1, fp) != 1) { + goto close_rom; + } + fseek(fp, 0, SEEK_SET); + + snprintf(name, sizeof(name), "%s.rom", + object_get_typename(OBJECT(dev))); + memory_region_init_ram(&dev->dev.rom, name, st.st_size); + vmstate_register_ram(&dev->dev.rom, &dev->dev.qdev); + ptr = memory_region_get_ram_ptr(&dev->dev.rom); + memset(ptr, 0xff, st.st_size); + + if (!fread(ptr, 1, st.st_size, fp)) { + error_report("pci-assign: Cannot read from host %s\n" + "\tDevice option ROM contents are probably invalid " + "(check dmesg).\n\tSkip option ROM probe with rombar=0, " + "or load from file with romfile=", rom_file); + memory_region_destroy(&dev->dev.rom); + goto close_rom; + } + + pci_register_bar(&dev->dev, PCI_ROM_SLOT, 0, &dev->dev.rom); + dev->dev.has_rom = true; +close_rom: + /* Write "0" to disable ROM */ + fseek(fp, 0, SEEK_SET); + val = 0; + if (!fwrite(&val, 1, 1, fp)) { + DEBUG("%s\n", "Failed to disable pci-sysfs rom file"); + } + fclose(fp); +} diff --git a/hw/qdev-monitor.c b/hw/qdev-monitor.c index 33b7f79a94..479eecda31 100644 --- a/hw/qdev-monitor.c +++ b/hw/qdev-monitor.c @@ -44,6 +44,7 @@ static const QDevAlias qdev_alias_table[] = { { "virtio-serial-s390", "virtio-serial", QEMU_ARCH_S390X }, { "lsi53c895a", "lsi" }, { "ich9-ahci", "ahci" }, + { "kvm-pci-assign", "pci-assign" }, { } };